Source code for bolster.data_sources.nisra.disease_prevalence

"""NISRA Disease Prevalence Module.

Provides access to Northern Ireland's disease prevalence statistics from
GP clinical disease registers (Quality & Outcomes Framework, QOF).
Data are released annually after National Prevalence Day.

Data Coverage:
    - Financial years 2017/18 to present (extended annually)
    - NI-level: registered patients per disease register and prevalence
      per 1,000 patients
    - By Local Government District (LGD): same metrics per council
    - By HSC Trust: same metrics per Trust

Disease Registers (17):
    Asthma, Atrial Fibrillation, Cancer, Chronic Kidney Disease,
    Chronic Obstructive Pulmonary Disease, Coronary Heart Disease,
    Dementia, Depression, Diabetes Mellitus, Heart Failure 1,
    Heart Failure 3, Hypertension, Mental Health,
    Non-Diabetic Hyperglycaemia, Osteoporosis, Rheumatoid Arthritis,
    Stroke & TIA

Original data sources:
    https://www.opendatani.gov.uk/dataset/gp-practice-reference-file
    https://www.health-ni.gov.uk/topics/health-statistics/disease-prevalence

Data is fetched from the NISRA PxStat API using the following matrices:
    - DISPREVNI: NI-wide annual prevalence by disease
    - DISPREVLGD: Annual prevalence by LGD and disease
    - DISPREVHSCT: Annual prevalence by HSC Trust and disease

Update Frequency:
    Annual, approximately May of the following calendar year.

Example:
    >>> from bolster.data_sources.nisra import disease_prevalence as dp
    >>> df = dp.get_latest_disease_prevalence()
    >>> 'registered_patients' in df.columns
    True
    >>> 'prevalence_per_1000' in df.columns
    True
"""

import logging

import pandas as pd

from ._base import NISRAValidationError
from .pxstat import read_dataset

[docs] logger = logging.getLogger(__name__)
# PxStat matrix codes _MATRIX_NI = "DISPREVNI" _MATRIX_LGD = "DISPREVLGD" _MATRIX_HSCT = "DISPREVHSCT" # STATISTIC values _STAT_NUMREG = "Numreg" _STAT_PREV = "Rawprevalence1000" def _pivot_prevalence(raw: pd.DataFrame, group_col: str, output_col: str) -> pd.DataFrame: """Pivot a disease prevalence matrix to wide format. Args: raw: Raw DataFrame from read_dataset(). group_col: Column name for the geographic dimension (e.g. 'Disease'). output_col: Name for the geographic dimension in the output. Returns: DataFrame with columns: financial_year, year, {output_col}, disease, registered_patients, prevalence_per_1000. """ fy_col = "Financial Year" disease_col = "Disease" pivot = raw.pivot_table( index=[fy_col, group_col, disease_col], columns="STATISTIC", values="VALUE", aggfunc="first", ).reset_index() pivot.columns.name = None pivot = pivot.rename( columns={ fy_col: "financial_year", group_col: output_col, disease_col: "disease", _STAT_NUMREG: "registered_patients", _STAT_PREV: "prevalence_per_1000", } ) for col in ("registered_patients", "prevalence_per_1000"): if col in pivot.columns: pivot[col] = pd.to_numeric(pivot[col], errors="coerce") pivot["year"] = pivot["financial_year"].apply(lambda fy: int(str(fy).split("/")[0])) col_order = ["financial_year", "year", output_col, "disease", "registered_patients", "prevalence_per_1000"] return ( pivot[[c for c in col_order if c in pivot.columns]] .sort_values(["financial_year", output_col, "disease"]) .reset_index(drop=True) )
[docs] def get_ni_prevalence(force_refresh: bool = False) -> pd.DataFrame: """Get NI-wide annual disease prevalence (DISPREVNI). Args: force_refresh: Accepted for API compatibility but ignored; the PxStat API always returns the latest data without caching. Returns: DataFrame with columns: financial_year, year, disease, registered_patients, prevalence_per_1000. """ raw = read_dataset(_MATRIX_NI) # DISPREVNI has no geographic dimension — pivot directly fy_col = "Financial Year" disease_col = "Disease" pivot = raw.pivot_table( index=[fy_col, disease_col], columns="STATISTIC", values="VALUE", aggfunc="first", ).reset_index() pivot.columns.name = None pivot = pivot.rename( columns={ fy_col: "financial_year", disease_col: "disease", _STAT_NUMREG: "registered_patients", _STAT_PREV: "prevalence_per_1000", } ) for col in ("registered_patients", "prevalence_per_1000"): if col in pivot.columns: pivot[col] = pd.to_numeric(pivot[col], errors="coerce") pivot["year"] = pivot["financial_year"].apply(lambda fy: int(str(fy).split("/")[0])) col_order = ["financial_year", "year", "disease", "registered_patients", "prevalence_per_1000"] return ( pivot[[c for c in col_order if c in pivot.columns]] .sort_values(["financial_year", "disease"]) .reset_index(drop=True) )
[docs] def get_lgd_prevalence(force_refresh: bool = False) -> pd.DataFrame: """Get annual disease prevalence by Local Government District (DISPREVLGD). Args: force_refresh: Accepted for API compatibility but ignored; the PxStat API always returns the latest data without caching. Returns: DataFrame with columns: financial_year, year, lgd, disease, registered_patients, prevalence_per_1000. """ raw = read_dataset(_MATRIX_LGD) return _pivot_prevalence(raw, "Local Government District", "lgd")
[docs] def get_hsct_prevalence(force_refresh: bool = False) -> pd.DataFrame: """Get annual disease prevalence by HSC Trust (DISPREVHSCT). Args: force_refresh: Accepted for API compatibility but ignored; the PxStat API always returns the latest data without caching. Returns: DataFrame with columns: financial_year, year, trust, disease, registered_patients, prevalence_per_1000. """ raw = read_dataset(_MATRIX_HSCT) return _pivot_prevalence(raw, "Health and Social Care Trust", "trust")
[docs] def get_latest_disease_prevalence( force_refresh: bool = False, level: str = "ni", lcg: str | None = None, ) -> pd.DataFrame: """Get the latest NI disease prevalence data. Fetches data from the NISRA PxStat API. The ``level`` parameter controls geographic granularity; ``lcg`` filters to a specific Local Government District (when level='lgd'). Args: force_refresh: Accepted for API compatibility but ignored; the PxStat API always returns the latest data without caching. level: Geographic level — 'ni' for NI-wide (default), 'lgd' for Local Government District breakdown, or 'trust' for HSC Trust. lcg: Optional LGD name filter (used when level='lgd'). If provided, only rows for that LGD are returned. Returns: DataFrame with columns: financial_year, year, disease, registered_patients, prevalence_per_1000. When level='lgd', also includes an 'lgd' column. When level='trust', also includes a 'trust' column. Raises: ValueError: If level is not one of 'ni', 'lgd', or 'trust'. Example: >>> df = get_latest_disease_prevalence() >>> 'registered_patients' in df.columns True >>> 'prevalence_per_1000' in df.columns True """ if level == "ni": df = get_ni_prevalence() elif level == "lgd": df = get_lgd_prevalence() if lcg is not None: df = df[df["lgd"] == lcg].reset_index(drop=True) elif level == "trust": df = get_hsct_prevalence() else: raise ValueError(f"level must be 'ni', 'lgd', or 'trust', got {level!r}") return df
[docs] def validate_disease_prevalence(df: pd.DataFrame, level: str = "ni") -> bool: """Validate the disease prevalence DataFrame for internal consistency. Args: df: DataFrame as returned by :func:`get_latest_disease_prevalence`. level: Validation mode — 'ni' (default) or 'lgd'/'trust' for geographic breakdowns. Validates the 'gp' level alias for backward compatibility (treated same as 'lgd'). Returns: True if all checks pass. Raises: NISRAValidationError: Describing the first failing check. ValueError: If level is not a recognised value. Example: >>> import pandas as pd >>> df = pd.DataFrame({ ... "year": [2017], "financial_year": ["2017/18"], ... "disease": ["Hypertension"], ... "registered_patients": [184824.0], ... "prevalence_per_1000": [102.9], ... }) >>> validate_disease_prevalence(df) True """ if level not in ("ni", "lgd", "trust", "gp"): raise ValueError(f"level must be 'ni', 'lgd', 'trust', or 'gp', got {level!r}") required = {"financial_year", "year", "disease", "registered_patients", "prevalence_per_1000"} # Accept 'register' as alias for 'disease' (backward compat with old Excel-based module) if "register" in df.columns and "disease" not in df.columns: df = df.rename(columns={"register": "disease"}) missing = required - set(df.columns) if missing: raise NISRAValidationError(f"Missing required columns: {missing}") if df.empty: raise NISRAValidationError("DataFrame is empty") if df["disease"].nunique() < 5: raise NISRAValidationError(f"Too few disease registers: expected ≥ 5, got {df['disease'].nunique()}") if level == "ni" and df["financial_year"].nunique() < 5: raise NISRAValidationError(f"Too few financial years: expected ≥ 5, got {df['financial_year'].nunique()}") if level == "gp" and df["financial_year"].nunique() < 3: # Backward-compat: treat gp level same as a geographic breakdown raise NISRAValidationError(f"Too few financial years: expected ≥ 3, got {df['financial_year'].nunique()}") prev = df["prevalence_per_1000"].dropna() if len(prev) > 0: if (prev < 0).any(): bad = prev[prev < 0] raise NISRAValidationError(f"prevalence_per_1000 has {len(bad)} negative values: {bad.head().tolist()}") if (prev > 1000).any(): bad = prev[prev > 1000] raise NISRAValidationError(f"prevalence_per_1000 has {len(bad)} values above 1000: {bad.head().tolist()}") patients = df["registered_patients"].dropna() if len(patients) > 0 and (patients < 0).any(): bad = patients[patients < 0] raise NISRAValidationError(f"registered_patients has {len(bad)} negative values: {bad.head().tolist()}") return True