Source code for bolster.data_sources.nisra.drug_related_deaths

"""NISRA Drug-Related and Drug Misuse Deaths Data Source.

Provides access to NISRA's annual statistics on drug-related deaths and deaths
due to drug misuse in Northern Ireland via the NISRA PxStat API.  Two geographic
breakdowns are available: HSC Trust and Local Government District.

NISRA distinguishes two related measures:

- **Drug-related deaths** (``DRDCOUNT``): All deaths where a drug was implicated,
  including prescription medicines, controlled drugs, and accidental/intentional
  poisonings.
- **Drug misuse deaths** (``MISUSECOUNT``): A narrower subset where the underlying
  cause is drug abuse/dependence or the death involved a controlled substance.

Original data source:
    https://www.nisra.gov.uk/statistics/death-statistics/drug-related-and-drug-misuse-deaths

PxStat matrices used:
    - DTHSDRHSCT — annual deaths by HSC Trust (6 trusts + NI total)
    - DTHSDRLGD  — annual deaths by LGD (11 districts + NI total)

Update Frequency: Annual (typically May)
Geographic Coverage: Northern Ireland

Example:
    >>> from bolster.data_sources.nisra import drug_related_deaths as drd
    >>> df = drd.get_latest_drug_related_deaths()
    >>> {"year", "geography", "statistic", "value"}.issubset(df.columns)
    True
    >>> len(df) > 0
    True
"""

import logging
from typing import Literal

import pandas as pd

from bolster.data_sources.nisra.pxstat import read_dataset

[docs] logger = logging.getLogger(__name__)
# PxStat matrix codes _MATRIX_HSCT = "DTHSDRHSCT" _MATRIX_LGD = "DTHSDRLGD"
[docs] DimensionType = Literal["hsct", "lgd", "all"]
# Map STATISTIC code to a human-readable label _STATISTIC_LABELS = { "DRDCOUNT": "drug_related", "MISUSECOUNT": "drug_misuse", } def _process_matrix(matrix: str, geo_col: str, geo_name_col: str) -> pd.DataFrame: """Fetch and tidy a drug deaths PxStat matrix. Args: matrix: PxStat matrix code. geo_col: Column name for the geography code. geo_name_col: Column name for the geography label. Returns: DataFrame with columns: - ``year``: Registration year (int) - ``geography_code``: geography identifier code - ``geography``: geography name label - ``statistic``: ``"drug_related"`` or ``"drug_misuse"`` - ``value``: Count of deaths (int, NaN where suppressed) """ df = read_dataset(matrix) df = df.rename( columns={ geo_col: "geography_code", geo_name_col: "geography", "Year": "year", "VALUE": "value", } ) df["statistic"] = df["STATISTIC"].map(_STATISTIC_LABELS).fillna(df["STATISTIC"]) result = df[["year", "geography_code", "geography", "statistic", "value"]].copy() return result.sort_values(["statistic", "year", "geography"]).reset_index(drop=True)
[docs] def validate_data(df: pd.DataFrame) -> bool: """Validate a parsed drug-related deaths DataFrame. Args: df: DataFrame from :func:`get_latest_drug_related_deaths` with a single geographic dimension. Returns: True if validation passes, False otherwise. Example: >>> import pandas as pd >>> validate_data(pd.DataFrame()) False """ if df is None or df.empty: logger.warning("Drug deaths data is empty") return False required_cols = {"year", "geography", "statistic", "value"} if not required_cols.issubset(df.columns): missing = required_cols - set(df.columns) logger.warning("Missing required columns: %s", missing) return False non_null_values = df["value"].dropna() if len(non_null_values) > 0 and (non_null_values < 0).any(): logger.warning("Found negative values in drug deaths data") return False # Need at least a few years of data to be a useful time series. if df["year"].nunique() < 5: logger.warning("Too few years of data: %d", df["year"].nunique()) return False return True