"""NISRA Drug-Related and Drug Misuse Deaths Data Source.
Provides access to NISRA's annual statistics on drug-related deaths and deaths
due to drug misuse in Northern Ireland via the NISRA PxStat API. Two geographic
breakdowns are available: HSC Trust and Local Government District.
NISRA distinguishes two related measures:
- **Drug-related deaths** (``DRDCOUNT``): All deaths where a drug was implicated,
including prescription medicines, controlled drugs, and accidental/intentional
poisonings.
- **Drug misuse deaths** (``MISUSECOUNT``): A narrower subset where the underlying
cause is drug abuse/dependence or the death involved a controlled substance.
Original data source:
https://www.nisra.gov.uk/statistics/death-statistics/drug-related-and-drug-misuse-deaths
PxStat matrices used:
- DTHSDRHSCT — annual deaths by HSC Trust (6 trusts + NI total)
- DTHSDRLGD — annual deaths by LGD (11 districts + NI total)
Update Frequency: Annual (typically May)
Geographic Coverage: Northern Ireland
Example:
>>> from bolster.data_sources.nisra import drug_related_deaths as drd
>>> df = drd.get_latest_drug_related_deaths()
>>> {"year", "geography", "statistic", "value"}.issubset(df.columns)
True
>>> len(df) > 0
True
"""
import logging
from typing import Literal
import pandas as pd
from bolster.data_sources.nisra.pxstat import read_dataset
[docs]
logger = logging.getLogger(__name__)
# PxStat matrix codes
_MATRIX_HSCT = "DTHSDRHSCT"
_MATRIX_LGD = "DTHSDRLGD"
[docs]
DimensionType = Literal["hsct", "lgd", "all"]
# Map STATISTIC code to a human-readable label
_STATISTIC_LABELS = {
"DRDCOUNT": "drug_related",
"MISUSECOUNT": "drug_misuse",
}
def _process_matrix(matrix: str, geo_col: str, geo_name_col: str) -> pd.DataFrame:
"""Fetch and tidy a drug deaths PxStat matrix.
Args:
matrix: PxStat matrix code.
geo_col: Column name for the geography code.
geo_name_col: Column name for the geography label.
Returns:
DataFrame with columns:
- ``year``: Registration year (int)
- ``geography_code``: geography identifier code
- ``geography``: geography name label
- ``statistic``: ``"drug_related"`` or ``"drug_misuse"``
- ``value``: Count of deaths (int, NaN where suppressed)
"""
df = read_dataset(matrix)
df = df.rename(
columns={
geo_col: "geography_code",
geo_name_col: "geography",
"Year": "year",
"VALUE": "value",
}
)
df["statistic"] = df["STATISTIC"].map(_STATISTIC_LABELS).fillna(df["STATISTIC"])
result = df[["year", "geography_code", "geography", "statistic", "value"]].copy()
return result.sort_values(["statistic", "year", "geography"]).reset_index(drop=True)
[docs]
def validate_data(df: pd.DataFrame) -> bool:
"""Validate a parsed drug-related deaths DataFrame.
Args:
df: DataFrame from :func:`get_latest_drug_related_deaths` with a single
geographic dimension.
Returns:
True if validation passes, False otherwise.
Example:
>>> import pandas as pd
>>> validate_data(pd.DataFrame())
False
"""
if df is None or df.empty:
logger.warning("Drug deaths data is empty")
return False
required_cols = {"year", "geography", "statistic", "value"}
if not required_cols.issubset(df.columns):
missing = required_cols - set(df.columns)
logger.warning("Missing required columns: %s", missing)
return False
non_null_values = df["value"].dropna()
if len(non_null_values) > 0 and (non_null_values < 0).any():
logger.warning("Found negative values in drug deaths data")
return False
# Need at least a few years of data to be a useful time series.
if df["year"].nunique() < 5:
logger.warning("Too few years of data: %d", df["year"].nunique())
return False
return True