"""NISRA Claimant Count Statistics Module.
Provides monthly Claimant Count statistics for Northern Ireland via the NISRA
PxStat API. The Claimant Count is an experimental statistic measuring the
number of people claiming benefits principally for the reason of being
unemployed. Data is published monthly and covers Northern Ireland with two
geographic breakdowns: Local Government Districts and Assembly Areas.
Original data source:
https://www.nisra.gov.uk/statistics/labour-market-and-social-welfare/claimant-count
PxStat matrices used:
- CCMLGD — monthly count and rate by LGD (11 districts + NI total)
- CCMAA — monthly count and rate by Assembly Area (18 areas + NI total)
- CCMSOA — monthly count by Super Output Area (~92 k rows, large)
Update Frequency: Monthly, approximately 2–3 weeks after the reference month.
Usage:
>>> from bolster.data_sources.nisra import claimant_count
>>> df = claimant_count.get_latest_claimant_count("lgd")
>>> "claimants_total" in df.columns
True
Example:
>>> from bolster.data_sources.nisra import claimant_count
>>> df = claimant_count.get_latest_claimant_count("lgd")
>>> df[df["geography"] == "Northern Ireland"]["claimants_total"].iloc[0] > 0
True
"""
import logging
import pandas as pd
from bolster.data_sources.nisra.pxstat import read_dataset
[docs]
logger = logging.getLogger(__name__)
# PxStat matrix codes
_MATRIX_LGD = "CCMLGD"
_MATRIX_AA = "CCMAA"
_MATRIX_SOA = "CCMSOA"
def _parse_month(month_str: str) -> pd.Timestamp:
"""Convert a PxStat month code (e.g. ``'2024M03'``) to a Timestamp.
Args:
month_str: Month code in the form ``'{YYYY}M{MM}'``.
Returns:
pandas Timestamp for the first day of the month.
Example:
>>> _parse_month("2024M03")
Timestamp('2024-03-01 00:00:00')
"""
year, month = month_str.split("M")
return pd.Timestamp(int(year), int(month), 1)
def _pivot_geo(df_raw: pd.DataFrame, geo_col: str, geo_name_col: str) -> pd.DataFrame:
"""Pivot a raw PxStat geography DataFrame into the standard output shape.
The API returns CCN (count) and CCP (rate) as separate rows identified by
the ``STATISTIC`` column. This function pivots them into columns named
``claimants_total`` and ``claimant_rate_total_pct``.
Args:
df_raw: Raw DataFrame from :func:`~bolster.data_sources.nisra.pxstat.read_dataset`.
geo_col: Column name for the geography code (e.g. ``'LGD2014'``).
geo_name_col: Column name for the geography label (e.g. ``'Local Government District'``).
Returns:
DataFrame with columns:
- ``date``: pandas Timestamp (monthly, day=1)
- ``geography_code``: geography identifier code
- ``geography``: geography name label
- ``claimants_total``: claimant count (float)
- ``claimant_rate_total_pct``: claimant rate as percentage (float)
"""
df_raw = df_raw.copy()
df_raw["date"] = df_raw["Month"].apply(_parse_month)
count_df = df_raw[df_raw["STATISTIC"] == "CCN"][["date", geo_col, geo_name_col, "VALUE"]].rename(
columns={geo_col: "geography_code", geo_name_col: "geography", "VALUE": "claimants_total"}
)
rate_df = df_raw[df_raw["STATISTIC"] == "CCP"][["date", geo_col, "VALUE"]].rename(
columns={geo_col: "geography_code", "VALUE": "claimant_rate_total_pct"}
)
return (
count_df.merge(rate_df, on=["date", "geography_code"], how="left")
.sort_values(["date", "geography"])
.reset_index(drop=True)
)
[docs]
def get_latest_claimant_count(
breakdown: str = "lgd",
adjusted: bool = True,
force_refresh: bool = False,
) -> pd.DataFrame:
"""Download and return the latest NISRA claimant count data.
Fetches data from the NISRA PxStat API for the chosen geographic breakdown.
Returns the full time series available from the API (from January 2005).
Note:
``adjusted`` is accepted for API compatibility but is ignored — the
PxStat API does not distinguish seasonally adjusted from unadjusted
counts at geographic level.
``force_refresh`` is accepted for API compatibility but is ignored —
the PxStat API is called directly with no local cache layer.
Args:
breakdown: Geographic breakdown to return. One of:
- ``"lgd"`` — 11 Local Government Districts + NI total (default)
- ``"aa"`` — 18 Assembly Areas + NI total
- ``"soa"`` — Super Output Areas (~92 k rows, large)
adjusted: Ignored. Retained for API compatibility.
force_refresh: Ignored. Retained for API compatibility.
Returns:
For ``"lgd"`` and ``"aa"``: DataFrame with columns:
- ``date``: pandas Timestamp (monthly, day=1)
- ``geography_code``: geography identifier code
- ``geography``: geography name label
- ``claimants_total``: claimant count (float)
- ``claimant_rate_total_pct``: claimant rate as percentage (float)
For ``"soa"``: DataFrame with columns:
- ``date``: pandas Timestamp (monthly, day=1)
- ``soa_code``: Super Output Area code
- ``soa_name``: Super Output Area name
- ``claimants``: claimant count (float)
Raises:
ValueError: If ``breakdown`` is not a supported value.
Example:
>>> df = get_latest_claimant_count("lgd")
>>> "claimants_total" in df.columns
True
>>> df_aa = get_latest_claimant_count("aa")
>>> "claimants_total" in df_aa.columns
True
"""
valid = ("lgd", "aa", "soa")
if breakdown not in valid:
raise ValueError(f"breakdown must be one of {valid}, got {breakdown!r}")
if breakdown == "lgd":
raw = read_dataset(_MATRIX_LGD)
return _pivot_geo(raw, "LGD2014", "Local Government District")
if breakdown == "aa":
raw = read_dataset(_MATRIX_AA)
return _pivot_geo(raw, "AA", "Assembly Area")
# breakdown == "soa"
raw = read_dataset(_MATRIX_SOA)
raw = raw.copy()
raw["date"] = raw["Month"].apply(_parse_month)
# SOA matrix columns: STATISTIC, Statistic Label, TLIST(M1), Month, SOA, <SOA name col>, UNIT, VALUE
soa_code_col = (
"SOA2001" if "SOA2001" in raw.columns else [c for c in raw.columns if "SOA" in c and c != "STATISTIC"][0]
)
soa_name_col = [
c
for c in raw.columns
if c not in {"STATISTIC", "Statistic Label", "TLIST(M1)", "Month", soa_code_col, "UNIT", "VALUE", "date"}
][0]
result = raw[["date", soa_code_col, soa_name_col, "VALUE"]].rename(
columns={soa_code_col: "soa_code", soa_name_col: "soa_name", "VALUE": "claimants"}
)
return result.sort_values(["date", "soa_code"]).reset_index(drop=True)
[docs]
def validate_claimant_count(df: pd.DataFrame, breakdown: str) -> bool:
"""Validate the integrity of a claimant count DataFrame.
Checks that required columns are present, values are in plausible ranges,
and the DataFrame is non-empty.
Args:
df: DataFrame returned by :func:`get_latest_claimant_count`.
breakdown: The breakdown type that produced the DataFrame.
One of ``"lgd"``, ``"aa"``, or ``"soa"``.
Returns:
``True`` if validation passes, ``False`` otherwise.
Example:
>>> import pandas as pd
>>> validate_claimant_count(pd.DataFrame(), "lgd")
False
"""
if df is None or df.empty:
logger.warning("Claimant count DataFrame is empty (breakdown=%s)", breakdown)
return False
required_columns: dict[str, list[str]] = {
"lgd": ["date", "geography_code", "geography", "claimants_total", "claimant_rate_total_pct"],
"aa": ["date", "geography_code", "geography", "claimants_total", "claimant_rate_total_pct"],
"soa": ["date", "soa_code", "claimants"],
}
if breakdown not in required_columns:
logger.warning("Unknown breakdown type: %s", breakdown)
return False
missing = [c for c in required_columns[breakdown] if c not in df.columns]
if missing:
logger.warning("Missing columns for %s breakdown: %s", breakdown, missing)
return False
if breakdown in ("lgd", "aa"):
if (df["claimants_total"] < 0).any():
logger.warning("Negative claimant counts in %s data", breakdown)
return False
rates = df["claimant_rate_total_pct"].dropna()
if len(rates) > 0 and ((rates < 0).any() or (rates > 100).any()):
logger.warning("Claimant rates out of range [0, 100] in %s data", breakdown)
return False
if breakdown == "soa" and (df["claimants"] < 0).any():
logger.warning("Negative claimant counts in SOA data")
return False
return True