Source code for bolster.data_sources.nisra.claimant_count

"""NISRA Claimant Count Statistics Module.

Provides monthly Claimant Count statistics for Northern Ireland via the NISRA
PxStat API.  The Claimant Count is an experimental statistic measuring the
number of people claiming benefits principally for the reason of being
unemployed.  Data is published monthly and covers Northern Ireland with two
geographic breakdowns: Local Government Districts and Assembly Areas.

Original data source:
    https://www.nisra.gov.uk/statistics/labour-market-and-social-welfare/claimant-count

PxStat matrices used:
    - CCMLGD  — monthly count and rate by LGD (11 districts + NI total)
    - CCMAA   — monthly count and rate by Assembly Area (18 areas + NI total)
    - CCMSOA  — monthly count by Super Output Area (~92 k rows, large)

Update Frequency: Monthly, approximately 2–3 weeks after the reference month.

Usage:
    >>> from bolster.data_sources.nisra import claimant_count
    >>> df = claimant_count.get_latest_claimant_count("lgd")
    >>> "claimants_total" in df.columns
    True

Example:
    >>> from bolster.data_sources.nisra import claimant_count
    >>> df = claimant_count.get_latest_claimant_count("lgd")
    >>> df[df["geography"] == "Northern Ireland"]["claimants_total"].iloc[0] > 0
    True
"""

import logging

import pandas as pd

from bolster.data_sources.nisra.pxstat import read_dataset

[docs] logger = logging.getLogger(__name__)
# PxStat matrix codes _MATRIX_LGD = "CCMLGD" _MATRIX_AA = "CCMAA" _MATRIX_SOA = "CCMSOA" def _parse_month(month_str: str) -> pd.Timestamp: """Convert a PxStat month code (e.g. ``'2024M03'``) to a Timestamp. Args: month_str: Month code in the form ``'{YYYY}M{MM}'``. Returns: pandas Timestamp for the first day of the month. Example: >>> _parse_month("2024M03") Timestamp('2024-03-01 00:00:00') """ year, month = month_str.split("M") return pd.Timestamp(int(year), int(month), 1) def _pivot_geo(df_raw: pd.DataFrame, geo_col: str, geo_name_col: str) -> pd.DataFrame: """Pivot a raw PxStat geography DataFrame into the standard output shape. The API returns CCN (count) and CCP (rate) as separate rows identified by the ``STATISTIC`` column. This function pivots them into columns named ``claimants_total`` and ``claimant_rate_total_pct``. Args: df_raw: Raw DataFrame from :func:`~bolster.data_sources.nisra.pxstat.read_dataset`. geo_col: Column name for the geography code (e.g. ``'LGD2014'``). geo_name_col: Column name for the geography label (e.g. ``'Local Government District'``). Returns: DataFrame with columns: - ``date``: pandas Timestamp (monthly, day=1) - ``geography_code``: geography identifier code - ``geography``: geography name label - ``claimants_total``: claimant count (float) - ``claimant_rate_total_pct``: claimant rate as percentage (float) """ df_raw = df_raw.copy() df_raw["date"] = df_raw["Month"].apply(_parse_month) count_df = df_raw[df_raw["STATISTIC"] == "CCN"][["date", geo_col, geo_name_col, "VALUE"]].rename( columns={geo_col: "geography_code", geo_name_col: "geography", "VALUE": "claimants_total"} ) rate_df = df_raw[df_raw["STATISTIC"] == "CCP"][["date", geo_col, "VALUE"]].rename( columns={geo_col: "geography_code", "VALUE": "claimant_rate_total_pct"} ) return ( count_df.merge(rate_df, on=["date", "geography_code"], how="left") .sort_values(["date", "geography"]) .reset_index(drop=True) )
[docs] def get_latest_claimant_count( breakdown: str = "lgd", adjusted: bool = True, force_refresh: bool = False, ) -> pd.DataFrame: """Download and return the latest NISRA claimant count data. Fetches data from the NISRA PxStat API for the chosen geographic breakdown. Returns the full time series available from the API (from January 2005). Note: ``adjusted`` is accepted for API compatibility but is ignored — the PxStat API does not distinguish seasonally adjusted from unadjusted counts at geographic level. ``force_refresh`` is accepted for API compatibility but is ignored — the PxStat API is called directly with no local cache layer. Args: breakdown: Geographic breakdown to return. One of: - ``"lgd"`` — 11 Local Government Districts + NI total (default) - ``"aa"`` — 18 Assembly Areas + NI total - ``"soa"`` — Super Output Areas (~92 k rows, large) adjusted: Ignored. Retained for API compatibility. force_refresh: Ignored. Retained for API compatibility. Returns: For ``"lgd"`` and ``"aa"``: DataFrame with columns: - ``date``: pandas Timestamp (monthly, day=1) - ``geography_code``: geography identifier code - ``geography``: geography name label - ``claimants_total``: claimant count (float) - ``claimant_rate_total_pct``: claimant rate as percentage (float) For ``"soa"``: DataFrame with columns: - ``date``: pandas Timestamp (monthly, day=1) - ``soa_code``: Super Output Area code - ``soa_name``: Super Output Area name - ``claimants``: claimant count (float) Raises: ValueError: If ``breakdown`` is not a supported value. Example: >>> df = get_latest_claimant_count("lgd") >>> "claimants_total" in df.columns True >>> df_aa = get_latest_claimant_count("aa") >>> "claimants_total" in df_aa.columns True """ valid = ("lgd", "aa", "soa") if breakdown not in valid: raise ValueError(f"breakdown must be one of {valid}, got {breakdown!r}") if breakdown == "lgd": raw = read_dataset(_MATRIX_LGD) return _pivot_geo(raw, "LGD2014", "Local Government District") if breakdown == "aa": raw = read_dataset(_MATRIX_AA) return _pivot_geo(raw, "AA", "Assembly Area") # breakdown == "soa" raw = read_dataset(_MATRIX_SOA) raw = raw.copy() raw["date"] = raw["Month"].apply(_parse_month) # SOA matrix columns: STATISTIC, Statistic Label, TLIST(M1), Month, SOA, <SOA name col>, UNIT, VALUE soa_code_col = ( "SOA2001" if "SOA2001" in raw.columns else [c for c in raw.columns if "SOA" in c and c != "STATISTIC"][0] ) soa_name_col = [ c for c in raw.columns if c not in {"STATISTIC", "Statistic Label", "TLIST(M1)", "Month", soa_code_col, "UNIT", "VALUE", "date"} ][0] result = raw[["date", soa_code_col, soa_name_col, "VALUE"]].rename( columns={soa_code_col: "soa_code", soa_name_col: "soa_name", "VALUE": "claimants"} ) return result.sort_values(["date", "soa_code"]).reset_index(drop=True)
[docs] def validate_claimant_count(df: pd.DataFrame, breakdown: str) -> bool: """Validate the integrity of a claimant count DataFrame. Checks that required columns are present, values are in plausible ranges, and the DataFrame is non-empty. Args: df: DataFrame returned by :func:`get_latest_claimant_count`. breakdown: The breakdown type that produced the DataFrame. One of ``"lgd"``, ``"aa"``, or ``"soa"``. Returns: ``True`` if validation passes, ``False`` otherwise. Example: >>> import pandas as pd >>> validate_claimant_count(pd.DataFrame(), "lgd") False """ if df is None or df.empty: logger.warning("Claimant count DataFrame is empty (breakdown=%s)", breakdown) return False required_columns: dict[str, list[str]] = { "lgd": ["date", "geography_code", "geography", "claimants_total", "claimant_rate_total_pct"], "aa": ["date", "geography_code", "geography", "claimants_total", "claimant_rate_total_pct"], "soa": ["date", "soa_code", "claimants"], } if breakdown not in required_columns: logger.warning("Unknown breakdown type: %s", breakdown) return False missing = [c for c in required_columns[breakdown] if c not in df.columns] if missing: logger.warning("Missing columns for %s breakdown: %s", breakdown, missing) return False if breakdown in ("lgd", "aa"): if (df["claimants_total"] < 0).any(): logger.warning("Negative claimant counts in %s data", breakdown) return False rates = df["claimant_rate_total_pct"].dropna() if len(rates) > 0 and ((rates < 0).any() or (rates > 100).any()): logger.warning("Claimant rates out of range [0, 100] in %s data", breakdown) return False if breakdown == "soa" and (df["claimants"] < 0).any(): logger.warning("Negative claimant counts in SOA data") return False return True