Source code for bolster.data_sources.nisra.pxstat

"""NISRA PxStat API client.

Provides access to the NISRA open data API at https://data.nisra.gov.uk/,
powered by the PxStat platform (https://github.com/CSOIreland/PxStat).

The API is publicly accessible without authentication and has no observed
rate limits, making it a more reliable alternative to scraping Excel files
from publication pages.

API endpoint pattern::

    GET https://ws-data.nisra.gov.uk/public/api.restful/
        PxStat.Data.Cube_API.ReadDataset/{MATRIX}/CSV/1.0/en

CSV responses include a UTF-8 BOM — always decode with ``utf-8-sig``.

Example::

    >>> from bolster.data_sources.nisra.pxstat import read_dataset
    >>> df = read_dataset("WDTHS")
    >>> "VALUE" in df.columns
    True
"""

import logging
from io import StringIO

import pandas as pd

from bolster.utils.web import session


[docs]
logger = logging.getLogger(__name__)


_BASE_URL = "https://ws-data.nisra.gov.uk/public/api.restful/PxStat.Data.Cube_API.ReadDataset"
_HEADERS = {"Referer": "https://data.nisra.gov.uk/"}



[docs]
class PxStatError(Exception):
    """Raised when the PxStat API returns an unexpected response."""




[docs]
def read_dataset(matrix: str, timeout: int = 30) -> pd.DataFrame:
    """Fetch a dataset from the NISRA PxStat API as a DataFrame.

    Args:
        matrix: Dataset matrix code (e.g. ``"WDTHS"`` for weekly deaths).
        timeout: HTTP request timeout in seconds.

    Returns:
        DataFrame with raw API columns. The ``VALUE`` column contains the
        numeric values; all other columns are dimension labels and codes.

    Raises:
        PxStatError: If the API returns a non-200 response.

    Example:
        >>> df = read_dataset("WDTHS")
        >>> "VALUE" in df.columns
        True
    """
    url = f"{_BASE_URL}/{matrix}/CSV/1.0/en"
    response = session.get(url, headers=_HEADERS, timeout=timeout)

    if response.status_code != 200:
        raise PxStatError(f"PxStat API returned {response.status_code} for matrix {matrix!r}: {url}")

    df = pd.read_csv(StringIO(response.content.decode("utf-8-sig")))
    logger.debug("PxStat %s: %d rows, columns: %s", matrix, len(df), list(df.columns))
    return df