Source code for bolster.data_sources.nisra.planning_statistics

"""Northern Ireland Planning Activity Statistics.

Annual planning application statistics for Northern Ireland, published by
the Department for Infrastructure (DfI). Provides counts of planning
applications received, decided, approved and withdrawn, broken down by
Local Government District or NI-wide totals.

Data is fetched from the NISRA PxStat API. Coverage begins 2015/16.

Original data source:
    https://www.infrastructure-ni.gov.uk/articles/planning-activity-statistics

PxStat matrices used:
    - PALGD: Annual planning applications by Local Government District
    - PAPLGD: Annual approved planning applications by LGD and application type
    - PADAA: Annual planning applications by Assembly Area

Update Frequency:
    Annual (financial year April-March).

Geographic Coverage:
    Northern Ireland - whole-country totals plus the 11 local council areas
    (Antrim and Newtownabbey, Ards and North Down, Armagh City, Banbridge and
    Craigavon, Belfast, Causeway Coast and Glens, Derry City and Strabane,
    Fermanagh and Omagh, Lisburn and Castlereagh, Mid and East Antrim,
    Mid Ulster, Newry, Mourne and Down).

Example:
    >>> from bolster.data_sources.nisra import planning_statistics
    >>> df = planning_statistics.get_latest_data()
    >>> 'applications_received' in df.columns
    True
"""

from __future__ import annotations

import logging

import pandas as pd

from ._base import NISRAValidationError
from .pxstat import read_dataset

[docs] logger = logging.getLogger(__name__)
# PxStat matrix codes _MATRIX_LGD = "PALGD" _MATRIX_APPROVED_LGD = "PAPLGD" _MATRIX_ASSEMBLY_AREA = "PADAA" # Application status codes in PALGD _STATUS_RECEIVED = "Applications received" _STATUS_DECIDED = "Applications decided" _STATUS_APPROVED = "Applications approved" _STATUS_WITHDRAWN = "Applications withdrawn" # Metric codes in PADAA (API returns integers after pivot on STATISTIC column) _STAT_RECEIVED = 1 _STAT_DECIDED = 2 _STAT_APPROVED = 3 _STAT_APPROVAL_RATE = 4 _STAT_WITHDRAWN = 5 # NI-wide aggregate code — exclude from council/area outputs _NI_CODE = "N92000002" _VALID_QUARTERS = {"Q1", "Q2", "Q3", "Q4"} def _parse_financial_year_to_date(financial_year: str) -> pd.Timestamp: """Convert a financial year string like '2024/25' to a Timestamp. Returns the first day of the financial year (April 1). Args: financial_year: Financial year string like '2024/25'. Returns: Timestamp for April 1 of the start year. Example: >>> _parse_financial_year_to_date("2024/25") Timestamp('2024-04-01 00:00:00') """ start_year = int(financial_year.split("/")[0]) return pd.Timestamp(year=start_year, month=4, day=1)
[docs] def parse_planning_by_lgd(force_refresh: bool = False) -> pd.DataFrame: """Parse annual planning applications by Local Government District (PALGD). Args: force_refresh: Accepted for API compatibility but ignored. Returns: DataFrame with columns: financial_year, date, year, council, applications_received, applications_decided, applications_approved, applications_withdrawn, approval_rate. The NI-wide aggregate row (council='Northern Ireland') is included. """ raw = read_dataset(_MATRIX_LGD) lgd_col = "Local Government District" fy_col = "Financial year" status_col = "Application status" pivot = raw.pivot_table( index=[fy_col, lgd_col], columns=status_col, values="VALUE", aggfunc="first", ).reset_index() pivot.columns.name = None pivot = pivot.rename( columns={ fy_col: "financial_year", lgd_col: "council", _STATUS_RECEIVED: "applications_received", _STATUS_DECIDED: "applications_decided", _STATUS_APPROVED: "applications_approved", _STATUS_WITHDRAWN: "applications_withdrawn", } ) for col in ("applications_received", "applications_decided", "applications_approved", "applications_withdrawn"): if col in pivot.columns: pivot[col] = pd.to_numeric(pivot[col], errors="coerce") pivot["date"] = pivot["financial_year"].apply(_parse_financial_year_to_date) pivot["year"] = pivot["date"].dt.year # Compute approval rate from available columns if "applications_decided" in pivot.columns and "applications_approved" in pivot.columns: decided = pivot["applications_decided"].replace(0, float("nan")) pivot["approval_rate"] = (pivot["applications_approved"] / decided).round(4) else: pivot["approval_rate"] = float("nan") col_order = [ "financial_year", "date", "year", "council", "applications_received", "applications_decided", "applications_approved", "applications_withdrawn", "approval_rate", ] return ( pivot[[c for c in col_order if c in pivot.columns]] .sort_values(["financial_year", "council"]) .reset_index(drop=True) )
[docs] def parse_planning_by_assembly_area(force_refresh: bool = False) -> pd.DataFrame: """Parse annual planning applications by Assembly Area (PADAA). Args: force_refresh: Accepted for API compatibility but ignored. Returns: DataFrame with columns: financial_year, date, year, assembly_area, applications_received, applications_decided, applications_approved, applications_withdrawn, approval_rate. The NI-wide aggregate row is included. """ raw = read_dataset(_MATRIX_ASSEMBLY_AREA) aa_col = "Assembly Area" fy_col = "Financial year" stat_col = "STATISTIC" pivot = raw.pivot_table( index=[fy_col, aa_col], columns=stat_col, values="VALUE", aggfunc="first", ).reset_index() pivot.columns.name = None rename_map = { fy_col: "financial_year", aa_col: "assembly_area", _STAT_RECEIVED: "applications_received", _STAT_DECIDED: "applications_decided", _STAT_APPROVED: "applications_approved", _STAT_APPROVAL_RATE: "approval_rate_pct", _STAT_WITHDRAWN: "applications_withdrawn", } pivot = pivot.rename(columns={k: v for k, v in rename_map.items() if k in pivot.columns}) for col in ("applications_received", "applications_decided", "applications_approved", "applications_withdrawn"): if col in pivot.columns: pivot[col] = pd.to_numeric(pivot[col], errors="coerce") pivot["date"] = pivot["financial_year"].apply(_parse_financial_year_to_date) pivot["year"] = pivot["date"].dt.year if "approval_rate_pct" in pivot.columns: pivot["approval_rate"] = pd.to_numeric(pivot["approval_rate_pct"], errors="coerce") / 100 else: pivot["approval_rate"] = float("nan") col_order = [ "financial_year", "date", "year", "assembly_area", "applications_received", "applications_decided", "applications_approved", "applications_withdrawn", "approval_rate", ] return ( pivot[[c for c in col_order if c in pivot.columns]] .sort_values(["financial_year", "assembly_area"]) .reset_index(drop=True) )
[docs] def get_latest_data(force_refresh: bool = False) -> pd.DataFrame: """Download and return NI-wide annual planning applications (all LGDs). Args: force_refresh: Accepted for API compatibility but ignored; the PxStat API always returns the latest data without caching. Returns: DataFrame from :func:`parse_planning_by_lgd` with all councils and NI-wide totals. Example: >>> df = get_latest_data() >>> 'applications_received' in df.columns True """ return parse_planning_by_lgd()
[docs] def get_latest_council_data(force_refresh: bool = False) -> pd.DataFrame: """Return council-area planning applications (excludes NI aggregate). Args: force_refresh: Accepted for API compatibility but ignored; the PxStat API always returns the latest data without caching. Returns: DataFrame with one row per (financial_year, council), excluding the NI-wide total row. Example: >>> df = get_latest_council_data() >>> 'council' in df.columns True """ df = parse_planning_by_lgd() return df[df["council"] != "Northern Ireland"].reset_index(drop=True)
[docs] def get_latest_planning_statistics( dimension: str = "ni", financial_year: str | None = None, summary: bool = False, force_refresh: bool = False, ) -> pd.DataFrame: """Get planning application statistics for a given dimension. Args: dimension: Breakdown dimension — 'ni' for NI-wide total, 'council' for LGD breakdown, or 'assembly' for Assembly Area breakdown. financial_year: Optional financial year filter (e.g. '2024/25'). If None, all available years are returned. summary: If True, return a summary aggregated across all financial years for each area. force_refresh: Accepted for API compatibility but ignored; the PxStat API always returns the latest data without caching. Returns: DataFrame with planning application counts and approval rate. Raises: ValueError: If an unsupported dimension is given. """ if dimension == "ni": df = parse_planning_by_lgd() df = df[df["council"] == "Northern Ireland"].reset_index(drop=True) elif dimension == "council": df = get_latest_council_data() elif dimension == "assembly": df = parse_planning_by_assembly_area() df = df[df["assembly_area"] != "Northern Ireland"].reset_index(drop=True) else: raise ValueError(f"Unsupported dimension: {dimension!r}. Use 'ni', 'council', or 'assembly'.") if financial_year is not None: df = df[df["financial_year"] == financial_year].reset_index(drop=True) if summary: df = get_council_summary(df) return df
[docs] def validate_data(df: pd.DataFrame) -> bool: """Validate an annual planning applications DataFrame. Args: df: DataFrame from :func:`get_latest_data` or :func:`parse_planning_by_lgd`. Returns: True if all checks pass. Raises: NISRAValidationError: If the DataFrame is empty, missing required columns, has implausible values, or has too short a time series. Example: >>> df = get_latest_data() >>> validate_data(df) True """ if df is None or df.empty: raise NISRAValidationError("Planning DataFrame is empty") required = { "financial_year", "applications_received", "applications_decided", "applications_approved", "applications_withdrawn", "approval_rate", } missing = required - set(df.columns) if missing: raise NISRAValidationError(f"Missing required columns: {sorted(missing)}") if len(df) < 5: raise NISRAValidationError(f"Too few records ({len(df)}); expected 5+ annual records") for col in ("applications_received", "applications_decided", "applications_approved"): vals = df[col].dropna() if (vals < 0).any(): raise NISRAValidationError(f"Negative values found in {col}") if (vals > 500_000).any(): raise NISRAValidationError(f"Implausibly high values in {col} (>500,000)") rates = df["approval_rate"].dropna() if ((rates < 0) | (rates > 1.0001)).any(): raise NISRAValidationError("approval_rate outside the [0, 1] range") return True
[docs] def get_annual_totals(df: pd.DataFrame) -> pd.DataFrame: """Aggregate a DataFrame to annual (financial-year) totals across all areas. Args: df: DataFrame from :func:`get_latest_data` (may include council breakdown). Returns: DataFrame with one row per financial year and columns: financial_year, applications_received, applications_decided, applications_approved, applications_withdrawn, approval_rate. Example: >>> df = get_latest_data() >>> annual = get_annual_totals(df) >>> 'applications_received' in annual.columns True """ # If NI-wide row is present, use it directly to avoid double-counting if "council" in df.columns and "Northern Ireland" in df["council"].values: ni_df = df[df["council"] == "Northern Ireland"].copy() return ni_df[ [ "financial_year", "applications_received", "applications_decided", "applications_approved", "applications_withdrawn", "approval_rate", ] ].reset_index(drop=True) # Otherwise aggregate grouped = df.groupby("financial_year", sort=True).agg( applications_received=("applications_received", "sum"), applications_decided=("applications_decided", "sum"), applications_approved=("applications_approved", "sum"), applications_withdrawn=("applications_withdrawn", "sum"), ) decided = grouped["applications_decided"].replace(0, float("nan")) grouped["approval_rate"] = (grouped["applications_approved"] / decided).round(4) return grouped.reset_index()[ [ "financial_year", "applications_received", "applications_decided", "applications_approved", "applications_withdrawn", "approval_rate", ] ]
[docs] def get_council_summary(council_df: pd.DataFrame, financial_year: str | None = None) -> pd.DataFrame: """Summarise council-area data by council across all (or one) financial year. Args: council_df: DataFrame from :func:`get_latest_council_data`. financial_year: Optional financial year to filter to (e.g. '2024/25'). If None, summarises across all available years. Returns: DataFrame with one row per council, sorted by applications_received descending. Example: >>> council_df = get_latest_council_data() >>> summary = get_council_summary(council_df, financial_year='2024/25') >>> 'council' in summary.columns True """ df = council_df if financial_year is not None: df = df[df["financial_year"] == financial_year] if "council" not in df.columns: return df grouped = df.groupby("council", sort=False).agg( applications_received=("applications_received", "sum"), applications_decided=("applications_decided", "sum"), applications_approved=("applications_approved", "sum"), applications_withdrawn=("applications_withdrawn", "sum"), ) decided = grouped["applications_decided"].replace(0, pd.NA) grouped["approval_rate"] = (grouped["applications_approved"] / decided).round(4) return grouped.reset_index().sort_values("applications_received", ascending=False).reset_index(drop=True)