Source code for bolster.data_sources.boe_base_rate

"""Bank of England official Bank Rate (base rate).

Wraps the Bank of England's published base-rate spreadsheet to provide
standardised access to the UK policy interest rate:

- A **unified daily rate** series back to 1973, formed by coalescing the
  successive rate regimes the BoE has used (Minimum Lending Rate, Minimum
  Band 1 Dealing Rate, Repo Rate and the current Official Bank Rate) into a
  single continuous level.
- An **event-based history of rate changes** back to 1694, exposed via
  :func:`get_rate_changes`.

The base rate is a *level* (a standing interest rate), not a flow, so when it
is resampled to coarser resolutions the **last observation** in each period is
used rather than a sum or mean.

This is the second of three macroeconomic context modules and emits the same
**fixed output schema** as :mod:`bolster.data_sources.ons_cpi`, so that the
macro series can be joined and resampled against one another:

==========  ==========================================================
Column      Description
==========  ==========================================================
date        ``datetime64[ns]`` period-start (Jan 2024 -> 2024-01-01,
            Q1 2024 -> 2024-01-01, year 2024 -> 2024-01-01,
            a daily observation keeps its own date)
year        ``int`` calendar year
quarter     ``str`` "Q1".."Q4", or ``pd.NA`` for annual/monthly/daily
month       ``int`` 1-12, or ``pd.NA`` for quarterly/annual/daily
resolution  ``str`` "daily" | "monthly" | "quarterly" | "annual"
series      ``str`` always "base_rate"
value       ``float`` rate in per-cent
unit        ``str`` always "%"
geography   ``str`` always "UK"
source      ``str`` always "BoE"
==========  ==========================================================

.. note::
    The Bank of England's dynamic statistics API
    (``/boe-apps/statistics-api/``, ``/boe-apps/sdw/``) returns HTTP 403 for
    automated requests. The static spreadsheet linked below is the only
    reliable programmatic route and is what this module uses.

Source:
    https://www.bankofengland.co.uk/boeapps/database/Bank-Rate.asp

Example:
    >>> from bolster.data_sources import boe_base_rate
    >>> df = boe_base_rate.get_latest_data(resolution="annual")  # doctest: +SKIP
    >>> sorted(df.columns)  # doctest: +SKIP
    ['date', 'geography', 'month', 'quarter', 'resolution', 'series', 'source', 'unit', 'value', 'year']
"""

import io
import logging

import pandas as pd

from bolster.utils.web import session

[docs] logger = logging.getLogger(__name__)
#: Static XLS published by the Bank of England (Excel 97-2003, needs ``xlrd``).
[docs] DATA_URL = "https://www.bankofengland.co.uk/-/media/boe/files/monetary-policy/baserate.xls"
#: Sheet holding the daily series (one column per historical rate regime).
[docs] RAW_DATA_SHEET = "Raw Data"
#: Sheet holding the irregular event-based rate changes since 1694.
[docs] HISTORICAL_SHEET = "HISTORICAL SINCE 1694"
#: Geography for all series exposed by this module.
[docs] GEOGRAPHY = "UK"
#: Data publisher.
[docs] SOURCE = "BoE"
#: Series identifier used across the macroeconomic modules.
[docs] SERIES = "base_rate"
#: Unit of the rate.
[docs] UNIT = "%"
#: Output schema column order shared across all macroeconomic modules.
[docs] SCHEMA_COLUMNS = [ "date", "year", "quarter", "month", "resolution", "series", "value", "unit", "geography", "source", ]
#: Schema for :func:`get_rate_changes` (no period-resolution context).
[docs] RATE_CHANGES_COLUMNS = [ "date", "year", "series", "value", "unit", "geography", "source", ]
#: Supported resolutions and how the period start maps to ``quarter``/``month``.
[docs] RESOLUTIONS = ("daily", "monthly", "quarterly", "annual")
#: Columns on the "Raw Data" sheet that carry a rate level (coalesced L->R). _RATE_COLUMNS = [ "Bank Rate", "Min Lending Rate", "Min Band 1 Dealing Rate", "Repo Rate", "Official Bank Rate", ] #: Three-letter month abbreviation -> month number. _MONTH_ABBR_TO_NUM = { "Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4, "May": 5, "Jun": 6, "Jul": 7, "Aug": 8, "Sep": 9, "Oct": 10, "Nov": 11, "Dec": 12, } #: Quarter number -> label. _QUARTER_LABELS = {1: "Q1", 2: "Q2", 3: "Q3", 4: "Q4"}
[docs] class BoEDataError(Exception): """Base exception for Bank of England data errors."""
[docs] class BoEValidationError(BoEDataError): """Raised when a DataFrame fails :func:`validate_data`."""
def _download_workbook(force_refresh: bool = False) -> pd.ExcelFile: """Download the BoE base-rate workbook and return it as an ``ExcelFile``. The workbook is a binary ``.xls`` and is therefore not cached by the shared :class:`~bolster.utils.web.CachingSession` (which only caches HTML); ``force_refresh`` is accepted for API parity with the sibling macroeconomic modules but has no effect on this code path. Args: force_refresh: Accepted for API parity; binary downloads are not cached. Returns: A :class:`pandas.ExcelFile` opened with the ``xlrd`` engine. Raises: BoEDataError: If the download fails or the bytes are not a valid xls. """ del force_refresh # Binary responses are not cached; nothing to refresh. try: response = session.get(DATA_URL, timeout=60) response.raise_for_status() return pd.ExcelFile(io.BytesIO(response.content), engine="xlrd") except Exception as e: # noqa: BLE001 - re-wrapped as a domain error raise BoEDataError(f"Failed to fetch BoE base rate workbook from {DATA_URL}: {e}") from e def _coalesce_daily(workbook: pd.ExcelFile) -> pd.DataFrame: """Build the unified daily rate series from the "Raw Data" sheet. The sheet has one column per successive rate regime; on any given day only the column for the regime then in force is populated. The unified rate is the right-most non-null value across the rate columns for each row. Args: workbook: Open BoE workbook. Returns: DataFrame with ``date`` (datetime64) and ``value`` (float) columns, sorted by date with one row per calendar day. Raises: BoEDataError: If the sheet is missing expected columns or yields no rows. """ raw = workbook.parse(sheet_name=RAW_DATA_SHEET, header=1) if "Date" not in raw.columns: raise BoEDataError(f"'{RAW_DATA_SHEET}' sheet is missing a 'Date' column; got {list(raw.columns)}") present = [c for c in _RATE_COLUMNS if c in raw.columns] if not present: raise BoEDataError(f"'{RAW_DATA_SHEET}' sheet has no recognised rate columns; got {list(raw.columns)}") dates = pd.to_datetime(raw["Date"], errors="coerce") # Coerce each regime column to numeric first so the row-wise ffill operates # on float (not object) data, then take the right-most non-null value = the # rate in force that day. rates = raw[present].apply(pd.to_numeric, errors="coerce") value = rates.ffill(axis=1).iloc[:, -1] df = pd.DataFrame({"date": dates, "value": pd.to_numeric(value, errors="coerce")}) df = df.dropna(subset=["date", "value"]).sort_values("date").reset_index(drop=True) if df.empty: raise BoEDataError(f"No usable rows parsed from '{RAW_DATA_SHEET}' sheet") df["value"] = df["value"].astype("float64") return df def _empty_frame() -> pd.DataFrame: """Return an empty DataFrame with the canonical schema and dtypes.""" df = pd.DataFrame(columns=SCHEMA_COLUMNS) df["date"] = pd.to_datetime(df["date"]) df["year"] = df["year"].astype("Int64") df["value"] = df["value"].astype("float64") return df def _attach_schema(df: pd.DataFrame, resolution: str) -> pd.DataFrame: """Attach the shared schema columns to a ``date``/``value`` frame. Args: df: Frame with at least ``date`` and ``value`` columns. ``date`` must already be normalised to the period start for the resolution. resolution: One of :data:`RESOLUTIONS`. Returns: DataFrame conforming to :data:`SCHEMA_COLUMNS`. """ out = df.copy() out["date"] = pd.to_datetime(out["date"]) out["year"] = out["date"].dt.year.astype("Int64") if resolution == "monthly": out["month"] = out["date"].dt.month.astype("Int64") out["quarter"] = pd.NA elif resolution == "quarterly": out["quarter"] = out["date"].dt.quarter.map(_QUARTER_LABELS) out["month"] = pd.NA else: # daily and annual carry neither a quarter nor a month label out["quarter"] = pd.NA out["month"] = pd.NA out["resolution"] = resolution out["series"] = SERIES out["value"] = out["value"].astype("float64") out["unit"] = UNIT out["geography"] = GEOGRAPHY out["source"] = SOURCE return out[SCHEMA_COLUMNS].reset_index(drop=True)
[docs] def get_latest_data(resolution: str = "monthly", force_refresh: bool = False) -> pd.DataFrame: """Fetch the unified UK base rate at a given resolution. The base rate is a standing level, so coarser resolutions take the **last observation** in each period (i.e. the rate in force at period end). Args: resolution: "daily", "monthly" (default), "quarterly" or "annual". force_refresh: Accepted for API parity; binary downloads are not cached. Returns: DataFrame conforming to :data:`SCHEMA_COLUMNS`, sorted by ``date``. Raises: ValueError: If ``resolution`` is not recognised. BoEDataError: If the workbook download or parse fails. Example: >>> df = get_latest_data(resolution="annual") # doctest: +SKIP >>> df.iloc[-1][["series", "unit", "geography", "source"]].tolist() # doctest: +SKIP ['base_rate', '%', 'UK', 'BoE'] """ if resolution not in RESOLUTIONS: raise ValueError(f"Unknown resolution {resolution!r}. Valid: {sorted(RESOLUTIONS)}") workbook = _download_workbook(force_refresh=force_refresh) daily = _coalesce_daily(workbook) if daily.empty: return _empty_frame() if resolution == "daily": return _attach_schema(daily, "daily") freq = {"monthly": "MS", "quarterly": "QS", "annual": "YS"}[resolution] # Resample to the period start, taking the last rate observed in the period. resampled = daily.set_index("date")["value"].resample(freq).last().dropna() period = resampled.reset_index() period.columns = ["date", "value"] return _attach_schema(period, resolution)
[docs] def get_rate_changes(force_refresh: bool = False) -> pd.DataFrame: """Fetch the event-based history of base-rate changes back to 1694. Each row is a rate *change* effective on a given date (the value is the new rate). The series is irregular by nature. Where the source records only a year and month (the earliest entries pre-date daily records), the change is dated to the first of that month. Args: force_refresh: Accepted for API parity; binary downloads are not cached. Returns: DataFrame with columns :data:`RATE_CHANGES_COLUMNS`, sorted by ``date``. Raises: BoEDataError: If the workbook download or parse fails, or no changes could be parsed. Example: >>> df = get_rate_changes() # doctest: +SKIP >>> df.iloc[0][["date", "value"]].tolist() # doctest: +SKIP [Timestamp('1694-10-01 00:00:00'), 6.0] """ workbook = _download_workbook(force_refresh=force_refresh) raw = workbook.parse(sheet_name=HISTORICAL_SHEET, header=None) # Columns: 0=year, 1=day, 2=month abbr, 3=new rate. Year is sparse (only # filled on the first change of each year) so it is forward-filled. cols = raw.iloc[:, [0, 1, 2, 3]].copy() cols.columns = ["year", "day", "month", "value"] cols["year"] = cols["year"].ffill() cols["mnum"] = cols["month"].astype(str).str.strip().str[:3].str.title().map(_MONTH_ABBR_TO_NUM) cols = cols[cols["mnum"].notna()].copy() # drops header rows and footnotes cols["day"] = pd.to_numeric(cols["day"], errors="coerce").fillna(1).astype(int) cols["year"] = pd.to_numeric(cols["year"], errors="coerce") cols["value"] = pd.to_numeric(cols["value"], errors="coerce") cols = cols.dropna(subset=["year", "value"]) cols["date"] = pd.to_datetime( {"year": cols["year"].astype(int), "month": cols["mnum"].astype(int), "day": cols["day"]}, errors="coerce", ) cols = cols.dropna(subset=["date"]).sort_values("date").reset_index(drop=True) if cols.empty: raise BoEDataError(f"No rate changes parsed from '{HISTORICAL_SHEET}' sheet") out = pd.DataFrame( { "date": cols["date"], "year": cols["date"].dt.year.astype("Int64"), "series": SERIES, "value": cols["value"].astype("float64"), "unit": UNIT, "geography": GEOGRAPHY, "source": SOURCE, } ) return out[RATE_CHANGES_COLUMNS].reset_index(drop=True)
[docs] def validate_data(df: pd.DataFrame) -> bool: """Validate that a DataFrame conforms to the macroeconomic schema. Checks performed: - All :data:`SCHEMA_COLUMNS` are present. - At least one row is present. - ``geography`` is exclusively "UK" and ``source`` exclusively "BoE". - ``series`` is exclusively "base_rate" and ``unit`` exclusively "%". - ``resolution`` only contains known values. - ``value`` is numeric, non-null and within a sane 0-25% range. Args: df: DataFrame to validate. Returns: ``True`` if all checks pass. Raises: BoEValidationError: If any check fails. Example: >>> df = get_latest_data(resolution="annual") # doctest: +SKIP >>> validate_data(df) # doctest: +SKIP True """ missing = [c for c in SCHEMA_COLUMNS if c not in df.columns] if missing: raise BoEValidationError(f"Missing required columns: {missing}") if len(df) == 0: raise BoEValidationError("DataFrame is empty") bad_geo = set(df["geography"].unique()) - {GEOGRAPHY} if bad_geo: raise BoEValidationError(f"Unexpected geography values: {bad_geo}") bad_source = set(df["source"].unique()) - {SOURCE} if bad_source: raise BoEValidationError(f"Unexpected source values: {bad_source}") bad_series = set(df["series"].unique()) - {SERIES} if bad_series: raise BoEValidationError(f"Unexpected series values: {bad_series}") bad_unit = set(df["unit"].unique()) - {UNIT} if bad_unit: raise BoEValidationError(f"Unexpected unit values: {bad_unit}") bad_res = set(df["resolution"].unique()) - set(RESOLUTIONS) if bad_res: raise BoEValidationError(f"Unexpected resolution values: {bad_res}") if not pd.api.types.is_numeric_dtype(df["value"]): raise BoEValidationError("Column 'value' must be numeric") if df["value"].isna().any(): raise BoEValidationError("Column 'value' contains nulls") if (df["value"] < 0).any() or (df["value"] > 25).any(): raise BoEValidationError("Column 'value' has rates outside the plausible 0-25% range") return True