From de061fc166edbdbc83ba27931c9a8ad834bba8c3 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Sun, 15 Feb 2026 13:40:07 +0000 Subject: [PATCH 1/6] Replace ad-hoc targets with structured registry and source modules The old system had ~700 lines of hardcoded targets scattered across loss.py with no provenance. This replaces it with a targets registry (sources.yaml + per-source Python scrapers) that produces 570 national targets with source URLs, and factors local area data loading into dedicated source modules. National: pydantic Target schema, YAML registry, build_loss_matrix.py computes household columns from registry. Local: 4 source modules (local_age, local_income, local_uc, local_la_extras) replace inline file reads in constituencies/loss.py and local_authorities/loss.py. Both calibrate.py files updated to import create_national_target_matrix from the new location. Full pipeline runs end-to-end with zero skipped targets. Co-Authored-By: Claude Opus 4 --- .../datasets/create_datasets.py | 4 +- .../local_areas/constituencies/calibrate.py | 4 +- .../local_areas/constituencies/loss.py | 144 ++- .../local_authorities/calibrate.py | 4 +- .../local_areas/local_authorities/loss.py | 424 +++------ policyengine_uk_data/targets/__init__.py | 15 + .../targets/build_loss_matrix.py | 880 ++++++++++++++++++ policyengine_uk_data/targets/registry.py | 69 ++ policyengine_uk_data/targets/schema.py | 49 + policyengine_uk_data/targets/sources.yaml | 43 + .../targets/sources/__init__.py | 4 + policyengine_uk_data/targets/sources/dwp.py | 265 ++++++ .../targets/sources/hmrc_salary_sacrifice.py | 135 +++ .../targets/sources/hmrc_spi.py | 297 ++++++ .../targets/sources/housing.py | 37 + .../targets/sources/local_age.py | 100 ++ .../targets/sources/local_income.py | 96 ++ .../targets/sources/local_la_extras.py | 129 +++ .../targets/sources/local_uc.py | 42 + .../targets/sources/nts_vehicles.py | 49 + policyengine_uk_data/targets/sources/obr.py | 504 ++++++++++ .../targets/sources/ons_demographics.py | 331 +++++++ .../targets/sources/ons_households.py | 114 +++ .../targets/sources/ons_savings.py | 72 ++ .../targets/sources/ons_tenure.py | 119 +++ .../targets/sources/scottish_government.py | 37 + .../targets/sources/voa_council_tax.py | 63 ++ .../tests/test_target_registry.py | 103 ++ .../tests/test_vehicle_ownership.py | 2 +- policyengine_uk_data/utils/loss.py | 692 +------------- pyproject.toml | 2 + 31 files changed, 3754 insertions(+), 1075 deletions(-) create mode 100644 policyengine_uk_data/targets/__init__.py create mode 100644 policyengine_uk_data/targets/build_loss_matrix.py create mode 100644 policyengine_uk_data/targets/registry.py create mode 100644 policyengine_uk_data/targets/schema.py create mode 100644 policyengine_uk_data/targets/sources.yaml create mode 100644 policyengine_uk_data/targets/sources/__init__.py create mode 100644 policyengine_uk_data/targets/sources/dwp.py create mode 100644 policyengine_uk_data/targets/sources/hmrc_salary_sacrifice.py create mode 100644 policyengine_uk_data/targets/sources/hmrc_spi.py create mode 100644 policyengine_uk_data/targets/sources/housing.py create mode 100644 policyengine_uk_data/targets/sources/local_age.py create mode 100644 policyengine_uk_data/targets/sources/local_income.py create mode 100644 policyengine_uk_data/targets/sources/local_la_extras.py create mode 100644 policyengine_uk_data/targets/sources/local_uc.py create mode 100644 policyengine_uk_data/targets/sources/nts_vehicles.py create mode 100644 policyengine_uk_data/targets/sources/obr.py create mode 100644 policyengine_uk_data/targets/sources/ons_demographics.py create mode 100644 policyengine_uk_data/targets/sources/ons_households.py create mode 100644 policyengine_uk_data/targets/sources/ons_savings.py create mode 100644 policyengine_uk_data/targets/sources/ons_tenure.py create mode 100644 policyengine_uk_data/targets/sources/scottish_government.py create mode 100644 policyengine_uk_data/targets/sources/voa_council_tax.py create mode 100644 policyengine_uk_data/tests/test_target_registry.py diff --git a/policyengine_uk_data/datasets/create_datasets.py b/policyengine_uk_data/datasets/create_datasets.py index 641644a4..ed969d07 100644 --- a/policyengine_uk_data/datasets/create_datasets.py +++ b/policyengine_uk_data/datasets/create_datasets.py @@ -119,7 +119,9 @@ def main(): ) from policyengine_uk_data.datasets.local_areas.constituencies.loss import ( create_constituency_target_matrix, - create_national_target_matrix, + ) + from policyengine_uk_data.targets.build_loss_matrix import ( + create_target_matrix as create_national_target_matrix, ) from policyengine_uk_data.datasets.local_areas.constituencies.calibrate import ( get_performance, diff --git a/policyengine_uk_data/datasets/local_areas/constituencies/calibrate.py b/policyengine_uk_data/datasets/local_areas/constituencies/calibrate.py index b264559b..6ea99677 100644 --- a/policyengine_uk_data/datasets/local_areas/constituencies/calibrate.py +++ b/policyengine_uk_data/datasets/local_areas/constituencies/calibrate.py @@ -2,7 +2,9 @@ from policyengine_uk_data.utils.calibrate import calibrate_local_areas from policyengine_uk_data.datasets.local_areas.constituencies.loss import ( create_constituency_target_matrix, - create_national_target_matrix, +) +from policyengine_uk_data.targets.build_loss_matrix import ( + create_target_matrix as create_national_target_matrix, ) from policyengine_uk_data.storage import STORAGE_FOLDER from policyengine_uk.data import UKSingleYearDataset diff --git a/policyengine_uk_data/datasets/local_areas/constituencies/loss.py b/policyengine_uk_data/datasets/local_areas/constituencies/loss.py index 0397f0d9..0cbd291d 100644 --- a/policyengine_uk_data/datasets/local_areas/constituencies/loss.py +++ b/policyengine_uk_data/datasets/local_areas/constituencies/loss.py @@ -1,19 +1,36 @@ +"""Constituency-level calibration target matrix. + +Constructs the (matrix, y, country_mask) triple for calibrating +household weights across 650 parliamentary constituencies. Target +data is loaded from source modules in the targets system. + +Sources: +- Age: ONS mid-year population estimates +- Income: HMRC SPI table 3.15 +- UC: DWP Stat-Xplore +""" + from policyengine_uk import Microsimulation import pandas as pd import numpy as np -from pathlib import Path -from policyengine_uk_data.utils.loss import ( - create_target_matrix as create_national_target_matrix, -) +from policyengine_uk.data import UKSingleYearDataset from policyengine_uk_data.storage import STORAGE_FOLDER from policyengine_uk_data.datasets.local_areas.constituencies.boundary_changes.mapping_matrix import ( mapping_matrix, ) -from policyengine_uk.data import UKSingleYearDataset -from policyengine_uk_data.utils.uc_data import uc_pc_households - -FOLDER = Path(__file__).parent +from policyengine_uk_data.targets.sources.local_age import ( + get_constituency_age_targets, + get_uk_total_population, +) +from policyengine_uk_data.targets.sources.local_income import ( + get_constituency_income_targets, + get_national_income_projections, + INCOME_VARIABLES, +) +from policyengine_uk_data.targets.sources.local_uc import ( + get_constituency_uc_targets, +) def create_constituency_target_matrix( @@ -23,26 +40,18 @@ def create_constituency_target_matrix( ): if time_period is None: time_period = dataset.time_period - ages = pd.read_csv(FOLDER / "targets" / "age.csv") - national_demographics = pd.read_csv(STORAGE_FOLDER / "demographics.csv") - incomes = pd.read_csv(FOLDER / "targets" / "spi_by_constituency.csv") sim = Microsimulation(dataset=dataset, reform=reform) sim.default_calculation_period = dataset.time_period - national_incomes = pd.read_csv(STORAGE_FOLDER / "incomes_projection.csv") - national_incomes = national_incomes[ - national_incomes.year - == max(national_incomes.year.min(), int(dataset.time_period)) - ] - matrix = pd.DataFrame() y = pd.DataFrame() - INCOME_VARIABLES = [ - "self_employment_income", - "employment_income", - ] + # ── Income targets ───────────────────────────────────────────── + incomes = get_constituency_income_targets() + national_incomes = get_national_income_projections( + int(dataset.time_period) + ) for income_variable in INCOME_VARIABLES: income_values = sim.calculate(income_variable).values @@ -56,84 +65,50 @@ def create_constituency_target_matrix( (national_incomes.total_income_lower_bound == 12_570) & (national_incomes.total_income_upper_bound == np.inf) ][income_variable + "_amount"].iloc[0] - national_consistency_adjustment_factor = ( - national_target / local_target_sum - ) - y[f"hmrc/{income_variable}/amount"] = ( - local_targets * national_consistency_adjustment_factor - ) + adjustment = national_target / local_target_sum + y[f"hmrc/{income_variable}/amount"] = local_targets * adjustment + matrix[f"hmrc/{income_variable}/count"] = sim.map_result( (income_values != 0) * in_spi_frame, "person", "household" ) - local_targets = incomes[f"{income_variable}_count"].values - local_target_sum = local_targets.sum() - national_target = national_incomes[ - (national_incomes.total_income_lower_bound == 12_570) - & (national_incomes.total_income_upper_bound == np.inf) - ][income_variable + "_count"].iloc[0] y[f"hmrc/{income_variable}/count"] = ( - incomes[f"{income_variable}_count"].values - * national_consistency_adjustment_factor + incomes[f"{income_variable}_count"].values * adjustment ) - uk_total_population = ( - national_demographics[national_demographics.name == "uk_population"][ - str(time_period) - ].values[0] - * 1e6 - ) + # ── Age targets ──────────────────────────────────────────────── + age_targets = get_constituency_age_targets() + uk_total_population = get_uk_total_population(int(time_period)) age = sim.calculate("age").values targets_total_pop = 0 - for lower_age in range(0, 80, 10): - upper_age = lower_age + 10 - - in_age_band = (age >= lower_age) & (age < upper_age) - - age_str = f"{lower_age}_{upper_age}" - matrix[f"age/{age_str}"] = sim.map_result( - in_age_band, "person", "household" - ) - - age_count = ages[ - [str(age) for age in range(lower_age, upper_age)] - ].sum(axis=1) - - age_str = f"{lower_age}_{upper_age}" - y[f"age/{age_str}"] = age_count.values - targets_total_pop += age_count.values.sum() - - # Adjust for consistency - for lower_age in range(0, 80, 10): - upper_age = lower_age + 10 - - in_age_band = (age >= lower_age) & (age < upper_age) - - age_str = f"{lower_age}_{upper_age}" - y[f"age/{age_str}"] *= uk_total_population / targets_total_pop * 0.9 - - # UC household count by constituency - y["uc_households"] = uc_pc_households.household_count.values + age_cols = [c for c in age_targets.columns if c.startswith("age/")] + for col in age_cols: + lower, upper = col.removeprefix("age/").split("_") + lower, upper = int(lower), int(upper) + in_band = (age >= lower) & (age < upper) + matrix[col] = sim.map_result(in_band, "person", "household") + y[col] = age_targets[col].values + targets_total_pop += age_targets[col].values.sum() + + # National consistency adjustment + for col in age_cols: + y[col] *= uk_total_population / targets_total_pop * 0.9 + + # ── UC targets ───────────────────────────────────────────────── + y["uc_households"] = get_constituency_uc_targets().values matrix["uc_households"] = sim.map_result( (sim.calculate("universal_credit").values > 0).astype(int), "benunit", "household", ) + # ── Boundary mapping (2010 → 2024) ──────────────────────────── const_2024 = pd.read_csv(STORAGE_FOLDER / "constituencies_2024.csv") - const_2010 = pd.read_csv(STORAGE_FOLDER / "constituencies_2010.csv") - - y_2010 = y.copy() - y_2010["name"] = const_2010["name"].values y_columns = list(y.columns) - y_values = mapping_matrix @ y.values # Transform to 2024 constituencies - + y_values = mapping_matrix @ y.values y = pd.DataFrame(y_values, columns=y_columns) - y_2024 = y.copy() - y_2024["name"] = const_2024["name"].values - country_mask = create_country_mask( household_countries=sim.calculate("country").values, codes=const_2024.code, @@ -144,10 +119,8 @@ def create_constituency_target_matrix( def create_country_mask( household_countries: np.ndarray, codes: pd.Series ) -> np.ndarray: - # Create a matrix R to accompany the loss matrix M s.t. (W x M) x R = Y_ - # where Y_ is the target matrix for the country where no target is constructed from weights from a different country. - - constituency_countries = codes.apply(lambda code: code[0]).map( + """Country mask: R[i,j] = 1 iff household j is in same country as area i.""" + area_countries = codes.apply(lambda code: code[0]).map( { "E": "ENGLAND", "W": "WALES", @@ -155,10 +128,7 @@ def create_country_mask( "N": "NORTHERN_IRELAND", } ) - r = np.zeros((len(codes), len(household_countries))) - for i in range(len(codes)): - r[i] = household_countries == constituency_countries[i] - + r[i] = household_countries == area_countries.iloc[i] return r diff --git a/policyengine_uk_data/datasets/local_areas/local_authorities/calibrate.py b/policyengine_uk_data/datasets/local_areas/local_authorities/calibrate.py index f6a8d7dc..588f2955 100644 --- a/policyengine_uk_data/datasets/local_areas/local_authorities/calibrate.py +++ b/policyengine_uk_data/datasets/local_areas/local_authorities/calibrate.py @@ -2,7 +2,9 @@ from policyengine_uk_data.utils.calibrate import calibrate_local_areas from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( create_local_authority_target_matrix, - create_national_target_matrix, +) +from policyengine_uk_data.targets.build_loss_matrix import ( + create_target_matrix as create_national_target_matrix, ) from policyengine_uk_data.storage import STORAGE_FOLDER from policyengine_uk.data import UKSingleYearDataset diff --git a/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py b/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py index 26e58a6e..177b2883 100644 --- a/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py +++ b/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py @@ -1,60 +1,44 @@ +"""Local authority calibration target matrix. + +Constructs the (matrix, y, country_mask) triple for calibrating +household weights across 360 local authorities. Target data is +loaded from source modules in the targets system. + +Sources: +- Age: ONS mid-year population estimates +- Income: HMRC SPI table 3.15 +- UC: DWP Stat-Xplore +- ONS income: ONS small area income estimates +- Tenure: English Housing Survey +- Private rent: VOA/ONS private rental market statistics +""" + from policyengine_uk import Microsimulation import pandas as pd import numpy as np -from pathlib import Path -from policyengine_uk_data.utils.loss import ( - create_target_matrix as create_national_target_matrix, -) -from policyengine_uk_data.storage import STORAGE_FOLDER from policyengine_uk.data import UKSingleYearDataset -from policyengine_uk_data.utils.uc_data import uc_la_households - -FOLDER = Path(__file__).parent - -# Uprating factors from FYE 2020 to 2025 (OBR Nov 2025 EFO) -# RHDI index: 1985.1 (2025-26) / 1467.6 (2020-21) = 1.352 -UPRATING_NET_INCOME_BHC_2020_TO_2025 = 1985.1 / 1467.6 -# House price index: 103.5 (2025-26) / 84.9 (2020-21) = 1.219 -UPRATING_HOUSING_COSTS_2020_TO_2025 = 103.5 / 84.9 - - -def load_ons_la_income_targets() -> pd.DataFrame: - """Load ONS income estimates by local authority. - - Returns a DataFrame with columns: la_code, total_income, net_income_bhc, net_income_ahc - (mean income per household, FYE 2020) - """ - xlsx = pd.ExcelFile(STORAGE_FOLDER / "local_authority_ons_income.xlsx") - - def load_sheet(sheet_name: str, value_col: str) -> pd.DataFrame: - df = pd.read_excel(xlsx, sheet_name=sheet_name, header=3) - df.columns = [ - "msoa_code", - "msoa_name", - "la_code", - "la_name", - "region_code", - "region_name", - value_col, - "upper_ci", - "lower_ci", - "ci_width", - ] - df = df.iloc[1:].dropna(subset=["msoa_code"]) - df[value_col] = pd.to_numeric(df[value_col]) - return df[["la_code", value_col]] - - total = load_sheet("Total annual income", "total_income") - bhc = load_sheet("Net income before housing costs", "net_income_bhc") - ahc = load_sheet("Net income after housing costs", "net_income_ahc") - - # Group by LA to get mean income per household - la_total = total.groupby("la_code")["total_income"].mean().reset_index() - la_bhc = bhc.groupby("la_code")["net_income_bhc"].mean().reset_index() - la_ahc = ahc.groupby("la_code")["net_income_ahc"].mean().reset_index() - - return la_total.merge(la_bhc, on="la_code").merge(la_ahc, on="la_code") +from policyengine_uk_data.storage import STORAGE_FOLDER +from policyengine_uk_data.targets.sources.local_age import ( + get_la_age_targets, + get_uk_total_population, +) +from policyengine_uk_data.targets.sources.local_income import ( + get_la_income_targets, + get_national_income_projections, + INCOME_VARIABLES, +) +from policyengine_uk_data.targets.sources.local_uc import ( + get_la_uc_targets, +) +from policyengine_uk_data.targets.sources.local_la_extras import ( + load_ons_la_income, + load_household_counts, + load_tenure_data, + load_private_rents, + UPRATING_NET_INCOME_BHC_2020_TO_2025, + UPRATING_HOUSING_COSTS_2020_TO_2025, +) def create_local_authority_target_matrix( @@ -64,8 +48,7 @@ def create_local_authority_target_matrix( ): if time_period is None: time_period = dataset.time_period - ages = pd.read_csv(FOLDER / "targets" / "age.csv") - incomes = pd.read_csv(FOLDER / "targets" / "spi_by_la.csv") + la_codes = pd.read_csv(STORAGE_FOLDER / "local_authorities_2021.csv") sim = Microsimulation(dataset=dataset, reform=reform) @@ -75,16 +58,11 @@ def create_local_authority_target_matrix( matrix = pd.DataFrame() y = pd.DataFrame() - INCOME_VARIABLES = [ - "self_employment_income", - "employment_income", - ] - - national_incomes = pd.read_csv(STORAGE_FOLDER / "incomes_projection.csv") - national_incomes = national_incomes[ - national_incomes.year - == max(national_incomes.year.min(), int(dataset.time_period)) - ] + # ── Income targets ───────────────────────────────────────────── + incomes = get_la_income_targets() + national_incomes = get_national_income_projections( + int(dataset.time_period) + ) for income_variable in INCOME_VARIABLES: income_values = sim.calculate(income_variable).values @@ -98,107 +76,66 @@ def create_local_authority_target_matrix( (national_incomes.total_income_lower_bound == 12_570) & (national_incomes.total_income_upper_bound == np.inf) ][income_variable + "_amount"].iloc[0] - national_consistency_adjustment_factor = ( - national_target / local_target_sum - ) - y[f"hmrc/{income_variable}/amount"] = ( - local_targets * national_consistency_adjustment_factor - ) + adjustment = national_target / local_target_sum + y[f"hmrc/{income_variable}/amount"] = local_targets * adjustment + matrix[f"hmrc/{income_variable}/count"] = sim.map_result( (income_values != 0) * in_spi_frame, "person", "household" ) - local_targets = incomes[f"{income_variable}_count"].values - local_target_sum = local_targets.sum() - national_target = national_incomes[ - (national_incomes.total_income_lower_bound == 12_570) - & (national_incomes.total_income_upper_bound == np.inf) - ][income_variable + "_count"].iloc[0] y[f"hmrc/{income_variable}/count"] = ( - incomes[f"{income_variable}_count"].values - * national_consistency_adjustment_factor + incomes[f"{income_variable}_count"].values * adjustment ) - age = sim.calculate("age").values - national_demographics = pd.read_csv(STORAGE_FOLDER / "demographics.csv") - uk_total_population = ( - national_demographics[national_demographics.name == "uk_population"][ - str(time_period) - ].values[0] - * 1e6 - ) + # ── Age targets ──────────────────────────────────────────────── + age_targets = get_la_age_targets() + uk_total_population = get_uk_total_population(int(time_period)) age = sim.calculate("age").values targets_total_pop = 0 - for lower_age in range(0, 80, 10): - upper_age = lower_age + 10 - - in_age_band = (age >= lower_age) & (age < upper_age) - - age_str = f"{lower_age}_{upper_age}" - matrix[f"age/{age_str}"] = sim.map_result( - in_age_band, "person", "household" - ) - - age_count = ages[ - [str(age) for age in range(lower_age, upper_age)] - ].sum(axis=1) - - age_str = f"{lower_age}_{upper_age}" - y[f"age/{age_str}"] = age_count.values - targets_total_pop += age_count.values.sum() - - # Adjust for consistency - for lower_age in range(0, 80, 10): - upper_age = lower_age + 10 - - in_age_band = (age >= lower_age) & (age < upper_age) - - age_str = f"{lower_age}_{upper_age}" - y[f"age/{age_str}"] *= uk_total_population / targets_total_pop * 0.9 - - # UC household count by local authority - y["uc_households"] = uc_la_households.household_count.values + age_cols = [c for c in age_targets.columns if c.startswith("age/")] + for col in age_cols: + lower, upper = col.removeprefix("age/").split("_") + lower, upper = int(lower), int(upper) + in_band = (age >= lower) & (age < upper) + matrix[col] = sim.map_result(in_band, "person", "household") + y[col] = age_targets[col].values + targets_total_pop += age_targets[col].values.sum() + + for col in age_cols: + y[col] *= uk_total_population / targets_total_pop * 0.9 + + # ── UC targets ───────────────────────────────────────────────── + y["uc_households"] = get_la_uc_targets().values matrix["uc_households"] = sim.map_result( (sim.calculate("universal_credit").values > 0).astype(int), "benunit", "household", ) - # ONS income targets by local authority - # ONS definitions: - # total_income (ONS) = household_market_income + household_benefits (PE) - # net_income_bhc (ONS) = hbai_household_net_income (PE) - # net_income_ahc (ONS) = hbai_household_net_income_ahc (PE) - ons_income = load_ons_la_income_targets() - households_by_la = pd.read_excel( - STORAGE_FOLDER / "la_count_households.xlsx", sheet_name="Dataset" - ) - households_by_la.columns = ["la_code", "la_name", "households"] + # ── ONS income targets ───────────────────────────────────────── + ons_income = load_ons_la_income() + households_by_la = load_household_counts() - # Merge ONS income with our LA codes to get targets aligned ons_merged = la_codes.merge( ons_income, left_on="code", right_on="la_code", how="left" ).merge( - households_by_la[["la_code", "households"]], + households_by_la, left_on="code", right_on="la_code", how="left", suffixes=("", "_hh"), ) - # Calculate PE household income variables hbai_net_income = sim.calculate("equiv_hbai_household_net_income").values hbai_net_income_ahc = sim.calculate( "equiv_hbai_household_net_income_ahc" ).values housing_costs = hbai_net_income - hbai_net_income_ahc - # Add to matrix (household-level values, will be summed with weights) matrix["ons/equiv_net_income_bhc"] = hbai_net_income matrix["ons/equiv_net_income_ahc"] = hbai_net_income_ahc matrix["ons/equiv_housing_costs"] = housing_costs - # Calculate LA-level targets: mean income * households, uprated to 2025 ons_merged["equiv_net_income_bhc_target"] = ( ons_merged["net_income_bhc"] * ons_merged["households"] @@ -213,205 +150,93 @@ def create_local_authority_target_matrix( - ons_merged["equiv_housing_costs_target"] ) - country_mask = create_country_mask( - household_countries=sim.calculate("country").values, - codes=la_codes.code, - ) - - # For LAs without ONS data (or without household counts), use national - # average scaled by LA household count has_ons_data = ( - ons_merged["net_income_bhc"].notna() & ons_merged["households"].notna() + ons_merged["net_income_bhc"].notna() + & ons_merged["households"].notna() ).values - # For LAs without household data, use equal share (1/360) as fallback total_households = ons_merged["households"].sum() - equal_share = 1 / len(la_codes) la_household_share = np.where( ons_merged["households"].notna(), ons_merged["households"].values / total_households, - equal_share, + 1 / len(la_codes), ) - # National totals (weighted sum across all households) - national_net_income_bhc = (original_weights * hbai_net_income).sum() - national_net_income_ahc = (original_weights * hbai_net_income_ahc).sum() - national_housing_costs = (original_weights * housing_costs).sum() - - # Default targets = national total * LA's share of households - default_net_income_bhc = national_net_income_bhc * la_household_share - default_net_income_ahc = national_net_income_ahc * la_household_share - default_housing_costs = national_housing_costs * la_household_share + national_bhc = (original_weights * hbai_net_income).sum() + national_ahc = (original_weights * hbai_net_income_ahc).sum() + national_hc = (original_weights * housing_costs).sum() y["ons/equiv_net_income_bhc"] = np.where( has_ons_data, ons_merged["equiv_net_income_bhc_target"].values, - default_net_income_bhc, + national_bhc * la_household_share, ) y["ons/equiv_net_income_ahc"] = np.where( has_ons_data, ons_merged["equiv_net_income_ahc_target"].values, - default_net_income_ahc, + national_ahc * la_household_share, ) y["ons/equiv_housing_costs"] = np.where( has_ons_data, ons_merged["equiv_housing_costs_target"].values, - default_housing_costs, + national_hc * la_household_share, ) - # Tenure type targets by local authority - tenure_data = pd.read_excel( - STORAGE_FOLDER / "la_tenure.xlsx", sheet_name="data download" - ) - tenure_data.columns = [ - "region_code", - "region_name", - "la_code", - "la_name", - "owned_outright_pct", - "owned_mortgage_pct", - "private_rent_pct", - "social_rent_pct", - ] - - # Merge with LA codes and households + # ── Tenure targets ───────────────────────────────────────────── + tenure_data = load_tenure_data() + tenure_merged = la_codes.merge( - tenure_data[ - [ - "la_code", - "owned_outright_pct", - "owned_mortgage_pct", - "private_rent_pct", - "social_rent_pct", - ] - ], - left_on="code", - right_on="la_code", - how="left", + tenure_data, left_on="code", right_on="la_code", how="left" ).merge( - households_by_la[["la_code", "households"]], + households_by_la, left_on="code", right_on="la_code", how="left", suffixes=("", "_hh"), ) - # Calculate household counts by tenure type tenure_type = sim.calculate("tenure_type").values - - # Matrix columns for tenure (1 if household has that tenure type) - matrix["tenure/owned_outright"] = (tenure_type == "OWNED_OUTRIGHT").astype( - float - ) + matrix["tenure/owned_outright"] = ( + tenure_type == "OWNED_OUTRIGHT" + ).astype(float) matrix["tenure/owned_mortgage"] = ( tenure_type == "OWNED_WITH_MORTGAGE" ).astype(float) - matrix["tenure/private_rent"] = (tenure_type == "RENT_PRIVATELY").astype( - float - ) + matrix["tenure/private_rent"] = ( + tenure_type == "RENT_PRIVATELY" + ).astype(float) matrix["tenure/social_rent"] = ( - (tenure_type == "RENT_FROM_COUNCIL") | (tenure_type == "RENT_FROM_HA") + (tenure_type == "RENT_FROM_COUNCIL") + | (tenure_type == "RENT_FROM_HA") ).astype(float) - # Calculate targets: percentage * households - tenure_merged["owned_outright_target"] = ( - tenure_merged["owned_outright_pct"] / 100 * tenure_merged["households"] - ) - tenure_merged["owned_mortgage_target"] = ( - tenure_merged["owned_mortgage_pct"] / 100 * tenure_merged["households"] - ) - tenure_merged["private_rent_target"] = ( - tenure_merged["private_rent_pct"] / 100 * tenure_merged["households"] - ) - tenure_merged["social_rent_target"] = ( - tenure_merged["social_rent_pct"] / 100 * tenure_merged["households"] - ) - - # For LAs without tenure data (or without household counts), use national - # average scaled by LA household count - has_tenure_data = ( + has_tenure = ( tenure_merged["owned_outright_pct"].notna() & tenure_merged["households"].notna() ).values - # National totals for each tenure type - national_owned_outright = ( - original_weights * matrix["tenure/owned_outright"].values - ).sum() - national_owned_mortgage = ( - original_weights * matrix["tenure/owned_mortgage"].values - ).sum() - national_private_rent = ( - original_weights * matrix["tenure/private_rent"].values - ).sum() - national_social_rent = ( - original_weights * matrix["tenure/social_rent"].values - ).sum() - - # Default targets = national total * LA's share of households - default_owned_outright = national_owned_outright * la_household_share - default_owned_mortgage = national_owned_mortgage * la_household_share - default_private_rent = national_private_rent * la_household_share - default_social_rent = national_social_rent * la_household_share - - y["tenure/owned_outright"] = np.where( - has_tenure_data, - tenure_merged["owned_outright_target"].values, - default_owned_outright, - ) - y["tenure/owned_mortgage"] = np.where( - has_tenure_data, - tenure_merged["owned_mortgage_target"].values, - default_owned_mortgage, - ) - y["tenure/private_rent"] = np.where( - has_tenure_data, - tenure_merged["private_rent_target"].values, - default_private_rent, - ) - y["tenure/social_rent"] = np.where( - has_tenure_data, - tenure_merged["social_rent_target"].values, - default_social_rent, - ) + for tenure_key, pct_col in [ + ("owned_outright", "owned_outright_pct"), + ("owned_mortgage", "owned_mortgage_pct"), + ("private_rent", "private_rent_pct"), + ("social_rent", "social_rent_pct"), + ]: + targets = ( + tenure_merged[pct_col] / 100 * tenure_merged["households"] + ) + national = ( + original_weights * matrix[f"tenure/{tenure_key}"].values + ).sum() + y[f"tenure/{tenure_key}"] = np.where( + has_tenure, targets.values, national * la_household_share + ) - # Private rent amounts by local authority - rent_data = pd.read_excel( - STORAGE_FOLDER / "la_private_rents_median.xlsx", - sheet_name="Figure 3", - header=5, - ) - rent_data.columns = [ - "col0", - "la_code_old", - "area_code", - "area_name", - "room", - "studio", - "one_bed", - "two_bed", - "three_bed", - "four_plus", - "median_monthly_rent", - ] - # Filter to LA rows and convert to numeric - rent_data = rent_data[ - rent_data["area_code"].astype(str).str.match(r"^E0[6789]") - ] - rent_data["median_monthly_rent"] = pd.to_numeric( - rent_data["median_monthly_rent"], errors="coerce" - ) - # Convert to annual rent - rent_data["median_annual_rent"] = rent_data["median_monthly_rent"] * 12 + # ── Private rent amounts ─────────────────────────────────────── + rent_data = load_private_rents() - # Add rent data to tenure_merged (which already has tenure pcts and households) tenure_merged = tenure_merged.merge( - rent_data[["area_code", "median_annual_rent"]], - left_on="code", - right_on="area_code", - how="left", + rent_data, left_on="code", right_on="area_code", how="left" ) - # Calculate private rent variable for matrix (rent for private renters, 0 otherwise) is_private_renter = (tenure_type == "RENT_PRIVATELY").astype(float) benunit_rent = sim.calculate("benunit_rent").values household_rent = sim.map_result(benunit_rent, "benunit", "household") @@ -419,29 +244,29 @@ def create_local_authority_target_matrix( matrix["rent/private_rent"] = private_rent_amount - # Target = median rent (assumed = mean) * number of private renting households - # Number of private renters = households * private_rent_pct (from tenure data) tenure_merged["private_rent_target"] = ( tenure_merged["median_annual_rent"] - * tenure_merged["private_rent_pct"] - / 100 + * tenure_merged["private_rent_pct"] / 100 * tenure_merged["households"] ) - # For LAs without rent data (need rent, tenure, and household data), use - # national average scaled by LA household share - has_rent_data = ( + has_rent = ( tenure_merged["median_annual_rent"].notna() & tenure_merged["private_rent_pct"].notna() & tenure_merged["households"].notna() ).values - national_private_rent = (original_weights * private_rent_amount).sum() - default_private_rent_amount = national_private_rent * la_household_share + national_rent = (original_weights * private_rent_amount).sum() y["rent/private_rent"] = np.where( - has_rent_data, + has_rent, tenure_merged["private_rent_target"].values, - default_private_rent_amount, + national_rent * la_household_share, + ) + + # ── Country mask ─────────────────────────────────────────────── + country_mask = create_country_mask( + household_countries=sim.calculate("country").values, + codes=la_codes.code, ) return matrix, y, country_mask @@ -450,10 +275,8 @@ def create_local_authority_target_matrix( def create_country_mask( household_countries: np.ndarray, codes: pd.Series ) -> np.ndarray: - # Create a matrix R to accompany the loss matrix M s.t. (W x M) x R = Y_ - # where Y_ is the target matrix for the country where no target is constructed from weights from a different country. - - constituency_countries = codes.apply(lambda code: code[0]).map( + """Country mask: R[i,j] = 1 iff household j is in same country as area i.""" + area_countries = codes.apply(lambda code: code[0]).map( { "E": "ENGLAND", "W": "WALES", @@ -461,10 +284,7 @@ def create_country_mask( "N": "NORTHERN_IRELAND", } ) - r = np.zeros((len(codes), len(household_countries))) - for i in range(len(codes)): - r[i] = household_countries == constituency_countries[i] - + r[i] = household_countries == area_countries.iloc[i] return r diff --git a/policyengine_uk_data/targets/__init__.py b/policyengine_uk_data/targets/__init__.py new file mode 100644 index 00000000..1b50479a --- /dev/null +++ b/policyengine_uk_data/targets/__init__.py @@ -0,0 +1,15 @@ +"""Targets system: structured, source-traceable calibration targets.""" + +from policyengine_uk_data.targets.registry import get_all_targets +from policyengine_uk_data.targets.schema import ( + GeographicLevel, + Target, + Unit, +) + +__all__ = [ + "get_all_targets", + "GeographicLevel", + "Target", + "Unit", +] diff --git a/policyengine_uk_data/targets/build_loss_matrix.py b/policyengine_uk_data/targets/build_loss_matrix.py new file mode 100644 index 00000000..6b366594 --- /dev/null +++ b/policyengine_uk_data/targets/build_loss_matrix.py @@ -0,0 +1,880 @@ +"""Build calibration loss matrices from the targets registry. + +Bridges the targets system to the calibration pipeline by converting +each Target into a household-level column vector and a scalar target +value, producing the (matrix, targets) pair that the weight optimiser +expects. + +For most targets the column is a straightforward simulation query +(sum a variable, count recipients, filter by region/age/income band). +For targets requiring custom logic (counterfactuals, cross-tabs), the +Target's custom_compute callable is invoked instead. +""" + +import logging + +import numpy as np +import pandas as pd + +from policyengine_uk_data.targets import get_all_targets +from policyengine_uk_data.targets.schema import GeographicLevel, Target, Unit + +logger = logging.getLogger(__name__) + + +def create_target_matrix( + dataset, + time_period: str = None, + reform=None, +) -> tuple[pd.DataFrame, pd.Series]: + """Create (matrix, target_values) for calibration. + + Args: + dataset: a UKSingleYearDataset instance. + time_period: calendar year as string; defaults to dataset year. + reform: optional PolicyEngine reform. + + Returns: + (df, targets) where df has one column per target and one row + per household, and targets is a Series of scalar target values + indexed by target name. + """ + from policyengine_uk import Microsimulation + + if time_period is None: + time_period = dataset.time_period + + year = int(time_period) + sim = Microsimulation(dataset=dataset, reform=reform) + sim.default_calculation_period = time_period + + # Helper closures for the simulation + ctx = _SimContext(sim, time_period, dataset, reform) + + # Fetch all targets (no year filter — we resolve values below) + all_targets = [] + seen = set() + for level in ( + GeographicLevel.NATIONAL, + GeographicLevel.REGION, + GeographicLevel.COUNTRY, + ): + for t in get_all_targets(geographic_level=level): + if t.name not in seen: + seen.add(t.name) + all_targets.append(t) + + df = pd.DataFrame() + target_names = [] + target_values = [] + + for target in all_targets: + try: + val = _resolve_value(target, year) + if val is None: + continue + col = _compute_column(target, ctx, year) + if col is None: + continue + df[target.name] = col + target_names.append(target.name) + target_values.append(val) + except Exception as e: + logger.warning("Skipping target %s: %s", target.name, e) + + return df, pd.Series(target_values, index=target_names) + + +def _resolve_value(target: Target, year: int) -> float | None: + """Get the target value for a year, falling back to nearest year.""" + if year in target.values: + return target.values[year] + # Use nearest available year + available = sorted(target.values.keys()) + if not available: + return None + closest = min(available, key=lambda y: abs(y - year)) + # Only allow ±3 years of extrapolation + if abs(closest - year) > 3: + return None + return target.values[closest] + + +class _SimContext: + """Holds the simulation and lazily computed intermediate arrays.""" + + def __init__(self, sim, time_period, dataset, reform): + self.sim = sim + self.time_period = time_period + self.dataset = dataset + self.reform = reform + self._cache = {} + + def pe(self, variable: str): + """Calculate variable mapped to household level.""" + key = ("pe", variable) + if key not in self._cache: + self._cache[key] = self.sim.calculate( + variable, map_to="household" + ).values + return self._cache[key] + + def pe_person(self, variable: str): + """Calculate variable at person level.""" + key = ("pe_person", variable) + if key not in self._cache: + self._cache[key] = self.sim.calculate(variable).values + return self._cache[key] + + def pe_count(self, *variables): + """Count people with variable > 0, mapped to household.""" + total = 0 + for variable in variables: + entity = self.sim.tax_benefit_system.variables[variable].entity.key + total += self.sim.map_result( + self.sim.calculate(variable) > 0, + entity, + "household", + ) + return total + + def household_from_person(self, values): + return self.sim.map_result(values, "person", "household") + + def household_from_family(self, values): + return self.sim.map_result(values, "benunit", "household") + + @property + def region(self): + if "region" not in self._cache: + self._cache["region"] = self.sim.calculate( + "region", map_to="person" + ) + return self._cache["region"] + + @property + def household_region(self): + if "household_region" not in self._cache: + self._cache["household_region"] = self.sim.calculate( + "region", map_to="household" + ).values + return self._cache["household_region"] + + @property + def age(self): + if "age" not in self._cache: + self._cache["age"] = self.sim.calculate("age").values + return self._cache["age"] + + @property + def country(self): + if "country" not in self._cache: + self._cache["country"] = self.sim.calculate("country").values + return self._cache["country"] + + @property + def counterfactual_sim(self): + """Lazily create the salary sacrifice counterfactual simulation.""" + if "counterfactual_sim" not in self._cache: + from policyengine_uk import Microsimulation + + ss = self.sim.calculate( + "pension_contributions_via_salary_sacrifice" + ) + emp = self.sim.calculate("employment_income") + cf_sim = Microsimulation( + dataset=self.dataset, reform=self.reform + ) + cf_sim.set_input( + "pension_contributions_via_salary_sacrifice", + self.time_period, + np.zeros_like(ss), + ) + cf_sim.set_input( + "employment_income", + self.time_period, + emp + ss, + ) + self._cache["counterfactual_sim"] = cf_sim + return self._cache["counterfactual_sim"] + + +# ── Region name mapping ────────────────────────────────────────────── + +_REGION_MAP = { + "NORTH_EAST": "north_east", + "SOUTH_EAST": "south_east", + "EAST_MIDLANDS": "east_midlands", + "WEST_MIDLANDS": "west_midlands", + "YORKSHIRE": "yorkshire_and_the_humber", + "EAST_OF_ENGLAND": "east", + "LONDON": "london", + "SOUTH_WEST": "south_west", + "NORTH_WEST": "north_west", + "WALES": "wales", + "SCOTLAND": "scotland", + "NORTHERN_IRELAND": "northern_ireland", +} + +_REGION_INV = {v: k for k, v in _REGION_MAP.items()} + + +# ── Column computation dispatch ────────────────────────────────────── + +def _compute_column( + target: Target, ctx: _SimContext, year: int +) -> np.ndarray | None: + """Compute the household-level column for a target. + + Returns None if the target can't be computed (e.g. missing + custom_compute for a complex target). + """ + # If the target has a custom compute function, use it + if target.custom_compute is not None: + return target.custom_compute(ctx, target, year) + + # Dispatch by target name patterns and metadata + name = target.name + + # ── Regional age bands ──────────────────────────────────────── + # Names like "ons/north_east_age_0_9" + if name.startswith("ons/") and "_age_" in name: + return _compute_regional_age(target, ctx) + + # ── Gender × age bands ──────────────────────────────────────── + # Names like "ons/female_0_14" + if name.startswith("ons/") and ( + name.startswith("ons/female_") or name.startswith("ons/male_") + ): + return _compute_gender_age(target, ctx) + + # ── UK total population ─────────────────────────────────────── + if name == "ons/uk_population": + return ctx.household_from_person(ctx.age >= 0) + + # ── Scotland-specific demographics ──────────────────────────── + if name == "ons/scotland_children_under_16": + return ctx.household_from_person( + (ctx.region.values == "SCOTLAND") & (ctx.age < 16) + ) + if name == "ons/scotland_babies_under_1": + return ctx.household_from_person( + (ctx.region.values == "SCOTLAND") & (ctx.age < 1) + ) + if name == "ons/scotland_households_3plus_children": + is_child = ctx.pe_person("is_child") + children_per_hh = ctx.household_from_person(is_child) + return ( + (ctx.household_region == "SCOTLAND") & (children_per_hh >= 3) + ).astype(float) + + # ── Household type targets ──────────────────────────────────── + if target.variable == "family_type" and target.is_count: + return _compute_household_type(target, ctx) + + # ── Tenure targets ──────────────────────────────────────────── + if target.variable == "tenure_type" and target.is_count: + return _compute_tenure(target, ctx) + + # ── Income band breakdowns (HMRC SPI) ───────────────────────── + if target.breakdown_variable == "total_income": + return _compute_income_band(target, ctx) + + # ── Council tax bands by region (VOA) ───────────────────────── + if name.startswith("voa/council_tax/"): + return _compute_council_tax_band(target, ctx) + + # ── Vehicle ownership (NTS) ─────────────────────────────────── + if name == "nts/households_no_vehicle": + return (ctx.pe("num_vehicles") == 0).astype(float) + if name == "nts/households_one_vehicle": + return (ctx.pe("num_vehicles") == 1).astype(float) + if name == "nts/households_two_plus_vehicles": + return (ctx.pe("num_vehicles") >= 2).astype(float) + + # ── Housing targets ─────────────────────────────────────────── + if name == "housing/total_mortgage": + return ( + ctx.pe("mortgage_capital_repayment") + + ctx.pe("mortgage_interest_repayment") + ) + if name == "housing/rent_private": + tenure = ctx.sim.calculate("tenure_type", map_to="household").values + return ctx.pe("rent") * (tenure == "RENT_PRIVATELY") + + # ── Savings interest (ONS) ──────────────────────────────────── + if name == "ons/savings_interest_income": + savings = ctx.sim.calculate("savings_interest_income") + return ctx.household_from_person(savings) + + # ── Scottish child payment ──────────────────────────────────── + if name == "sss/scottish_child_payment": + scp = ctx.sim.calculate("scottish_child_payment") + return ctx.household_from_person(scp) + + # ── DWP PIP claimant splits ─────────────────────────────────── + if name == "dwp/pip_dl_standard_claimants": + pip_dl = ctx.sim.calculate("pip_dl_category") + return ctx.sim.map_result( + pip_dl == "STANDARD", "person", "household" + ) + if name == "dwp/pip_dl_enhanced_claimants": + pip_dl = ctx.sim.calculate("pip_dl_category") + return ctx.sim.map_result( + pip_dl == "ENHANCED", "person", "household" + ) + + # ── DWP benefit cap ─────────────────────────────────────────── + if name == "dwp/benefit_capped_households": + reduction = ctx.sim.calculate( + "benefit_cap_reduction", map_to="household" + ).values + return (reduction > 0).astype(float) + if name == "dwp/benefit_cap_total_reduction": + return ctx.sim.calculate( + "benefit_cap_reduction", map_to="household" + ).values.astype(float) + + # ── DWP Scotland UC + child under 1 ────────────────────────── + if name == "dwp/scotland_uc_households_child_under_1": + uc = ctx.sim.calculate("universal_credit") + on_uc = ctx.household_from_family(uc > 0) > 0 + child_u1 = ctx.pe_person("is_child") & (ctx.age < 1) + has_child_u1 = ctx.household_from_person(child_u1) > 0 + return ( + (ctx.household_region == "SCOTLAND") & on_uc & has_child_u1 + ).astype(float) + + # ── UC claimants by number of children ───────────────────────── + if name.startswith("dwp/uc/claimants_with_") and "_children" in name: + return _compute_uc_by_children(target, ctx) + + # ── UC claimants by family type ────────────────────────────── + if name.startswith("dwp/uc/claimants_") and not name.startswith( + "dwp/uc/claimants_with_" + ): + return _compute_uc_by_family_type(target, ctx) + + # ── UC payment distribution ─────────────────────────────────── + if name.startswith("dwp/uc_payment_dist/"): + return _compute_uc_payment_dist(target, ctx) + + # ── Salary sacrifice IT relief by tax band ──────────────────── + if name.startswith("hmrc/salary_sacrifice_it_relief_"): + return _compute_ss_it_relief(target, ctx) + + # ── Salary sacrifice NI relief ──────────────────────────────── + if name in ( + "hmrc/salary_sacrifice_employee_nics_relief", + "obr/salary_sacrifice_employee_ni_relief", + ): + ni_base = ctx.sim.calculate("ni_employee") + ni_cf = ctx.counterfactual_sim.calculate( + "ni_employee", ctx.time_period + ) + return ctx.household_from_person(ni_cf - ni_base) + if name in ( + "hmrc/salary_sacrifice_employer_nics_relief", + "obr/salary_sacrifice_employer_ni_relief", + ): + ni_base = ctx.sim.calculate("ni_employer") + ni_cf = ctx.counterfactual_sim.calculate( + "ni_employer", ctx.time_period + ) + return ctx.household_from_person(ni_cf - ni_base) + + # ── UC jobseeker / non-jobseeker splits ─────────────────────── + if name in ( + "obr/universal_credit_jobseekers", + "obr/universal_credit_non_jobseekers", + "obr/universal_credit_jobseekers_count", + "obr/universal_credit_non_jobseekers_count", + ): + return _compute_uc_jobseeker(target, ctx) + + # ── OBR UC outside benefit cap ──────────────────────────────── + if name == "obr/universal_credit_outside_cap": + uc = ctx.sim.calculate("universal_credit") + uc_hh = ctx.household_from_family(uc) + cap_reduction = ctx.sim.calculate( + "benefit_cap_reduction", map_to="household" + ).values + not_capped = cap_reduction == 0 + return uc_hh * not_capped + + # ── Two-child limit targets ─────────────────────────────────── + if "two_child_limit" in name: + return _compute_two_child_limit(target, ctx) + + # ── OBR council tax by country ──────────────────────────────── + if name.startswith("obr/council_tax"): + return _compute_obr_council_tax(target, ctx) + + # ── Simple GBP sum targets ──────────────────────────────────── + if target.unit == Unit.GBP and not target.is_count: + return _compute_simple_gbp(target, ctx) + + # ── Simple count targets ────────────────────────────────────── + if target.is_count and target.unit == Unit.COUNT: + return _compute_simple_count(target, ctx) + + logger.debug("No compute logic for target %s", name) + return None + + +# ── Compute implementations ────────────────────────────────────────── + +def _compute_simple_gbp(target: Target, ctx: _SimContext) -> np.ndarray: + """Sum a variable at household level.""" + variable = target.variable + try: + entity = ctx.sim.tax_benefit_system.variables[variable].entity.key + except KeyError: + return None + if entity == "household": + return ctx.pe(variable) + elif entity == "person": + return ctx.household_from_person(ctx.sim.calculate(variable)) + elif entity == "benunit": + return ctx.household_from_family(ctx.sim.calculate(variable)) + return None + + +def _compute_simple_count(target: Target, ctx: _SimContext) -> np.ndarray: + """Count recipients of a variable, mapped to household.""" + return ctx.pe_count(target.variable) + + +def _compute_regional_age( + target: Target, ctx: _SimContext +) -> np.ndarray: + """Compute person count in a region × age band.""" + # Parse "ons/{region_name}_age_{lower}_{upper}" from the name + name = target.name.removeprefix("ons/") + # Find the _age_ part + idx = name.index("_age_") + region_name = name[:idx] + age_part = name[idx + 5:] # e.g. "0_9" + lower, upper = age_part.split("_") + lower, upper = int(lower), int(upper) + + pe_region = _REGION_INV.get(region_name) + if pe_region is None: + return None + + person_match = ( + (ctx.region.values == pe_region) + & (ctx.age >= lower) + & (ctx.age <= upper) + ) + return ctx.household_from_person(person_match) + + +def _compute_gender_age( + target: Target, ctx: _SimContext +) -> np.ndarray: + """Compute person count in a gender × age band.""" + name = target.name.removeprefix("ons/") + # "female_0_14" or "male_75_90" + parts = name.split("_") + sex = parts[0] + lower = int(parts[1]) + upper = int(parts[2]) + + gender = ctx.sim.calculate("gender").values + sex_match = gender == ("FEMALE" if sex == "female" else "MALE") + age_match = (ctx.age >= lower) & (ctx.age <= upper) + return ctx.household_from_person(sex_match & age_match) + + +def _compute_household_type( + target: Target, ctx: _SimContext +) -> np.ndarray | None: + """Compute household type count from ONS families & households categories. + + Maps ONS household categories to PE family_type enum values and + household composition conditions. family_type is a benunit variable + so we map boolean comparisons to household level. + """ + name = target.name.removeprefix("ons/") + ft = ctx.sim.calculate("family_type").values # benunit level + is_child = ctx.pe_person("is_child") + children_per_hh = ctx.household_from_person(is_child) + age_hh_head = ctx.pe("age") # head of household age + + def ft_hh(value): + """Map family_type == value from benunit to household (any).""" + return ctx.household_from_family(ft == value) > 0 + + if name == "lone_households_under_65": + return ( + ft_hh("SINGLE") + & (children_per_hh == 0) + & (age_hh_head < 65) + ).astype(float) + if name == "lone_households_over_65": + return ( + ft_hh("SINGLE") + & (children_per_hh == 0) + & (age_hh_head >= 65) + ).astype(float) + if name == "unrelated_adult_households": + people_per_hh = ctx.household_from_person( + np.ones_like(is_child) + ) + return ( + ft_hh("SINGLE") + & (children_per_hh == 0) + & (people_per_hh > 1) + ).astype(float) + if name == "couple_no_children_households": + return ft_hh("COUPLE_NO_CHILDREN").astype(float) + if name == "couple_under_3_children_households": + return ( + ft_hh("COUPLE_WITH_CHILDREN") + & (children_per_hh >= 1) + & (children_per_hh <= 2) + ).astype(float) + if name == "couple_3_plus_children_households": + return ( + ft_hh("COUPLE_WITH_CHILDREN") + & (children_per_hh >= 3) + ).astype(float) + if name == "couple_non_dependent_children_only_households": + people_per_hh = ctx.household_from_person( + np.ones_like(is_child) + ) + return ( + ft_hh("COUPLE_NO_CHILDREN") + & (people_per_hh > 2) + ).astype(float) + if name == "lone_parent_dependent_children_households": + return ( + ft_hh("LONE_PARENT") + & (children_per_hh > 0) + ).astype(float) + if name == "lone_parent_non_dependent_children_households": + people_per_hh = ctx.household_from_person( + np.ones_like(is_child) + ) + return ( + ft_hh("SINGLE") + & (children_per_hh == 0) + & (people_per_hh > 1) + & (age_hh_head >= 40) + ).astype(float) + if name == "multi_family_households": + n_benunits = ctx.pe("household_num_benunits") + return (n_benunits > 1).astype(float) + + return None + + +def _compute_tenure( + target: Target, ctx: _SimContext +) -> np.ndarray | None: + """Compute dwelling count by tenure type.""" + # Map ONS target name suffixes to PE tenure_type enum values + _TENURE_MAP = { + "tenure_england_owned_outright": "OWNED_OUTRIGHT", + "tenure_england_owned_with_mortgage": "OWNED_WITH_MORTGAGE", + "tenure_england_rented_privately": "RENT_PRIVATELY", + "tenure_england_social_rent": ["RENT_FROM_COUNCIL", "RENT_FROM_HA"], + "tenure_england_total": None, # all tenures + } + suffix = target.name.removeprefix("ons/") + pe_values = _TENURE_MAP.get(suffix) + if pe_values is None and suffix == "tenure_england_total": + # Total dwellings in England + return (ctx.country == "ENGLAND").astype(float) + if pe_values is None: + return None + + tenure = ctx.sim.calculate("tenure_type", map_to="household").values + in_england = ctx.country == "ENGLAND" + if isinstance(pe_values, list): + match = np.zeros_like(tenure, dtype=bool) + for v in pe_values: + match = match | (tenure == v) + else: + match = tenure == pe_values + return (match & in_england).astype(float) + + +def _compute_income_band( + target: Target, ctx: _SimContext +) -> np.ndarray: + """Compute income variable within a total income band.""" + variable = target.variable + lower = target.lower_bound + upper = target.upper_bound + + income_df = ctx.sim.calculate_dataframe( + ["total_income", variable] + ) + in_band = (income_df.total_income >= lower) & ( + income_df.total_income < upper + ) + + if target.is_count: + return ctx.household_from_person( + (income_df[variable] > 0) * in_band + ) + else: + return ctx.household_from_person( + income_df[variable] * in_band + ) + + +def _compute_council_tax_band( + target: Target, ctx: _SimContext +) -> np.ndarray: + """Compute council tax band count for a region.""" + # "voa/council_tax/{REGION}/{band}" + parts = target.name.split("/") + region = parts[2] + band = parts[3] + + in_region = ctx.sim.calculate("region").values == region + + if band == "total": + return in_region.astype(float) + + in_band = ctx.sim.calculate("council_tax_band") == band + return (in_band * in_region).astype(float) + + +def _compute_obr_council_tax( + target: Target, ctx: _SimContext +) -> np.ndarray: + """Compute OBR council tax receipts, optionally by country.""" + name = target.name + ct = ctx.pe("council_tax") + + if name == "obr/council_tax": + return ct + if name == "obr/council_tax_england": + return ct * (ctx.country == "ENGLAND") + if name == "obr/council_tax_scotland": + return ct * (ctx.country == "SCOTLAND") + if name == "obr/council_tax_wales": + return ct * (ctx.country == "WALES") + return ct + + +def _compute_uc_jobseeker( + target: Target, ctx: _SimContext +) -> np.ndarray: + """Compute UC jobseeker / non-jobseeker splits.""" + family = ctx.sim.populations["benunit"] + uc = ctx.sim.calculate("universal_credit") + on_uc = uc > 0 + unemployed = family.any( + ctx.sim.calculate("employment_status") == "UNEMPLOYED" + ) + + if "non_jobseekers" in target.name: + mask = on_uc * ~unemployed + else: + mask = on_uc * unemployed + + if "_count" in target.name: + return ctx.household_from_family(mask) + else: + return ctx.household_from_family(uc * mask) + + +def _compute_uc_payment_dist( + target: Target, ctx: _SimContext +) -> np.ndarray: + """Compute UC payment distribution band × family type.""" + # Parse from name: "dwp/uc_payment_dist/{family_type}_annual_payment_{lower}_to_{upper}" + name = target.name.removeprefix("dwp/uc_payment_dist/") + # Find the _annual_payment_ separator + idx = name.index("_annual_payment_") + family_type = name[:idx] + payment_part = name[idx + 16:] # e.g. "0_to_1_000" + lower = target.lower_bound + upper = target.upper_bound + + uc_payments = ctx.sim.calculate( + "universal_credit", map_to="benunit" + ).values + uc_family_type = ctx.sim.calculate( + "family_type", map_to="benunit" + ).values + + in_band = ( + (uc_payments >= lower) + & (uc_payments < upper) + & (uc_family_type == family_type) + ) + return ctx.household_from_family(in_band) + + +def _compute_ss_it_relief( + target: Target, ctx: _SimContext +) -> np.ndarray: + """Compute salary sacrifice IT relief by tax band.""" + it_base = ctx.sim.calculate("income_tax") + it_cf = ctx.counterfactual_sim.calculate("income_tax", ctx.time_period) + it_relief = it_cf - it_base + + adj_net_income_cf = ctx.counterfactual_sim.calculate( + "adjusted_net_income", ctx.time_period + ) + + params = ctx.sim.tax_benefit_system.parameters.gov.hmrc.income_tax.rates.uk + basic_thresh = params[0].threshold(ctx.time_period) + higher_thresh = params[1].threshold(ctx.time_period) + additional_thresh = params[2].threshold(ctx.time_period) + + name = target.name + if "basic" in name: + mask = (adj_net_income_cf > basic_thresh) & ( + adj_net_income_cf <= higher_thresh + ) + elif "higher" in name: + mask = (adj_net_income_cf > higher_thresh) & ( + adj_net_income_cf <= additional_thresh + ) + elif "additional" in name: + mask = adj_net_income_cf > additional_thresh + else: + # Total — no mask + mask = np.ones_like(it_relief, dtype=bool) + + return ctx.household_from_person(it_relief * mask) + + +def _compute_two_child_limit( + target: Target, ctx: _SimContext +) -> np.ndarray | None: + """Compute two-child limit targets. + + These involve cross-tabulations of UC eligibility, child count, + disability status, etc. Complex enough to need specific logic + per target name. + """ + name = target.name + sim = ctx.sim + + is_child = sim.calculate("is_child").values + child_is_affected = ( + sim.map_result( + sim.calculate("uc_is_child_limit_affected", map_to="household"), + "household", + "person", + ) + > 0 + ) * is_child + child_in_uc = sim.calculate("universal_credit", map_to="person").values > 0 + children_in_capped = sim.map_result( + child_is_affected * child_in_uc, "person", "household" + ) + capped_hh = (children_in_capped > 0) * 1.0 + + if name == "dwp/uc/two_child_limit/households_affected": + return capped_hh + if name == "dwp/uc/two_child_limit/children_affected": + return children_in_capped + if name == "dwp/uc/two_child_limit/children_in_affected_households": + # Total children (not just affected ones) in capped households + total_children = sim.map_result( + is_child * child_in_uc, "person", "household" + ) + return total_children * capped_hh + + # By number of children: "dwp/uc/two_child_limit/{n}_children_households" + if "_children_households_total_children" in name: + n = int(name.split("/")[-1].split("_")[0]) + children_count = sim.map_result(is_child, "person", "household") + return ( + capped_hh * (children_count == n) * children_count + ).astype(float) + if "_children_households" in name and "total" not in name: + n = int(name.split("/")[-1].split("_")[0]) + children_count = sim.map_result(is_child, "person", "household") + match = n if n < 6 else slice(6, None) + if isinstance(match, int): + return (capped_hh * (children_count == n)).astype(float) + else: + return (capped_hh * (children_count >= 6)).astype(float) + + # Disability cross-tabs + if "adult_pip_households" in name: + pip = sim.calculate("pip", map_to="household").values + return (capped_hh * (pip > 0)).astype(float) + if "adult_pip_children" in name: + pip = sim.calculate("pip", map_to="household").values + return (children_in_capped * (pip > 0)).astype(float) + if "disabled_child_element_households" in name: + dce = sim.calculate( + "uc_individual_disabled_child_element", map_to="household" + ).values + return (capped_hh * (dce > 0)).astype(float) + if "disabled_child_element_children" in name: + dce = sim.calculate( + "uc_individual_disabled_child_element", map_to="household" + ).values + return (children_in_capped * (dce > 0)).astype(float) + + return None + + +def _compute_uc_by_children( + target: Target, ctx: _SimContext +) -> np.ndarray: + """Compute UC claimant households filtered by number of dependent children.""" + # Parse "dwp/uc/claimants_with_{n}_children" + name = target.name + n_str = name.split("claimants_with_")[1].split("_children")[0] + + uc = ctx.sim.calculate("universal_credit") + on_uc = ctx.household_from_family(uc > 0) > 0 + + is_child = ctx.pe_person("is_child") + children_per_hh = ctx.household_from_person(is_child) + + if n_str.endswith("+"): + n = int(n_str[:-1]) + match = children_per_hh >= n + else: + n = int(n_str) + match = children_per_hh == n + + return (on_uc & match).astype(float) + + +def _compute_uc_by_family_type( + target: Target, ctx: _SimContext +) -> np.ndarray: + """Compute UC claimant households filtered by family type.""" + name = target.name + ft_str = name.split("dwp/uc/claimants_")[1] + + uc = ctx.sim.calculate("universal_credit") + on_uc = ctx.household_from_family(uc > 0) > 0 + + ft = ctx.sim.calculate("family_type").values # benunit level + + def ft_hh(value): + return ctx.household_from_family(ft == value) > 0 + + is_child = ctx.pe_person("is_child") + children_per_hh = ctx.household_from_person(is_child) + + if ft_str == "single_no_children": + match = ft_hh("SINGLE") & (children_per_hh == 0) + elif ft_str == "single_with_children": + match = (ft_hh("SINGLE") | ft_hh("LONE_PARENT")) & ( + children_per_hh > 0 + ) + elif ft_str == "couple_no_children": + match = ft_hh("COUPLE_NO_CHILDREN") + elif ft_str == "couple_with_children": + match = ft_hh("COUPLE_WITH_CHILDREN") + else: + return None + + return (on_uc & match).astype(float) diff --git a/policyengine_uk_data/targets/registry.py b/policyengine_uk_data/targets/registry.py new file mode 100644 index 00000000..909fd85d --- /dev/null +++ b/policyengine_uk_data/targets/registry.py @@ -0,0 +1,69 @@ +"""Target registry: discovers source modules and collects targets.""" + +import importlib +import pkgutil +from pathlib import Path + +import yaml + +from policyengine_uk_data.targets.schema import ( + GeographicLevel, + Target, +) +from policyengine_uk_data.targets import sources as sources_pkg + + +def load_sources_config() -> dict: + """Load the sources.yaml URL configuration.""" + config_path = Path(__file__).parent / "sources.yaml" + with open(config_path) as f: + return yaml.safe_load(f) + + +def discover_source_modules() -> list: + """Import all modules under targets.sources.""" + modules = [] + package_path = Path(sources_pkg.__file__).parent + for importer, modname, ispkg in pkgutil.iter_modules( + [str(package_path)] + ): + mod = importlib.import_module( + f"policyengine_uk_data.targets.sources.{modname}" + ) + if hasattr(mod, "get_targets"): + modules.append(mod) + return modules + + +def get_all_targets( + year: int | None = None, + geographic_level: GeographicLevel | None = None, +) -> list[Target]: + """Collect targets from all source modules. + + Args: + year: if provided, only return targets that have a value for + this year. + geographic_level: if provided, filter to this geographic level. + + Returns: + De-duplicated list of Target objects. + """ + all_targets: list[Target] = [] + seen_names: set[str] = set() + + for mod in discover_source_modules(): + for target in mod.get_targets(): + if target.name in seen_names: + continue + if year is not None and year not in target.values: + continue + if ( + geographic_level is not None + and target.geographic_level != geographic_level + ): + continue + seen_names.add(target.name) + all_targets.append(target) + + return all_targets diff --git a/policyengine_uk_data/targets/schema.py b/policyengine_uk_data/targets/schema.py new file mode 100644 index 00000000..97b81467 --- /dev/null +++ b/policyengine_uk_data/targets/schema.py @@ -0,0 +1,49 @@ +"""Pydantic schema for calibration targets.""" + +from enum import Enum +from typing import Callable +from pydantic import BaseModel, Field + + +class GeographicLevel(str, Enum): + NATIONAL = "national" + COUNTRY = "country" + REGION = "region" + CONSTITUENCY = "constituency" + LOCAL_AUTHORITY = "local_authority" + + +class Unit(str, Enum): + GBP = "gbp" + COUNT = "count" + RATE = "rate" + + +class Target(BaseModel): + """A single calibration target from an official statistical source. + + Each target represents one number that the microsimulation should + reproduce when household weights are correctly calibrated, e.g. + "total income tax receipts in 2025 = £328.4bn". + """ + + name: str + variable: str + source: str + unit: Unit + geographic_level: GeographicLevel = GeographicLevel.NATIONAL + geo_code: str | None = None + geo_name: str | None = None + values: dict[int, float] + breakdown_variable: str | None = None + lower_bound: float | None = None + upper_bound: float | None = None + is_count: bool = False + reference_url: str | None = None + forecast_vintage: str | None = None + + # For targets needing custom simulation logic (UC splits, + # counterfactuals). Excluded from serialisation. + custom_compute: Callable | None = Field(default=None, exclude=True) + + model_config = {"arbitrary_types_allowed": True} diff --git a/policyengine_uk_data/targets/sources.yaml b/policyengine_uk_data/targets/sources.yaml new file mode 100644 index 00000000..8bb87679 --- /dev/null +++ b/policyengine_uk_data/targets/sources.yaml @@ -0,0 +1,43 @@ +# Official source URLs for calibration targets. +# Update these when new vintages are published. + +obr: + efo_receipts: "https://obr.uk/download/november-2025-economic-and-fiscal-outlook-detailed-forecast-tables-receipts/" + efo_expenditure: "https://obr.uk/download/november-2025-economic-and-fiscal-outlook-detailed-forecast-tables-expenditure/" + vintage: "november_2025" + +hmrc: + spi_collated: "https://assets.publishing.service.gov.uk/media/67cabb37ade26736dbf9ffe5/Collated_Tables_3_1_to_3_17_2223.ods" + spi_geography: "https://assets.publishing.service.gov.uk/media/67cabb7f8c1076c796a45bec/Collated_Tables_3_12_to_3_15a_2223.ods" + income_tax_liabilities: "https://www.gov.uk/government/statistics/income-tax-liabilities-statistics-tax-year-2022-to-2023-to-tax-year-2025-to-2026" + salary_sacrifice_table_6: "https://assets.publishing.service.gov.uk/media/687a294e312ee8a5f0806b6d/Tables_6_1_and_6_2.csv" + +dwp: + stat_xplore_api: "https://stat-xplore.dwp.gov.uk/webapi/rest/v1" + two_child_limit: "https://www.gov.uk/government/statistics/universal-credit-and-child-tax-credit-claimants-statistics-related-to-the-policy-to-provide-support-for-a-maximum-of-2-children-april-2024" + benefit_cap: "https://www.gov.uk/government/statistics/benefit-cap-number-of-households-capped-to-february-2025" + uc_national_payment_dist: "https://stat-xplore.dwp.gov.uk" + uc_pc_households: "https://stat-xplore.dwp.gov.uk" + uc_la_households: "https://stat-xplore.dwp.gov.uk" + ni_uc_stats: "https://www.communities-ni.gov.uk/publications/universal-credit-statistics" + +ons: + population_projections: "https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationprojections/datasets/z1zippedpopulationprojectionsdatafilesuk" + savings_interest: "https://www.ons.gov.uk/economy/grossdomesticproductgdp/timeseries/haxv/ukea" + households: "https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/families/datasets/familiesandhouseholdsfamiliesandhouseholds" + la_income: "https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/earningsandworkinghours/datasets/smallareaincomeestimatesformiddlelayersuperoutputareasenglandandwales" + +voa: + council_tax: "https://www.gov.uk/government/statistics/council-tax-stock-of-properties-2024" + +nts: + vehicle_ownership: "https://www.gov.uk/government/statistics/national-travel-survey-2024" + +nrs: + population_estimates: "https://www.nrscotland.gov.uk/statistics-and-data/statistics/statistics-by-theme/population/population-estimates/mid-year-population-estimates" + +scottish_government: + budget: "https://www.gov.scot/publications/scottish-budget-2026-2027/pages/6/" + +nomis: + earnings_by_constituency: "https://www.nomisweb.co.uk/api/v01/" diff --git a/policyengine_uk_data/targets/sources/__init__.py b/policyengine_uk_data/targets/sources/__init__.py new file mode 100644 index 00000000..e161e156 --- /dev/null +++ b/policyengine_uk_data/targets/sources/__init__.py @@ -0,0 +1,4 @@ +"""Target source modules. + +Each module exposes get_targets() -> list[Target]. +""" diff --git a/policyengine_uk_data/targets/sources/dwp.py b/policyengine_uk_data/targets/sources/dwp.py new file mode 100644 index 00000000..67c23c0a --- /dev/null +++ b/policyengine_uk_data/targets/sources/dwp.py @@ -0,0 +1,265 @@ +"""DWP benefit targets. + +PIP daily living standard/enhanced claimant counts, benefit cap, +UC payment distribution, UC claimant counts by children/family type, +two-child limit breakdowns, and Scotland UC households with child under 1. + +Sources: +- DWP Stat-Xplore: https://stat-xplore.dwp.gov.uk +- DWP benefit cap: https://www.gov.uk/government/statistics/benefit-cap-number-of-households-capped-to-february-2025 +- DWP two-child limit: https://www.gov.uk/government/statistics/universal-credit-and-child-tax-credit-claimants-statistics-related-to-the-policy-to-provide-support-for-a-maximum-of-2-children-april-2024 +""" + +import pandas as pd +from pathlib import Path + +from policyengine_uk_data.targets.schema import Target, Unit + +_STORAGE = Path(__file__).parents[2] / "storage" + + +def get_targets() -> list[Target]: + targets = [] + + # PIP daily living standard and enhanced claimant counts + # From Disability Rights UK analysis of DWP data + targets.append( + Target( + name="dwp/pip_dl_standard_claimants", + variable="pip_dl_category", + source="dwp", + unit=Unit.COUNT, + values={2025: 1_283_000}, + is_count=True, + reference_url="https://www.disabilityrightsuk.org/news/90-pip-standard-daily-living-component-recipients-would-fail-new-green-paper-test", + ) + ) + targets.append( + Target( + name="dwp/pip_dl_enhanced_claimants", + variable="pip_dl_category", + source="dwp", + unit=Unit.COUNT, + values={2025: 1_608_000}, + is_count=True, + reference_url="https://www.disabilityrightsuk.org/news/90-pip-standard-daily-living-component-recipients-would-fail-new-green-paper-test", + ) + ) + + # Benefit cap + targets.append( + Target( + name="dwp/benefit_capped_households", + variable="benefit_cap_reduction", + source="dwp", + unit=Unit.COUNT, + values={2025: 115_000}, + is_count=True, + reference_url="https://www.gov.uk/government/statistics/benefit-cap-number-of-households-capped-to-february-2025/benefit-cap-number-of-households-capped-to-february-2025", + ) + ) + targets.append( + Target( + name="dwp/benefit_cap_total_reduction", + variable="benefit_cap_reduction", + source="dwp", + unit=Unit.GBP, + values={2025: 60 * 52 * 115_000}, + reference_url="https://www.gov.uk/government/statistics/benefit-cap-number-of-households-capped-to-february-2025/benefit-cap-number-of-households-capped-to-february-2025", + ) + ) + + # Scotland UC households with child under 1 + targets.append( + Target( + name="dwp/scotland_uc_households_child_under_1", + variable="universal_credit", + source="dwp", + unit=Unit.COUNT, + values={2025: 14_000}, + is_count=True, + reference_url="https://stat-xplore.dwp.gov.uk/", + ) + ) + + # UC claimant counts by number of children + _UC_BY_CHILDREN = { + "1": 1_222_944, + "2": 1_058_967, + "3": 473_500, + "4": 166_790, + "5+": 74_050 + 1_860, + } + for num_children, count in _UC_BY_CHILDREN.items(): + targets.append( + Target( + name=f"dwp/uc/claimants_with_{num_children}_children", + variable="universal_credit", + source="dwp", + unit=Unit.COUNT, + values={2025: count}, + is_count=True, + reference_url="https://stat-xplore.dwp.gov.uk/", + ) + ) + + # UC claimant counts by family type + _UC_BY_FAMILY_TYPE = { + "single_no_children": 2868.011, + "single_with_children": 2156.879, + "couple_no_children": 231.368, + "couple_with_children": 839.379, + } + undercount_relative = 1.27921 / sum(_UC_BY_FAMILY_TYPE.values()) + for family_type, count_k in _UC_BY_FAMILY_TYPE.items(): + targets.append( + Target( + name=f"dwp/uc/claimants_{family_type}", + variable="universal_credit", + source="dwp", + unit=Unit.COUNT, + values={ + 2025: count_k * (1 + undercount_relative) * 1e3 + }, + is_count=True, + reference_url="https://stat-xplore.dwp.gov.uk/", + ) + ) + + # Two-child limit statistics (2026 data) + targets.append( + Target( + name="dwp/uc/two_child_limit/households_affected", + variable="uc_is_child_limit_affected", + source="dwp", + unit=Unit.COUNT, + values={2026: 453_600}, + is_count=True, + reference_url="https://www.gov.uk/government/statistics/universal-credit-and-child-tax-credit-claimants-statistics-related-to-the-policy-to-provide-support-for-a-maximum-of-2-children-april-2024", + ) + ) + targets.append( + Target( + name="dwp/uc/two_child_limit/children_in_affected_households", + variable="is_child", + source="dwp", + unit=Unit.COUNT, + values={2026: 1_613_980}, + is_count=True, + reference_url="https://www.gov.uk/government/statistics/universal-credit-and-child-tax-credit-claimants-statistics-related-to-the-policy-to-provide-support-for-a-maximum-of-2-children-april-2024", + ) + ) + targets.append( + Target( + name="dwp/uc/two_child_limit/children_affected", + variable="uc_is_child_limit_affected", + source="dwp", + unit=Unit.COUNT, + values={2026: 580_400}, + is_count=True, + reference_url="https://www.gov.uk/government/statistics/universal-credit-and-child-tax-credit-claimants-statistics-related-to-the-policy-to-provide-support-for-a-maximum-of-2-children-april-2024", + ) + ) + + # Two-child limit by number of children + _TCL_BY_CHILDREN = [ + (3, 283_290, 849_860), + (4, 115_630, 462_520), + (5, 36_590, 182_940), + (6, 18_090, 118_670), + ] + for num_children, households, children in _TCL_BY_CHILDREN: + targets.append( + Target( + name=f"dwp/uc/two_child_limit/{num_children}_children_households", + variable="uc_is_child_limit_affected", + source="dwp", + unit=Unit.COUNT, + values={2026: households}, + is_count=True, + reference_url="https://www.gov.uk/government/statistics/universal-credit-and-child-tax-credit-claimants-statistics-related-to-the-policy-to-provide-support-for-a-maximum-of-2-children-april-2024", + ) + ) + targets.append( + Target( + name=f"dwp/uc/two_child_limit/{num_children}_children_households_total_children", + variable="is_child", + source="dwp", + unit=Unit.COUNT, + values={2026: children}, + is_count=True, + reference_url="https://www.gov.uk/government/statistics/universal-credit-and-child-tax-credit-claimants-statistics-related-to-the-policy-to-provide-support-for-a-maximum-of-2-children-april-2024", + ) + ) + + # Two-child limit by disability + targets.extend([ + Target( + name="dwp/uc/two_child_limit/adult_pip_households", + variable="pip", + source="dwp", + unit=Unit.COUNT, + values={2026: 62_260}, + is_count=True, + reference_url="https://www.gov.uk/government/statistics/universal-credit-and-child-tax-credit-claimants-statistics-related-to-the-policy-to-provide-support-for-a-maximum-of-2-children-april-2024", + ), + Target( + name="dwp/uc/two_child_limit/adult_pip_children", + variable="is_child", + source="dwp", + unit=Unit.COUNT, + values={2026: 225_320}, + is_count=True, + reference_url="https://www.gov.uk/government/statistics/universal-credit-and-child-tax-credit-claimants-statistics-related-to-the-policy-to-provide-support-for-a-maximum-of-2-children-april-2024", + ), + Target( + name="dwp/uc/two_child_limit/disabled_child_element_households", + variable="uc_individual_disabled_child_element", + source="dwp", + unit=Unit.COUNT, + values={2026: 124_560}, + is_count=True, + reference_url="https://www.gov.uk/government/statistics/universal-credit-and-child-tax-credit-claimants-statistics-related-to-the-policy-to-provide-support-for-a-maximum-of-2-children-april-2024", + ), + Target( + name="dwp/uc/two_child_limit/disabled_child_element_children", + variable="is_child", + source="dwp", + unit=Unit.COUNT, + values={2026: 462_660}, + is_count=True, + reference_url="https://www.gov.uk/government/statistics/universal-credit-and-child-tax-credit-claimants-statistics-related-to-the-policy-to-provide-support-for-a-maximum-of-2-children-april-2024", + ), + ]) + + # UC national payment distribution from xlsx + targets.extend(_uc_payment_distribution_targets()) + + return targets + + +def _uc_payment_distribution_targets() -> list[Target]: + """Parse UC payment distribution from xlsx into Target objects.""" + from policyengine_uk_data.utils.uc_data import uc_national_payment_dist + + targets = [] + for _, row in uc_national_payment_dist.iterrows(): + lower = row.uc_annual_payment_min + upper = row.uc_annual_payment_max + family_type = row.family_type + name = f"dwp/uc_payment_dist/{family_type}_annual_payment_{lower:_.0f}_to_{upper:_.0f}" + targets.append( + Target( + name=name, + variable="universal_credit", + source="dwp", + unit=Unit.COUNT, + values={2025: float(row.household_count)}, + is_count=True, + breakdown_variable="universal_credit", + lower_bound=float(lower), + upper_bound=float(upper), + reference_url="https://stat-xplore.dwp.gov.uk/", + ) + ) + return targets diff --git a/policyengine_uk_data/targets/sources/hmrc_salary_sacrifice.py b/policyengine_uk_data/targets/sources/hmrc_salary_sacrifice.py new file mode 100644 index 00000000..a5f40c0d --- /dev/null +++ b/policyengine_uk_data/targets/sources/hmrc_salary_sacrifice.py @@ -0,0 +1,135 @@ +"""HMRC salary sacrifice income tax and NICs relief targets. + +Downloads Table 6.2 CSV from HMRC to get salary sacrifice IT relief +by tax rate band and NICs relief (employee + employer). + +Source: https://assets.publishing.service.gov.uk/media/687a294e312ee8a5f0806b6d/Tables_6_1_and_6_2.csv +""" + +import io +import logging +from pathlib import Path + +import pandas as pd +import requests +import yaml + +from policyengine_uk_data.targets.schema import Target, Unit + +logger = logging.getLogger(__name__) + +_SOURCES_YAML = Path(__file__).parent.parent / "sources.yaml" +_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36" + ), +} + +# Uprate 3% pa for wage growth from the base year +_GROWTH = 1.03 +_BASE_YEAR = 2024 # 2023-24 tax year → calendar 2024 + + +def _load_config(): + with open(_SOURCES_YAML) as f: + return yaml.safe_load(f) + + +def _to_float(val) -> float: + """Convert CSV value to float, handling suppressed '[z]' etc.""" + if isinstance(val, (int, float)): + return float(val) + try: + return float(val) + except (ValueError, TypeError): + return 0.0 + + +def get_targets() -> list[Target]: + config = _load_config() + ref = config["hmrc"]["salary_sacrifice_table_6"] + targets = [] + + try: + r = requests.get( + ref, headers=_HEADERS, allow_redirects=True, timeout=30 + ) + r.raise_for_status() + df = pd.read_csv(io.StringIO(r.content.decode("utf-8-sig"))) + + ss = df[df["contribution_type"] == "Salary sacrificed contributions"] + + # IT relief by tax band + ss_it = ss[ + (ss["income_tax_nics"] == "Income Tax") + & (ss["sector_scheme"] == "Total") + & (ss["scheme_type"] == "Total") + ] + for _, row in ss_it.iterrows(): + rate = row["tax_rate"] + val = _to_float(row["value_of_relief"]) + if val <= 0: + continue + rate_key = rate.lower().replace(" ", "_") + base = val * 1e6 + targets.append( + Target( + name=f"hmrc/salary_sacrifice_it_relief_{rate_key}", + variable="income_tax", + source="hmrc", + unit=Unit.GBP, + values={ + y: base * _GROWTH ** max(0, y - _BASE_YEAR) + for y in range(_BASE_YEAR, 2032) + }, + reference_url=ref, + ) + ) + + # NICs relief (employee + employer) + ss_nics = ss[ + (ss["income_tax_nics"] == "NICs") + & (ss["sector_scheme"] == "Total") + & (ss["scheme_type"] == "Total") + ] + for _, row in ss_nics.iterrows(): + nics_class = row["nics_relief_class"] + val = _to_float(row["value_of_relief"]) + if val <= 0: + continue + if "employee" in str(nics_class).lower(): + name = "hmrc/salary_sacrifice_employee_nics_relief" + variable = "ni_employee" + elif "employer" in str(nics_class).lower(): + name = "hmrc/salary_sacrifice_employer_nics_relief" + variable = "ni_employer" + else: + continue + + # Only take the first (Total scheme) row for each class + existing = {t.name for t in targets} + if name in existing: + continue + + base = val * 1e6 + targets.append( + Target( + name=name, + variable=variable, + source="hmrc", + unit=Unit.GBP, + values={ + y: base * _GROWTH ** max(0, y - _BASE_YEAR) + for y in range(_BASE_YEAR, 2032) + }, + reference_url=ref, + ) + ) + + except Exception as e: + logger.error( + "Failed to download/parse HMRC salary sacrifice CSV: %s", e + ) + + return targets diff --git a/policyengine_uk_data/targets/sources/hmrc_spi.py b/policyengine_uk_data/targets/sources/hmrc_spi.py new file mode 100644 index 00000000..b78540c3 --- /dev/null +++ b/policyengine_uk_data/targets/sources/hmrc_spi.py @@ -0,0 +1,297 @@ +"""HMRC Survey of Personal Incomes targets. + +Downloads and parses the SPI ODS (Tables 3.6 and 3.7) to get income +distributions by total income band and income type for 2022-23. + +For future year projections, the microsimulation uprates these base +year distributions forward using PolicyEngine's uprating factors. +That projection logic is in utils/incomes_projection.py and is not +part of the target download — it's a simulation step. + +Source: https://www.gov.uk/government/statistics/income-tax-summarised-accounts-statistics +""" + +import io +import logging +from functools import lru_cache +from pathlib import Path + +import pandas as pd +import requests +import yaml + +from policyengine_uk_data.targets.schema import Target, Unit + +logger = logging.getLogger(__name__) + +_SOURCES_YAML = Path(__file__).parent.parent / "sources.yaml" +_STORAGE = Path(__file__).parents[2] / "storage" + +_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36" + ), +} + +# Income bands in the SPI tables (lower bounds) +_BAND_LOWER = [ + 12_570, + 15_000, + 20_000, + 30_000, + 40_000, + 50_000, + 70_000, + 100_000, + 150_000, + 200_000, + 300_000, + 500_000, + 1_000_000, +] +_BAND_UPPER = _BAND_LOWER[1:] + [float("inf")] + +# SPI year: the ODS is for tax year 2022-23, mapped to calendar 2023 +_SPI_YEAR = 2023 + + +def _load_config(): + with open(_SOURCES_YAML) as f: + return yaml.safe_load(f) + + +@lru_cache(maxsize=1) +def _download_ods(url: str) -> bytes: + """Download an ODS file.""" + r = requests.get(url, headers=_HEADERS, allow_redirects=True, timeout=60) + r.raise_for_status() + return r.content + + +def _parse_table_36(ods_bytes: bytes) -> pd.DataFrame: + """Parse Table 3.6: employment, self-employment, pensions by band. + + Columns: lower_bound, self_employment_income_count/amount, + employment_income_count/amount, state_pension_count/amount, + private_pension_income_count/amount. + """ + df = pd.read_excel( + io.BytesIO(ods_bytes), + sheet_name="Table_3_6", + engine="odf", + header=None, + ) + # Data rows start at row 5, end before "All ranges" + data_rows = [] + for i in range(5, len(df)): + lower = df.iloc[i, 0] + if not isinstance(lower, (int, float)): + break + data_rows.append( + { + "lower_bound": int(lower), + "self_employment_income_count": _to_float(df.iloc[i, 1]), + "self_employment_income_amount": _to_float(df.iloc[i, 2]), + "employment_income_count": _to_float(df.iloc[i, 4]), + "employment_income_amount": _to_float(df.iloc[i, 5]), + "state_pension_count": _to_float(df.iloc[i, 7]), + "state_pension_amount": _to_float(df.iloc[i, 8]), + "private_pension_income_count": _to_float(df.iloc[i, 10]), + "private_pension_income_amount": _to_float(df.iloc[i, 11]), + } + ) + return pd.DataFrame(data_rows) + + +def _parse_table_37(ods_bytes: bytes) -> pd.DataFrame: + """Parse Table 3.7: property, interest, dividends by band. + + Columns: lower_bound, property_income_count/amount, + savings_interest_income_count/amount, dividend_income_count/amount. + """ + df = pd.read_excel( + io.BytesIO(ods_bytes), + sheet_name="Table_3_7", + engine="odf", + header=None, + ) + data_rows = [] + for i in range(5, len(df)): + lower = df.iloc[i, 0] + if not isinstance(lower, (int, float)): + break + data_rows.append( + { + "lower_bound": int(lower), + "property_income_count": _to_float(df.iloc[i, 1]), + "property_income_amount": _to_float(df.iloc[i, 2]), + "savings_interest_income_count": _to_float(df.iloc[i, 4]), + "savings_interest_income_amount": _to_float(df.iloc[i, 5]), + "dividend_income_count": _to_float(df.iloc[i, 7]), + "dividend_income_amount": _to_float(df.iloc[i, 8]), + } + ) + return pd.DataFrame(data_rows) + + +def _to_float(val) -> float: + """Convert cell value to float, handling '[Not available]' etc.""" + if isinstance(val, (int, float)): + return float(val) + return 0.0 + + +INCOME_VARIABLES = [ + "employment_income", + "self_employment_income", + "state_pension", + "private_pension_income", + "property_income", + "dividend_income", +] + + +def get_targets() -> list[Target]: + """Build income-band targets from the live HMRC SPI ODS. + + Also reads incomes_projection.csv if available, which contains + projected future year data generated by the microsimulation. + """ + config = _load_config() + ref = config["hmrc"]["spi_collated"] + targets = [] + + # Parse base year from official ODS + try: + ods_bytes = _download_ods(ref) + t36 = _parse_table_36(ods_bytes) + t37 = _parse_table_37(ods_bytes) + merged = t36.merge(t37, on="lower_bound", how="outer") + + for idx, row in merged.iterrows(): + lower = int(row["lower_bound"]) + upper = ( + _BAND_UPPER[idx] if idx < len(_BAND_UPPER) else float("inf") + ) + band_label = f"{lower:_}_to_{upper:_}" + + for variable in INCOME_VARIABLES: + amount_col = f"{variable}_amount" + count_col = f"{variable}_count" + + if amount_col in row.index and row[amount_col] > 0: + # SPI amounts are in £millions, counts in thousands + targets.append( + Target( + name=f"hmrc/{variable}_income_band_{band_label}", + variable=variable, + source="hmrc_spi", + unit=Unit.GBP, + values={ + _SPI_YEAR: float(row[amount_col]) * 1e6 + }, + breakdown_variable="total_income", + lower_bound=float(lower), + upper_bound=float(upper), + reference_url=ref, + ) + ) + + if count_col in row.index and row[count_col] > 0: + targets.append( + Target( + name=( + f"hmrc/{variable}_count_income_band" + f"_{band_label}" + ), + variable=variable, + source="hmrc_spi", + unit=Unit.COUNT, + values={ + _SPI_YEAR: float(row[count_col]) * 1e3 + }, + is_count=True, + breakdown_variable="total_income", + lower_bound=float(lower), + upper_bound=float(upper), + reference_url=ref, + ) + ) + except Exception as e: + logger.error("Failed to download/parse HMRC SPI ODS: %s", e) + + # Also read projected future years from incomes_projection.csv + # if it exists (generated by utils/incomes_projection.py) + proj_path = _STORAGE / "incomes_projection.csv" + if proj_path.exists(): + targets.extend(_read_projection_csv(proj_path, ref)) + + return targets + + +def _read_projection_csv( + csv_path: Path, ref: str +) -> list[Target]: + """Read projected future year targets from incomes_projection.csv.""" + incomes = pd.read_csv(csv_path) + targets = [] + + for year in incomes.year.unique(): + if year <= _SPI_YEAR: + continue # Skip base year — we have actuals from ODS + year_df = incomes[incomes.year == year] + + for _, row in year_df.iterrows(): + lower = row.total_income_lower_bound + upper = row.total_income_upper_bound + band_label = f"{lower:_.0f}_to_{upper:_.0f}" + + for variable in INCOME_VARIABLES: + amount_col = f"{variable}_amount" + count_col = f"{variable}_count" + + if amount_col in row.index and pd.notna(row[amount_col]): + name = f"hmrc/{variable}_income_band_{band_label}" + targets.append( + Target( + name=name, + variable=variable, + source="hmrc_spi", + unit=Unit.GBP, + values={int(year): float(row[amount_col])}, + breakdown_variable="total_income", + lower_bound=float(lower), + upper_bound=float(upper), + reference_url=ref, + ) + ) + + if count_col in row.index and pd.notna(row[count_col]): + name = ( + f"hmrc/{variable}_count_income_band_{band_label}" + ) + targets.append( + Target( + name=name, + variable=variable, + source="hmrc_spi", + unit=Unit.COUNT, + values={int(year): float(row[count_col])}, + is_count=True, + breakdown_variable="total_income", + lower_bound=float(lower), + upper_bound=float(upper), + reference_url=ref, + ) + ) + + # Merge targets with the same name across years + merged: dict[str, Target] = {} + for t in targets: + if t.name in merged: + merged[t.name].values.update(t.values) + else: + merged[t.name] = t + + return list(merged.values()) diff --git a/policyengine_uk_data/targets/sources/housing.py b/policyengine_uk_data/targets/sources/housing.py new file mode 100644 index 00000000..98cff99b --- /dev/null +++ b/policyengine_uk_data/targets/sources/housing.py @@ -0,0 +1,37 @@ +"""Housing affordability targets. + +Total mortgage payments and private rent from ONS/English Housing Survey. + +Sources: +- ONS PRHI: https://www.ons.gov.uk/economy/inflationandpriceindices/bulletins/privaterentandhousepricesuk/january2025 +- English Housing Survey mortgage data +""" + +from policyengine_uk_data.targets.schema import Target, Unit + +# Estimated total annual housing costs (£) +# Private rent: avg £1,400/month × 12 × 4.7m private renters +# Mortgage: avg £1,100/month × 12 × 7.5m owner-occupiers with mortgage +_PRIVATE_RENT_TOTAL = 1_400 * 12 * 4.7e6 +_MORTGAGE_TOTAL = 1_100 * 12 * 7.5e6 + + +def get_targets() -> list[Target]: + return [ + Target( + name="housing/total_mortgage", + variable="mortgage_capital_repayment", + source="ons", + unit=Unit.GBP, + values={2025: _MORTGAGE_TOTAL}, + reference_url="https://www.ons.gov.uk/economy/inflationandpriceindices/bulletins/privaterentandhousepricesuk/january2025", + ), + Target( + name="housing/rent_private", + variable="rent", + source="ons", + unit=Unit.GBP, + values={2025: _PRIVATE_RENT_TOTAL}, + reference_url="https://www.ons.gov.uk/economy/inflationandpriceindices/bulletins/privaterentandhousepricesuk/january2025", + ), + ] diff --git a/policyengine_uk_data/targets/sources/local_age.py b/policyengine_uk_data/targets/sources/local_age.py new file mode 100644 index 00000000..2276c173 --- /dev/null +++ b/policyengine_uk_data/targets/sources/local_age.py @@ -0,0 +1,100 @@ +"""Local area age band targets from ONS subnational population estimates. + +Reads pre-processed age CSV files for constituencies and local authorities, +aggregates single-year ages into 10-year bands, and applies boundary +change mapping (2010→2024) for constituencies. + +Source: ONS mid-year population estimates +https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationestimates +""" + +import logging +from pathlib import Path + +import numpy as np +import pandas as pd + +logger = logging.getLogger(__name__) + +_CONST_DIR = ( + Path(__file__).parents[2] + / "datasets" + / "local_areas" + / "constituencies" + / "targets" +) +_LA_DIR = ( + Path(__file__).parents[2] + / "datasets" + / "local_areas" + / "local_authorities" + / "targets" +) +_STORAGE = Path(__file__).parents[2] / "storage" + +_REF = ( + "https://www.ons.gov.uk/peoplepopulationandcommunity/" + "populationandmigration/populationestimates" +) + +_AGE_BANDS = list(range(0, 80, 10)) # [0, 10, 20, ..., 70] + + +def _load_age_csv(path: Path) -> pd.DataFrame: + """Load age.csv, returning code + single-year columns.""" + if not path.exists(): + logger.warning("Age CSV not found: %s", path) + return pd.DataFrame() + return pd.read_csv(path) + + +def _aggregate_to_bands(ages: pd.DataFrame) -> pd.DataFrame: + """Sum single-year ages into 10-year bands. + + Returns DataFrame with columns: code, name, age/0_10, age/10_20, etc. + """ + result = ages[["code", "name"]].copy() + for lower in _AGE_BANDS: + upper = lower + 10 + cols = [str(a) for a in range(lower, upper) if str(a) in ages.columns] + result[f"age/{lower}_{upper}"] = ages[cols].sum(axis=1) + return result + + +def get_constituency_age_targets() -> pd.DataFrame: + """Age targets for 650 constituencies (2010 boundary codes). + + Returns DataFrame with 650 rows × (code, name, age/0_10, ..., age/70_80). + Caller must apply mapping_matrix to transform to 2024 boundaries. + """ + ages = _load_age_csv(_CONST_DIR / "age.csv") + if ages.empty: + return ages + return _aggregate_to_bands(ages) + + +def get_la_age_targets() -> pd.DataFrame: + """Age targets for 360 local authorities. + + Returns DataFrame with 360 rows × (code, name, age/0_10, ..., age/70_80). + """ + ages = _load_age_csv(_LA_DIR / "age.csv") + if ages.empty: + return ages + return _aggregate_to_bands(ages) + + +def get_uk_total_population(year: int) -> float: + """UK total population from demographics.csv (in persons, not thousands).""" + csv_path = _STORAGE / "demographics.csv" + if not csv_path.exists(): + return 69.9e6 # fallback + demographics = pd.read_csv(csv_path) + row = demographics[demographics.name == "uk_population"] + col = str(year) + if col in row.columns and not row[col].isna().all(): + return float(row[col].values[0]) * 1e6 + return 69.9e6 + + +REFERENCE_URL = _REF diff --git a/policyengine_uk_data/targets/sources/local_income.py b/policyengine_uk_data/targets/sources/local_income.py new file mode 100644 index 00000000..1e418313 --- /dev/null +++ b/policyengine_uk_data/targets/sources/local_income.py @@ -0,0 +1,96 @@ +"""Local area income targets from HMRC SPI table 3.15. + +Reads pre-processed SPI CSV files for constituencies and local authorities, +extracting employment and self-employment income (count + amount) per area. + +National consistency adjustment (scaling local totals to match national SPI +projections) is applied by the caller, not here. + +Source: HMRC self-assessment and PAYE statistics +https://www.gov.uk/government/statistics/income-and-tax-by-county-and-region-and-by-parliamentary-constituency +""" + +import logging +from pathlib import Path + +import pandas as pd + +logger = logging.getLogger(__name__) + +_CONST_DIR = ( + Path(__file__).parents[2] + / "datasets" + / "local_areas" + / "constituencies" + / "targets" +) +_LA_DIR = ( + Path(__file__).parents[2] + / "datasets" + / "local_areas" + / "local_authorities" + / "targets" +) +_STORAGE = Path(__file__).parents[2] / "storage" + +_REF = ( + "https://www.gov.uk/government/statistics/" + "income-and-tax-by-county-and-region-and-by-parliamentary-constituency" +) + +_INCOME_VARIABLES = ["self_employment_income", "employment_income"] + + +def _load_spi(path: Path) -> pd.DataFrame: + if not path.exists(): + logger.warning("SPI CSV not found: %s", path) + return pd.DataFrame() + return pd.read_csv(path) + + +def get_constituency_income_targets() -> pd.DataFrame: + """Income targets for 650 constituencies (2010 codes). + + Returns DataFrame with columns: code, name, and for each income + variable: {var}_count, {var}_amount. + """ + spi = _load_spi(_CONST_DIR / "spi_by_constituency.csv") + if spi.empty: + return spi + cols = ["code", "name"] + for v in _INCOME_VARIABLES: + cols.extend([f"{v}_count", f"{v}_amount"]) + return spi[cols] + + +def get_la_income_targets() -> pd.DataFrame: + """Income targets for 360 local authorities. + + Returns DataFrame with columns: code, name, and for each income + variable: {var}_count, {var}_amount. + """ + spi = _load_spi(_LA_DIR / "spi_by_la.csv") + if spi.empty: + return spi + cols = ["code", "name"] + for v in _INCOME_VARIABLES: + cols.extend([f"{v}_count", f"{v}_amount"]) + return spi[cols] + + +def get_national_income_projections(year: int) -> pd.DataFrame: + """National income projections for consistency adjustment. + + Returns the incomes_projection.csv rows for the requested year, + filtered to the above-personal-allowance band (12570+). + """ + path = _STORAGE / "incomes_projection.csv" + if not path.exists(): + return pd.DataFrame() + df = pd.read_csv(path) + df = df[df.year == max(df.year.min(), year)] + return df + + +INCOME_VARIABLES = _INCOME_VARIABLES +REFERENCE_URL = _REF diff --git a/policyengine_uk_data/targets/sources/local_la_extras.py b/policyengine_uk_data/targets/sources/local_la_extras.py new file mode 100644 index 00000000..c2a9e7d8 --- /dev/null +++ b/policyengine_uk_data/targets/sources/local_la_extras.py @@ -0,0 +1,129 @@ +"""Local authority extra targets: ONS income, tenure, private rent. + +These targets are only available at LA level (not constituency). + +Sources: +- ONS small area income estimates: + https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/earningsandworkinghours/datasets/smallareaincomeestimatesformiddlelayersuperoutputareasenglandandwales +- English Housing Survey tenure: + https://www.gov.uk/government/statistics/english-housing-survey-2023 +- VOA private rental market statistics: + https://www.ons.gov.uk/peoplepopulationandcommunity/housing/datasets/privaterentalmarketsummarystatisticsinengland +""" + +import logging +from pathlib import Path + +import numpy as np +import pandas as pd + +logger = logging.getLogger(__name__) + +_STORAGE = Path(__file__).parents[2] / "storage" + +# Uprating factors from FYE 2020 to 2025 (OBR Nov 2025 EFO) +UPRATING_NET_INCOME_BHC_2020_TO_2025 = 1985.1 / 1467.6 +UPRATING_HOUSING_COSTS_2020_TO_2025 = 103.5 / 84.9 + +_REF_ONS_INCOME = ( + "https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/" + "earningsandworkinghours/datasets/" + "smallareaincomeestimatesformiddlelayersuperoutputareasenglandandwales" +) +_REF_TENURE = "https://www.gov.uk/government/statistics/english-housing-survey-2023" +_REF_RENT = ( + "https://www.ons.gov.uk/peoplepopulationandcommunity/housing/datasets/" + "privaterentalmarketsummarystatisticsinengland" +) + + +def load_ons_la_income() -> pd.DataFrame: + """Load ONS income estimates by local authority. + + Returns DataFrame with columns: la_code, total_income, net_income_bhc, + net_income_ahc (mean income per household, FYE 2020). + """ + xlsx_path = _STORAGE / "local_authority_ons_income.xlsx" + if not xlsx_path.exists(): + logger.warning("ONS LA income file not found: %s", xlsx_path) + return pd.DataFrame() + + xlsx = pd.ExcelFile(xlsx_path) + + def load_sheet(sheet_name: str, value_col: str) -> pd.DataFrame: + df = pd.read_excel(xlsx, sheet_name=sheet_name, header=3) + df.columns = [ + "msoa_code", "msoa_name", "la_code", "la_name", + "region_code", "region_name", value_col, + "upper_ci", "lower_ci", "ci_width", + ] + df = df.iloc[1:].dropna(subset=["msoa_code"]) + df[value_col] = pd.to_numeric(df[value_col]) + return df[["la_code", value_col]] + + total = load_sheet("Total annual income", "total_income") + bhc = load_sheet("Net income before housing costs", "net_income_bhc") + ahc = load_sheet("Net income after housing costs", "net_income_ahc") + + la_total = total.groupby("la_code")["total_income"].mean().reset_index() + la_bhc = bhc.groupby("la_code")["net_income_bhc"].mean().reset_index() + la_ahc = ahc.groupby("la_code")["net_income_ahc"].mean().reset_index() + + return la_total.merge(la_bhc, on="la_code").merge(la_ahc, on="la_code") + + +def load_household_counts() -> pd.DataFrame: + """Load household counts by LA (Census 2021). + + Returns DataFrame with columns: la_code, households. + """ + path = _STORAGE / "la_count_households.xlsx" + if not path.exists(): + logger.warning("LA household count file not found: %s", path) + return pd.DataFrame() + df = pd.read_excel(path, sheet_name="Dataset") + df.columns = ["la_code", "la_name", "households"] + return df[["la_code", "households"]] + + +def load_tenure_data() -> pd.DataFrame: + """Load tenure percentages by LA. + + Returns DataFrame with columns: la_code, owned_outright_pct, + owned_mortgage_pct, private_rent_pct, social_rent_pct. + """ + path = _STORAGE / "la_tenure.xlsx" + if not path.exists(): + logger.warning("LA tenure file not found: %s", path) + return pd.DataFrame() + df = pd.read_excel(path, sheet_name="data download") + df.columns = [ + "region_code", "region_name", "la_code", "la_name", + "owned_outright_pct", "owned_mortgage_pct", + "private_rent_pct", "social_rent_pct", + ] + return df[["la_code", "owned_outright_pct", "owned_mortgage_pct", + "private_rent_pct", "social_rent_pct"]] + + +def load_private_rents() -> pd.DataFrame: + """Load median monthly private rents by LA. + + Returns DataFrame with columns: area_code, median_annual_rent. + """ + path = _STORAGE / "la_private_rents_median.xlsx" + if not path.exists(): + logger.warning("LA private rent file not found: %s", path) + return pd.DataFrame() + df = pd.read_excel(path, sheet_name="Figure 3", header=5) + df.columns = [ + "col0", "la_code_old", "area_code", "area_name", "room", + "studio", "one_bed", "two_bed", "three_bed", "four_plus", + "median_monthly_rent", + ] + df = df[df["area_code"].astype(str).str.match(r"^E0[6789]")] + df["median_monthly_rent"] = pd.to_numeric( + df["median_monthly_rent"], errors="coerce" + ) + df["median_annual_rent"] = df["median_monthly_rent"] * 12 + return df[["area_code", "median_annual_rent"]] diff --git a/policyengine_uk_data/targets/sources/local_uc.py b/policyengine_uk_data/targets/sources/local_uc.py new file mode 100644 index 00000000..c326498d --- /dev/null +++ b/policyengine_uk_data/targets/sources/local_uc.py @@ -0,0 +1,42 @@ +"""Local area UC household targets from DWP Stat-Xplore. + +UC household counts by parliamentary constituency and local authority, +loaded from pre-downloaded Stat-Xplore exports and scaled to match +national UC payment distribution totals. + +Source: DWP Stat-Xplore +https://stat-xplore.dwp.gov.uk +""" + +import logging + +import pandas as pd + +logger = logging.getLogger(__name__) + +_REF = "https://stat-xplore.dwp.gov.uk" + + +def get_constituency_uc_targets() -> pd.Series: + """UC household counts for 650 constituencies (positional order). + + Returns Series of household_count values, aligned to the same + ordering as the constituency age.csv. + """ + from policyengine_uk_data.utils.uc_data import uc_pc_households + + return uc_pc_households.household_count + + +def get_la_uc_targets() -> pd.Series: + """UC household counts for 360 local authorities (positional order). + + Returns Series of household_count values, aligned to the same + ordering as the LA age.csv. + """ + from policyengine_uk_data.utils.uc_data import uc_la_households + + return uc_la_households.household_count + + +REFERENCE_URL = _REF diff --git a/policyengine_uk_data/targets/sources/nts_vehicles.py b/policyengine_uk_data/targets/sources/nts_vehicles.py new file mode 100644 index 00000000..18fd8df8 --- /dev/null +++ b/policyengine_uk_data/targets/sources/nts_vehicles.py @@ -0,0 +1,49 @@ +"""NTS vehicle ownership targets. + +From the National Travel Survey 2024. +Source: https://www.gov.uk/government/statistics/national-travel-survey-2024 +""" + +from policyengine_uk_data.targets.schema import Target, Unit + +_REF = "https://www.gov.uk/government/statistics/national-travel-survey-2024" + +# NTS 2024: 22% no car, 44% one car, 34% two+ cars +NTS_NO_VEHICLE_RATE = 0.22 +NTS_ONE_VEHICLE_RATE = 0.44 +NTS_TWO_PLUS_VEHICLE_RATE = 0.34 + +# ~29.6m total UK households (from VOA/ONS council tax stock 2024) +_TOTAL_HOUSEHOLDS = 29.6e6 + + +def get_targets() -> list[Target]: + return [ + Target( + name="nts/households_no_vehicle", + variable="num_vehicles", + source="nts", + unit=Unit.COUNT, + values={2024: _TOTAL_HOUSEHOLDS * NTS_NO_VEHICLE_RATE}, + is_count=True, + reference_url=_REF, + ), + Target( + name="nts/households_one_vehicle", + variable="num_vehicles", + source="nts", + unit=Unit.COUNT, + values={2024: _TOTAL_HOUSEHOLDS * NTS_ONE_VEHICLE_RATE}, + is_count=True, + reference_url=_REF, + ), + Target( + name="nts/households_two_plus_vehicles", + variable="num_vehicles", + source="nts", + unit=Unit.COUNT, + values={2024: _TOTAL_HOUSEHOLDS * NTS_TWO_PLUS_VEHICLE_RATE}, + is_count=True, + reference_url=_REF, + ), + ] diff --git a/policyengine_uk_data/targets/sources/obr.py b/policyengine_uk_data/targets/sources/obr.py new file mode 100644 index 00000000..6867eb1f --- /dev/null +++ b/policyengine_uk_data/targets/sources/obr.py @@ -0,0 +1,504 @@ +"""OBR Economic and Fiscal Outlook targets. + +Downloads and parses the OBR's detailed supplementary tables (receipts +and expenditure xlsx) to extract tax receipt forecasts, benefit +expenditure, and benefit caseloads. + +Sources: +- Receipts: https://obr.uk/download/november-2025-economic-and-fiscal-outlook-detailed-forecast-tables-receipts/ +- Expenditure: https://obr.uk/download/november-2025-economic-and-fiscal-outlook-detailed-forecast-tables-expenditure/ +""" + +import io +import logging +from functools import lru_cache +from pathlib import Path + +import openpyxl +import requests +import yaml + +from policyengine_uk_data.targets.schema import Target, Unit + +logger = logging.getLogger(__name__) + +_SOURCES_YAML = Path(__file__).parent.parent / "sources.yaml" + +# Financial year columns in OBR tables: C=2024-25, D=2025-26, ..., I=2030-31 +# PolicyEngine convention: FY 2025-26 → calendar year 2025 (first year) +_FY_COL_TO_YEAR = { + "C": 2024, + "D": 2025, + "E": 2026, + "F": 2027, + "G": 2028, + "H": 2029, + "I": 2030, +} + +_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36" + ), +} + + +def _load_config(): + with open(_SOURCES_YAML) as f: + return yaml.safe_load(f) + + +@lru_cache(maxsize=1) +def _download_workbook(url: str) -> openpyxl.Workbook: + """Download an xlsx from OBR and return an openpyxl workbook.""" + r = requests.get(url, headers=_HEADERS, allow_redirects=True, timeout=60) + r.raise_for_status() + return openpyxl.load_workbook(io.BytesIO(r.content), data_only=False) + + +def _read_row_values( + ws, row_num: int, col_letters: list[str] +) -> dict[int, float]: + """Read numeric values from a row, mapped to calendar years.""" + result = {} + for col in col_letters: + cell = ws[f"{col}{row_num}"] + val = cell.value + if val is not None and isinstance(val, (int, float)): + result[_FY_COL_TO_YEAR[col]] = float(val) * 1e9 + return result + + +def _find_row(ws, label: str, col: str = "B", max_row: int = 80) -> int: + """Find the row number where a cell starts with label.""" + for row in range(1, max_row + 1): + cell_val = ws[f"{col}{row}"].value + if cell_val and str(cell_val).strip().startswith(label): + return row + raise ValueError(f"Row '{label}' not found in sheet") + + +def _parse_receipts(wb: openpyxl.Workbook) -> list[Target]: + """Parse tax receipts from the OBR EFO. + + Income tax uses Table 3.4 (accrued basis) for consistency with + the standard fiscal forecasting convention. Other receipts use + Table 3.9 (cash basis) since they only appear there. + """ + config = _load_config() + vintage = config["obr"]["vintage"] + ref = config["obr"]["efo_receipts"] + cols_34 = list(_FY_COL_TO_YEAR.keys()) + + # Table 3.9 columns are shifted right by one vs 3.4 + cols_39 = ["D", "E", "F", "G", "H", "I", "J"] + fy_39 = { + "D": 2024, + "E": 2025, + "F": 2026, + "G": 2027, + "H": 2028, + "I": 2029, + "J": 2030, + } + + def read_39(ws, row_num: int) -> dict[int, float]: + result = {} + for col in cols_39: + cell = ws[f"{col}{row_num}"] + val = cell.value + if val is not None and isinstance(val, (int, float)): + result[fy_39[col]] = float(val) * 1e9 + return result + + targets = [] + + # Income tax from Table 3.4 (accrued basis) + try: + ws34 = wb["3.4"] + row_num = _find_row(ws34, "Income tax (gross of tax credits)", col="B", max_row=30) + values = _read_row_values(ws34, row_num, cols_34) + if values: + targets.append( + Target( + name="obr/income_tax", + variable="income_tax", + source="obr", + unit=Unit.GBP, + values=values, + reference_url=ref, + forecast_vintage=vintage, + ) + ) + except ValueError: + logger.warning("OBR receipts: income tax row not found in 3.4") + + # Other receipts from Table 3.9 (cash basis) + ws39 = wb["3.9"] + cash_rows = { + "ni": ("National insurance contributions", "ni_employee"), + "vat": ("Value added tax", "vat"), + "fuel_duties": ("Fuel duties", "fuel_duty"), + "capital_gains_tax": ("Capital gains tax", "capital_gains_tax"), + "sdlt": ("Stamp duty land tax", "stamp_duty_land_tax"), + } + + for name, (label, variable) in cash_rows.items(): + try: + row_num = _find_row(ws39, label, col="B", max_row=80) + values = read_39(ws39, row_num) + if values: + targets.append( + Target( + name=f"obr/{name}", + variable=variable, + source="obr", + unit=Unit.GBP, + values=values, + reference_url=ref, + forecast_vintage=vintage, + ) + ) + except ValueError: + logger.warning("OBR receipts: row '%s' not found", label) + + return targets + + +def _parse_council_tax(wb: openpyxl.Workbook) -> list[Target]: + """Parse Table 4.1 (council tax receipts) from expenditure xlsx.""" + config = _load_config() + vintage = config["obr"]["vintage"] + ref = config["obr"]["efo_expenditure"] + ws = wb["4.1"] + + cols = ["C", "D", "E", "F", "G", "H", "I"] + fy = { + "C": 2024, + "D": 2025, + "E": 2026, + "F": 2027, + "G": 2028, + "H": 2029, + "I": 2030, + } + + def read_41(row_num: int) -> dict[int, float]: + result = {} + for col in cols: + cell = ws[f"{col}{row_num}"] + val = cell.value + if val is not None and isinstance(val, (int, float)): + result[fy[col]] = float(val) * 1e9 + return result + + ct_rows = { + "council_tax": ("Total net council tax receipts", "council_tax"), + "council_tax_england": ( + "England council tax receipts", + "council_tax", + ), + "council_tax_scotland": ( + "Scotland council tax receipts", + "council_tax", + ), + "council_tax_wales": ("Wales council tax receipts", "council_tax"), + "domestic_rates": ("NI domestic rates", "domestic_rates"), + } + + targets = [] + for name, (label, variable) in ct_rows.items(): + try: + row_num = _find_row(ws, label, col="B", max_row=30) + values = read_41(row_num) + if values: + targets.append( + Target( + name=f"obr/{name}", + variable=variable, + source="obr", + unit=Unit.GBP, + values=values, + reference_url=ref, + forecast_vintage=vintage, + ) + ) + except ValueError: + logger.warning("OBR council tax: row '%s' not found", label) + + return targets + + +def _parse_nics(wb: openpyxl.Workbook) -> list[Target]: + """Parse Table 3.4 (income tax and NICs detail) for employee/employer.""" + config = _load_config() + vintage = config["obr"]["vintage"] + ref = config["obr"]["efo_receipts"] + ws = wb["3.4"] + cols = list(_FY_COL_TO_YEAR.keys()) + + nic_rows = { + "ni_employee": ( + "Class 1 Employee NICs", + "ni_employee", + ), + "ni_employer": ( + "Class 1 Employer NICs", + "ni_employer", + ), + } + + targets = [] + for name, (label, variable) in nic_rows.items(): + try: + row_num = _find_row(ws, label, col="B", max_row=30) + values = _read_row_values(ws, row_num, cols) + if values: + targets.append( + Target( + name=f"obr/{name}", + variable=variable, + source="obr", + unit=Unit.GBP, + values=values, + reference_url=ref, + forecast_vintage=vintage, + ) + ) + except ValueError: + logger.warning("OBR NICs: row '%s' not found", label) + + return targets + + +def _parse_welfare(wb: openpyxl.Workbook) -> list[Target]: + """Parse Table 4.9 (welfare spending) from expenditure xlsx.""" + config = _load_config() + vintage = config["obr"]["vintage"] + ref = config["obr"]["efo_expenditure"] + ws = wb["4.9"] + + cols = ["C", "D", "E", "F", "G", "H", "I"] + fy = { + "C": 2024, + "D": 2025, + "E": 2026, + "F": 2027, + "G": 2028, + "H": 2029, + "I": 2030, + } + + def read_49(row_num: int) -> dict[int, float]: + result = {} + for col in cols: + cell = ws[f"{col}{row_num}"] + val = cell.value + if val is not None and isinstance(val, (int, float)): + result[fy[col]] = float(val) * 1e9 + return result + + benefit_rows = { + "housing_benefit": ( + "Housing benefit (not on JSA)", + "housing_benefit", + ), + "pip": ( + "Disability living allowance and personal independence p", + "pip", + ), + "esa": ("Incapacity benefits", "esa_income"), + "attendance_allowance": ( + "Attendance allowance", + "attendance_allowance", + ), + "pension_credit": ("Pension credit", "pension_credit"), + "carers_allowance": ("Carer's allowance", "carers_allowance"), + "statutory_maternity_pay": ( + "Statutory maternity pay", + "statutory_maternity_pay", + ), + "winter_fuel_allowance": ( + "Winter fuel payment", + "winter_fuel_allowance", + ), + "universal_credit_in_cap": ( + "Universal credit", + "universal_credit", + ), + "child_benefit": ("Child benefit", "child_benefit"), + "state_pension": ("State pension", "state_pension"), + "jobseekers_allowance": ( + "Jobseeker's allowance", + "jsa_income", + ), + } + + targets = [] + # Welfare cap section (rows 6-36) + for name, (label, variable) in benefit_rows.items(): + try: + row_num = _find_row(ws, label, col="B", max_row=55) + values = read_49(row_num) + if values: + targets.append( + Target( + name=f"obr/{name}", + variable=variable, + source="obr", + unit=Unit.GBP, + values=values, + reference_url=ref, + forecast_vintage=vintage, + ) + ) + except ValueError: + logger.warning("OBR welfare: row '%s' not found", label) + + # Universal credit outside cap (row 43) is jobseekers UC + try: + # UC outside cap = predominantly JSA-conditionality UC + uc_outside_row = _find_row( + ws, "Universal credit", col="B", max_row=55 + ) + # Find the second UC row (outside cap section) + for row in range(uc_outside_row + 1, 55): + cell_val = ws[f"B{row}"].value + if cell_val and str(cell_val).strip().startswith( + "Universal credit" + ): + values = read_49(row) + if values: + targets.append( + Target( + name="obr/universal_credit_outside_cap", + variable="universal_credit", + source="obr", + unit=Unit.GBP, + values=values, + reference_url=ref, + forecast_vintage=vintage, + ) + ) + break + except ValueError: + logger.warning("OBR welfare: UC outside cap not found") + + return targets + + +def _parse_tv_licence(wb: openpyxl.Workbook) -> list[Target]: + """Parse Table 4.19 (BBC) from expenditure xlsx.""" + config = _load_config() + vintage = config["obr"]["vintage"] + ref = config["obr"]["efo_expenditure"] + + try: + ws = wb["4.19"] + cols = ["C", "D", "E", "F", "G", "H", "I"] + fy = { + "C": 2024, + "D": 2025, + "E": 2026, + "F": 2027, + "G": 2028, + "H": 2029, + "I": 2030, + } + + # Find "Licence fee receipts" or "BBC licence fee" + for row_num in range(1, 30): + val = ws[f"B{row_num}"].value + if val and "licence fee" in str(val).lower(): + values = {} + for col in cols: + cell = ws[f"{col}{row_num}"] + v = cell.value + if v is not None and isinstance(v, (int, float)): + values[fy[col]] = float(v) * 1e9 + if values: + return [ + Target( + name="obr/tv_licence_fee", + variable="tv_licence", + source="obr", + unit=Unit.GBP, + values=values, + reference_url=ref, + forecast_vintage=vintage, + ) + ] + except Exception: + logger.warning("OBR: TV licence table not found") + + return [] + + +# ISC census: private school students (roughly constant at ~557k) +_PRIVATE_SCHOOL = {y: 557_000 for y in range(2018, 2032)} + +# SPP Review: salary sacrifice NI relief (uprated 3% pa from 2024 base) +_SS_EMPLOYEE_NI = { + y: 1.2e9 * 1.03 ** max(0, y - 2024) for y in range(2024, 2032) +} +_SS_EMPLOYER_NI = { + y: 2.9e9 * 1.03 ** max(0, y - 2024) for y in range(2024, 2032) +} + + +def get_targets() -> list[Target]: + config = _load_config() + targets = [] + + try: + receipts_wb = _download_workbook(config["obr"]["efo_receipts"]) + targets.extend(_parse_receipts(receipts_wb)) + targets.extend(_parse_nics(receipts_wb)) + except Exception as e: + logger.error("Failed to download/parse OBR receipts: %s", e) + + try: + expenditure_wb = _download_workbook( + config["obr"]["efo_expenditure"] + ) + targets.extend(_parse_council_tax(expenditure_wb)) + targets.extend(_parse_welfare(expenditure_wb)) + targets.extend(_parse_tv_licence(expenditure_wb)) + except Exception as e: + logger.error("Failed to download/parse OBR expenditure: %s", e) + + # Static targets that don't come from the xlsx + targets.append( + Target( + name="obr/private_school_students", + variable="attends_private_school", + source="obr", + unit=Unit.COUNT, + values=_PRIVATE_SCHOOL, + is_count=True, + reference_url="https://www.isc.co.uk/research/annual-census/", + ) + ) + targets.append( + Target( + name="obr/salary_sacrifice_employee_ni_relief", + variable="ni_employee", + source="obr", + unit=Unit.GBP, + values=_SS_EMPLOYEE_NI, + reference_url="https://assets.publishing.service.gov.uk/media/67ce0e7c08e764d17a5d3c21/2025_SPP_Review.pdf", + ) + ) + targets.append( + Target( + name="obr/salary_sacrifice_employer_ni_relief", + variable="ni_employer", + source="obr", + unit=Unit.GBP, + values=_SS_EMPLOYER_NI, + reference_url="https://assets.publishing.service.gov.uk/media/67ce0e7c08e764d17a5d3c21/2025_SPP_Review.pdf", + ) + ) + + return targets diff --git a/policyengine_uk_data/targets/sources/ons_demographics.py b/policyengine_uk_data/targets/sources/ons_demographics.py new file mode 100644 index 00000000..1cf0a23b --- /dev/null +++ b/policyengine_uk_data/targets/sources/ons_demographics.py @@ -0,0 +1,331 @@ +"""ONS population projections and demographic targets. + +Downloads the ONS 2022-based principal population projection for the +UK to extract total population and gender × age band targets. + +For regional age breakdowns (12 regions × 9 age bands), reads the +pre-existing demographics.csv which was extracted from ONS subnational +projections. The subnational projections don't have a stable machine- +readable download URL, so this is the pragmatic compromise. + +Household type and tenure targets are from ONS families & households +datasets (also lacking stable machine-readable URLs). + +Sources: +- UK projections: https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationprojections/datasets/z1zippedpopulationprojectionsdatafilesuk +- NRS Scotland: https://www.nrscotland.gov.uk/statistics-and-data/statistics/statistics-by-theme/population/population-estimates/mid-year-population-estimates +""" + +import io +import logging +import zipfile +from functools import lru_cache +from pathlib import Path + +import pandas as pd +import requests +import yaml + +from policyengine_uk_data.targets.schema import ( + GeographicLevel, + Target, + Unit, +) + +logger = logging.getLogger(__name__) + +_SOURCES_YAML = Path(__file__).parent.parent / "sources.yaml" +_STORAGE = Path(__file__).parents[2] / "storage" + +_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36" + ), +} + +_UK_ZIP_URL = ( + "https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/" + "populationandmigration/populationprojections/datasets/" + "z1zippedpopulationprojectionsdatafilesuk/2022based/uk.zip" +) + +_REF_REGION = ( + "https://www.ons.gov.uk/peoplepopulationandcommunity/" + "populationandmigration/populationprojections/datasets/" + "z1zippedpopulationprojectionsdatafilesuk" +) +_REF_NRS = ( + "https://www.nrscotland.gov.uk/statistics-and-data/statistics/" + "statistics-by-theme/population/population-estimates/" + "mid-year-population-estimates" +) + +_YEARS = list(range(2022, 2030)) + +# Age band boundaries +_AGE_BANDS = [ + (0, 9), + (10, 19), + (20, 29), + (30, 39), + (40, 49), + (50, 59), + (60, 69), + (70, 79), + (80, 89), +] + +_GENDER_BANDS = [ + (0, 14), + (15, 29), + (30, 44), + (45, 59), + (60, 74), + (75, 90), +] + + +@lru_cache(maxsize=1) +def _download_uk_projection() -> pd.DataFrame: + """Download and parse the UK principal population projection.""" + r = requests.get( + _UK_ZIP_URL, headers=_HEADERS, allow_redirects=True, timeout=120 + ) + r.raise_for_status() + z = zipfile.ZipFile(io.BytesIO(r.content)) + with z.open("uk/uk_ppp_machine_readable.xlsx") as f: + df = pd.read_excel( + io.BytesIO(f.read()), + sheet_name="Population", + engine="openpyxl", + ) + return df + + +def _aggregate_ages( + df: pd.DataFrame, sex: str, low: int, high: int, years: list[int] +) -> dict[int, float]: + """Sum population for a sex and age range across years.""" + sex_filter = "Females" if sex == "female" else "Males" + mask = (df["Sex"] == sex_filter) & ( + df["Age"].apply(lambda a: isinstance(a, int) and low <= a <= high) + ) + subset = df[mask] + result = {} + for y in years: + if y in subset.columns: + result[y] = float(subset[y].sum()) + return result + + +def _parse_uk_totals(df: pd.DataFrame) -> list[Target]: + """Extract UK total population and gender × age bands.""" + targets = [] + + # UK total + uk_pop = {} + for y in _YEARS: + if y in df.columns: + uk_pop[y] = float(df[y].sum()) + if uk_pop: + targets.append( + Target( + name="ons/uk_population", + variable="age", + source="ons", + unit=Unit.COUNT, + values=uk_pop, + is_count=True, + reference_url=_REF_REGION, + ) + ) + + # Gender × age bands + for sex in ["female", "male"]: + for low, high in _GENDER_BANDS: + values = _aggregate_ages(df, sex, low, high, _YEARS) + if values: + targets.append( + Target( + name=f"ons/{sex}_{low}_{high}", + variable="age", + source="ons", + unit=Unit.COUNT, + values=values, + is_count=True, + reference_url=_REF_REGION, + ) + ) + + return targets + + +def _parse_regional_from_csv() -> list[Target]: + """Read regional age band targets from demographics.csv. + + This CSV was extracted from ONS subnational projections which + lack a stable machine-readable download URL. + """ + csv_path = _STORAGE / "demographics.csv" + if not csv_path.exists(): + logger.warning("demographics.csv not found, skipping regional") + return [] + + demographics = pd.read_csv(csv_path) + targets = [] + + # Skip rows now handled by dedicated modules (ons_households.py, + # ons_tenure.py) and rows handled elsewhere in this module + _SKIP_PREFIXES = ("tenure_", "scotland_households") + _SKIP_NAMES = { + "couple_3_plus_children_households", + "couple_no_children_households", + "couple_non_dependent_children_only_households", + "couple_under_3_children_households", + "lone_households_over_65", + "lone_households_under_65", + "lone_parent_dependent_children_households", + "lone_parent_non_dependent_children_households", + "multi_family_households", + "unrelated_adult_households", + } + + for _, row in demographics.iterrows(): + name = row["name"] + if name in _SKIP_NAMES or any( + name.startswith(p) for p in _SKIP_PREFIXES + ): + continue + values = {} + for y in _YEARS: + col = str(y) + if col in row.index and pd.notna(row[col]): + # Values in CSV are in thousands + values[y] = float(row[col]) * 1e3 + if values: + targets.append( + Target( + name=f"ons/{name}", + variable="age", + source="ons", + unit=Unit.COUNT, + geographic_level=GeographicLevel.REGION, + values=values, + is_count=True, + reference_url=_REF_REGION, + ) + ) + + return targets + + +# Scotland-specific (from NRS/census — not in ONS projections) +_SCOTLAND_CHILDREN_UNDER_16 = { + y: v * 1e3 + for y, v in { + 2022: 904, + 2023: 900, + 2024: 896, + 2025: 892, + 2026: 888, + 2027: 884, + 2028: 880, + }.items() +} + +_SCOTLAND_BABIES_UNDER_1 = { + y: v * 1e3 + for y, v in { + 2022: 46, + 2023: 46, + 2024: 46, + 2025: 46, + 2026: 46, + 2027: 46, + 2028: 46, + }.items() +} + +_SCOTLAND_HOUSEHOLDS_3PLUS_CHILDREN = { + y: v * 1e3 + for y, v in { + 2022: 82, + 2023: 82, + 2024: 82, + 2025: 82, + 2026: 82, + 2027: 82, + 2028: 82, + }.items() +} + + +# Household types and tenure are now scraped from ONS in +# ons_households.py and ons_tenure.py respectively. + + +def get_targets() -> list[Target]: + targets = [] + + # UK total + gender × age from live download + try: + df = _download_uk_projection() + targets.extend(_parse_uk_totals(df)) + except Exception as e: + logger.error("Failed to download ONS UK projections: %s", e) + + # Regional age bands from demographics.csv + targets.extend(_parse_regional_from_csv()) + + # Scotland-specific (NRS/census — small number of static values) + targets.append( + Target( + name="ons/scotland_children_under_16", + variable="age", + source="nrs", + unit=Unit.COUNT, + values=_SCOTLAND_CHILDREN_UNDER_16, + is_count=True, + geographic_level=GeographicLevel.COUNTRY, + geo_code="S", + geo_name="Scotland", + reference_url=_REF_NRS, + ) + ) + targets.append( + Target( + name="ons/scotland_babies_under_1", + variable="age", + source="nrs", + unit=Unit.COUNT, + values=_SCOTLAND_BABIES_UNDER_1, + is_count=True, + geographic_level=GeographicLevel.COUNTRY, + geo_code="S", + geo_name="Scotland", + reference_url=( + "https://www.nrscotland.gov.uk/publications/" + "vital-events-reference-tables-2024/" + ), + ) + ) + targets.append( + Target( + name="ons/scotland_households_3plus_children", + variable="is_child", + source="scotland_census", + unit=Unit.COUNT, + values=_SCOTLAND_HOUSEHOLDS_3PLUS_CHILDREN, + is_count=True, + geographic_level=GeographicLevel.COUNTRY, + geo_code="S", + geo_name="Scotland", + reference_url=( + "https://www.scotlandscensus.gov.uk/census-results/" + "at-a-glance/household-composition/" + ), + ) + ) + + return targets diff --git a/policyengine_uk_data/targets/sources/ons_households.py b/policyengine_uk_data/targets/sources/ons_households.py new file mode 100644 index 00000000..4c68b714 --- /dev/null +++ b/policyengine_uk_data/targets/sources/ons_households.py @@ -0,0 +1,114 @@ +"""ONS families & households targets. + +Downloads Table 7 from the ONS Families and Households dataset to +get household counts by type (one-person, couples, lone parents, etc). + +Source: https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/families/datasets/familiesandhouseholdsfamiliesandhouseholds +""" + +import io +import logging +from functools import lru_cache + +import openpyxl +import requests + +from policyengine_uk_data.targets.schema import Target, Unit + +logger = logging.getLogger(__name__) + +_URL = ( + "https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/" + "birthsdeathsandmarriages/families/datasets/" + "familiesandhouseholdsfamiliesandhouseholds/" + "current/familiesandhouseholdsuk2024.xlsx" +) +_REF = ( + "https://www.ons.gov.uk/peoplepopulationandcommunity/" + "birthsdeathsandmarriages/families/datasets/" + "familiesandhouseholdsfamiliesandhouseholds" +) +_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36" + ), +} + +# Table 7 rows: (row_number, target_name) +# Row numbers are 1-indexed in the xlsx +_TABLE7_ROWS = { + 14: "lone_households_under_65", + 15: "lone_households_over_65", + 16: "unrelated_adult_households", + 19: "couple_no_children_households", + 20: "couple_under_3_children_households", + 21: "couple_3_plus_children_households", + 22: "couple_non_dependent_children_only_households", + 24: "lone_parent_dependent_children_households", + 25: "lone_parent_non_dependent_children_households", + 26: "multi_family_households", +} + +# Years we want (columns follow pattern: year_col, cv_col, ci_col, +# repeating every 3 columns from col 2 for year 1996) +_MIN_YEAR = 2018 + + +@lru_cache(maxsize=1) +def _download_workbook() -> openpyxl.Workbook: + r = requests.get( + _URL, headers=_HEADERS, allow_redirects=True, timeout=60 + ) + r.raise_for_status() + return openpyxl.load_workbook( + io.BytesIO(r.content), data_only=True + ) + + +def _find_year_columns(ws) -> dict[int, int]: + """Map calendar year -> column index for Estimate columns in Table 7.""" + year_cols = {} + for col in range(2, ws.max_column + 1): + header = ws.cell(row=12, column=col).value + if header and "Estimate" in str(header): + year_str = str(header).split(" ")[0] + try: + year = int(year_str) + if year >= _MIN_YEAR: + year_cols[year] = col + except ValueError: + continue + return year_cols + + +def get_targets() -> list[Target]: + targets = [] + try: + wb = _download_workbook() + ws = wb["7"] + year_cols = _find_year_columns(ws) + + for row_num, name in _TABLE7_ROWS.items(): + values = {} + for year, col in year_cols.items(): + val = ws.cell(row=row_num, column=col).value + if val is not None and isinstance(val, (int, float)): + values[year] = float(val) * 1e3 # thousands → count + if values: + targets.append( + Target( + name=f"ons/{name}", + variable="family_type", + source="ons", + unit=Unit.COUNT, + values=values, + is_count=True, + reference_url=_REF, + ) + ) + + except Exception as e: + logger.error("Failed to download ONS households xlsx: %s", e) + + return targets diff --git a/policyengine_uk_data/targets/sources/ons_savings.py b/policyengine_uk_data/targets/sources/ons_savings.py new file mode 100644 index 00000000..5f49d8c5 --- /dev/null +++ b/policyengine_uk_data/targets/sources/ons_savings.py @@ -0,0 +1,72 @@ +"""ONS savings interest income targets. + +Downloads the HAXV timeseries from the ONS National Accounts: +D.41g — Households (S.14): Interest resources. + +SPI significantly underestimates savings income because it only +captures taxable interest, not tax-free ISAs/NS&I. + +Source: https://www.ons.gov.uk/economy/grossdomesticproductgdp/timeseries/haxv/ukea +""" + +import logging + +import requests + +from policyengine_uk_data.targets.schema import Target, Unit + +logger = logging.getLogger(__name__) + +_API_URL = ( + "https://www.ons.gov.uk/economy/grossdomesticproductgdp/" + "timeseries/haxv/ukea/data" +) +_REF = ( + "https://www.ons.gov.uk/economy/grossdomesticproductgdp/" + "timeseries/haxv/ukea" +) +_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36" + ), +} + + +def get_targets() -> list[Target]: + try: + r = requests.get( + _API_URL, headers=_HEADERS, allow_redirects=True, timeout=30 + ) + r.raise_for_status() + data = r.json() + + values = {} + for item in data.get("years", []): + year = int(item["year"]) + if 2018 <= year <= 2029: + values[year] = float(item["value"]) * 1e6 + + # Hold flat from last actual year for projections + if values: + last_year = max(values.keys()) + last_val = values[last_year] + for y in range(last_year + 1, 2030): + values[y] = last_val + + if values: + return [ + Target( + name="ons/savings_interest_income", + variable="savings_interest_income", + source="ons", + unit=Unit.GBP, + values=values, + reference_url=_REF, + ) + ] + + except Exception as e: + logger.error("Failed to download ONS savings timeseries: %s", e) + + return [] diff --git a/policyengine_uk_data/targets/sources/ons_tenure.py b/policyengine_uk_data/targets/sources/ons_tenure.py new file mode 100644 index 00000000..841e3f4f --- /dev/null +++ b/policyengine_uk_data/targets/sources/ons_tenure.py @@ -0,0 +1,119 @@ +"""ONS subnational dwelling stock by tenure targets. + +Downloads the ONS SPREE tenure estimates to get England-level tenure +breakdowns (owned outright, owned with mortgage, private rent, social +rent) summed across all local authorities. + +Source: https://www.ons.gov.uk/peoplepopulationandcommunity/housing/datasets/subnationaldwellingstockbytenureestimates +""" + +import io +import logging +from functools import lru_cache + +import openpyxl +import requests + +from policyengine_uk_data.targets.schema import ( + GeographicLevel, + Target, + Unit, +) + +logger = logging.getLogger(__name__) + +_URL = ( + "https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/" + "housing/datasets/subnationaldwellingstockbytenureestimates/" + "current/subnationaldwellingsbytenure2024.xlsx" +) +_REF = ( + "https://www.ons.gov.uk/peoplepopulationandcommunity/" + "housing/datasets/subnationaldwellingstockbytenureestimates" +) +_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36" + ), +} + +# Tenure categories in the xlsx header → target name suffix +_TENURE_COLS = { + "Owned Outright": "tenure_england_owned_outright", + "Owned with Mortgage or Loan": "tenure_england_owned_with_mortgage", + "Private Rent": "tenure_england_rented_privately", + "Social Rent": "tenure_england_social_rent", + "Total Dwellings": "tenure_england_total", +} + + +@lru_cache(maxsize=1) +def _download_workbook() -> openpyxl.Workbook: + r = requests.get( + _URL, headers=_HEADERS, allow_redirects=True, timeout=60 + ) + r.raise_for_status() + return openpyxl.load_workbook( + io.BytesIO(r.content), data_only=True + ) + + +def _parse_header_columns(ws) -> dict[tuple[int, str], int]: + """Map (year, tenure_category) → column index from row 4 headers.""" + mapping = {} + for col in range(5, ws.max_column + 1): + header = ws.cell(row=4, column=col).value + if not header: + continue + header = str(header) + for tenure_suffix in _TENURE_COLS: + if header.endswith(tenure_suffix): + year = int(header.split(" ")[0]) + mapping[(year, tenure_suffix)] = col + break + return mapping + + +def get_targets() -> list[Target]: + targets = [] + try: + wb = _download_workbook() + ws = wb["1a"] + col_map = _parse_header_columns(ws) + + # Sum across all local authorities for each (year, tenure) + totals: dict[tuple[int, str], float] = {} + for row in range(5, ws.max_row + 1): + for (year, tenure), col in col_map.items(): + val = ws.cell(row=row, column=col).value + if val is not None and isinstance(val, (int, float)): + key = (year, tenure) + totals[key] = totals.get(key, 0) + float(val) + + # Build targets + for tenure_col, target_name in _TENURE_COLS.items(): + values = {} + for (year, tenure), total in totals.items(): + if tenure == tenure_col: + values[year] = total + if values: + targets.append( + Target( + name=f"ons/{target_name}", + variable="tenure_type", + source="ons", + unit=Unit.COUNT, + geographic_level=GeographicLevel.COUNTRY, + geo_code="E", + geo_name="England", + values=values, + is_count=True, + reference_url=_REF, + ) + ) + + except Exception as e: + logger.error("Failed to download ONS tenure data: %s", e) + + return targets diff --git a/policyengine_uk_data/targets/sources/scottish_government.py b/policyengine_uk_data/targets/sources/scottish_government.py new file mode 100644 index 00000000..93423a04 --- /dev/null +++ b/policyengine_uk_data/targets/sources/scottish_government.py @@ -0,0 +1,37 @@ +"""Scottish Government targets. + +Scottish Child Payment spend from Scottish Budget. +Source: https://www.gov.scot/publications/scottish-budget-2026-2027/pages/6/ +""" + +from policyengine_uk_data.targets.schema import ( + GeographicLevel, + Target, + Unit, +) + + +def get_targets() -> list[Target]: + # Scottish Child Payment from Scottish Budget 2026-27 Table 5.08 + scp_spend = { + 2024: 455.8e6, + 2025: 471.0e6, + 2026: 484.8e6, + } + # Extrapolate other years at 3% growth + for y in range(2027, 2030): + scp_spend[y] = 471.0e6 * (1.03 ** (y - 2025)) + + return [ + Target( + name="sss/scottish_child_payment", + variable="scottish_child_payment", + source="scottish_government", + unit=Unit.GBP, + geographic_level=GeographicLevel.COUNTRY, + geo_code="S", + geo_name="Scotland", + values=scp_spend, + reference_url="https://www.gov.scot/publications/scottish-budget-2026-2027/pages/6/", + ) + ] diff --git a/policyengine_uk_data/targets/sources/voa_council_tax.py b/policyengine_uk_data/targets/sources/voa_council_tax.py new file mode 100644 index 00000000..19c3d92a --- /dev/null +++ b/policyengine_uk_data/targets/sources/voa_council_tax.py @@ -0,0 +1,63 @@ +"""VOA council tax band targets. + +Council tax band counts (A-H + total) by region from VOA stock of +properties data. + +Source: https://www.gov.uk/government/statistics/council-tax-stock-of-properties-2024 +Scotland: https://www.gov.scot/publications/council-tax-datasets/ +""" + +import pandas as pd +from pathlib import Path + +from policyengine_uk_data.targets.schema import ( + GeographicLevel, + Target, + Unit, +) + +_STORAGE = Path(__file__).parents[2] / "storage" +_REF = "https://www.gov.uk/government/statistics/council-tax-stock-of-properties-2024" + + +def get_targets() -> list[Target]: + """Build council tax band targets from the CSV.""" + csv_path = _STORAGE / "council_tax_bands_2024.csv" + if not csv_path.exists(): + return [] + + ct_data = pd.read_csv(csv_path) + targets = [] + + for _, row in ct_data.iterrows(): + region = row["Region"] + for band in ["A", "B", "C", "D", "E", "F", "G", "H"]: + targets.append( + Target( + name=f"voa/council_tax/{region}/{band}", + variable="council_tax_band", + source="voa", + unit=Unit.COUNT, + geographic_level=GeographicLevel.REGION, + geo_name=region, + values={2024: float(row[band])}, + is_count=True, + reference_url=_REF, + ) + ) + # Total row + targets.append( + Target( + name=f"voa/council_tax/{region}/total", + variable="council_tax_band", + source="voa", + unit=Unit.COUNT, + geographic_level=GeographicLevel.REGION, + geo_name=region, + values={2024: float(row["Total"])}, + is_count=True, + reference_url=_REF, + ) + ) + + return targets diff --git a/policyengine_uk_data/tests/test_target_registry.py b/policyengine_uk_data/tests/test_target_registry.py new file mode 100644 index 00000000..c6f78bdd --- /dev/null +++ b/policyengine_uk_data/tests/test_target_registry.py @@ -0,0 +1,103 @@ +"""Tests for the targets registry. + +Verifies that: +1. All source modules load without error +2. No duplicate target names +3. Core targets exist for key years +4. Target values match the current system's hardcoded values +""" + +import pytest +from policyengine_uk_data.targets import get_all_targets, Target + + +def test_registry_loads(): + """All source modules should load and return targets.""" + targets = get_all_targets() + assert len(targets) > 0, "Registry returned no targets" + + +def test_no_duplicate_names(): + """Target names should be unique across all sources.""" + targets = get_all_targets() + names = [t.name for t in targets] + duplicates = [n for n in names if names.count(n) > 1] + assert len(duplicates) == 0, f"Duplicate target names: {set(duplicates)}" + + +def test_obr_income_tax_exists(): + """OBR income tax target should exist for 2025.""" + targets = get_all_targets(year=2025) + names = {t.name for t in targets} + assert "obr/income_tax" in names + + +def test_obr_income_tax_value(): + """OBR income tax for 2025 should be ~£329bn (Table 3.4 accrued basis).""" + targets = get_all_targets(year=2025) + it = next(t for t in targets if t.name == "obr/income_tax") + # Table 3.4 D6 = 328.96bn for FY 2025-26 → calendar 2025 + assert abs(it.values[2025] - 329e9) < 1e9 + + +def test_ons_uk_population_exists(): + """UK population target should exist.""" + targets = get_all_targets(year=2025) + names = {t.name for t in targets} + assert "ons/uk_population" in names + + +def test_hmrc_spi_targets_exist(): + """HMRC SPI income band targets should exist.""" + targets = get_all_targets(year=2025) + spi_targets = [t for t in targets if t.source == "hmrc_spi"] + # 13 bands × 6 income types × 2 (count + amount) = 156 per year + assert len(spi_targets) >= 100, ( + f"Expected 100+ SPI targets, got {len(spi_targets)}" + ) + + +def test_dwp_pip_targets(): + """DWP PIP targets should exist.""" + targets = get_all_targets(year=2025) + names = {t.name for t in targets} + assert "dwp/pip_dl_standard_claimants" in names + assert "dwp/pip_dl_enhanced_claimants" in names + + +def test_voa_council_tax_targets(): + """VOA council tax band targets should exist.""" + targets = get_all_targets(year=2024) + voa = [t for t in targets if t.source == "voa"] + # 11 regions × 9 (8 bands + total) = 99 + assert len(voa) >= 90, f"Expected 90+ VOA targets, got {len(voa)}" + + +def test_core_target_count(): + """Total target count should be substantial.""" + targets = get_all_targets(year=2025) + assert len(targets) >= 200, ( + f"Expected 200+ targets for 2025, got {len(targets)}" + ) + + +def test_two_child_limit_targets(): + """Two-child limit targets should exist.""" + targets = get_all_targets(year=2026) + names = {t.name for t in targets} + assert "dwp/uc/two_child_limit/households_affected" in names + assert "dwp/uc/two_child_limit/children_affected" in names + + +def test_scottish_child_payment(): + """Scottish child payment should exist.""" + targets = get_all_targets(year=2025) + names = {t.name for t in targets} + assert "sss/scottish_child_payment" in names + + +def test_savings_interest(): + """ONS savings interest target should exist.""" + targets = get_all_targets(year=2025) + names = {t.name for t in targets} + assert "ons/savings_interest_income" in names diff --git a/policyengine_uk_data/tests/test_vehicle_ownership.py b/policyengine_uk_data/tests/test_vehicle_ownership.py index 1bf2a5fa..32d6f88f 100644 --- a/policyengine_uk_data/tests/test_vehicle_ownership.py +++ b/policyengine_uk_data/tests/test_vehicle_ownership.py @@ -1,4 +1,4 @@ -from policyengine_uk_data.utils.loss import ( +from policyengine_uk_data.targets.sources.nts_vehicles import ( NTS_NO_VEHICLE_RATE, NTS_ONE_VEHICLE_RATE, NTS_TWO_PLUS_VEHICLE_RATE, diff --git a/policyengine_uk_data/utils/loss.py b/policyengine_uk_data/utils/loss.py index 1987b586..3c240ff6 100644 --- a/policyengine_uk_data/utils/loss.py +++ b/policyengine_uk_data/utils/loss.py @@ -1,699 +1,27 @@ -""" -Loss functions and target matrices for dataset calibration. +"""Loss functions and target matrices for dataset calibration. -This module creates target matrices comparing PolicyEngine UK model outputs -against official statistics from OBR, ONS, HMRC, DWP and other sources. -Used for calibrating household weights to match aggregate targets. +Delegates to the targets registry and build_loss_matrix module +for all target definitions and simulation column construction. """ import numpy as np import pandas as pd -from policyengine_uk_data.storage import STORAGE_FOLDER -from policyengine_uk_data.utils import uprate_values -from policyengine_uk.data import UKSingleYearDataset -from policyengine_uk_data.utils.uc_data import uc_national_payment_dist - -tax_benefit = pd.read_csv(STORAGE_FOLDER / "tax_benefit.csv") -tax_benefit["name"] = tax_benefit["name"].apply(lambda x: f"obr/{x}") -demographics = pd.read_csv(STORAGE_FOLDER / "demographics.csv") -demographics["name"] = demographics["name"].apply(lambda x: f"ons/{x}") -statistics = pd.concat([tax_benefit, demographics]) -dfs = [] - -MIN_YEAR = 2018 -MAX_YEAR = 2029 - -# NTS 2024 vehicle ownership targets -# https://www.gov.uk/government/statistics/national-travel-survey-2024 -NTS_NO_VEHICLE_RATE = 0.22 -NTS_ONE_VEHICLE_RATE = 0.44 -NTS_TWO_PLUS_VEHICLE_RATE = 0.34 - -for time_period in range(MIN_YEAR, MAX_YEAR + 1): - time_period_df = statistics[ - ["name", "unit", "reference", str(time_period)] - ].rename(columns={str(time_period): "value"}) - time_period_df["time_period"] = time_period - dfs.append(time_period_df) - -statistics = pd.concat(dfs) -statistics = statistics[statistics.value.notnull()] - - -def create_target_matrix( - dataset: UKSingleYearDataset, - time_period: str = None, - reform=None, -) -> np.ndarray: - """ - Create target matrix for calibration against official statistics. - - Creates a matrix A such that for household weights w, target vector b - and a perfectly calibrated PolicyEngine UK: A * w = b - - Compares model outputs against: - - OBR tax and benefit aggregates - - ONS demographic and regional statistics - - HMRC income distribution data - - DWP benefit caseload data - - VOA council tax statistics - - Args: - dataset: PolicyEngine UK dataset to analyse. - time_period: Year for target statistics (uses dataset default if None). - reform: Policy reform to apply during analysis. - - Returns: - Tuple of (target_matrix, target_values) for calibration. - """ - - # First- tax-benefit outcomes from the DWP and OBR. - - from policyengine_uk import Microsimulation - - if time_period is None: - time_period = dataset.time_period - - sim = Microsimulation(dataset=dataset, reform=reform) - sim.default_calculation_period = time_period - - family = sim.populations["benunit"] - - pe = lambda variable: sim.calculate(variable, map_to="household").values - - household_from_family = lambda values: sim.map_result( - values, "benunit", "household" - ) - household_from_person = lambda values: sim.map_result( - values, "person", "household" - ) - - def pe_count(*variables): - total = 0 - for variable in variables: - entity = sim.tax_benefit_system.variables[variable].entity.key - total += sim.map_result( - sim.calculate(variable) > 0, - entity, - "household", - ) - - return total - - df = pd.DataFrame() - - df["obr/attendance_allowance"] = pe("attendance_allowance") - df["obr/carers_allowance"] = pe("carers_allowance") - df["obr/dla"] = pe("dla") - df["obr/esa"] = pe("esa_income") + pe("esa_contrib") - df["obr/esa_contrib"] = pe("esa_contrib") - df["obr/esa_income"] = pe("esa_income") - df["obr/housing_benefit"] = pe("housing_benefit") - df["obr/pip"] = pe("pip") - df["obr/statutory_maternity_pay"] = pe("statutory_maternity_pay") - df["obr/attendance_allowance_count"] = pe_count("attendance_allowance") - df["obr/carers_allowance_count"] = pe_count("carers_allowance") - df["obr/dla_count"] = pe_count("dla") - df["obr/esa_count"] = pe_count("esa_income", "esa_contrib") - df["obr/housing_benefit_count"] = pe_count("housing_benefit") - df["obr/pension_credit_count"] = pe_count("pension_credit") - df["obr/pip_count"] = pe_count("pip") - - on_uc = sim.calculate("universal_credit") > 0 - unemployed = family.any(sim.calculate("employment_status") == "UNEMPLOYED") - - df["obr/universal_credit_jobseekers_count"] = household_from_family( - on_uc * unemployed - ) - df["obr/universal_credit_non_jobseekers_count"] = household_from_family( - on_uc * ~unemployed - ) - - # df["obr/winter_fuel_allowance_count"] = pe_count("winter_fuel_allowance") - df["obr/capital_gains_tax"] = pe("capital_gains_tax") - df["obr/child_benefit"] = pe("child_benefit") - - country = sim.calculate("country") - ct = pe("council_tax") - df["obr/council_tax"] = ct - df["obr/council_tax_england"] = ct * (country == "ENGLAND") - df["obr/council_tax_scotland"] = ct * (country == "SCOTLAND") - df["obr/council_tax_wales"] = ct * (country == "WALES") - - df["obr/domestic_rates"] = pe("domestic_rates") - df["obr/fuel_duties"] = pe("fuel_duty") - df["obr/income_tax"] = pe("income_tax") - df["obr/jobseekers_allowance"] = pe("jsa_income") + pe("jsa_contrib") - df["obr/pension_credit"] = pe("pension_credit") - df["obr/state_pension"] = pe("state_pension") - # df["obr/tax_credits"] = pe("tax_credits") - df["obr/tv_licence_fee"] = pe("tv_licence") - - uc = sim.calculate("universal_credit") - df["obr/universal_credit"] = household_from_family(uc) - df["obr/universal_credit_jobseekers"] = household_from_family( - uc * unemployed - ) - df["obr/universal_credit_non_jobseekers"] = household_from_family( - uc * ~unemployed - ) - - df["obr/vat"] = pe("vat") - # df["obr/winter_fuel_allowance"] = pe("winter_fuel_allowance") - - # Not strictly from the OBR but from the 2024 Independent Schools Council census. OBR will be using that. - df["obr/private_school_students"] = pe("attends_private_school") - - # Salary sacrifice NI relief - SPP estimates £4.1bn total (£1.2bn employee + £2.9bn employer) - # Calculate relief via counterfactual: what additional NI would be paid if SS became income - ss_contributions = sim.calculate( - "pension_contributions_via_salary_sacrifice" - ) - employment_income = sim.calculate("employment_income") - - # Run counterfactual simulation with SS converted to employment income - counterfactual_sim = Microsimulation(dataset=dataset, reform=reform) - counterfactual_sim.set_input( - "pension_contributions_via_salary_sacrifice", - time_period, - np.zeros_like(ss_contributions), - ) - counterfactual_sim.set_input( - "employment_income", - time_period, - employment_income + ss_contributions, - ) - - # NI relief = counterfactual NI - baseline NI - ni_employee_baseline = sim.calculate("ni_employee") - ni_employer_baseline = sim.calculate("ni_employer") - ni_employee_cf = counterfactual_sim.calculate("ni_employee", time_period) - ni_employer_cf = counterfactual_sim.calculate("ni_employer", time_period) - - employee_ni_relief = ni_employee_cf - ni_employee_baseline - employer_ni_relief = ni_employer_cf - ni_employer_baseline - - df["obr/salary_sacrifice_employee_ni_relief"] = household_from_person( - employee_ni_relief - ) - df["obr/salary_sacrifice_employer_ni_relief"] = household_from_person( - employer_ni_relief - ) - - # Population statistics from the ONS. - - region = sim.calculate("region", map_to="person") - region_to_target_name_map = { - "NORTH_EAST": "north_east", - "SOUTH_EAST": "south_east", - "EAST_MIDLANDS": "east_midlands", - "WEST_MIDLANDS": "west_midlands", - "YORKSHIRE": "yorkshire_and_the_humber", - "EAST_OF_ENGLAND": "east", - "LONDON": "london", - "SOUTH_WEST": "south_west", - "NORTH_WEST": "north_west", - "WALES": "wales", - "SCOTLAND": "scotland", - "NORTHERN_IRELAND": "northern_ireland", - } - age = sim.calculate("age") - - # Ensure local populations are consistent with national population - local_population_total = 0 - for pe_region_name, region_name in region_to_target_name_map.items(): - for lower_age in range(0, 90, 10): - upper_age = lower_age + 10 - name = f"ons/{region_name}_age_{lower_age}_{upper_age - 1}" - local_population_total += ( - demographics[demographics.name == name][ - str(time_period) - ].values[0] - * 1e3 - ) - - population_scaling_factor = ( - demographics[demographics.name == "ons/uk_population"][ - str(time_period) - ].values[0] - * 1e6 - / local_population_total - ) * 0.9 - - for pe_region_name, region_name in region_to_target_name_map.items(): - for lower_age in range(0, 90, 10): - upper_age = lower_age + 10 - name = f"ons/{region_name}_age_{lower_age}_{upper_age - 1}" - statistics.loc[ - (statistics.name == name) - & (statistics.time_period == int(time_period)), - "value", - ] *= population_scaling_factor - - for pe_region_name, region_name in region_to_target_name_map.items(): - for lower_age in range(0, 90, 10): - upper_age = lower_age + 10 - name = f"ons/{region_name}_age_{lower_age}_{upper_age - 1}" - person_in_criteria = ( - (region == pe_region_name) - & (age >= lower_age) - & (age < upper_age) - ) - df[name] = household_from_person(person_in_criteria) - - df["ons/uk_population"] = household_from_person(age >= 0) - - # Scotland-specific calibration targets - # Children under 16 in Scotland - # Source: NRS mid-year population estimates - # https://www.nrscotland.gov.uk/statistics-and-data/statistics/statistics-by-theme/population/population-estimates/mid-year-population-estimates - scotland_children_under_16 = (region.values == "SCOTLAND") & (age < 16) - df["ons/scotland_children_under_16"] = household_from_person( - scotland_children_under_16 - ) - - # Babies under 1 in Scotland - # Source: NRS Vital Events - births registered in Scotland - # https://www.nrscotland.gov.uk/publications/vital-events-reference-tables-2024/ - # ~46,000 births per year (45,763 in 2024) - scotland_babies_under_1 = (region.values == "SCOTLAND") & (age < 1) - df["ons/scotland_babies_under_1"] = household_from_person( - scotland_babies_under_1 - ) - - # Households with 3+ children in Scotland - # Source: Scotland Census 2022 - Household composition - # https://www.scotlandscensus.gov.uk/census-results/at-a-glance/household-composition/ - # Count children per household, filter to Scotland households with 3+ - is_child = sim.calculate("is_child").values - children_per_household = household_from_person(is_child) - household_region = sim.calculate("region", map_to="household").values - scotland_3plus_children = (household_region == "SCOTLAND") & ( - children_per_household >= 3 - ) - df["ons/scotland_households_3plus_children"] = ( - scotland_3plus_children.astype(float) - ) - - targets = ( - statistics[statistics.time_period == int(time_period)] - .set_index("name") - .loc[df.columns] - ) - - targets.value = np.select( - [ - targets.unit == "gbp-bn", - targets.unit == "person-m", - targets.unit == "person-k", - targets.unit == "benefit-unit-m", - targets.unit == "household-k", - ], - [ - targets.value * 1e9, - targets.value * 1e6, - targets.value * 1e3, - targets.value * 1e6, - targets.value * 1e3, - ], - ) - - # Finally, incomes from HMRC - - target_names = [] - target_values = [] - # Note: savings_interest_income is excluded because SPI significantly - # underestimates it. Savings income is calibrated from ONS National - # Accounts D.41g household interest data separately below. - INCOME_VARIABLES = [ - "employment_income", - "self_employment_income", - "state_pension", - "private_pension_income", - "property_income", - "dividend_income", - ] - - income_df = sim.calculate_dataframe(["total_income"] + INCOME_VARIABLES) - - incomes = pd.read_csv(STORAGE_FOLDER / "incomes_projection.csv") - incomes = incomes[incomes.year.astype(str) == str(time_period)] - for i, row in incomes.iterrows(): - lower = row.total_income_lower_bound - upper = row.total_income_upper_bound - in_income_band = (income_df.total_income >= lower) & ( - income_df.total_income < upper - ) - for variable in INCOME_VARIABLES: - name_amount = ( - "hmrc/" - + variable - + f"_income_band_{i}_{lower:_.0f}_to_{upper:_.0f}" - ) - df[name_amount] = household_from_person( - income_df[variable] * in_income_band - ) - target_values.append(row[variable + "_amount"]) - target_names.append(name_amount) - name_count = ( - "hmrc/" - + variable - + f"_count_income_band_{i}_{lower:_.0f}_to_{upper:_.0f}" - ) - df[name_count] = household_from_person( - (income_df[variable] > 0) * in_income_band - ) - target_values.append(row[variable + "_count"]) - target_names.append(name_count) - - # Savings interest income from ONS National Accounts D.41 - # Source: ONS HAXV - Households (S.14): Interest (D.41) Resources - # https://www.ons.gov.uk/economy/grossdomesticproductgdp/timeseries/haxv/ukea - # SPI significantly underestimates savings income (~£3bn vs £43-98bn actual) - # because it only captures taxable interest, not tax-free ISAs/NS&I - ONS_SAVINGS_INCOME = { - 2020: 16.0e9, - 2021: 19.6e9, - 2022: 43.3e9, - 2023: 86.0e9, - 2024: 98.2e9, - 2025: 98.2e9, # Projected (held flat) - 2026: 98.2e9, - 2027: 98.2e9, - 2028: 98.2e9, - 2029: 98.2e9, - } - savings_income = sim.calculate("savings_interest_income") - df["ons/savings_interest_income"] = household_from_person(savings_income) - target_names.append("ons/savings_interest_income") - target_values.append(ONS_SAVINGS_INCOME.get(int(time_period), 55.0e9)) - - # HMRC Table 6.2 - Salary sacrifice income tax relief by tax rate - # This helps calibrate the distribution of SS users by income level - # 2023-24 values (£m): Basic £1,600, Higher £4,400, Additional £1,200 - # Total IT relief from SS: £7,200m - # Use true counterfactual: IT relief = counterfactual IT - baseline IT - income_tax_baseline = sim.calculate("income_tax") - income_tax_cf = counterfactual_sim.calculate("income_tax", time_period) - it_relief = income_tax_cf - income_tax_baseline - - # Get tax band from counterfactual adjusted net income (where SS is wages) - adjusted_net_income_cf = counterfactual_sim.calculate( - "adjusted_net_income", time_period - ) - basic_rate_threshold = ( - sim.tax_benefit_system.parameters.gov.hmrc.income_tax.rates.uk[ - 0 - ].threshold(time_period) - ) - higher_rate_threshold = ( - sim.tax_benefit_system.parameters.gov.hmrc.income_tax.rates.uk[ - 1 - ].threshold(time_period) - ) - additional_rate_threshold = ( - sim.tax_benefit_system.parameters.gov.hmrc.income_tax.rates.uk[ - 2 - ].threshold(time_period) - ) - - # Determine tax band for each person based on counterfactual income - is_basic_rate = (adjusted_net_income_cf > basic_rate_threshold) & ( - adjusted_net_income_cf <= higher_rate_threshold - ) - is_higher_rate = (adjusted_net_income_cf > higher_rate_threshold) & ( - adjusted_net_income_cf <= additional_rate_threshold - ) - is_additional_rate = adjusted_net_income_cf > additional_rate_threshold - - # Allocate the true IT relief to tax bands - ss_it_relief_basic = it_relief * is_basic_rate - ss_it_relief_higher = it_relief * is_higher_rate - ss_it_relief_additional = it_relief * is_additional_rate - - df["hmrc/salary_sacrifice_it_relief_basic"] = household_from_person( - ss_it_relief_basic - ) - df["hmrc/salary_sacrifice_it_relief_higher"] = household_from_person( - ss_it_relief_higher - ) - df["hmrc/salary_sacrifice_it_relief_additional"] = household_from_person( - ss_it_relief_additional - ) - - # Total gross salary sacrifice contributions - # This is derived from the IT relief: £7.2bn IT relief at ~30% avg rate - # implies ~£24bn gross contributions (but we target the relief directly) - df["hmrc/salary_sacrifice_contributions"] = household_from_person( - ss_contributions - ) - - # HMRC Table 6.2 - Salary sacrifice income tax relief by tax rate (2023-24) - # https://assets.publishing.service.gov.uk/media/687a294e312ee8a5f0806b6d/Tables_6_1_and_6_2.csv - # Values in £bn - SS_IT_RELIEF_BASIC_2024 = 1.6e9 - SS_IT_RELIEF_HIGHER_2024 = 4.4e9 - SS_IT_RELIEF_ADDITIONAL_2024 = 1.2e9 - SS_CONTRIBUTIONS_2024 = 24e9 # £7.2bn IT relief / 0.30 avg rate - - # Uprate by ~3% per year for wage growth - years_from_2024 = max(0, int(time_period) - 2024) - uprating_factor = 1.03**years_from_2024 - - target_names.append("hmrc/salary_sacrifice_it_relief_basic") - target_values.append(SS_IT_RELIEF_BASIC_2024 * uprating_factor) - - target_names.append("hmrc/salary_sacrifice_it_relief_higher") - target_values.append(SS_IT_RELIEF_HIGHER_2024 * uprating_factor) - - target_names.append("hmrc/salary_sacrifice_it_relief_additional") - target_values.append(SS_IT_RELIEF_ADDITIONAL_2024 * uprating_factor) - - target_names.append("hmrc/salary_sacrifice_contributions") - target_values.append(SS_CONTRIBUTIONS_2024 * uprating_factor) - - # Add two-child limit targets. - child_is_affected = ( - sim.map_result( - sim.calculate("uc_is_child_limit_affected", map_to="household"), - "household", - "person", - ) - > 0 - ) * sim.calculate("is_child", map_to="person").values - child_in_uc_household = ( - sim.calculate("universal_credit", map_to="person").values > 0 - ) - children_in_capped_households = sim.map_result( - child_is_affected * child_in_uc_household, "person", "household" - ) - capped_households = (children_in_capped_households > 0) * 1.0 - df["dwp/uc_two_child_limit_affected_child_count"] = ( - children_in_capped_households - ) - target_names.append("dwp/uc_two_child_limit_affected_child_count") - UPRATING_24_25 = 1.12 # https://ifs.org.uk/articles/two-child-limit-poverty-incentives-and-cost, table at the end - target_values.append(1.6e6 * UPRATING_24_25) # DWP statistics for 2024/25 - # https://www.gov.uk/government/statistics/universal-credit-and-child-tax-credit-claimants-statistics-related-to-the-policy-to-provide-support-for-a-maximum-of-2-children-april-2024/universal-credit-and-child-tax-credit-claimants-statistics-related-to-the-policy-to-provide-support-for-a-maximum-of-two-children-april-2024 - df["dwp/uc_two_child_limit_affected_household_count"] = capped_households - target_names.append("dwp/uc_two_child_limit_affected_household_count") - target_values.append(440e3 * UPRATING_24_25) # DWP statistics for 2024/25 - - # PIP daily living standard and enhanced claimant counts - # https://www.disabilityrightsuk.org/news/90-pip-standard-daily-living-component-recipients-would-fail-new-green-paper-test?srsltid=AfmBOoqSq3cQwtZnQBe-qLN7PT1mUBVtZ0ZINYtoG5bG5O9_ObQ90Y0n - - pip_dl_category = sim.calculate("pip_dl_category") - on_standard = sim.map_result( - pip_dl_category == "STANDARD", "person", "household" - ) - on_enhanced = sim.map_result( - pip_dl_category == "ENHANCED", "person", "household" - ) - - df["dwp/pip_dl_standard_claimants"] = on_standard - target_names.append("dwp/pip_dl_standard_claimants") - target_values.append(1_283_000) - - df["dwp/pip_dl_enhanced_claimants"] = on_enhanced - target_names.append("dwp/pip_dl_enhanced_claimants") - target_values.append(1_608_000) - - # Scottish Child Payment total spend - # Source: Scottish Budget 2026-27, Table 5.08 - # https://www.gov.scot/publications/scottish-budget-2026-2027/pages/6/ - scp = sim.calculate("scottish_child_payment") - df["sss/scottish_child_payment"] = household_from_person(scp) - SCP_SPEND = { - 2024: 455.8e6, - 2025: 471.0e6, - 2026: 484.8e6, - } - # Extrapolate for other years using 3% annual growth - scp_target = SCP_SPEND.get( - int(time_period), 471.0e6 * (1.03 ** (int(time_period) - 2025)) - ) - target_names.append("sss/scottish_child_payment") - target_values.append(scp_target) - - # UC households in Scotland with child under 1 - # Source: DWP Stat-Xplore, UC Households dataset, November 2023 - # https://stat-xplore.dwp.gov.uk/ - # Filters: Scotland, Age of Youngest Child = 0 - # ~14,000 households (13,992 in November 2023) - uc_amount = sim.calculate("universal_credit") - on_uc_family = uc_amount > 0 - on_uc_household = household_from_family(on_uc_family) > 0 - - child_under_1 = is_child & (age < 1) - has_child_under_1 = household_from_person(child_under_1) > 0 - - scotland_uc_child_under_1 = ( - (household_region == "SCOTLAND") & on_uc_household & has_child_under_1 - ) - df["dwp/scotland_uc_households_child_under_1"] = ( - scotland_uc_child_under_1.astype(float) - ) - target_names.append("dwp/scotland_uc_households_child_under_1") - target_values.append(14_000) # 13,992 rounded, November 2023 - - # Council Tax band counts - - ct_data = pd.read_csv(STORAGE_FOLDER / "council_tax_bands_2024.csv") - uk_population = ( - sim.tax_benefit_system.parameters.gov.economic_assumptions.indices.ons.population - ) - uprating = uk_population(time_period) / uk_population(2024) - - # England and Wales data from https://www.gov.uk/government/statistics/council-tax-stock-of-properties-2024 - - # Scotland data from https://www.gov.scot/publications/council-tax-datasets/ (Number of chargeable dwellings, 2024) - - for i, row in ct_data.iterrows(): - selected_region = row["Region"] - in_region = sim.calculate("region").values == selected_region - for band in ["A", "B", "C", "D", "E", "F", "G", "H"]: - name = f"voa/council_tax/{selected_region}/{band}" - in_band = sim.calculate("council_tax_band") == band - df[name] = (in_band * in_region).astype(float) - target_names.append(name) - target_values.append(float(row[band]) * uprating) - # Add total row - name = f"voa/council_tax/{selected_region}/total" - df[name] = (in_region).astype(float) - target_names.append(name) - target_values.append(float(row["Total"]) * uprating) - - # Benefit cap counts - - benefit_cap_reduction = sim.calculate( - "benefit_cap_reduction", map_to="household" - ).values - df["dwp/benefit_capped_households"] = (benefit_cap_reduction > 0).astype( - float - ) - target_names.append("dwp/benefit_capped_households") - target_values.append( - 115_000 - ) # https://www.gov.uk/government/statistics/benefit-cap-number-of-households-capped-to-february-2025/benefit-cap-number-of-households-capped-to-february-2025 - - df["dwp/benefit_cap_total_reduction"] = benefit_cap_reduction.astype(float) - target_names.append("dwp/benefit_cap_total_reduction") - target_values.append( - 60 * 52 * 115_000 - ) # same source as above, multiply avg cap amount by total capped population - - # UC national payment distribution - - uc_payment_dist = uc_national_payment_dist - uc_payments = sim.calculate("universal_credit", map_to="benunit").values - uc_family_type = sim.calculate("family_type", map_to="benunit").values - - for i, row in uc_payment_dist.iterrows(): - lower = row.uc_annual_payment_min - upper = row.uc_annual_payment_max - family_type = row.family_type - in_band = ( - (uc_payments >= lower) - & (uc_payments < upper) - & (uc_family_type == family_type) - ) - name = f"dwp/uc_payment_dist/{family_type}_annual_payment_{lower:_.0f}_to_{upper:_.0f}" - df[name] = household_from_family(in_band) - target_names.append(name) - target_values.append(row.household_count) - - # Vehicle ownership calibration targets - # NTS 2024: 22% no car, 44% one car, 34% two+ cars - # https://www.gov.uk/government/statistics/national-travel-survey-2024 - # Total households (~29.6m) from council tax data (consistent with other calibration) - total_households = ct_data["Total"].sum() * uprating - num_vehicles = pe("num_vehicles") - - df["nts/households_no_vehicle"] = (num_vehicles == 0).astype(float) - target_names.append("nts/households_no_vehicle") - target_values.append(total_households * NTS_NO_VEHICLE_RATE) - - df["nts/households_one_vehicle"] = (num_vehicles == 1).astype(float) - target_names.append("nts/households_one_vehicle") - target_values.append(total_households * NTS_ONE_VEHICLE_RATE) - - df["nts/households_two_plus_vehicles"] = (num_vehicles >= 2).astype(float) - target_names.append("nts/households_two_plus_vehicles") - target_values.append(total_households * NTS_TWO_PLUS_VEHICLE_RATE) - - RENT_ESTIMATE = { - "private_renter": 1_400 - * 12 - * 4.7e6, # https://www.ons.gov.uk/economy/inflationandpriceindices/bulletins/privaterentandhousepricesuk/january2025 - "owner_mortgage": 1_100 * 12 * 7.5e6, - } - - # Housing affordability targets - # Total mortgage payments (capital + interest) - mortgage_capital = pe("mortgage_capital_repayment") - mortgage_interest = pe("mortgage_interest_repayment") - total_mortgage = mortgage_capital + mortgage_interest - df["housing/total_mortgage"] = total_mortgage - target_names.append("housing/total_mortgage") - target_values.append(RENT_ESTIMATE["owner_mortgage"]) - - # Total rent by tenure type - rent = pe("rent") - tenure_type = sim.calculate("tenure_type", map_to="household").values - - df["housing/rent_private"] = rent * (tenure_type == "RENT_PRIVATELY") - target_names.append("housing/rent_private") - target_values.append(RENT_ESTIMATE["private_renter"]) - - combined_targets = pd.concat( - [ - targets, - pd.DataFrame( - { - "value": target_values, - }, - index=target_names, - ), - ] - ) - - combined_targets.to_csv("test.csv") - - return df, combined_targets.value +from policyengine_uk_data.targets.build_loss_matrix import ( + create_target_matrix, +) def get_loss_results( dataset, time_period, reform=None, household_weights=None ): - """ - Calculate loss metrics comparing model outputs to targets. + """Calculate loss metrics comparing model outputs to targets. Args: dataset: PolicyEngine UK dataset to evaluate. - time_period: Year for comparison. - reform: Policy reform to apply. - household_weights: Custom weights (uses dataset weights if None). + time_period: year for comparison. + reform: policy reform to apply. + household_weights: custom weights (uses dataset weights if None). Returns: DataFrame with estimate vs target comparisons and error metrics. diff --git a/pyproject.toml b/pyproject.toml index 55255e38..9ce780d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,8 @@ dependencies = [ "odfpy", "pandas", "openpyxl", + "pydantic>=2.0", + "pyyaml", ] [project.optional-dependencies] From 482f87d47cffe424b86f28a6a2818cf0561eb9c6 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Sun, 15 Feb 2026 13:42:20 +0000 Subject: [PATCH 2/6] Fix lint errors and add changelog entry Co-Authored-By: Claude Opus 4 --- changelog_entry.yaml | 4 + .../datasets/create_datasets.py | 11 +- .../local_areas/constituencies/calibrate.py | 16 +- .../local_authorities/calibrate.py | 22 +- .../targets/build_loss_matrix.py | 211 +++++------------- policyengine_uk_data/targets/registry.py | 8 +- policyengine_uk_data/targets/sources/dwp.py | 83 ++++--- .../targets/sources/hmrc_salary_sacrifice.py | 11 +- .../targets/sources/hmrc_spi.py | 28 +-- .../targets/sources/local_age.py | 1 - .../targets/sources/local_la_extras.py | 48 +++- policyengine_uk_data/targets/sources/obr.py | 31 +-- .../targets/sources/ons_demographics.py | 12 +- .../targets/sources/ons_households.py | 11 +- .../targets/sources/ons_savings.py | 15 +- .../targets/sources/ons_tenure.py | 11 +- policyengine_uk_data/utils/loss.py | 5 +- 17 files changed, 192 insertions(+), 336 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..5a854246 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + changed: + - Replaced ad-hoc calibration targets with structured registry and source modules. diff --git a/policyengine_uk_data/datasets/create_datasets.py b/policyengine_uk_data/datasets/create_datasets.py index ed969d07..ded6210a 100644 --- a/policyengine_uk_data/datasets/create_datasets.py +++ b/policyengine_uk_data/datasets/create_datasets.py @@ -2,7 +2,6 @@ from policyengine_uk_data.storage import STORAGE_FOLDER import logging import os -from policyengine_uk.data import UKSingleYearDataset from policyengine_uk_data.utils.uprating import uprate_dataset from policyengine_uk_data.utils.progress import ( ProcessingProgress, @@ -44,7 +43,6 @@ def main(): update_dataset, nested_progress, ): - # Create base FRS dataset update_dataset("Create base FRS dataset", "processing") frs = create_frs( @@ -107,9 +105,6 @@ def main(): update_dataset("Uprate to 2025", "completed") # Calibrate constituency weights with nested progress - from policyengine_uk_data.datasets.local_areas.constituencies.calibrate import ( - calibrate, - ) update_dataset("Calibrate constituency weights", "processing") @@ -151,7 +146,7 @@ def main(): ) # Run calibration with verbose progress - frs_calibrated_las = calibrate_local_areas( + calibrate_local_areas( dataset=frs, epochs=epochs, matrix_fn=create_local_authority_target_matrix, @@ -170,9 +165,7 @@ def main(): # Downrate and save update_dataset("Downrate to 2023", "processing") - frs_calibrated = uprate_dataset( - frs_calibrated_constituencies, 2023 - ) + frs_calibrated = uprate_dataset(frs_calibrated_constituencies, 2023) update_dataset("Downrate to 2023", "completed") update_dataset("Save final dataset", "processing") diff --git a/policyengine_uk_data/datasets/local_areas/constituencies/calibrate.py b/policyengine_uk_data/datasets/local_areas/constituencies/calibrate.py index 6ea99677..24aa3c30 100644 --- a/policyengine_uk_data/datasets/local_areas/constituencies/calibrate.py +++ b/policyengine_uk_data/datasets/local_areas/constituencies/calibrate.py @@ -73,9 +73,9 @@ def get_performance(weights, m_c, y_c, m_n, y_n, excluded_targets): constituency_target_validation["estimate"] - constituency_target_validation["target"] ) - constituency_target_validation["abs_error"] = ( - constituency_target_validation["error"].abs() - ) + constituency_target_validation["abs_error"] = constituency_target_validation[ + "error" + ].abs() constituency_target_validation["rel_abs_error"] = ( constituency_target_validation["abs_error"] / constituency_target_validation["target"] @@ -91,15 +91,11 @@ def get_performance(weights, m_c, y_c, m_n, y_n, excluded_targets): national_target_validation["target"] = national_actuals.values national_target_validation["error"] = ( - national_target_validation["estimate"] - - national_target_validation["target"] + national_target_validation["estimate"] - national_target_validation["target"] ) - national_target_validation["abs_error"] = national_target_validation[ - "error" - ].abs() + national_target_validation["abs_error"] = national_target_validation["error"].abs() national_target_validation["rel_abs_error"] = ( - national_target_validation["abs_error"] - / national_target_validation["target"] + national_target_validation["abs_error"] / national_target_validation["target"] ) df = pd.concat( diff --git a/policyengine_uk_data/datasets/local_areas/local_authorities/calibrate.py b/policyengine_uk_data/datasets/local_areas/local_authorities/calibrate.py index 588f2955..746d94e7 100644 --- a/policyengine_uk_data/datasets/local_areas/local_authorities/calibrate.py +++ b/policyengine_uk_data/datasets/local_areas/local_authorities/calibrate.py @@ -18,12 +18,8 @@ def calibrate( ): return calibrate_local_areas( dataset=dataset, - matrix_fn=lambda ds: create_local_authority_target_matrix( - ds, ds.time_period - ), - national_matrix_fn=lambda ds: create_national_target_matrix( - ds, ds.time_period - ), + matrix_fn=lambda ds: create_local_authority_target_matrix(ds, ds.time_period), + national_matrix_fn=lambda ds: create_national_target_matrix(ds, ds.time_period), area_count=360, weight_file="local_authority_weights.h5", excluded_training_targets=excluded_training_targets, @@ -37,9 +33,7 @@ def calibrate( def get_performance(weights, m_c, y_c, m_n, y_n, excluded_targets): la_target_matrix, la_actuals = m_c, y_c national_target_matrix, national_actuals = m_n, y_n - local_authorities = pd.read_csv( - STORAGE_FOLDER / "local_authorities_2021.csv" - ) + local_authorities = pd.read_csv(STORAGE_FOLDER / "local_authorities_2021.csv") la_wide = weights @ la_target_matrix la_wide.index = local_authorities.code.values la_wide["name"] = local_authorities.name.values @@ -93,15 +87,11 @@ def get_performance(weights, m_c, y_c, m_n, y_n, excluded_targets): national_target_validation["target"] = national_actuals.values national_target_validation["error"] = ( - national_target_validation["estimate"] - - national_target_validation["target"] + national_target_validation["estimate"] - national_target_validation["target"] ) - national_target_validation["abs_error"] = national_target_validation[ - "error" - ].abs() + national_target_validation["abs_error"] = national_target_validation["error"].abs() national_target_validation["rel_abs_error"] = ( - national_target_validation["abs_error"] - / national_target_validation["target"] + national_target_validation["abs_error"] / national_target_validation["target"] ) df = pd.concat( diff --git a/policyengine_uk_data/targets/build_loss_matrix.py b/policyengine_uk_data/targets/build_loss_matrix.py index 6b366594..e76ff50c 100644 --- a/policyengine_uk_data/targets/build_loss_matrix.py +++ b/policyengine_uk_data/targets/build_loss_matrix.py @@ -114,9 +114,7 @@ def pe(self, variable: str): """Calculate variable mapped to household level.""" key = ("pe", variable) if key not in self._cache: - self._cache[key] = self.sim.calculate( - variable, map_to="household" - ).values + self._cache[key] = self.sim.calculate(variable, map_to="household").values return self._cache[key] def pe_person(self, variable: str): @@ -147,9 +145,7 @@ def household_from_family(self, values): @property def region(self): if "region" not in self._cache: - self._cache["region"] = self.sim.calculate( - "region", map_to="person" - ) + self._cache["region"] = self.sim.calculate("region", map_to="person") return self._cache["region"] @property @@ -178,13 +174,9 @@ def counterfactual_sim(self): if "counterfactual_sim" not in self._cache: from policyengine_uk import Microsimulation - ss = self.sim.calculate( - "pension_contributions_via_salary_sacrifice" - ) + ss = self.sim.calculate("pension_contributions_via_salary_sacrifice") emp = self.sim.calculate("employment_income") - cf_sim = Microsimulation( - dataset=self.dataset, reform=self.reform - ) + cf_sim = Microsimulation(dataset=self.dataset, reform=self.reform) cf_sim.set_input( "pension_contributions_via_salary_sacrifice", self.time_period, @@ -221,9 +213,8 @@ def counterfactual_sim(self): # ── Column computation dispatch ────────────────────────────────────── -def _compute_column( - target: Target, ctx: _SimContext, year: int -) -> np.ndarray | None: + +def _compute_column(target: Target, ctx: _SimContext, year: int) -> np.ndarray | None: """Compute the household-level column for a target. Returns None if the target can't be computed (e.g. missing @@ -264,9 +255,9 @@ def _compute_column( if name == "ons/scotland_households_3plus_children": is_child = ctx.pe_person("is_child") children_per_hh = ctx.household_from_person(is_child) - return ( - (ctx.household_region == "SCOTLAND") & (children_per_hh >= 3) - ).astype(float) + return ((ctx.household_region == "SCOTLAND") & (children_per_hh >= 3)).astype( + float + ) # ── Household type targets ──────────────────────────────────── if target.variable == "family_type" and target.is_count: @@ -294,9 +285,8 @@ def _compute_column( # ── Housing targets ─────────────────────────────────────────── if name == "housing/total_mortgage": - return ( - ctx.pe("mortgage_capital_repayment") - + ctx.pe("mortgage_interest_repayment") + return ctx.pe("mortgage_capital_repayment") + ctx.pe( + "mortgage_interest_repayment" ) if name == "housing/rent_private": tenure = ctx.sim.calculate("tenure_type", map_to="household").values @@ -315,14 +305,10 @@ def _compute_column( # ── DWP PIP claimant splits ─────────────────────────────────── if name == "dwp/pip_dl_standard_claimants": pip_dl = ctx.sim.calculate("pip_dl_category") - return ctx.sim.map_result( - pip_dl == "STANDARD", "person", "household" - ) + return ctx.sim.map_result(pip_dl == "STANDARD", "person", "household") if name == "dwp/pip_dl_enhanced_claimants": pip_dl = ctx.sim.calculate("pip_dl_category") - return ctx.sim.map_result( - pip_dl == "ENHANCED", "person", "household" - ) + return ctx.sim.map_result(pip_dl == "ENHANCED", "person", "household") # ── DWP benefit cap ─────────────────────────────────────────── if name == "dwp/benefit_capped_households": @@ -341,9 +327,9 @@ def _compute_column( on_uc = ctx.household_from_family(uc > 0) > 0 child_u1 = ctx.pe_person("is_child") & (ctx.age < 1) has_child_u1 = ctx.household_from_person(child_u1) > 0 - return ( - (ctx.household_region == "SCOTLAND") & on_uc & has_child_u1 - ).astype(float) + return ((ctx.household_region == "SCOTLAND") & on_uc & has_child_u1).astype( + float + ) # ── UC claimants by number of children ───────────────────────── if name.startswith("dwp/uc/claimants_with_") and "_children" in name: @@ -369,18 +355,14 @@ def _compute_column( "obr/salary_sacrifice_employee_ni_relief", ): ni_base = ctx.sim.calculate("ni_employee") - ni_cf = ctx.counterfactual_sim.calculate( - "ni_employee", ctx.time_period - ) + ni_cf = ctx.counterfactual_sim.calculate("ni_employee", ctx.time_period) return ctx.household_from_person(ni_cf - ni_base) if name in ( "hmrc/salary_sacrifice_employer_nics_relief", "obr/salary_sacrifice_employer_ni_relief", ): ni_base = ctx.sim.calculate("ni_employer") - ni_cf = ctx.counterfactual_sim.calculate( - "ni_employer", ctx.time_period - ) + ni_cf = ctx.counterfactual_sim.calculate("ni_employer", ctx.time_period) return ctx.household_from_person(ni_cf - ni_base) # ── UC jobseeker / non-jobseeker splits ─────────────────────── @@ -424,6 +406,7 @@ def _compute_column( # ── Compute implementations ────────────────────────────────────────── + def _compute_simple_gbp(target: Target, ctx: _SimContext) -> np.ndarray: """Sum a variable at household level.""" variable = target.variable @@ -445,16 +428,14 @@ def _compute_simple_count(target: Target, ctx: _SimContext) -> np.ndarray: return ctx.pe_count(target.variable) -def _compute_regional_age( - target: Target, ctx: _SimContext -) -> np.ndarray: +def _compute_regional_age(target: Target, ctx: _SimContext) -> np.ndarray: """Compute person count in a region × age band.""" # Parse "ons/{region_name}_age_{lower}_{upper}" from the name name = target.name.removeprefix("ons/") # Find the _age_ part idx = name.index("_age_") region_name = name[:idx] - age_part = name[idx + 5:] # e.g. "0_9" + age_part = name[idx + 5 :] # e.g. "0_9" lower, upper = age_part.split("_") lower, upper = int(lower), int(upper) @@ -463,16 +444,12 @@ def _compute_regional_age( return None person_match = ( - (ctx.region.values == pe_region) - & (ctx.age >= lower) - & (ctx.age <= upper) + (ctx.region.values == pe_region) & (ctx.age >= lower) & (ctx.age <= upper) ) return ctx.household_from_person(person_match) -def _compute_gender_age( - target: Target, ctx: _SimContext -) -> np.ndarray: +def _compute_gender_age(target: Target, ctx: _SimContext) -> np.ndarray: """Compute person count in a gender × age band.""" name = target.name.removeprefix("ons/") # "female_0_14" or "male_75_90" @@ -487,9 +464,7 @@ def _compute_gender_age( return ctx.household_from_person(sex_match & age_match) -def _compute_household_type( - target: Target, ctx: _SimContext -) -> np.ndarray | None: +def _compute_household_type(target: Target, ctx: _SimContext) -> np.ndarray | None: """Compute household type count from ONS families & households categories. Maps ONS household categories to PE family_type enum values and @@ -507,26 +482,18 @@ def ft_hh(value): return ctx.household_from_family(ft == value) > 0 if name == "lone_households_under_65": - return ( - ft_hh("SINGLE") - & (children_per_hh == 0) - & (age_hh_head < 65) - ).astype(float) + return (ft_hh("SINGLE") & (children_per_hh == 0) & (age_hh_head < 65)).astype( + float + ) if name == "lone_households_over_65": - return ( - ft_hh("SINGLE") - & (children_per_hh == 0) - & (age_hh_head >= 65) - ).astype(float) + return (ft_hh("SINGLE") & (children_per_hh == 0) & (age_hh_head >= 65)).astype( + float + ) if name == "unrelated_adult_households": - people_per_hh = ctx.household_from_person( - np.ones_like(is_child) + people_per_hh = ctx.household_from_person(np.ones_like(is_child)) + return (ft_hh("SINGLE") & (children_per_hh == 0) & (people_per_hh > 1)).astype( + float ) - return ( - ft_hh("SINGLE") - & (children_per_hh == 0) - & (people_per_hh > 1) - ).astype(float) if name == "couple_no_children_households": return ft_hh("COUPLE_NO_CHILDREN").astype(float) if name == "couple_under_3_children_households": @@ -536,27 +503,14 @@ def ft_hh(value): & (children_per_hh <= 2) ).astype(float) if name == "couple_3_plus_children_households": - return ( - ft_hh("COUPLE_WITH_CHILDREN") - & (children_per_hh >= 3) - ).astype(float) + return (ft_hh("COUPLE_WITH_CHILDREN") & (children_per_hh >= 3)).astype(float) if name == "couple_non_dependent_children_only_households": - people_per_hh = ctx.household_from_person( - np.ones_like(is_child) - ) - return ( - ft_hh("COUPLE_NO_CHILDREN") - & (people_per_hh > 2) - ).astype(float) + people_per_hh = ctx.household_from_person(np.ones_like(is_child)) + return (ft_hh("COUPLE_NO_CHILDREN") & (people_per_hh > 2)).astype(float) if name == "lone_parent_dependent_children_households": - return ( - ft_hh("LONE_PARENT") - & (children_per_hh > 0) - ).astype(float) + return (ft_hh("LONE_PARENT") & (children_per_hh > 0)).astype(float) if name == "lone_parent_non_dependent_children_households": - people_per_hh = ctx.household_from_person( - np.ones_like(is_child) - ) + people_per_hh = ctx.household_from_person(np.ones_like(is_child)) return ( ft_hh("SINGLE") & (children_per_hh == 0) @@ -570,9 +524,7 @@ def ft_hh(value): return None -def _compute_tenure( - target: Target, ctx: _SimContext -) -> np.ndarray | None: +def _compute_tenure(target: Target, ctx: _SimContext) -> np.ndarray | None: """Compute dwelling count by tenure type.""" # Map ONS target name suffixes to PE tenure_type enum values _TENURE_MAP = { @@ -601,34 +553,22 @@ def _compute_tenure( return (match & in_england).astype(float) -def _compute_income_band( - target: Target, ctx: _SimContext -) -> np.ndarray: +def _compute_income_band(target: Target, ctx: _SimContext) -> np.ndarray: """Compute income variable within a total income band.""" variable = target.variable lower = target.lower_bound upper = target.upper_bound - income_df = ctx.sim.calculate_dataframe( - ["total_income", variable] - ) - in_band = (income_df.total_income >= lower) & ( - income_df.total_income < upper - ) + income_df = ctx.sim.calculate_dataframe(["total_income", variable]) + in_band = (income_df.total_income >= lower) & (income_df.total_income < upper) if target.is_count: - return ctx.household_from_person( - (income_df[variable] > 0) * in_band - ) + return ctx.household_from_person((income_df[variable] > 0) * in_band) else: - return ctx.household_from_person( - income_df[variable] * in_band - ) + return ctx.household_from_person(income_df[variable] * in_band) -def _compute_council_tax_band( - target: Target, ctx: _SimContext -) -> np.ndarray: +def _compute_council_tax_band(target: Target, ctx: _SimContext) -> np.ndarray: """Compute council tax band count for a region.""" # "voa/council_tax/{REGION}/{band}" parts = target.name.split("/") @@ -644,9 +584,7 @@ def _compute_council_tax_band( return (in_band * in_region).astype(float) -def _compute_obr_council_tax( - target: Target, ctx: _SimContext -) -> np.ndarray: +def _compute_obr_council_tax(target: Target, ctx: _SimContext) -> np.ndarray: """Compute OBR council tax receipts, optionally by country.""" name = target.name ct = ctx.pe("council_tax") @@ -662,16 +600,12 @@ def _compute_obr_council_tax( return ct -def _compute_uc_jobseeker( - target: Target, ctx: _SimContext -) -> np.ndarray: +def _compute_uc_jobseeker(target: Target, ctx: _SimContext) -> np.ndarray: """Compute UC jobseeker / non-jobseeker splits.""" family = ctx.sim.populations["benunit"] uc = ctx.sim.calculate("universal_credit") on_uc = uc > 0 - unemployed = family.any( - ctx.sim.calculate("employment_status") == "UNEMPLOYED" - ) + unemployed = family.any(ctx.sim.calculate("employment_status") == "UNEMPLOYED") if "non_jobseekers" in target.name: mask = on_uc * ~unemployed @@ -684,37 +618,26 @@ def _compute_uc_jobseeker( return ctx.household_from_family(uc * mask) -def _compute_uc_payment_dist( - target: Target, ctx: _SimContext -) -> np.ndarray: +def _compute_uc_payment_dist(target: Target, ctx: _SimContext) -> np.ndarray: """Compute UC payment distribution band × family type.""" # Parse from name: "dwp/uc_payment_dist/{family_type}_annual_payment_{lower}_to_{upper}" name = target.name.removeprefix("dwp/uc_payment_dist/") # Find the _annual_payment_ separator idx = name.index("_annual_payment_") family_type = name[:idx] - payment_part = name[idx + 16:] # e.g. "0_to_1_000" lower = target.lower_bound upper = target.upper_bound - uc_payments = ctx.sim.calculate( - "universal_credit", map_to="benunit" - ).values - uc_family_type = ctx.sim.calculate( - "family_type", map_to="benunit" - ).values + uc_payments = ctx.sim.calculate("universal_credit", map_to="benunit").values + uc_family_type = ctx.sim.calculate("family_type", map_to="benunit").values in_band = ( - (uc_payments >= lower) - & (uc_payments < upper) - & (uc_family_type == family_type) + (uc_payments >= lower) & (uc_payments < upper) & (uc_family_type == family_type) ) return ctx.household_from_family(in_band) -def _compute_ss_it_relief( - target: Target, ctx: _SimContext -) -> np.ndarray: +def _compute_ss_it_relief(target: Target, ctx: _SimContext) -> np.ndarray: """Compute salary sacrifice IT relief by tax band.""" it_base = ctx.sim.calculate("income_tax") it_cf = ctx.counterfactual_sim.calculate("income_tax", ctx.time_period) @@ -731,9 +654,7 @@ def _compute_ss_it_relief( name = target.name if "basic" in name: - mask = (adj_net_income_cf > basic_thresh) & ( - adj_net_income_cf <= higher_thresh - ) + mask = (adj_net_income_cf > basic_thresh) & (adj_net_income_cf <= higher_thresh) elif "higher" in name: mask = (adj_net_income_cf > higher_thresh) & ( adj_net_income_cf <= additional_thresh @@ -747,9 +668,7 @@ def _compute_ss_it_relief( return ctx.household_from_person(it_relief * mask) -def _compute_two_child_limit( - target: Target, ctx: _SimContext -) -> np.ndarray | None: +def _compute_two_child_limit(target: Target, ctx: _SimContext) -> np.ndarray | None: """Compute two-child limit targets. These involve cross-tabulations of UC eligibility, child count, @@ -780,18 +699,14 @@ def _compute_two_child_limit( return children_in_capped if name == "dwp/uc/two_child_limit/children_in_affected_households": # Total children (not just affected ones) in capped households - total_children = sim.map_result( - is_child * child_in_uc, "person", "household" - ) + total_children = sim.map_result(is_child * child_in_uc, "person", "household") return total_children * capped_hh # By number of children: "dwp/uc/two_child_limit/{n}_children_households" if "_children_households_total_children" in name: n = int(name.split("/")[-1].split("_")[0]) children_count = sim.map_result(is_child, "person", "household") - return ( - capped_hh * (children_count == n) * children_count - ).astype(float) + return (capped_hh * (children_count == n) * children_count).astype(float) if "_children_households" in name and "total" not in name: n = int(name.split("/")[-1].split("_")[0]) children_count = sim.map_result(is_child, "person", "household") @@ -822,9 +737,7 @@ def _compute_two_child_limit( return None -def _compute_uc_by_children( - target: Target, ctx: _SimContext -) -> np.ndarray: +def _compute_uc_by_children(target: Target, ctx: _SimContext) -> np.ndarray: """Compute UC claimant households filtered by number of dependent children.""" # Parse "dwp/uc/claimants_with_{n}_children" name = target.name @@ -846,9 +759,7 @@ def _compute_uc_by_children( return (on_uc & match).astype(float) -def _compute_uc_by_family_type( - target: Target, ctx: _SimContext -) -> np.ndarray: +def _compute_uc_by_family_type(target: Target, ctx: _SimContext) -> np.ndarray: """Compute UC claimant households filtered by family type.""" name = target.name ft_str = name.split("dwp/uc/claimants_")[1] @@ -867,9 +778,7 @@ def ft_hh(value): if ft_str == "single_no_children": match = ft_hh("SINGLE") & (children_per_hh == 0) elif ft_str == "single_with_children": - match = (ft_hh("SINGLE") | ft_hh("LONE_PARENT")) & ( - children_per_hh > 0 - ) + match = (ft_hh("SINGLE") | ft_hh("LONE_PARENT")) & (children_per_hh > 0) elif ft_str == "couple_no_children": match = ft_hh("COUPLE_NO_CHILDREN") elif ft_str == "couple_with_children": diff --git a/policyengine_uk_data/targets/registry.py b/policyengine_uk_data/targets/registry.py index 909fd85d..b4c97108 100644 --- a/policyengine_uk_data/targets/registry.py +++ b/policyengine_uk_data/targets/registry.py @@ -24,12 +24,8 @@ def discover_source_modules() -> list: """Import all modules under targets.sources.""" modules = [] package_path = Path(sources_pkg.__file__).parent - for importer, modname, ispkg in pkgutil.iter_modules( - [str(package_path)] - ): - mod = importlib.import_module( - f"policyengine_uk_data.targets.sources.{modname}" - ) + for importer, modname, ispkg in pkgutil.iter_modules([str(package_path)]): + mod = importlib.import_module(f"policyengine_uk_data.targets.sources.{modname}") if hasattr(mod, "get_targets"): modules.append(mod) return modules diff --git a/policyengine_uk_data/targets/sources/dwp.py b/policyengine_uk_data/targets/sources/dwp.py index 67c23c0a..ff4441b2 100644 --- a/policyengine_uk_data/targets/sources/dwp.py +++ b/policyengine_uk_data/targets/sources/dwp.py @@ -10,7 +10,6 @@ - DWP two-child limit: https://www.gov.uk/government/statistics/universal-credit-and-child-tax-credit-claimants-statistics-related-to-the-policy-to-provide-support-for-a-maximum-of-2-children-april-2024 """ -import pandas as pd from pathlib import Path from policyengine_uk_data.targets.schema import Target, Unit @@ -118,9 +117,7 @@ def get_targets() -> list[Target]: variable="universal_credit", source="dwp", unit=Unit.COUNT, - values={ - 2025: count_k * (1 + undercount_relative) * 1e3 - }, + values={2025: count_k * (1 + undercount_relative) * 1e3}, is_count=True, reference_url="https://stat-xplore.dwp.gov.uk/", ) @@ -193,44 +190,46 @@ def get_targets() -> list[Target]: ) # Two-child limit by disability - targets.extend([ - Target( - name="dwp/uc/two_child_limit/adult_pip_households", - variable="pip", - source="dwp", - unit=Unit.COUNT, - values={2026: 62_260}, - is_count=True, - reference_url="https://www.gov.uk/government/statistics/universal-credit-and-child-tax-credit-claimants-statistics-related-to-the-policy-to-provide-support-for-a-maximum-of-2-children-april-2024", - ), - Target( - name="dwp/uc/two_child_limit/adult_pip_children", - variable="is_child", - source="dwp", - unit=Unit.COUNT, - values={2026: 225_320}, - is_count=True, - reference_url="https://www.gov.uk/government/statistics/universal-credit-and-child-tax-credit-claimants-statistics-related-to-the-policy-to-provide-support-for-a-maximum-of-2-children-april-2024", - ), - Target( - name="dwp/uc/two_child_limit/disabled_child_element_households", - variable="uc_individual_disabled_child_element", - source="dwp", - unit=Unit.COUNT, - values={2026: 124_560}, - is_count=True, - reference_url="https://www.gov.uk/government/statistics/universal-credit-and-child-tax-credit-claimants-statistics-related-to-the-policy-to-provide-support-for-a-maximum-of-2-children-april-2024", - ), - Target( - name="dwp/uc/two_child_limit/disabled_child_element_children", - variable="is_child", - source="dwp", - unit=Unit.COUNT, - values={2026: 462_660}, - is_count=True, - reference_url="https://www.gov.uk/government/statistics/universal-credit-and-child-tax-credit-claimants-statistics-related-to-the-policy-to-provide-support-for-a-maximum-of-2-children-april-2024", - ), - ]) + targets.extend( + [ + Target( + name="dwp/uc/two_child_limit/adult_pip_households", + variable="pip", + source="dwp", + unit=Unit.COUNT, + values={2026: 62_260}, + is_count=True, + reference_url="https://www.gov.uk/government/statistics/universal-credit-and-child-tax-credit-claimants-statistics-related-to-the-policy-to-provide-support-for-a-maximum-of-2-children-april-2024", + ), + Target( + name="dwp/uc/two_child_limit/adult_pip_children", + variable="is_child", + source="dwp", + unit=Unit.COUNT, + values={2026: 225_320}, + is_count=True, + reference_url="https://www.gov.uk/government/statistics/universal-credit-and-child-tax-credit-claimants-statistics-related-to-the-policy-to-provide-support-for-a-maximum-of-2-children-april-2024", + ), + Target( + name="dwp/uc/two_child_limit/disabled_child_element_households", + variable="uc_individual_disabled_child_element", + source="dwp", + unit=Unit.COUNT, + values={2026: 124_560}, + is_count=True, + reference_url="https://www.gov.uk/government/statistics/universal-credit-and-child-tax-credit-claimants-statistics-related-to-the-policy-to-provide-support-for-a-maximum-of-2-children-april-2024", + ), + Target( + name="dwp/uc/two_child_limit/disabled_child_element_children", + variable="is_child", + source="dwp", + unit=Unit.COUNT, + values={2026: 462_660}, + is_count=True, + reference_url="https://www.gov.uk/government/statistics/universal-credit-and-child-tax-credit-claimants-statistics-related-to-the-policy-to-provide-support-for-a-maximum-of-2-children-april-2024", + ), + ] + ) # UC national payment distribution from xlsx targets.extend(_uc_payment_distribution_targets()) diff --git a/policyengine_uk_data/targets/sources/hmrc_salary_sacrifice.py b/policyengine_uk_data/targets/sources/hmrc_salary_sacrifice.py index a5f40c0d..1ff2ac82 100644 --- a/policyengine_uk_data/targets/sources/hmrc_salary_sacrifice.py +++ b/policyengine_uk_data/targets/sources/hmrc_salary_sacrifice.py @@ -21,8 +21,7 @@ _SOURCES_YAML = Path(__file__).parent.parent / "sources.yaml" _HEADERS = { "User-Agent": ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " - "AppleWebKit/537.36" + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" ), } @@ -52,9 +51,7 @@ def get_targets() -> list[Target]: targets = [] try: - r = requests.get( - ref, headers=_HEADERS, allow_redirects=True, timeout=30 - ) + r = requests.get(ref, headers=_HEADERS, allow_redirects=True, timeout=30) r.raise_for_status() df = pd.read_csv(io.StringIO(r.content.decode("utf-8-sig"))) @@ -128,8 +125,6 @@ def get_targets() -> list[Target]: ) except Exception as e: - logger.error( - "Failed to download/parse HMRC salary sacrifice CSV: %s", e - ) + logger.error("Failed to download/parse HMRC salary sacrifice CSV: %s", e) return targets diff --git a/policyengine_uk_data/targets/sources/hmrc_spi.py b/policyengine_uk_data/targets/sources/hmrc_spi.py index b78540c3..296965dc 100644 --- a/policyengine_uk_data/targets/sources/hmrc_spi.py +++ b/policyengine_uk_data/targets/sources/hmrc_spi.py @@ -29,8 +29,7 @@ _HEADERS = { "User-Agent": ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " - "AppleWebKit/537.36" + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" ), } @@ -171,9 +170,7 @@ def get_targets() -> list[Target]: for idx, row in merged.iterrows(): lower = int(row["lower_bound"]) - upper = ( - _BAND_UPPER[idx] if idx < len(_BAND_UPPER) else float("inf") - ) + upper = _BAND_UPPER[idx] if idx < len(_BAND_UPPER) else float("inf") band_label = f"{lower:_}_to_{upper:_}" for variable in INCOME_VARIABLES: @@ -188,9 +185,7 @@ def get_targets() -> list[Target]: variable=variable, source="hmrc_spi", unit=Unit.GBP, - values={ - _SPI_YEAR: float(row[amount_col]) * 1e6 - }, + values={_SPI_YEAR: float(row[amount_col]) * 1e6}, breakdown_variable="total_income", lower_bound=float(lower), upper_bound=float(upper), @@ -201,16 +196,11 @@ def get_targets() -> list[Target]: if count_col in row.index and row[count_col] > 0: targets.append( Target( - name=( - f"hmrc/{variable}_count_income_band" - f"_{band_label}" - ), + name=(f"hmrc/{variable}_count_income_band_{band_label}"), variable=variable, source="hmrc_spi", unit=Unit.COUNT, - values={ - _SPI_YEAR: float(row[count_col]) * 1e3 - }, + values={_SPI_YEAR: float(row[count_col]) * 1e3}, is_count=True, breakdown_variable="total_income", lower_bound=float(lower), @@ -230,9 +220,7 @@ def get_targets() -> list[Target]: return targets -def _read_projection_csv( - csv_path: Path, ref: str -) -> list[Target]: +def _read_projection_csv(csv_path: Path, ref: str) -> list[Target]: """Read projected future year targets from incomes_projection.csv.""" incomes = pd.read_csv(csv_path) targets = [] @@ -268,9 +256,7 @@ def _read_projection_csv( ) if count_col in row.index and pd.notna(row[count_col]): - name = ( - f"hmrc/{variable}_count_income_band_{band_label}" - ) + name = f"hmrc/{variable}_count_income_band_{band_label}" targets.append( Target( name=name, diff --git a/policyengine_uk_data/targets/sources/local_age.py b/policyengine_uk_data/targets/sources/local_age.py index 2276c173..5cd7f744 100644 --- a/policyengine_uk_data/targets/sources/local_age.py +++ b/policyengine_uk_data/targets/sources/local_age.py @@ -11,7 +11,6 @@ import logging from pathlib import Path -import numpy as np import pandas as pd logger = logging.getLogger(__name__) diff --git a/policyengine_uk_data/targets/sources/local_la_extras.py b/policyengine_uk_data/targets/sources/local_la_extras.py index c2a9e7d8..193daa53 100644 --- a/policyengine_uk_data/targets/sources/local_la_extras.py +++ b/policyengine_uk_data/targets/sources/local_la_extras.py @@ -14,7 +14,6 @@ import logging from pathlib import Path -import numpy as np import pandas as pd logger = logging.getLogger(__name__) @@ -53,9 +52,16 @@ def load_ons_la_income() -> pd.DataFrame: def load_sheet(sheet_name: str, value_col: str) -> pd.DataFrame: df = pd.read_excel(xlsx, sheet_name=sheet_name, header=3) df.columns = [ - "msoa_code", "msoa_name", "la_code", "la_name", - "region_code", "region_name", value_col, - "upper_ci", "lower_ci", "ci_width", + "msoa_code", + "msoa_name", + "la_code", + "la_name", + "region_code", + "region_name", + value_col, + "upper_ci", + "lower_ci", + "ci_width", ] df = df.iloc[1:].dropna(subset=["msoa_code"]) df[value_col] = pd.to_numeric(df[value_col]) @@ -98,12 +104,24 @@ def load_tenure_data() -> pd.DataFrame: return pd.DataFrame() df = pd.read_excel(path, sheet_name="data download") df.columns = [ - "region_code", "region_name", "la_code", "la_name", - "owned_outright_pct", "owned_mortgage_pct", - "private_rent_pct", "social_rent_pct", + "region_code", + "region_name", + "la_code", + "la_name", + "owned_outright_pct", + "owned_mortgage_pct", + "private_rent_pct", + "social_rent_pct", + ] + return df[ + [ + "la_code", + "owned_outright_pct", + "owned_mortgage_pct", + "private_rent_pct", + "social_rent_pct", + ] ] - return df[["la_code", "owned_outright_pct", "owned_mortgage_pct", - "private_rent_pct", "social_rent_pct"]] def load_private_rents() -> pd.DataFrame: @@ -117,8 +135,16 @@ def load_private_rents() -> pd.DataFrame: return pd.DataFrame() df = pd.read_excel(path, sheet_name="Figure 3", header=5) df.columns = [ - "col0", "la_code_old", "area_code", "area_name", "room", - "studio", "one_bed", "two_bed", "three_bed", "four_plus", + "col0", + "la_code_old", + "area_code", + "area_name", + "room", + "studio", + "one_bed", + "two_bed", + "three_bed", + "four_plus", "median_monthly_rent", ] df = df[df["area_code"].astype(str).str.match(r"^E0[6789]")] diff --git a/policyengine_uk_data/targets/sources/obr.py b/policyengine_uk_data/targets/sources/obr.py index 6867eb1f..5a5bccc3 100644 --- a/policyengine_uk_data/targets/sources/obr.py +++ b/policyengine_uk_data/targets/sources/obr.py @@ -38,8 +38,7 @@ _HEADERS = { "User-Agent": ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " - "AppleWebKit/537.36" + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" ), } @@ -57,9 +56,7 @@ def _download_workbook(url: str) -> openpyxl.Workbook: return openpyxl.load_workbook(io.BytesIO(r.content), data_only=False) -def _read_row_values( - ws, row_num: int, col_letters: list[str] -) -> dict[int, float]: +def _read_row_values(ws, row_num: int, col_letters: list[str]) -> dict[int, float]: """Read numeric values from a row, mapped to calendar years.""" result = {} for col in col_letters: @@ -117,7 +114,9 @@ def read_39(ws, row_num: int) -> dict[int, float]: # Income tax from Table 3.4 (accrued basis) try: ws34 = wb["3.4"] - row_num = _find_row(ws34, "Income tax (gross of tax credits)", col="B", max_row=30) + row_num = _find_row( + ws34, "Income tax (gross of tax credits)", col="B", max_row=30 + ) values = _read_row_values(ws34, row_num, cols_34) if values: targets.append( @@ -359,15 +358,11 @@ def read_49(row_num: int) -> dict[int, float]: # Universal credit outside cap (row 43) is jobseekers UC try: # UC outside cap = predominantly JSA-conditionality UC - uc_outside_row = _find_row( - ws, "Universal credit", col="B", max_row=55 - ) + uc_outside_row = _find_row(ws, "Universal credit", col="B", max_row=55) # Find the second UC row (outside cap section) for row in range(uc_outside_row + 1, 55): cell_val = ws[f"B{row}"].value - if cell_val and str(cell_val).strip().startswith( - "Universal credit" - ): + if cell_val and str(cell_val).strip().startswith("Universal credit"): values = read_49(row) if values: targets.append( @@ -439,12 +434,8 @@ def _parse_tv_licence(wb: openpyxl.Workbook) -> list[Target]: _PRIVATE_SCHOOL = {y: 557_000 for y in range(2018, 2032)} # SPP Review: salary sacrifice NI relief (uprated 3% pa from 2024 base) -_SS_EMPLOYEE_NI = { - y: 1.2e9 * 1.03 ** max(0, y - 2024) for y in range(2024, 2032) -} -_SS_EMPLOYER_NI = { - y: 2.9e9 * 1.03 ** max(0, y - 2024) for y in range(2024, 2032) -} +_SS_EMPLOYEE_NI = {y: 1.2e9 * 1.03 ** max(0, y - 2024) for y in range(2024, 2032)} +_SS_EMPLOYER_NI = {y: 2.9e9 * 1.03 ** max(0, y - 2024) for y in range(2024, 2032)} def get_targets() -> list[Target]: @@ -459,9 +450,7 @@ def get_targets() -> list[Target]: logger.error("Failed to download/parse OBR receipts: %s", e) try: - expenditure_wb = _download_workbook( - config["obr"]["efo_expenditure"] - ) + expenditure_wb = _download_workbook(config["obr"]["efo_expenditure"]) targets.extend(_parse_council_tax(expenditure_wb)) targets.extend(_parse_welfare(expenditure_wb)) targets.extend(_parse_tv_licence(expenditure_wb)) diff --git a/policyengine_uk_data/targets/sources/ons_demographics.py b/policyengine_uk_data/targets/sources/ons_demographics.py index 1cf0a23b..0a88d54b 100644 --- a/policyengine_uk_data/targets/sources/ons_demographics.py +++ b/policyengine_uk_data/targets/sources/ons_demographics.py @@ -24,7 +24,6 @@ import pandas as pd import requests -import yaml from policyengine_uk_data.targets.schema import ( GeographicLevel, @@ -39,8 +38,7 @@ _HEADERS = { "User-Agent": ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " - "AppleWebKit/537.36" + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" ), } @@ -89,9 +87,7 @@ @lru_cache(maxsize=1) def _download_uk_projection() -> pd.DataFrame: """Download and parse the UK principal population projection.""" - r = requests.get( - _UK_ZIP_URL, headers=_HEADERS, allow_redirects=True, timeout=120 - ) + r = requests.get(_UK_ZIP_URL, headers=_HEADERS, allow_redirects=True, timeout=120) r.raise_for_status() z = zipfile.ZipFile(io.BytesIO(r.content)) with z.open("uk/uk_ppp_machine_readable.xlsx") as f: @@ -193,9 +189,7 @@ def _parse_regional_from_csv() -> list[Target]: for _, row in demographics.iterrows(): name = row["name"] - if name in _SKIP_NAMES or any( - name.startswith(p) for p in _SKIP_PREFIXES - ): + if name in _SKIP_NAMES or any(name.startswith(p) for p in _SKIP_PREFIXES): continue values = {} for y in _YEARS: diff --git a/policyengine_uk_data/targets/sources/ons_households.py b/policyengine_uk_data/targets/sources/ons_households.py index 4c68b714..88d51cb8 100644 --- a/policyengine_uk_data/targets/sources/ons_households.py +++ b/policyengine_uk_data/targets/sources/ons_households.py @@ -30,8 +30,7 @@ ) _HEADERS = { "User-Agent": ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " - "AppleWebKit/537.36" + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" ), } @@ -57,13 +56,9 @@ @lru_cache(maxsize=1) def _download_workbook() -> openpyxl.Workbook: - r = requests.get( - _URL, headers=_HEADERS, allow_redirects=True, timeout=60 - ) + r = requests.get(_URL, headers=_HEADERS, allow_redirects=True, timeout=60) r.raise_for_status() - return openpyxl.load_workbook( - io.BytesIO(r.content), data_only=True - ) + return openpyxl.load_workbook(io.BytesIO(r.content), data_only=True) def _find_year_columns(ws) -> dict[int, int]: diff --git a/policyengine_uk_data/targets/sources/ons_savings.py b/policyengine_uk_data/targets/sources/ons_savings.py index 5f49d8c5..a2984713 100644 --- a/policyengine_uk_data/targets/sources/ons_savings.py +++ b/policyengine_uk_data/targets/sources/ons_savings.py @@ -18,26 +18,19 @@ logger = logging.getLogger(__name__) _API_URL = ( - "https://www.ons.gov.uk/economy/grossdomesticproductgdp/" - "timeseries/haxv/ukea/data" -) -_REF = ( - "https://www.ons.gov.uk/economy/grossdomesticproductgdp/" - "timeseries/haxv/ukea" + "https://www.ons.gov.uk/economy/grossdomesticproductgdp/timeseries/haxv/ukea/data" ) +_REF = "https://www.ons.gov.uk/economy/grossdomesticproductgdp/timeseries/haxv/ukea" _HEADERS = { "User-Agent": ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " - "AppleWebKit/537.36" + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" ), } def get_targets() -> list[Target]: try: - r = requests.get( - _API_URL, headers=_HEADERS, allow_redirects=True, timeout=30 - ) + r = requests.get(_API_URL, headers=_HEADERS, allow_redirects=True, timeout=30) r.raise_for_status() data = r.json() diff --git a/policyengine_uk_data/targets/sources/ons_tenure.py b/policyengine_uk_data/targets/sources/ons_tenure.py index 841e3f4f..0ae4ccdd 100644 --- a/policyengine_uk_data/targets/sources/ons_tenure.py +++ b/policyengine_uk_data/targets/sources/ons_tenure.py @@ -33,8 +33,7 @@ ) _HEADERS = { "User-Agent": ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " - "AppleWebKit/537.36" + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" ), } @@ -50,13 +49,9 @@ @lru_cache(maxsize=1) def _download_workbook() -> openpyxl.Workbook: - r = requests.get( - _URL, headers=_HEADERS, allow_redirects=True, timeout=60 - ) + r = requests.get(_URL, headers=_HEADERS, allow_redirects=True, timeout=60) r.raise_for_status() - return openpyxl.load_workbook( - io.BytesIO(r.content), data_only=True - ) + return openpyxl.load_workbook(io.BytesIO(r.content), data_only=True) def _parse_header_columns(ws) -> dict[tuple[int, str], int]: diff --git a/policyengine_uk_data/utils/loss.py b/policyengine_uk_data/utils/loss.py index 3c240ff6..27eb919f 100644 --- a/policyengine_uk_data/utils/loss.py +++ b/policyengine_uk_data/utils/loss.py @@ -4,7 +4,6 @@ for all target definitions and simulation column construction. """ -import numpy as np import pandas as pd from policyengine_uk_data.targets.build_loss_matrix import ( @@ -12,9 +11,7 @@ ) -def get_loss_results( - dataset, time_period, reform=None, household_weights=None -): +def get_loss_results(dataset, time_period, reform=None, household_weights=None): """Calculate loss metrics comparing model outputs to targets. Args: From 68b31d49c34624fd7271b32c423ad26afefc7b93 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Sun, 15 Feb 2026 13:44:08 +0000 Subject: [PATCH 3/6] Format with black -l 79 Co-Authored-By: Claude Opus 4 --- .../datasets/create_datasets.py | 4 +- .../local_areas/constituencies/calibrate.py | 16 ++- .../local_authorities/calibrate.py | 22 +++- .../local_areas/local_authorities/loss.py | 25 ++--- .../targets/build_loss_matrix.py | 106 ++++++++++++------ policyengine_uk_data/targets/registry.py | 4 +- .../targets/sources/hmrc_salary_sacrifice.py | 8 +- .../targets/sources/hmrc_spi.py | 8 +- .../targets/sources/local_la_extras.py | 4 +- policyengine_uk_data/targets/sources/obr.py | 16 ++- .../targets/sources/ons_demographics.py | 8 +- .../targets/sources/ons_savings.py | 8 +- .../tests/test_target_registry.py | 12 +- policyengine_uk_data/utils/loss.py | 4 +- 14 files changed, 161 insertions(+), 84 deletions(-) diff --git a/policyengine_uk_data/datasets/create_datasets.py b/policyengine_uk_data/datasets/create_datasets.py index ded6210a..24efd652 100644 --- a/policyengine_uk_data/datasets/create_datasets.py +++ b/policyengine_uk_data/datasets/create_datasets.py @@ -165,7 +165,9 @@ def main(): # Downrate and save update_dataset("Downrate to 2023", "processing") - frs_calibrated = uprate_dataset(frs_calibrated_constituencies, 2023) + frs_calibrated = uprate_dataset( + frs_calibrated_constituencies, 2023 + ) update_dataset("Downrate to 2023", "completed") update_dataset("Save final dataset", "processing") diff --git a/policyengine_uk_data/datasets/local_areas/constituencies/calibrate.py b/policyengine_uk_data/datasets/local_areas/constituencies/calibrate.py index 24aa3c30..6ea99677 100644 --- a/policyengine_uk_data/datasets/local_areas/constituencies/calibrate.py +++ b/policyengine_uk_data/datasets/local_areas/constituencies/calibrate.py @@ -73,9 +73,9 @@ def get_performance(weights, m_c, y_c, m_n, y_n, excluded_targets): constituency_target_validation["estimate"] - constituency_target_validation["target"] ) - constituency_target_validation["abs_error"] = constituency_target_validation[ - "error" - ].abs() + constituency_target_validation["abs_error"] = ( + constituency_target_validation["error"].abs() + ) constituency_target_validation["rel_abs_error"] = ( constituency_target_validation["abs_error"] / constituency_target_validation["target"] @@ -91,11 +91,15 @@ def get_performance(weights, m_c, y_c, m_n, y_n, excluded_targets): national_target_validation["target"] = national_actuals.values national_target_validation["error"] = ( - national_target_validation["estimate"] - national_target_validation["target"] + national_target_validation["estimate"] + - national_target_validation["target"] ) - national_target_validation["abs_error"] = national_target_validation["error"].abs() + national_target_validation["abs_error"] = national_target_validation[ + "error" + ].abs() national_target_validation["rel_abs_error"] = ( - national_target_validation["abs_error"] / national_target_validation["target"] + national_target_validation["abs_error"] + / national_target_validation["target"] ) df = pd.concat( diff --git a/policyengine_uk_data/datasets/local_areas/local_authorities/calibrate.py b/policyengine_uk_data/datasets/local_areas/local_authorities/calibrate.py index 746d94e7..588f2955 100644 --- a/policyengine_uk_data/datasets/local_areas/local_authorities/calibrate.py +++ b/policyengine_uk_data/datasets/local_areas/local_authorities/calibrate.py @@ -18,8 +18,12 @@ def calibrate( ): return calibrate_local_areas( dataset=dataset, - matrix_fn=lambda ds: create_local_authority_target_matrix(ds, ds.time_period), - national_matrix_fn=lambda ds: create_national_target_matrix(ds, ds.time_period), + matrix_fn=lambda ds: create_local_authority_target_matrix( + ds, ds.time_period + ), + national_matrix_fn=lambda ds: create_national_target_matrix( + ds, ds.time_period + ), area_count=360, weight_file="local_authority_weights.h5", excluded_training_targets=excluded_training_targets, @@ -33,7 +37,9 @@ def calibrate( def get_performance(weights, m_c, y_c, m_n, y_n, excluded_targets): la_target_matrix, la_actuals = m_c, y_c national_target_matrix, national_actuals = m_n, y_n - local_authorities = pd.read_csv(STORAGE_FOLDER / "local_authorities_2021.csv") + local_authorities = pd.read_csv( + STORAGE_FOLDER / "local_authorities_2021.csv" + ) la_wide = weights @ la_target_matrix la_wide.index = local_authorities.code.values la_wide["name"] = local_authorities.name.values @@ -87,11 +93,15 @@ def get_performance(weights, m_c, y_c, m_n, y_n, excluded_targets): national_target_validation["target"] = national_actuals.values national_target_validation["error"] = ( - national_target_validation["estimate"] - national_target_validation["target"] + national_target_validation["estimate"] + - national_target_validation["target"] ) - national_target_validation["abs_error"] = national_target_validation["error"].abs() + national_target_validation["abs_error"] = national_target_validation[ + "error" + ].abs() national_target_validation["rel_abs_error"] = ( - national_target_validation["abs_error"] / national_target_validation["target"] + national_target_validation["abs_error"] + / national_target_validation["target"] ) df = pd.concat( diff --git a/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py b/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py index 177b2883..1b4e113e 100644 --- a/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py +++ b/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py @@ -151,8 +151,7 @@ def create_local_authority_target_matrix( ) has_ons_data = ( - ons_merged["net_income_bhc"].notna() - & ons_merged["households"].notna() + ons_merged["net_income_bhc"].notna() & ons_merged["households"].notna() ).values total_households = ons_merged["households"].sum() la_household_share = np.where( @@ -195,18 +194,17 @@ def create_local_authority_target_matrix( ) tenure_type = sim.calculate("tenure_type").values - matrix["tenure/owned_outright"] = ( - tenure_type == "OWNED_OUTRIGHT" - ).astype(float) + matrix["tenure/owned_outright"] = (tenure_type == "OWNED_OUTRIGHT").astype( + float + ) matrix["tenure/owned_mortgage"] = ( tenure_type == "OWNED_WITH_MORTGAGE" ).astype(float) - matrix["tenure/private_rent"] = ( - tenure_type == "RENT_PRIVATELY" - ).astype(float) + matrix["tenure/private_rent"] = (tenure_type == "RENT_PRIVATELY").astype( + float + ) matrix["tenure/social_rent"] = ( - (tenure_type == "RENT_FROM_COUNCIL") - | (tenure_type == "RENT_FROM_HA") + (tenure_type == "RENT_FROM_COUNCIL") | (tenure_type == "RENT_FROM_HA") ).astype(float) has_tenure = ( @@ -220,9 +218,7 @@ def create_local_authority_target_matrix( ("private_rent", "private_rent_pct"), ("social_rent", "social_rent_pct"), ]: - targets = ( - tenure_merged[pct_col] / 100 * tenure_merged["households"] - ) + targets = tenure_merged[pct_col] / 100 * tenure_merged["households"] national = ( original_weights * matrix[f"tenure/{tenure_key}"].values ).sum() @@ -246,7 +242,8 @@ def create_local_authority_target_matrix( tenure_merged["private_rent_target"] = ( tenure_merged["median_annual_rent"] - * tenure_merged["private_rent_pct"] / 100 + * tenure_merged["private_rent_pct"] + / 100 * tenure_merged["households"] ) diff --git a/policyengine_uk_data/targets/build_loss_matrix.py b/policyengine_uk_data/targets/build_loss_matrix.py index e76ff50c..ddb71593 100644 --- a/policyengine_uk_data/targets/build_loss_matrix.py +++ b/policyengine_uk_data/targets/build_loss_matrix.py @@ -114,7 +114,9 @@ def pe(self, variable: str): """Calculate variable mapped to household level.""" key = ("pe", variable) if key not in self._cache: - self._cache[key] = self.sim.calculate(variable, map_to="household").values + self._cache[key] = self.sim.calculate( + variable, map_to="household" + ).values return self._cache[key] def pe_person(self, variable: str): @@ -145,7 +147,9 @@ def household_from_family(self, values): @property def region(self): if "region" not in self._cache: - self._cache["region"] = self.sim.calculate("region", map_to="person") + self._cache["region"] = self.sim.calculate( + "region", map_to="person" + ) return self._cache["region"] @property @@ -174,7 +178,9 @@ def counterfactual_sim(self): if "counterfactual_sim" not in self._cache: from policyengine_uk import Microsimulation - ss = self.sim.calculate("pension_contributions_via_salary_sacrifice") + ss = self.sim.calculate( + "pension_contributions_via_salary_sacrifice" + ) emp = self.sim.calculate("employment_income") cf_sim = Microsimulation(dataset=self.dataset, reform=self.reform) cf_sim.set_input( @@ -214,7 +220,9 @@ def counterfactual_sim(self): # ── Column computation dispatch ────────────────────────────────────── -def _compute_column(target: Target, ctx: _SimContext, year: int) -> np.ndarray | None: +def _compute_column( + target: Target, ctx: _SimContext, year: int +) -> np.ndarray | None: """Compute the household-level column for a target. Returns None if the target can't be computed (e.g. missing @@ -255,9 +263,9 @@ def _compute_column(target: Target, ctx: _SimContext, year: int) -> np.ndarray | if name == "ons/scotland_households_3plus_children": is_child = ctx.pe_person("is_child") children_per_hh = ctx.household_from_person(is_child) - return ((ctx.household_region == "SCOTLAND") & (children_per_hh >= 3)).astype( - float - ) + return ( + (ctx.household_region == "SCOTLAND") & (children_per_hh >= 3) + ).astype(float) # ── Household type targets ──────────────────────────────────── if target.variable == "family_type" and target.is_count: @@ -327,9 +335,9 @@ def _compute_column(target: Target, ctx: _SimContext, year: int) -> np.ndarray | on_uc = ctx.household_from_family(uc > 0) > 0 child_u1 = ctx.pe_person("is_child") & (ctx.age < 1) has_child_u1 = ctx.household_from_person(child_u1) > 0 - return ((ctx.household_region == "SCOTLAND") & on_uc & has_child_u1).astype( - float - ) + return ( + (ctx.household_region == "SCOTLAND") & on_uc & has_child_u1 + ).astype(float) # ── UC claimants by number of children ───────────────────────── if name.startswith("dwp/uc/claimants_with_") and "_children" in name: @@ -355,14 +363,18 @@ def _compute_column(target: Target, ctx: _SimContext, year: int) -> np.ndarray | "obr/salary_sacrifice_employee_ni_relief", ): ni_base = ctx.sim.calculate("ni_employee") - ni_cf = ctx.counterfactual_sim.calculate("ni_employee", ctx.time_period) + ni_cf = ctx.counterfactual_sim.calculate( + "ni_employee", ctx.time_period + ) return ctx.household_from_person(ni_cf - ni_base) if name in ( "hmrc/salary_sacrifice_employer_nics_relief", "obr/salary_sacrifice_employer_ni_relief", ): ni_base = ctx.sim.calculate("ni_employer") - ni_cf = ctx.counterfactual_sim.calculate("ni_employer", ctx.time_period) + ni_cf = ctx.counterfactual_sim.calculate( + "ni_employer", ctx.time_period + ) return ctx.household_from_person(ni_cf - ni_base) # ── UC jobseeker / non-jobseeker splits ─────────────────────── @@ -444,7 +456,9 @@ def _compute_regional_age(target: Target, ctx: _SimContext) -> np.ndarray: return None person_match = ( - (ctx.region.values == pe_region) & (ctx.age >= lower) & (ctx.age <= upper) + (ctx.region.values == pe_region) + & (ctx.age >= lower) + & (ctx.age <= upper) ) return ctx.household_from_person(person_match) @@ -464,7 +478,9 @@ def _compute_gender_age(target: Target, ctx: _SimContext) -> np.ndarray: return ctx.household_from_person(sex_match & age_match) -def _compute_household_type(target: Target, ctx: _SimContext) -> np.ndarray | None: +def _compute_household_type( + target: Target, ctx: _SimContext +) -> np.ndarray | None: """Compute household type count from ONS families & households categories. Maps ONS household categories to PE family_type enum values and @@ -482,18 +498,18 @@ def ft_hh(value): return ctx.household_from_family(ft == value) > 0 if name == "lone_households_under_65": - return (ft_hh("SINGLE") & (children_per_hh == 0) & (age_hh_head < 65)).astype( - float - ) + return ( + ft_hh("SINGLE") & (children_per_hh == 0) & (age_hh_head < 65) + ).astype(float) if name == "lone_households_over_65": - return (ft_hh("SINGLE") & (children_per_hh == 0) & (age_hh_head >= 65)).astype( - float - ) + return ( + ft_hh("SINGLE") & (children_per_hh == 0) & (age_hh_head >= 65) + ).astype(float) if name == "unrelated_adult_households": people_per_hh = ctx.household_from_person(np.ones_like(is_child)) - return (ft_hh("SINGLE") & (children_per_hh == 0) & (people_per_hh > 1)).astype( - float - ) + return ( + ft_hh("SINGLE") & (children_per_hh == 0) & (people_per_hh > 1) + ).astype(float) if name == "couple_no_children_households": return ft_hh("COUPLE_NO_CHILDREN").astype(float) if name == "couple_under_3_children_households": @@ -503,10 +519,14 @@ def ft_hh(value): & (children_per_hh <= 2) ).astype(float) if name == "couple_3_plus_children_households": - return (ft_hh("COUPLE_WITH_CHILDREN") & (children_per_hh >= 3)).astype(float) + return (ft_hh("COUPLE_WITH_CHILDREN") & (children_per_hh >= 3)).astype( + float + ) if name == "couple_non_dependent_children_only_households": people_per_hh = ctx.household_from_person(np.ones_like(is_child)) - return (ft_hh("COUPLE_NO_CHILDREN") & (people_per_hh > 2)).astype(float) + return (ft_hh("COUPLE_NO_CHILDREN") & (people_per_hh > 2)).astype( + float + ) if name == "lone_parent_dependent_children_households": return (ft_hh("LONE_PARENT") & (children_per_hh > 0)).astype(float) if name == "lone_parent_non_dependent_children_households": @@ -560,7 +580,9 @@ def _compute_income_band(target: Target, ctx: _SimContext) -> np.ndarray: upper = target.upper_bound income_df = ctx.sim.calculate_dataframe(["total_income", variable]) - in_band = (income_df.total_income >= lower) & (income_df.total_income < upper) + in_band = (income_df.total_income >= lower) & ( + income_df.total_income < upper + ) if target.is_count: return ctx.household_from_person((income_df[variable] > 0) * in_band) @@ -605,7 +627,9 @@ def _compute_uc_jobseeker(target: Target, ctx: _SimContext) -> np.ndarray: family = ctx.sim.populations["benunit"] uc = ctx.sim.calculate("universal_credit") on_uc = uc > 0 - unemployed = family.any(ctx.sim.calculate("employment_status") == "UNEMPLOYED") + unemployed = family.any( + ctx.sim.calculate("employment_status") == "UNEMPLOYED" + ) if "non_jobseekers" in target.name: mask = on_uc * ~unemployed @@ -628,11 +652,15 @@ def _compute_uc_payment_dist(target: Target, ctx: _SimContext) -> np.ndarray: lower = target.lower_bound upper = target.upper_bound - uc_payments = ctx.sim.calculate("universal_credit", map_to="benunit").values + uc_payments = ctx.sim.calculate( + "universal_credit", map_to="benunit" + ).values uc_family_type = ctx.sim.calculate("family_type", map_to="benunit").values in_band = ( - (uc_payments >= lower) & (uc_payments < upper) & (uc_family_type == family_type) + (uc_payments >= lower) + & (uc_payments < upper) + & (uc_family_type == family_type) ) return ctx.household_from_family(in_band) @@ -654,7 +682,9 @@ def _compute_ss_it_relief(target: Target, ctx: _SimContext) -> np.ndarray: name = target.name if "basic" in name: - mask = (adj_net_income_cf > basic_thresh) & (adj_net_income_cf <= higher_thresh) + mask = (adj_net_income_cf > basic_thresh) & ( + adj_net_income_cf <= higher_thresh + ) elif "higher" in name: mask = (adj_net_income_cf > higher_thresh) & ( adj_net_income_cf <= additional_thresh @@ -668,7 +698,9 @@ def _compute_ss_it_relief(target: Target, ctx: _SimContext) -> np.ndarray: return ctx.household_from_person(it_relief * mask) -def _compute_two_child_limit(target: Target, ctx: _SimContext) -> np.ndarray | None: +def _compute_two_child_limit( + target: Target, ctx: _SimContext +) -> np.ndarray | None: """Compute two-child limit targets. These involve cross-tabulations of UC eligibility, child count, @@ -699,14 +731,18 @@ def _compute_two_child_limit(target: Target, ctx: _SimContext) -> np.ndarray | N return children_in_capped if name == "dwp/uc/two_child_limit/children_in_affected_households": # Total children (not just affected ones) in capped households - total_children = sim.map_result(is_child * child_in_uc, "person", "household") + total_children = sim.map_result( + is_child * child_in_uc, "person", "household" + ) return total_children * capped_hh # By number of children: "dwp/uc/two_child_limit/{n}_children_households" if "_children_households_total_children" in name: n = int(name.split("/")[-1].split("_")[0]) children_count = sim.map_result(is_child, "person", "household") - return (capped_hh * (children_count == n) * children_count).astype(float) + return (capped_hh * (children_count == n) * children_count).astype( + float + ) if "_children_households" in name and "total" not in name: n = int(name.split("/")[-1].split("_")[0]) children_count = sim.map_result(is_child, "person", "household") @@ -778,7 +814,9 @@ def ft_hh(value): if ft_str == "single_no_children": match = ft_hh("SINGLE") & (children_per_hh == 0) elif ft_str == "single_with_children": - match = (ft_hh("SINGLE") | ft_hh("LONE_PARENT")) & (children_per_hh > 0) + match = (ft_hh("SINGLE") | ft_hh("LONE_PARENT")) & ( + children_per_hh > 0 + ) elif ft_str == "couple_no_children": match = ft_hh("COUPLE_NO_CHILDREN") elif ft_str == "couple_with_children": diff --git a/policyengine_uk_data/targets/registry.py b/policyengine_uk_data/targets/registry.py index b4c97108..ebd0af4f 100644 --- a/policyengine_uk_data/targets/registry.py +++ b/policyengine_uk_data/targets/registry.py @@ -25,7 +25,9 @@ def discover_source_modules() -> list: modules = [] package_path = Path(sources_pkg.__file__).parent for importer, modname, ispkg in pkgutil.iter_modules([str(package_path)]): - mod = importlib.import_module(f"policyengine_uk_data.targets.sources.{modname}") + mod = importlib.import_module( + f"policyengine_uk_data.targets.sources.{modname}" + ) if hasattr(mod, "get_targets"): modules.append(mod) return modules diff --git a/policyengine_uk_data/targets/sources/hmrc_salary_sacrifice.py b/policyengine_uk_data/targets/sources/hmrc_salary_sacrifice.py index 1ff2ac82..4df4e48d 100644 --- a/policyengine_uk_data/targets/sources/hmrc_salary_sacrifice.py +++ b/policyengine_uk_data/targets/sources/hmrc_salary_sacrifice.py @@ -51,7 +51,9 @@ def get_targets() -> list[Target]: targets = [] try: - r = requests.get(ref, headers=_HEADERS, allow_redirects=True, timeout=30) + r = requests.get( + ref, headers=_HEADERS, allow_redirects=True, timeout=30 + ) r.raise_for_status() df = pd.read_csv(io.StringIO(r.content.decode("utf-8-sig"))) @@ -125,6 +127,8 @@ def get_targets() -> list[Target]: ) except Exception as e: - logger.error("Failed to download/parse HMRC salary sacrifice CSV: %s", e) + logger.error( + "Failed to download/parse HMRC salary sacrifice CSV: %s", e + ) return targets diff --git a/policyengine_uk_data/targets/sources/hmrc_spi.py b/policyengine_uk_data/targets/sources/hmrc_spi.py index 296965dc..a976c668 100644 --- a/policyengine_uk_data/targets/sources/hmrc_spi.py +++ b/policyengine_uk_data/targets/sources/hmrc_spi.py @@ -170,7 +170,9 @@ def get_targets() -> list[Target]: for idx, row in merged.iterrows(): lower = int(row["lower_bound"]) - upper = _BAND_UPPER[idx] if idx < len(_BAND_UPPER) else float("inf") + upper = ( + _BAND_UPPER[idx] if idx < len(_BAND_UPPER) else float("inf") + ) band_label = f"{lower:_}_to_{upper:_}" for variable in INCOME_VARIABLES: @@ -196,7 +198,9 @@ def get_targets() -> list[Target]: if count_col in row.index and row[count_col] > 0: targets.append( Target( - name=(f"hmrc/{variable}_count_income_band_{band_label}"), + name=( + f"hmrc/{variable}_count_income_band_{band_label}" + ), variable=variable, source="hmrc_spi", unit=Unit.COUNT, diff --git a/policyengine_uk_data/targets/sources/local_la_extras.py b/policyengine_uk_data/targets/sources/local_la_extras.py index 193daa53..fea211ca 100644 --- a/policyengine_uk_data/targets/sources/local_la_extras.py +++ b/policyengine_uk_data/targets/sources/local_la_extras.py @@ -29,7 +29,9 @@ "earningsandworkinghours/datasets/" "smallareaincomeestimatesformiddlelayersuperoutputareasenglandandwales" ) -_REF_TENURE = "https://www.gov.uk/government/statistics/english-housing-survey-2023" +_REF_TENURE = ( + "https://www.gov.uk/government/statistics/english-housing-survey-2023" +) _REF_RENT = ( "https://www.ons.gov.uk/peoplepopulationandcommunity/housing/datasets/" "privaterentalmarketsummarystatisticsinengland" diff --git a/policyengine_uk_data/targets/sources/obr.py b/policyengine_uk_data/targets/sources/obr.py index 5a5bccc3..6a95e504 100644 --- a/policyengine_uk_data/targets/sources/obr.py +++ b/policyengine_uk_data/targets/sources/obr.py @@ -56,7 +56,9 @@ def _download_workbook(url: str) -> openpyxl.Workbook: return openpyxl.load_workbook(io.BytesIO(r.content), data_only=False) -def _read_row_values(ws, row_num: int, col_letters: list[str]) -> dict[int, float]: +def _read_row_values( + ws, row_num: int, col_letters: list[str] +) -> dict[int, float]: """Read numeric values from a row, mapped to calendar years.""" result = {} for col in col_letters: @@ -362,7 +364,9 @@ def read_49(row_num: int) -> dict[int, float]: # Find the second UC row (outside cap section) for row in range(uc_outside_row + 1, 55): cell_val = ws[f"B{row}"].value - if cell_val and str(cell_val).strip().startswith("Universal credit"): + if cell_val and str(cell_val).strip().startswith( + "Universal credit" + ): values = read_49(row) if values: targets.append( @@ -434,8 +438,12 @@ def _parse_tv_licence(wb: openpyxl.Workbook) -> list[Target]: _PRIVATE_SCHOOL = {y: 557_000 for y in range(2018, 2032)} # SPP Review: salary sacrifice NI relief (uprated 3% pa from 2024 base) -_SS_EMPLOYEE_NI = {y: 1.2e9 * 1.03 ** max(0, y - 2024) for y in range(2024, 2032)} -_SS_EMPLOYER_NI = {y: 2.9e9 * 1.03 ** max(0, y - 2024) for y in range(2024, 2032)} +_SS_EMPLOYEE_NI = { + y: 1.2e9 * 1.03 ** max(0, y - 2024) for y in range(2024, 2032) +} +_SS_EMPLOYER_NI = { + y: 2.9e9 * 1.03 ** max(0, y - 2024) for y in range(2024, 2032) +} def get_targets() -> list[Target]: diff --git a/policyengine_uk_data/targets/sources/ons_demographics.py b/policyengine_uk_data/targets/sources/ons_demographics.py index 0a88d54b..3c48c38a 100644 --- a/policyengine_uk_data/targets/sources/ons_demographics.py +++ b/policyengine_uk_data/targets/sources/ons_demographics.py @@ -87,7 +87,9 @@ @lru_cache(maxsize=1) def _download_uk_projection() -> pd.DataFrame: """Download and parse the UK principal population projection.""" - r = requests.get(_UK_ZIP_URL, headers=_HEADERS, allow_redirects=True, timeout=120) + r = requests.get( + _UK_ZIP_URL, headers=_HEADERS, allow_redirects=True, timeout=120 + ) r.raise_for_status() z = zipfile.ZipFile(io.BytesIO(r.content)) with z.open("uk/uk_ppp_machine_readable.xlsx") as f: @@ -189,7 +191,9 @@ def _parse_regional_from_csv() -> list[Target]: for _, row in demographics.iterrows(): name = row["name"] - if name in _SKIP_NAMES or any(name.startswith(p) for p in _SKIP_PREFIXES): + if name in _SKIP_NAMES or any( + name.startswith(p) for p in _SKIP_PREFIXES + ): continue values = {} for y in _YEARS: diff --git a/policyengine_uk_data/targets/sources/ons_savings.py b/policyengine_uk_data/targets/sources/ons_savings.py index a2984713..21edb0c0 100644 --- a/policyengine_uk_data/targets/sources/ons_savings.py +++ b/policyengine_uk_data/targets/sources/ons_savings.py @@ -17,9 +17,7 @@ logger = logging.getLogger(__name__) -_API_URL = ( - "https://www.ons.gov.uk/economy/grossdomesticproductgdp/timeseries/haxv/ukea/data" -) +_API_URL = "https://www.ons.gov.uk/economy/grossdomesticproductgdp/timeseries/haxv/ukea/data" _REF = "https://www.ons.gov.uk/economy/grossdomesticproductgdp/timeseries/haxv/ukea" _HEADERS = { "User-Agent": ( @@ -30,7 +28,9 @@ def get_targets() -> list[Target]: try: - r = requests.get(_API_URL, headers=_HEADERS, allow_redirects=True, timeout=30) + r = requests.get( + _API_URL, headers=_HEADERS, allow_redirects=True, timeout=30 + ) r.raise_for_status() data = r.json() diff --git a/policyengine_uk_data/tests/test_target_registry.py b/policyengine_uk_data/tests/test_target_registry.py index c6f78bdd..ccc49e00 100644 --- a/policyengine_uk_data/tests/test_target_registry.py +++ b/policyengine_uk_data/tests/test_target_registry.py @@ -52,9 +52,9 @@ def test_hmrc_spi_targets_exist(): targets = get_all_targets(year=2025) spi_targets = [t for t in targets if t.source == "hmrc_spi"] # 13 bands × 6 income types × 2 (count + amount) = 156 per year - assert len(spi_targets) >= 100, ( - f"Expected 100+ SPI targets, got {len(spi_targets)}" - ) + assert ( + len(spi_targets) >= 100 + ), f"Expected 100+ SPI targets, got {len(spi_targets)}" def test_dwp_pip_targets(): @@ -76,9 +76,9 @@ def test_voa_council_tax_targets(): def test_core_target_count(): """Total target count should be substantial.""" targets = get_all_targets(year=2025) - assert len(targets) >= 200, ( - f"Expected 200+ targets for 2025, got {len(targets)}" - ) + assert ( + len(targets) >= 200 + ), f"Expected 200+ targets for 2025, got {len(targets)}" def test_two_child_limit_targets(): diff --git a/policyengine_uk_data/utils/loss.py b/policyengine_uk_data/utils/loss.py index 27eb919f..18d30bed 100644 --- a/policyengine_uk_data/utils/loss.py +++ b/policyengine_uk_data/utils/loss.py @@ -11,7 +11,9 @@ ) -def get_loss_results(dataset, time_period, reform=None, household_weights=None): +def get_loss_results( + dataset, time_period, reform=None, household_weights=None +): """Calculate loss metrics comparing model outputs to targets. Args: From 4bff224526eb42303a0997770d8db45049c6de98 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Sun, 15 Feb 2026 13:48:04 +0000 Subject: [PATCH 4/6] Remove unused pkg_resources import (broken on Python 3.13) Co-Authored-By: Claude Opus 4 --- policyengine_uk_data/utils/huggingface.py | 1 - 1 file changed, 1 deletion(-) diff --git a/policyengine_uk_data/utils/huggingface.py b/policyengine_uk_data/utils/huggingface.py index d2fa27e6..1ed8de25 100644 --- a/policyengine_uk_data/utils/huggingface.py +++ b/policyengine_uk_data/utils/huggingface.py @@ -1,6 +1,5 @@ from huggingface_hub import hf_hub_download, login, HfApi import os -import pkg_resources def download( From 07e345272fd072140f4a7be2f8e1d7db2410dc22 Mon Sep 17 00:00:00 2001 From: Vahid Ahmadi Date: Mon, 16 Feb 2026 10:39:45 +0000 Subject: [PATCH 5/6] Fix PR review issues: restore dropped targets, deduplicate, decompose MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Restore hmrc/salary_sacrifice_contributions target (24bn base, 3%/yr) - Fix obr/esa to combine esa_income + esa_contrib - Restore VOA council tax population uprating for non-base years - Extract shared HEADERS/STORAGE/load_config/to_float into _common.py - Decompose build_loss_matrix.py (828→402 lines) into targets/compute/ subpackage with domain modules: demographics, households, income, benefits, council_tax, other Co-Authored-By: Claude Opus 4.6 --- .../targets/build_loss_matrix.py | 681 ++++-------------- .../targets/compute/__init__.py | 69 ++ .../targets/compute/benefits.py | 205 ++++++ .../targets/compute/council_tax.py | 34 + .../targets/compute/demographics.py | 79 ++ .../targets/compute/households.py | 91 +++ .../targets/compute/income.py | 81 +++ policyengine_uk_data/targets/compute/other.py | 36 + .../targets/sources/_common.py | 29 + policyengine_uk_data/targets/sources/dwp.py | 4 - .../targets/sources/hmrc_salary_sacrifice.py | 56 +- .../targets/sources/hmrc_spi.py | 62 +- .../targets/sources/local_age.py | 13 +- .../targets/sources/local_income.py | 13 +- .../targets/sources/local_la_extras.py | 13 +- policyengine_uk_data/targets/sources/obr.py | 33 +- .../targets/sources/ons_demographics.py | 15 +- .../targets/sources/ons_households.py | 8 +- .../targets/sources/ons_savings.py | 8 +- .../targets/sources/ons_tenure.py | 8 +- .../targets/sources/voa_council_tax.py | 5 +- 21 files changed, 841 insertions(+), 702 deletions(-) create mode 100644 policyengine_uk_data/targets/compute/__init__.py create mode 100644 policyengine_uk_data/targets/compute/benefits.py create mode 100644 policyengine_uk_data/targets/compute/council_tax.py create mode 100644 policyengine_uk_data/targets/compute/demographics.py create mode 100644 policyengine_uk_data/targets/compute/households.py create mode 100644 policyengine_uk_data/targets/compute/income.py create mode 100644 policyengine_uk_data/targets/compute/other.py create mode 100644 policyengine_uk_data/targets/sources/_common.py diff --git a/policyengine_uk_data/targets/build_loss_matrix.py b/policyengine_uk_data/targets/build_loss_matrix.py index ddb71593..8f95a15a 100644 --- a/policyengine_uk_data/targets/build_loss_matrix.py +++ b/policyengine_uk_data/targets/build_loss_matrix.py @@ -17,7 +17,39 @@ import pandas as pd from policyengine_uk_data.targets import get_all_targets -from policyengine_uk_data.targets.schema import GeographicLevel, Target, Unit +from policyengine_uk_data.targets.schema import ( + GeographicLevel, + Target, + Unit, +) +from policyengine_uk_data.targets.compute import ( + compute_benefit_cap, + compute_council_tax_band, + compute_esa, + compute_gender_age, + compute_household_type, + compute_housing, + compute_income_band, + compute_obr_council_tax, + compute_pip_claimants, + compute_regional_age, + compute_savings_interest, + compute_scotland_demographics, + compute_scotland_uc_child, + compute_scottish_child_payment, + compute_ss_contributions, + compute_ss_it_relief, + compute_ss_ni_relief, + compute_tenure, + compute_two_child_limit, + compute_uc_by_children, + compute_uc_by_family_type, + compute_uc_jobseeker, + compute_uc_outside_cap, + compute_uc_payment_dist, + compute_uk_population, + compute_vehicles, +) logger = logging.getLogger(__name__) @@ -48,10 +80,8 @@ def create_target_matrix( sim = Microsimulation(dataset=dataset, reform=reform) sim.default_calculation_period = time_period - # Helper closures for the simulation ctx = _SimContext(sim, time_period, dataset, reform) - # Fetch all targets (no year filter — we resolve values below) all_targets = [] seen = set() for level in ( @@ -86,18 +116,31 @@ def create_target_matrix( def _resolve_value(target: Target, year: int) -> float | None: - """Get the target value for a year, falling back to nearest year.""" + """Get the target value for a year, falling back to nearest year. + + VOA council tax targets are population-uprated when extrapolating + from their base year (2024). + """ if year in target.values: return target.values[year] - # Use nearest available year available = sorted(target.values.keys()) if not available: return None closest = min(available, key=lambda y: abs(y - year)) - # Only allow ±3 years of extrapolation if abs(closest - year) > 3: return None - return target.values[closest] + base_value = target.values[closest] + # VOA council tax counts scale with population + if target.source == "voa" and year != closest: + from policyengine_uk_data.targets.sources.local_age import ( + get_uk_total_population, + ) + + pop_target = get_uk_total_population(year) + pop_base = get_uk_total_population(closest) + if pop_base > 0: + base_value *= pop_target / pop_base + return base_value class _SimContext: @@ -174,7 +217,7 @@ def country(self): @property def counterfactual_sim(self): - """Lazily create the salary sacrifice counterfactual simulation.""" + """Lazily create the salary sacrifice counterfactual.""" if "counterfactual_sim" not in self._cache: from policyengine_uk import Microsimulation @@ -197,26 +240,6 @@ def counterfactual_sim(self): return self._cache["counterfactual_sim"] -# ── Region name mapping ────────────────────────────────────────────── - -_REGION_MAP = { - "NORTH_EAST": "north_east", - "SOUTH_EAST": "south_east", - "EAST_MIDLANDS": "east_midlands", - "WEST_MIDLANDS": "west_midlands", - "YORKSHIRE": "yorkshire_and_the_humber", - "EAST_OF_ENGLAND": "east", - "LONDON": "london", - "SOUTH_WEST": "south_west", - "NORTH_WEST": "north_west", - "WALES": "wales", - "SCOTLAND": "scotland", - "NORTHERN_IRELAND": "northern_ireland", -} - -_REGION_INV = {v: k for k, v in _REGION_MAP.items()} - - # ── Column computation dispatch ────────────────────────────────────── @@ -225,190 +248,132 @@ def _compute_column( ) -> np.ndarray | None: """Compute the household-level column for a target. - Returns None if the target can't be computed (e.g. missing - custom_compute for a complex target). + Dispatches to domain-specific compute modules. """ - # If the target has a custom compute function, use it if target.custom_compute is not None: return target.custom_compute(ctx, target, year) - # Dispatch by target name patterns and metadata name = target.name - # ── Regional age bands ──────────────────────────────────────── - # Names like "ons/north_east_age_0_9" + # Demographics if name.startswith("ons/") and "_age_" in name: - return _compute_regional_age(target, ctx) - - # ── Gender × age bands ──────────────────────────────────────── - # Names like "ons/female_0_14" - if name.startswith("ons/") and ( - name.startswith("ons/female_") or name.startswith("ons/male_") - ): - return _compute_gender_age(target, ctx) - - # ── UK total population ─────────────────────────────────────── + return compute_regional_age(target, ctx) + if name.startswith("ons/female_") or name.startswith("ons/male_"): + return compute_gender_age(target, ctx) if name == "ons/uk_population": - return ctx.household_from_person(ctx.age >= 0) + return compute_uk_population(target, ctx) + if name in ( + "ons/scotland_children_under_16", + "ons/scotland_babies_under_1", + "ons/scotland_households_3plus_children", + ): + return compute_scotland_demographics(target, ctx) - # ── Scotland-specific demographics ──────────────────────────── - if name == "ons/scotland_children_under_16": - return ctx.household_from_person( - (ctx.region.values == "SCOTLAND") & (ctx.age < 16) - ) - if name == "ons/scotland_babies_under_1": - return ctx.household_from_person( - (ctx.region.values == "SCOTLAND") & (ctx.age < 1) - ) - if name == "ons/scotland_households_3plus_children": - is_child = ctx.pe_person("is_child") - children_per_hh = ctx.household_from_person(is_child) - return ( - (ctx.household_region == "SCOTLAND") & (children_per_hh >= 3) - ).astype(float) - - # ── Household type targets ──────────────────────────────────── + # Households and tenure if target.variable == "family_type" and target.is_count: - return _compute_household_type(target, ctx) - - # ── Tenure targets ──────────────────────────────────────────── + return compute_household_type(target, ctx) if target.variable == "tenure_type" and target.is_count: - return _compute_tenure(target, ctx) + return compute_tenure(target, ctx) - # ── Income band breakdowns (HMRC SPI) ───────────────────────── + # Income bands (HMRC SPI) if target.breakdown_variable == "total_income": - return _compute_income_band(target, ctx) + return compute_income_band(target, ctx) - # ── Council tax bands by region (VOA) ───────────────────────── + # Council tax if name.startswith("voa/council_tax/"): - return _compute_council_tax_band(target, ctx) - - # ── Vehicle ownership (NTS) ─────────────────────────────────── - if name == "nts/households_no_vehicle": - return (ctx.pe("num_vehicles") == 0).astype(float) - if name == "nts/households_one_vehicle": - return (ctx.pe("num_vehicles") == 1).astype(float) - if name == "nts/households_two_plus_vehicles": - return (ctx.pe("num_vehicles") >= 2).astype(float) - - # ── Housing targets ─────────────────────────────────────────── - if name == "housing/total_mortgage": - return ctx.pe("mortgage_capital_repayment") + ctx.pe( - "mortgage_interest_repayment" - ) - if name == "housing/rent_private": - tenure = ctx.sim.calculate("tenure_type", map_to="household").values - return ctx.pe("rent") * (tenure == "RENT_PRIVATELY") + return compute_council_tax_band(target, ctx) + if name.startswith("obr/council_tax"): + return compute_obr_council_tax(target, ctx) - # ── Savings interest (ONS) ──────────────────────────────────── + # Vehicles + if name.startswith("nts/households_"): + return compute_vehicles(target, ctx) + + # Housing + if name in ("housing/total_mortgage", "housing/rent_private"): + return compute_housing(target, ctx) + + # Savings if name == "ons/savings_interest_income": - savings = ctx.sim.calculate("savings_interest_income") - return ctx.household_from_person(savings) + return compute_savings_interest(target, ctx) - # ── Scottish child payment ──────────────────────────────────── + # Scottish child payment if name == "sss/scottish_child_payment": - scp = ctx.sim.calculate("scottish_child_payment") - return ctx.household_from_person(scp) - - # ── DWP PIP claimant splits ─────────────────────────────────── - if name == "dwp/pip_dl_standard_claimants": - pip_dl = ctx.sim.calculate("pip_dl_category") - return ctx.sim.map_result(pip_dl == "STANDARD", "person", "household") - if name == "dwp/pip_dl_enhanced_claimants": - pip_dl = ctx.sim.calculate("pip_dl_category") - return ctx.sim.map_result(pip_dl == "ENHANCED", "person", "household") - - # ── DWP benefit cap ─────────────────────────────────────────── - if name == "dwp/benefit_capped_households": - reduction = ctx.sim.calculate( - "benefit_cap_reduction", map_to="household" - ).values - return (reduction > 0).astype(float) - if name == "dwp/benefit_cap_total_reduction": - return ctx.sim.calculate( - "benefit_cap_reduction", map_to="household" - ).values.astype(float) - - # ── DWP Scotland UC + child under 1 ────────────────────────── + return compute_scottish_child_payment(target, ctx) + + # PIP claimants + if name in ( + "dwp/pip_dl_standard_claimants", + "dwp/pip_dl_enhanced_claimants", + ): + return compute_pip_claimants(target, ctx) + + # Benefit cap + if name in ( + "dwp/benefit_capped_households", + "dwp/benefit_cap_total_reduction", + ): + return compute_benefit_cap(target, ctx) + + # Scotland UC + child under 1 if name == "dwp/scotland_uc_households_child_under_1": - uc = ctx.sim.calculate("universal_credit") - on_uc = ctx.household_from_family(uc > 0) > 0 - child_u1 = ctx.pe_person("is_child") & (ctx.age < 1) - has_child_u1 = ctx.household_from_person(child_u1) > 0 - return ( - (ctx.household_region == "SCOTLAND") & on_uc & has_child_u1 - ).astype(float) - - # ── UC claimants by number of children ───────────────────────── + return compute_scotland_uc_child(target, ctx) + + # UC claimants by children if name.startswith("dwp/uc/claimants_with_") and "_children" in name: - return _compute_uc_by_children(target, ctx) + return compute_uc_by_children(target, ctx) - # ── UC claimants by family type ────────────────────────────── + # UC claimants by family type if name.startswith("dwp/uc/claimants_") and not name.startswith( "dwp/uc/claimants_with_" ): - return _compute_uc_by_family_type(target, ctx) + return compute_uc_by_family_type(target, ctx) - # ── UC payment distribution ─────────────────────────────────── + # UC payment distribution if name.startswith("dwp/uc_payment_dist/"): - return _compute_uc_payment_dist(target, ctx) + return compute_uc_payment_dist(target, ctx) - # ── Salary sacrifice IT relief by tax band ──────────────────── + # Salary sacrifice IT relief if name.startswith("hmrc/salary_sacrifice_it_relief_"): - return _compute_ss_it_relief(target, ctx) + return compute_ss_it_relief(target, ctx) + + # Salary sacrifice contributions + if name == "hmrc/salary_sacrifice_contributions": + return compute_ss_contributions(target, ctx) - # ── Salary sacrifice NI relief ──────────────────────────────── + # Salary sacrifice NI relief if name in ( "hmrc/salary_sacrifice_employee_nics_relief", "obr/salary_sacrifice_employee_ni_relief", - ): - ni_base = ctx.sim.calculate("ni_employee") - ni_cf = ctx.counterfactual_sim.calculate( - "ni_employee", ctx.time_period - ) - return ctx.household_from_person(ni_cf - ni_base) - if name in ( "hmrc/salary_sacrifice_employer_nics_relief", "obr/salary_sacrifice_employer_ni_relief", ): - ni_base = ctx.sim.calculate("ni_employer") - ni_cf = ctx.counterfactual_sim.calculate( - "ni_employer", ctx.time_period - ) - return ctx.household_from_person(ni_cf - ni_base) + return compute_ss_ni_relief(target, ctx) - # ── UC jobseeker / non-jobseeker splits ─────────────────────── + # UC jobseeker splits if name in ( "obr/universal_credit_jobseekers", "obr/universal_credit_non_jobseekers", "obr/universal_credit_jobseekers_count", "obr/universal_credit_non_jobseekers_count", ): - return _compute_uc_jobseeker(target, ctx) + return compute_uc_jobseeker(target, ctx) - # ── OBR UC outside benefit cap ──────────────────────────────── + # UC outside benefit cap if name == "obr/universal_credit_outside_cap": - uc = ctx.sim.calculate("universal_credit") - uc_hh = ctx.household_from_family(uc) - cap_reduction = ctx.sim.calculate( - "benefit_cap_reduction", map_to="household" - ).values - not_capped = cap_reduction == 0 - return uc_hh * not_capped - - # ── Two-child limit targets ─────────────────────────────────── + return compute_uc_outside_cap(target, ctx) + + # Two-child limit if "two_child_limit" in name: - return _compute_two_child_limit(target, ctx) + return compute_two_child_limit(target, ctx) - # ── OBR council tax by country ──────────────────────────────── - if name.startswith("obr/council_tax"): - return _compute_obr_council_tax(target, ctx) + # ESA (combined income + contributory) + if name == "obr/esa": + return compute_esa(target, ctx) - # ── Simple GBP sum targets ──────────────────────────────────── + # Fallbacks: simple GBP sum / simple count if target.unit == Unit.GBP and not target.is_count: return _compute_simple_gbp(target, ctx) - - # ── Simple count targets ────────────────────────────────────── if target.is_count and target.unit == Unit.COUNT: return _compute_simple_count(target, ctx) @@ -416,10 +381,7 @@ def _compute_column( return None -# ── Compute implementations ────────────────────────────────────────── - - -def _compute_simple_gbp(target: Target, ctx: _SimContext) -> np.ndarray: +def _compute_simple_gbp(target: Target, ctx: _SimContext) -> np.ndarray | None: """Sum a variable at household level.""" variable = target.variable try: @@ -438,390 +400,3 @@ def _compute_simple_gbp(target: Target, ctx: _SimContext) -> np.ndarray: def _compute_simple_count(target: Target, ctx: _SimContext) -> np.ndarray: """Count recipients of a variable, mapped to household.""" return ctx.pe_count(target.variable) - - -def _compute_regional_age(target: Target, ctx: _SimContext) -> np.ndarray: - """Compute person count in a region × age band.""" - # Parse "ons/{region_name}_age_{lower}_{upper}" from the name - name = target.name.removeprefix("ons/") - # Find the _age_ part - idx = name.index("_age_") - region_name = name[:idx] - age_part = name[idx + 5 :] # e.g. "0_9" - lower, upper = age_part.split("_") - lower, upper = int(lower), int(upper) - - pe_region = _REGION_INV.get(region_name) - if pe_region is None: - return None - - person_match = ( - (ctx.region.values == pe_region) - & (ctx.age >= lower) - & (ctx.age <= upper) - ) - return ctx.household_from_person(person_match) - - -def _compute_gender_age(target: Target, ctx: _SimContext) -> np.ndarray: - """Compute person count in a gender × age band.""" - name = target.name.removeprefix("ons/") - # "female_0_14" or "male_75_90" - parts = name.split("_") - sex = parts[0] - lower = int(parts[1]) - upper = int(parts[2]) - - gender = ctx.sim.calculate("gender").values - sex_match = gender == ("FEMALE" if sex == "female" else "MALE") - age_match = (ctx.age >= lower) & (ctx.age <= upper) - return ctx.household_from_person(sex_match & age_match) - - -def _compute_household_type( - target: Target, ctx: _SimContext -) -> np.ndarray | None: - """Compute household type count from ONS families & households categories. - - Maps ONS household categories to PE family_type enum values and - household composition conditions. family_type is a benunit variable - so we map boolean comparisons to household level. - """ - name = target.name.removeprefix("ons/") - ft = ctx.sim.calculate("family_type").values # benunit level - is_child = ctx.pe_person("is_child") - children_per_hh = ctx.household_from_person(is_child) - age_hh_head = ctx.pe("age") # head of household age - - def ft_hh(value): - """Map family_type == value from benunit to household (any).""" - return ctx.household_from_family(ft == value) > 0 - - if name == "lone_households_under_65": - return ( - ft_hh("SINGLE") & (children_per_hh == 0) & (age_hh_head < 65) - ).astype(float) - if name == "lone_households_over_65": - return ( - ft_hh("SINGLE") & (children_per_hh == 0) & (age_hh_head >= 65) - ).astype(float) - if name == "unrelated_adult_households": - people_per_hh = ctx.household_from_person(np.ones_like(is_child)) - return ( - ft_hh("SINGLE") & (children_per_hh == 0) & (people_per_hh > 1) - ).astype(float) - if name == "couple_no_children_households": - return ft_hh("COUPLE_NO_CHILDREN").astype(float) - if name == "couple_under_3_children_households": - return ( - ft_hh("COUPLE_WITH_CHILDREN") - & (children_per_hh >= 1) - & (children_per_hh <= 2) - ).astype(float) - if name == "couple_3_plus_children_households": - return (ft_hh("COUPLE_WITH_CHILDREN") & (children_per_hh >= 3)).astype( - float - ) - if name == "couple_non_dependent_children_only_households": - people_per_hh = ctx.household_from_person(np.ones_like(is_child)) - return (ft_hh("COUPLE_NO_CHILDREN") & (people_per_hh > 2)).astype( - float - ) - if name == "lone_parent_dependent_children_households": - return (ft_hh("LONE_PARENT") & (children_per_hh > 0)).astype(float) - if name == "lone_parent_non_dependent_children_households": - people_per_hh = ctx.household_from_person(np.ones_like(is_child)) - return ( - ft_hh("SINGLE") - & (children_per_hh == 0) - & (people_per_hh > 1) - & (age_hh_head >= 40) - ).astype(float) - if name == "multi_family_households": - n_benunits = ctx.pe("household_num_benunits") - return (n_benunits > 1).astype(float) - - return None - - -def _compute_tenure(target: Target, ctx: _SimContext) -> np.ndarray | None: - """Compute dwelling count by tenure type.""" - # Map ONS target name suffixes to PE tenure_type enum values - _TENURE_MAP = { - "tenure_england_owned_outright": "OWNED_OUTRIGHT", - "tenure_england_owned_with_mortgage": "OWNED_WITH_MORTGAGE", - "tenure_england_rented_privately": "RENT_PRIVATELY", - "tenure_england_social_rent": ["RENT_FROM_COUNCIL", "RENT_FROM_HA"], - "tenure_england_total": None, # all tenures - } - suffix = target.name.removeprefix("ons/") - pe_values = _TENURE_MAP.get(suffix) - if pe_values is None and suffix == "tenure_england_total": - # Total dwellings in England - return (ctx.country == "ENGLAND").astype(float) - if pe_values is None: - return None - - tenure = ctx.sim.calculate("tenure_type", map_to="household").values - in_england = ctx.country == "ENGLAND" - if isinstance(pe_values, list): - match = np.zeros_like(tenure, dtype=bool) - for v in pe_values: - match = match | (tenure == v) - else: - match = tenure == pe_values - return (match & in_england).astype(float) - - -def _compute_income_band(target: Target, ctx: _SimContext) -> np.ndarray: - """Compute income variable within a total income band.""" - variable = target.variable - lower = target.lower_bound - upper = target.upper_bound - - income_df = ctx.sim.calculate_dataframe(["total_income", variable]) - in_band = (income_df.total_income >= lower) & ( - income_df.total_income < upper - ) - - if target.is_count: - return ctx.household_from_person((income_df[variable] > 0) * in_band) - else: - return ctx.household_from_person(income_df[variable] * in_band) - - -def _compute_council_tax_band(target: Target, ctx: _SimContext) -> np.ndarray: - """Compute council tax band count for a region.""" - # "voa/council_tax/{REGION}/{band}" - parts = target.name.split("/") - region = parts[2] - band = parts[3] - - in_region = ctx.sim.calculate("region").values == region - - if band == "total": - return in_region.astype(float) - - in_band = ctx.sim.calculate("council_tax_band") == band - return (in_band * in_region).astype(float) - - -def _compute_obr_council_tax(target: Target, ctx: _SimContext) -> np.ndarray: - """Compute OBR council tax receipts, optionally by country.""" - name = target.name - ct = ctx.pe("council_tax") - - if name == "obr/council_tax": - return ct - if name == "obr/council_tax_england": - return ct * (ctx.country == "ENGLAND") - if name == "obr/council_tax_scotland": - return ct * (ctx.country == "SCOTLAND") - if name == "obr/council_tax_wales": - return ct * (ctx.country == "WALES") - return ct - - -def _compute_uc_jobseeker(target: Target, ctx: _SimContext) -> np.ndarray: - """Compute UC jobseeker / non-jobseeker splits.""" - family = ctx.sim.populations["benunit"] - uc = ctx.sim.calculate("universal_credit") - on_uc = uc > 0 - unemployed = family.any( - ctx.sim.calculate("employment_status") == "UNEMPLOYED" - ) - - if "non_jobseekers" in target.name: - mask = on_uc * ~unemployed - else: - mask = on_uc * unemployed - - if "_count" in target.name: - return ctx.household_from_family(mask) - else: - return ctx.household_from_family(uc * mask) - - -def _compute_uc_payment_dist(target: Target, ctx: _SimContext) -> np.ndarray: - """Compute UC payment distribution band × family type.""" - # Parse from name: "dwp/uc_payment_dist/{family_type}_annual_payment_{lower}_to_{upper}" - name = target.name.removeprefix("dwp/uc_payment_dist/") - # Find the _annual_payment_ separator - idx = name.index("_annual_payment_") - family_type = name[:idx] - lower = target.lower_bound - upper = target.upper_bound - - uc_payments = ctx.sim.calculate( - "universal_credit", map_to="benunit" - ).values - uc_family_type = ctx.sim.calculate("family_type", map_to="benunit").values - - in_band = ( - (uc_payments >= lower) - & (uc_payments < upper) - & (uc_family_type == family_type) - ) - return ctx.household_from_family(in_band) - - -def _compute_ss_it_relief(target: Target, ctx: _SimContext) -> np.ndarray: - """Compute salary sacrifice IT relief by tax band.""" - it_base = ctx.sim.calculate("income_tax") - it_cf = ctx.counterfactual_sim.calculate("income_tax", ctx.time_period) - it_relief = it_cf - it_base - - adj_net_income_cf = ctx.counterfactual_sim.calculate( - "adjusted_net_income", ctx.time_period - ) - - params = ctx.sim.tax_benefit_system.parameters.gov.hmrc.income_tax.rates.uk - basic_thresh = params[0].threshold(ctx.time_period) - higher_thresh = params[1].threshold(ctx.time_period) - additional_thresh = params[2].threshold(ctx.time_period) - - name = target.name - if "basic" in name: - mask = (adj_net_income_cf > basic_thresh) & ( - adj_net_income_cf <= higher_thresh - ) - elif "higher" in name: - mask = (adj_net_income_cf > higher_thresh) & ( - adj_net_income_cf <= additional_thresh - ) - elif "additional" in name: - mask = adj_net_income_cf > additional_thresh - else: - # Total — no mask - mask = np.ones_like(it_relief, dtype=bool) - - return ctx.household_from_person(it_relief * mask) - - -def _compute_two_child_limit( - target: Target, ctx: _SimContext -) -> np.ndarray | None: - """Compute two-child limit targets. - - These involve cross-tabulations of UC eligibility, child count, - disability status, etc. Complex enough to need specific logic - per target name. - """ - name = target.name - sim = ctx.sim - - is_child = sim.calculate("is_child").values - child_is_affected = ( - sim.map_result( - sim.calculate("uc_is_child_limit_affected", map_to="household"), - "household", - "person", - ) - > 0 - ) * is_child - child_in_uc = sim.calculate("universal_credit", map_to="person").values > 0 - children_in_capped = sim.map_result( - child_is_affected * child_in_uc, "person", "household" - ) - capped_hh = (children_in_capped > 0) * 1.0 - - if name == "dwp/uc/two_child_limit/households_affected": - return capped_hh - if name == "dwp/uc/two_child_limit/children_affected": - return children_in_capped - if name == "dwp/uc/two_child_limit/children_in_affected_households": - # Total children (not just affected ones) in capped households - total_children = sim.map_result( - is_child * child_in_uc, "person", "household" - ) - return total_children * capped_hh - - # By number of children: "dwp/uc/two_child_limit/{n}_children_households" - if "_children_households_total_children" in name: - n = int(name.split("/")[-1].split("_")[0]) - children_count = sim.map_result(is_child, "person", "household") - return (capped_hh * (children_count == n) * children_count).astype( - float - ) - if "_children_households" in name and "total" not in name: - n = int(name.split("/")[-1].split("_")[0]) - children_count = sim.map_result(is_child, "person", "household") - match = n if n < 6 else slice(6, None) - if isinstance(match, int): - return (capped_hh * (children_count == n)).astype(float) - else: - return (capped_hh * (children_count >= 6)).astype(float) - - # Disability cross-tabs - if "adult_pip_households" in name: - pip = sim.calculate("pip", map_to="household").values - return (capped_hh * (pip > 0)).astype(float) - if "adult_pip_children" in name: - pip = sim.calculate("pip", map_to="household").values - return (children_in_capped * (pip > 0)).astype(float) - if "disabled_child_element_households" in name: - dce = sim.calculate( - "uc_individual_disabled_child_element", map_to="household" - ).values - return (capped_hh * (dce > 0)).astype(float) - if "disabled_child_element_children" in name: - dce = sim.calculate( - "uc_individual_disabled_child_element", map_to="household" - ).values - return (children_in_capped * (dce > 0)).astype(float) - - return None - - -def _compute_uc_by_children(target: Target, ctx: _SimContext) -> np.ndarray: - """Compute UC claimant households filtered by number of dependent children.""" - # Parse "dwp/uc/claimants_with_{n}_children" - name = target.name - n_str = name.split("claimants_with_")[1].split("_children")[0] - - uc = ctx.sim.calculate("universal_credit") - on_uc = ctx.household_from_family(uc > 0) > 0 - - is_child = ctx.pe_person("is_child") - children_per_hh = ctx.household_from_person(is_child) - - if n_str.endswith("+"): - n = int(n_str[:-1]) - match = children_per_hh >= n - else: - n = int(n_str) - match = children_per_hh == n - - return (on_uc & match).astype(float) - - -def _compute_uc_by_family_type(target: Target, ctx: _SimContext) -> np.ndarray: - """Compute UC claimant households filtered by family type.""" - name = target.name - ft_str = name.split("dwp/uc/claimants_")[1] - - uc = ctx.sim.calculate("universal_credit") - on_uc = ctx.household_from_family(uc > 0) > 0 - - ft = ctx.sim.calculate("family_type").values # benunit level - - def ft_hh(value): - return ctx.household_from_family(ft == value) > 0 - - is_child = ctx.pe_person("is_child") - children_per_hh = ctx.household_from_person(is_child) - - if ft_str == "single_no_children": - match = ft_hh("SINGLE") & (children_per_hh == 0) - elif ft_str == "single_with_children": - match = (ft_hh("SINGLE") | ft_hh("LONE_PARENT")) & ( - children_per_hh > 0 - ) - elif ft_str == "couple_no_children": - match = ft_hh("COUPLE_NO_CHILDREN") - elif ft_str == "couple_with_children": - match = ft_hh("COUPLE_WITH_CHILDREN") - else: - return None - - return (on_uc & match).astype(float) diff --git a/policyengine_uk_data/targets/compute/__init__.py b/policyengine_uk_data/targets/compute/__init__.py new file mode 100644 index 00000000..9ab7340c --- /dev/null +++ b/policyengine_uk_data/targets/compute/__init__.py @@ -0,0 +1,69 @@ +"""Compute subpackage: domain-specific column computation for targets.""" + +from policyengine_uk_data.targets.compute.benefits import ( + compute_benefit_cap, + compute_pip_claimants, + compute_scotland_uc_child, + compute_two_child_limit, + compute_uc_by_children, + compute_uc_by_family_type, + compute_uc_jobseeker, + compute_uc_outside_cap, + compute_uc_payment_dist, +) +from policyengine_uk_data.targets.compute.council_tax import ( + compute_council_tax_band, + compute_obr_council_tax, +) +from policyengine_uk_data.targets.compute.demographics import ( + compute_gender_age, + compute_regional_age, + compute_scotland_demographics, + compute_uk_population, +) +from policyengine_uk_data.targets.compute.households import ( + compute_household_type, + compute_tenure, +) +from policyengine_uk_data.targets.compute.income import ( + compute_esa, + compute_income_band, + compute_ss_contributions, + compute_ss_it_relief, + compute_ss_ni_relief, +) +from policyengine_uk_data.targets.compute.other import ( + compute_housing, + compute_savings_interest, + compute_scottish_child_payment, + compute_vehicles, +) + +__all__ = [ + "compute_benefit_cap", + "compute_council_tax_band", + "compute_esa", + "compute_gender_age", + "compute_household_type", + "compute_housing", + "compute_income_band", + "compute_obr_council_tax", + "compute_pip_claimants", + "compute_regional_age", + "compute_savings_interest", + "compute_scotland_demographics", + "compute_scotland_uc_child", + "compute_scottish_child_payment", + "compute_ss_contributions", + "compute_ss_it_relief", + "compute_ss_ni_relief", + "compute_tenure", + "compute_two_child_limit", + "compute_uc_by_children", + "compute_uc_by_family_type", + "compute_uc_jobseeker", + "compute_uc_outside_cap", + "compute_uc_payment_dist", + "compute_uk_population", + "compute_vehicles", +] diff --git a/policyengine_uk_data/targets/compute/benefits.py b/policyengine_uk_data/targets/compute/benefits.py new file mode 100644 index 00000000..140eab68 --- /dev/null +++ b/policyengine_uk_data/targets/compute/benefits.py @@ -0,0 +1,205 @@ +"""Benefit-related compute functions (UC, PIP, benefit cap, etc).""" + +import numpy as np + + +def compute_pip_claimants(target, ctx) -> np.ndarray: + """Compute PIP daily living standard/enhanced claimant counts.""" + pip_dl = ctx.sim.calculate("pip_dl_category") + if "standard" in target.name: + return ctx.sim.map_result(pip_dl == "STANDARD", "person", "household") + return ctx.sim.map_result(pip_dl == "ENHANCED", "person", "household") + + +def compute_benefit_cap(target, ctx) -> np.ndarray: + """Compute benefit cap targets.""" + if "total_reduction" in target.name: + return ctx.sim.calculate( + "benefit_cap_reduction", map_to="household" + ).values.astype(float) + reduction = ctx.sim.calculate( + "benefit_cap_reduction", map_to="household" + ).values + return (reduction > 0).astype(float) + + +def compute_scotland_uc_child(target, ctx) -> np.ndarray: + """Compute Scotland UC households with child under 1.""" + uc = ctx.sim.calculate("universal_credit") + on_uc = ctx.household_from_family(uc > 0) > 0 + child_u1 = ctx.pe_person("is_child") & (ctx.age < 1) + has_child_u1 = ctx.household_from_person(child_u1) > 0 + return ( + (ctx.household_region == "SCOTLAND") & on_uc & has_child_u1 + ).astype(float) + + +def compute_uc_by_children(target, ctx) -> np.ndarray: + """Compute UC claimant households by number of children.""" + name = target.name + n_str = name.split("claimants_with_")[1].split("_children")[0] + + uc = ctx.sim.calculate("universal_credit") + on_uc = ctx.household_from_family(uc > 0) > 0 + + is_child = ctx.pe_person("is_child") + children_per_hh = ctx.household_from_person(is_child) + + if n_str.endswith("+"): + n = int(n_str[:-1]) + match = children_per_hh >= n + else: + n = int(n_str) + match = children_per_hh == n + + return (on_uc & match).astype(float) + + +def compute_uc_by_family_type(target, ctx) -> np.ndarray | None: + """Compute UC claimant households by family type.""" + name = target.name + ft_str = name.split("dwp/uc/claimants_")[1] + + uc = ctx.sim.calculate("universal_credit") + on_uc = ctx.household_from_family(uc > 0) > 0 + + ft = ctx.sim.calculate("family_type").values + + def ft_hh(value): + return ctx.household_from_family(ft == value) > 0 + + is_child = ctx.pe_person("is_child") + children_per_hh = ctx.household_from_person(is_child) + + if ft_str == "single_no_children": + match = ft_hh("SINGLE") & (children_per_hh == 0) + elif ft_str == "single_with_children": + match = (ft_hh("SINGLE") | ft_hh("LONE_PARENT")) & ( + children_per_hh > 0 + ) + elif ft_str == "couple_no_children": + match = ft_hh("COUPLE_NO_CHILDREN") + elif ft_str == "couple_with_children": + match = ft_hh("COUPLE_WITH_CHILDREN") + else: + return None + + return (on_uc & match).astype(float) + + +def compute_uc_payment_dist(target, ctx) -> np.ndarray: + """Compute UC payment distribution band x family type.""" + name = target.name.removeprefix("dwp/uc_payment_dist/") + idx = name.index("_annual_payment_") + family_type = name[:idx] + lower = target.lower_bound + upper = target.upper_bound + + uc_payments = ctx.sim.calculate( + "universal_credit", map_to="benunit" + ).values + uc_family_type = ctx.sim.calculate("family_type", map_to="benunit").values + + in_band = ( + (uc_payments >= lower) + & (uc_payments < upper) + & (uc_family_type == family_type) + ) + return ctx.household_from_family(in_band) + + +def compute_uc_jobseeker(target, ctx) -> np.ndarray: + """Compute UC jobseeker / non-jobseeker splits.""" + family = ctx.sim.populations["benunit"] + uc = ctx.sim.calculate("universal_credit") + on_uc = uc > 0 + unemployed = family.any( + ctx.sim.calculate("employment_status") == "UNEMPLOYED" + ) + + if "non_jobseekers" in target.name: + mask = on_uc * ~unemployed + else: + mask = on_uc * unemployed + + if "_count" in target.name: + return ctx.household_from_family(mask) + else: + return ctx.household_from_family(uc * mask) + + +def compute_uc_outside_cap(target, ctx) -> np.ndarray: + """Compute OBR UC outside benefit cap.""" + uc = ctx.sim.calculate("universal_credit") + uc_hh = ctx.household_from_family(uc) + cap_reduction = ctx.sim.calculate( + "benefit_cap_reduction", map_to="household" + ).values + not_capped = cap_reduction == 0 + return uc_hh * not_capped + + +def compute_two_child_limit(target, ctx) -> np.ndarray | None: + """Compute two-child limit targets.""" + name = target.name + sim = ctx.sim + + is_child = sim.calculate("is_child").values + child_is_affected = ( + sim.map_result( + sim.calculate("uc_is_child_limit_affected", map_to="household"), + "household", + "person", + ) + > 0 + ) * is_child + child_in_uc = sim.calculate("universal_credit", map_to="person").values > 0 + children_in_capped = sim.map_result( + child_is_affected * child_in_uc, "person", "household" + ) + capped_hh = (children_in_capped > 0) * 1.0 + + if name == "dwp/uc/two_child_limit/households_affected": + return capped_hh + if name == "dwp/uc/two_child_limit/children_affected": + return children_in_capped + if name == "dwp/uc/two_child_limit/children_in_affected_households": + total_children = sim.map_result( + is_child * child_in_uc, "person", "household" + ) + return total_children * capped_hh + + if "_children_households_total_children" in name: + n = int(name.split("/")[-1].split("_")[0]) + children_count = sim.map_result(is_child, "person", "household") + return (capped_hh * (children_count == n) * children_count).astype( + float + ) + if "_children_households" in name and "total" not in name: + n = int(name.split("/")[-1].split("_")[0]) + children_count = sim.map_result(is_child, "person", "household") + if n < 6: + return (capped_hh * (children_count == n)).astype(float) + else: + return (capped_hh * (children_count >= 6)).astype(float) + + if "adult_pip_households" in name: + pip = sim.calculate("pip", map_to="household").values + return (capped_hh * (pip > 0)).astype(float) + if "adult_pip_children" in name: + pip = sim.calculate("pip", map_to="household").values + return (children_in_capped * (pip > 0)).astype(float) + if "disabled_child_element_households" in name: + dce = sim.calculate( + "uc_individual_disabled_child_element", + map_to="household", + ).values + return (capped_hh * (dce > 0)).astype(float) + if "disabled_child_element_children" in name: + dce = sim.calculate( + "uc_individual_disabled_child_element", + map_to="household", + ).values + return (children_in_capped * (dce > 0)).astype(float) + + return None diff --git a/policyengine_uk_data/targets/compute/council_tax.py b/policyengine_uk_data/targets/compute/council_tax.py new file mode 100644 index 00000000..2c538f25 --- /dev/null +++ b/policyengine_uk_data/targets/compute/council_tax.py @@ -0,0 +1,34 @@ +"""Council tax compute functions.""" + +import numpy as np + + +def compute_council_tax_band(target, ctx) -> np.ndarray: + """Compute council tax band count for a region.""" + parts = target.name.split("/") + region = parts[2] + band = parts[3] + + in_region = ctx.sim.calculate("region").values == region + + if band == "total": + return in_region.astype(float) + + in_band = ctx.sim.calculate("council_tax_band") == band + return (in_band * in_region).astype(float) + + +def compute_obr_council_tax(target, ctx) -> np.ndarray: + """Compute OBR council tax receipts, optionally by country.""" + name = target.name + ct = ctx.pe("council_tax") + + if name == "obr/council_tax": + return ct + if name == "obr/council_tax_england": + return ct * (ctx.country == "ENGLAND") + if name == "obr/council_tax_scotland": + return ct * (ctx.country == "SCOTLAND") + if name == "obr/council_tax_wales": + return ct * (ctx.country == "WALES") + return ct diff --git a/policyengine_uk_data/targets/compute/demographics.py b/policyengine_uk_data/targets/compute/demographics.py new file mode 100644 index 00000000..670b8072 --- /dev/null +++ b/policyengine_uk_data/targets/compute/demographics.py @@ -0,0 +1,79 @@ +"""Demographic target compute functions.""" + +import numpy as np + +_REGION_MAP = { + "NORTH_EAST": "north_east", + "SOUTH_EAST": "south_east", + "EAST_MIDLANDS": "east_midlands", + "WEST_MIDLANDS": "west_midlands", + "YORKSHIRE": "yorkshire_and_the_humber", + "EAST_OF_ENGLAND": "east", + "LONDON": "london", + "SOUTH_WEST": "south_west", + "NORTH_WEST": "north_west", + "WALES": "wales", + "SCOTLAND": "scotland", + "NORTHERN_IRELAND": "northern_ireland", +} +_REGION_INV = {v: k for k, v in _REGION_MAP.items()} + + +def compute_regional_age(target, ctx) -> np.ndarray | None: + """Compute person count in a region x age band.""" + name = target.name.removeprefix("ons/") + idx = name.index("_age_") + region_name = name[:idx] + age_part = name[idx + 5 :] + lower, upper = age_part.split("_") + lower, upper = int(lower), int(upper) + + pe_region = _REGION_INV.get(region_name) + if pe_region is None: + return None + + person_match = ( + (ctx.region.values == pe_region) + & (ctx.age >= lower) + & (ctx.age <= upper) + ) + return ctx.household_from_person(person_match) + + +def compute_gender_age(target, ctx) -> np.ndarray: + """Compute person count in a gender x age band.""" + name = target.name.removeprefix("ons/") + parts = name.split("_") + sex = parts[0] + lower = int(parts[1]) + upper = int(parts[2]) + + gender = ctx.sim.calculate("gender").values + sex_match = gender == ("FEMALE" if sex == "female" else "MALE") + age_match = (ctx.age >= lower) & (ctx.age <= upper) + return ctx.household_from_person(sex_match & age_match) + + +def compute_uk_population(target, ctx) -> np.ndarray: + """Compute UK total population column.""" + return ctx.household_from_person(ctx.age >= 0) + + +def compute_scotland_demographics(target, ctx) -> np.ndarray | None: + """Compute Scotland-specific demographic targets.""" + name = target.name + if name == "ons/scotland_children_under_16": + return ctx.household_from_person( + (ctx.region.values == "SCOTLAND") & (ctx.age < 16) + ) + if name == "ons/scotland_babies_under_1": + return ctx.household_from_person( + (ctx.region.values == "SCOTLAND") & (ctx.age < 1) + ) + if name == "ons/scotland_households_3plus_children": + is_child = ctx.pe_person("is_child") + children_per_hh = ctx.household_from_person(is_child) + return ( + (ctx.household_region == "SCOTLAND") & (children_per_hh >= 3) + ).astype(float) + return None diff --git a/policyengine_uk_data/targets/compute/households.py b/policyengine_uk_data/targets/compute/households.py new file mode 100644 index 00000000..be7686ac --- /dev/null +++ b/policyengine_uk_data/targets/compute/households.py @@ -0,0 +1,91 @@ +"""Household type and tenure compute functions.""" + +import numpy as np + + +def compute_household_type(target, ctx) -> np.ndarray | None: + """Compute household type count from ONS categories.""" + name = target.name.removeprefix("ons/") + ft = ctx.sim.calculate("family_type").values + is_child = ctx.pe_person("is_child") + children_per_hh = ctx.household_from_person(is_child) + age_hh_head = ctx.pe("age") + + def ft_hh(value): + return ctx.household_from_family(ft == value) > 0 + + if name == "lone_households_under_65": + return ( + ft_hh("SINGLE") & (children_per_hh == 0) & (age_hh_head < 65) + ).astype(float) + if name == "lone_households_over_65": + return ( + ft_hh("SINGLE") & (children_per_hh == 0) & (age_hh_head >= 65) + ).astype(float) + if name == "unrelated_adult_households": + people_per_hh = ctx.household_from_person(np.ones_like(is_child)) + return ( + ft_hh("SINGLE") & (children_per_hh == 0) & (people_per_hh > 1) + ).astype(float) + if name == "couple_no_children_households": + return ft_hh("COUPLE_NO_CHILDREN").astype(float) + if name == "couple_under_3_children_households": + return ( + ft_hh("COUPLE_WITH_CHILDREN") + & (children_per_hh >= 1) + & (children_per_hh <= 2) + ).astype(float) + if name == "couple_3_plus_children_households": + return (ft_hh("COUPLE_WITH_CHILDREN") & (children_per_hh >= 3)).astype( + float + ) + if name == "couple_non_dependent_children_only_households": + people_per_hh = ctx.household_from_person(np.ones_like(is_child)) + return (ft_hh("COUPLE_NO_CHILDREN") & (people_per_hh > 2)).astype( + float + ) + if name == "lone_parent_dependent_children_households": + return (ft_hh("LONE_PARENT") & (children_per_hh > 0)).astype(float) + if name == "lone_parent_non_dependent_children_households": + people_per_hh = ctx.household_from_person(np.ones_like(is_child)) + return ( + ft_hh("SINGLE") + & (children_per_hh == 0) + & (people_per_hh > 1) + & (age_hh_head >= 40) + ).astype(float) + if name == "multi_family_households": + n_benunits = ctx.pe("household_num_benunits") + return (n_benunits > 1).astype(float) + + return None + + +def compute_tenure(target, ctx) -> np.ndarray | None: + """Compute dwelling count by tenure type.""" + _TENURE_MAP = { + "tenure_england_owned_outright": "OWNED_OUTRIGHT", + "tenure_england_owned_with_mortgage": "OWNED_WITH_MORTGAGE", + "tenure_england_rented_privately": "RENT_PRIVATELY", + "tenure_england_social_rent": [ + "RENT_FROM_COUNCIL", + "RENT_FROM_HA", + ], + "tenure_england_total": None, + } + suffix = target.name.removeprefix("ons/") + pe_values = _TENURE_MAP.get(suffix) + if pe_values is None and suffix == "tenure_england_total": + return (ctx.country == "ENGLAND").astype(float) + if pe_values is None: + return None + + tenure = ctx.sim.calculate("tenure_type", map_to="household").values + in_england = ctx.country == "ENGLAND" + if isinstance(pe_values, list): + match = np.zeros_like(tenure, dtype=bool) + for v in pe_values: + match = match | (tenure == v) + else: + match = tenure == pe_values + return (match & in_england).astype(float) diff --git a/policyengine_uk_data/targets/compute/income.py b/policyengine_uk_data/targets/compute/income.py new file mode 100644 index 00000000..34a99a84 --- /dev/null +++ b/policyengine_uk_data/targets/compute/income.py @@ -0,0 +1,81 @@ +"""Income and salary sacrifice compute functions.""" + +import numpy as np + + +def compute_income_band(target, ctx) -> np.ndarray: + """Compute income variable within a total income band.""" + variable = target.variable + lower = target.lower_bound + upper = target.upper_bound + + income_df = ctx.sim.calculate_dataframe(["total_income", variable]) + in_band = (income_df.total_income >= lower) & ( + income_df.total_income < upper + ) + + if target.is_count: + return ctx.household_from_person((income_df[variable] > 0) * in_band) + else: + return ctx.household_from_person(income_df[variable] * in_band) + + +def compute_ss_it_relief(target, ctx) -> np.ndarray: + """Compute salary sacrifice IT relief by tax band.""" + it_base = ctx.sim.calculate("income_tax") + it_cf = ctx.counterfactual_sim.calculate("income_tax", ctx.time_period) + it_relief = it_cf - it_base + + adj_net_income_cf = ctx.counterfactual_sim.calculate( + "adjusted_net_income", ctx.time_period + ) + + params = ctx.sim.tax_benefit_system.parameters.gov.hmrc.income_tax.rates.uk + basic_thresh = params[0].threshold(ctx.time_period) + higher_thresh = params[1].threshold(ctx.time_period) + additional_thresh = params[2].threshold(ctx.time_period) + + name = target.name + if "basic" in name: + mask = (adj_net_income_cf > basic_thresh) & ( + adj_net_income_cf <= higher_thresh + ) + elif "higher" in name: + mask = (adj_net_income_cf > higher_thresh) & ( + adj_net_income_cf <= additional_thresh + ) + elif "additional" in name: + mask = adj_net_income_cf > additional_thresh + else: + mask = np.ones_like(it_relief, dtype=bool) + + return ctx.household_from_person(it_relief * mask) + + +def compute_ss_contributions(target, ctx) -> np.ndarray: + """Compute total salary sacrifice pension contributions.""" + ss = ctx.sim.calculate("pension_contributions_via_salary_sacrifice") + return ctx.household_from_person(ss) + + +def compute_ss_ni_relief(target, ctx) -> np.ndarray: + """Compute salary sacrifice NI relief (employee or employer).""" + name = target.name + if "employee" in name: + ni_base = ctx.sim.calculate("ni_employee") + ni_cf = ctx.counterfactual_sim.calculate( + "ni_employee", ctx.time_period + ) + else: + ni_base = ctx.sim.calculate("ni_employer") + ni_cf = ctx.counterfactual_sim.calculate( + "ni_employer", ctx.time_period + ) + return ctx.household_from_person(ni_cf - ni_base) + + +def compute_esa(target, ctx) -> np.ndarray: + """Compute ESA (combined income-related + contributory).""" + return ctx.household_from_person( + ctx.sim.calculate("esa_income") + ) + ctx.household_from_person(ctx.sim.calculate("esa_contrib")) diff --git a/policyengine_uk_data/targets/compute/other.py b/policyengine_uk_data/targets/compute/other.py new file mode 100644 index 00000000..64e015b4 --- /dev/null +++ b/policyengine_uk_data/targets/compute/other.py @@ -0,0 +1,36 @@ +"""Miscellaneous compute functions (vehicles, housing, savings, SCP).""" + +import numpy as np + + +def compute_vehicles(target, ctx) -> np.ndarray: + """Compute vehicle ownership targets.""" + name = target.name + if name == "nts/households_no_vehicle": + return (ctx.pe("num_vehicles") == 0).astype(float) + if name == "nts/households_one_vehicle": + return (ctx.pe("num_vehicles") == 1).astype(float) + return (ctx.pe("num_vehicles") >= 2).astype(float) + + +def compute_housing(target, ctx) -> np.ndarray: + """Compute housing targets (mortgage, private rent).""" + name = target.name + if name == "housing/total_mortgage": + return ctx.pe("mortgage_capital_repayment") + ctx.pe( + "mortgage_interest_repayment" + ) + tenure = ctx.sim.calculate("tenure_type", map_to="household").values + return ctx.pe("rent") * (tenure == "RENT_PRIVATELY") + + +def compute_savings_interest(target, ctx) -> np.ndarray: + """Compute ONS savings interest income.""" + savings = ctx.sim.calculate("savings_interest_income") + return ctx.household_from_person(savings) + + +def compute_scottish_child_payment(target, ctx) -> np.ndarray: + """Compute Scottish child payment spend.""" + scp = ctx.sim.calculate("scottish_child_payment") + return ctx.household_from_person(scp) diff --git a/policyengine_uk_data/targets/sources/_common.py b/policyengine_uk_data/targets/sources/_common.py new file mode 100644 index 00000000..69fb6988 --- /dev/null +++ b/policyengine_uk_data/targets/sources/_common.py @@ -0,0 +1,29 @@ +"""Shared utilities for target source modules.""" + +from pathlib import Path + +import yaml + +SOURCES_YAML = Path(__file__).parent.parent / "sources.yaml" +STORAGE = Path(__file__).parents[2] / "storage" + +HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)" " AppleWebKit/537.36" + ), +} + + +def load_config() -> dict: + with open(SOURCES_YAML) as f: + return yaml.safe_load(f) + + +def to_float(val) -> float: + """Convert a cell value to float, handling suppressed markers.""" + if isinstance(val, (int, float)): + return float(val) + try: + return float(val) + except (ValueError, TypeError): + return 0.0 diff --git a/policyengine_uk_data/targets/sources/dwp.py b/policyengine_uk_data/targets/sources/dwp.py index ff4441b2..ddfc1ff6 100644 --- a/policyengine_uk_data/targets/sources/dwp.py +++ b/policyengine_uk_data/targets/sources/dwp.py @@ -10,12 +10,8 @@ - DWP two-child limit: https://www.gov.uk/government/statistics/universal-credit-and-child-tax-credit-claimants-statistics-related-to-the-policy-to-provide-support-for-a-maximum-of-2-children-april-2024 """ -from pathlib import Path - from policyengine_uk_data.targets.schema import Target, Unit -_STORAGE = Path(__file__).parents[2] / "storage" - def get_targets() -> list[Target]: targets = [] diff --git a/policyengine_uk_data/targets/sources/hmrc_salary_sacrifice.py b/policyengine_uk_data/targets/sources/hmrc_salary_sacrifice.py index 4df4e48d..97ec499a 100644 --- a/policyengine_uk_data/targets/sources/hmrc_salary_sacrifice.py +++ b/policyengine_uk_data/targets/sources/hmrc_salary_sacrifice.py @@ -8,51 +8,32 @@ import io import logging -from pathlib import Path import pandas as pd import requests -import yaml from policyengine_uk_data.targets.schema import Target, Unit +from policyengine_uk_data.targets.sources._common import ( + HEADERS, + load_config, + to_float, +) logger = logging.getLogger(__name__) -_SOURCES_YAML = Path(__file__).parent.parent / "sources.yaml" -_HEADERS = { - "User-Agent": ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" - ), -} - # Uprate 3% pa for wage growth from the base year _GROWTH = 1.03 _BASE_YEAR = 2024 # 2023-24 tax year → calendar 2024 -def _load_config(): - with open(_SOURCES_YAML) as f: - return yaml.safe_load(f) - - -def _to_float(val) -> float: - """Convert CSV value to float, handling suppressed '[z]' etc.""" - if isinstance(val, (int, float)): - return float(val) - try: - return float(val) - except (ValueError, TypeError): - return 0.0 - - def get_targets() -> list[Target]: - config = _load_config() + config = load_config() ref = config["hmrc"]["salary_sacrifice_table_6"] targets = [] try: r = requests.get( - ref, headers=_HEADERS, allow_redirects=True, timeout=30 + ref, headers=HEADERS, allow_redirects=True, timeout=30 ) r.raise_for_status() df = pd.read_csv(io.StringIO(r.content.decode("utf-8-sig"))) @@ -67,7 +48,7 @@ def get_targets() -> list[Target]: ] for _, row in ss_it.iterrows(): rate = row["tax_rate"] - val = _to_float(row["value_of_relief"]) + val = to_float(row["value_of_relief"]) if val <= 0: continue rate_key = rate.lower().replace(" ", "_") @@ -94,7 +75,7 @@ def get_targets() -> list[Target]: ] for _, row in ss_nics.iterrows(): nics_class = row["nics_relief_class"] - val = _to_float(row["value_of_relief"]) + val = to_float(row["value_of_relief"]) if val <= 0: continue if "employee" in str(nics_class).lower(): @@ -131,4 +112,23 @@ def get_targets() -> list[Target]: "Failed to download/parse HMRC salary sacrifice CSV: %s", e ) + # Total salary sacrifice contributions (SPP Review 2025: £24bn base) + _SS_CONTRIBUTIONS = { + y: 24e9 * _GROWTH ** max(0, y - _BASE_YEAR) + for y in range(_BASE_YEAR, 2030) + } + targets.append( + Target( + name="hmrc/salary_sacrifice_contributions", + variable="pension_contributions_via_salary_sacrifice", + source="hmrc", + unit=Unit.GBP, + values=_SS_CONTRIBUTIONS, + reference_url=( + "https://assets.publishing.service.gov.uk/media/" + "67ce0e7c08e764d17a5d3c21/2025_SPP_Review.pdf" + ), + ) + ) + return targets diff --git a/policyengine_uk_data/targets/sources/hmrc_spi.py b/policyengine_uk_data/targets/sources/hmrc_spi.py index a976c668..de993d49 100644 --- a/policyengine_uk_data/targets/sources/hmrc_spi.py +++ b/policyengine_uk_data/targets/sources/hmrc_spi.py @@ -18,21 +18,17 @@ import pandas as pd import requests -import yaml from policyengine_uk_data.targets.schema import Target, Unit +from policyengine_uk_data.targets.sources._common import ( + HEADERS, + STORAGE, + load_config, + to_float, +) logger = logging.getLogger(__name__) -_SOURCES_YAML = Path(__file__).parent.parent / "sources.yaml" -_STORAGE = Path(__file__).parents[2] / "storage" - -_HEADERS = { - "User-Agent": ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" - ), -} - # Income bands in the SPI tables (lower bounds) _BAND_LOWER = [ 12_570, @@ -55,15 +51,10 @@ _SPI_YEAR = 2023 -def _load_config(): - with open(_SOURCES_YAML) as f: - return yaml.safe_load(f) - - @lru_cache(maxsize=1) def _download_ods(url: str) -> bytes: """Download an ODS file.""" - r = requests.get(url, headers=_HEADERS, allow_redirects=True, timeout=60) + r = requests.get(url, headers=HEADERS, allow_redirects=True, timeout=60) r.raise_for_status() return r.content @@ -90,14 +81,14 @@ def _parse_table_36(ods_bytes: bytes) -> pd.DataFrame: data_rows.append( { "lower_bound": int(lower), - "self_employment_income_count": _to_float(df.iloc[i, 1]), - "self_employment_income_amount": _to_float(df.iloc[i, 2]), - "employment_income_count": _to_float(df.iloc[i, 4]), - "employment_income_amount": _to_float(df.iloc[i, 5]), - "state_pension_count": _to_float(df.iloc[i, 7]), - "state_pension_amount": _to_float(df.iloc[i, 8]), - "private_pension_income_count": _to_float(df.iloc[i, 10]), - "private_pension_income_amount": _to_float(df.iloc[i, 11]), + "self_employment_income_count": to_float(df.iloc[i, 1]), + "self_employment_income_amount": to_float(df.iloc[i, 2]), + "employment_income_count": to_float(df.iloc[i, 4]), + "employment_income_amount": to_float(df.iloc[i, 5]), + "state_pension_count": to_float(df.iloc[i, 7]), + "state_pension_amount": to_float(df.iloc[i, 8]), + "private_pension_income_count": to_float(df.iloc[i, 10]), + "private_pension_income_amount": to_float(df.iloc[i, 11]), } ) return pd.DataFrame(data_rows) @@ -123,24 +114,17 @@ def _parse_table_37(ods_bytes: bytes) -> pd.DataFrame: data_rows.append( { "lower_bound": int(lower), - "property_income_count": _to_float(df.iloc[i, 1]), - "property_income_amount": _to_float(df.iloc[i, 2]), - "savings_interest_income_count": _to_float(df.iloc[i, 4]), - "savings_interest_income_amount": _to_float(df.iloc[i, 5]), - "dividend_income_count": _to_float(df.iloc[i, 7]), - "dividend_income_amount": _to_float(df.iloc[i, 8]), + "property_income_count": to_float(df.iloc[i, 1]), + "property_income_amount": to_float(df.iloc[i, 2]), + "savings_interest_income_count": to_float(df.iloc[i, 4]), + "savings_interest_income_amount": to_float(df.iloc[i, 5]), + "dividend_income_count": to_float(df.iloc[i, 7]), + "dividend_income_amount": to_float(df.iloc[i, 8]), } ) return pd.DataFrame(data_rows) -def _to_float(val) -> float: - """Convert cell value to float, handling '[Not available]' etc.""" - if isinstance(val, (int, float)): - return float(val) - return 0.0 - - INCOME_VARIABLES = [ "employment_income", "self_employment_income", @@ -157,7 +141,7 @@ def get_targets() -> list[Target]: Also reads incomes_projection.csv if available, which contains projected future year data generated by the microsimulation. """ - config = _load_config() + config = load_config() ref = config["hmrc"]["spi_collated"] targets = [] @@ -217,7 +201,7 @@ def get_targets() -> list[Target]: # Also read projected future years from incomes_projection.csv # if it exists (generated by utils/incomes_projection.py) - proj_path = _STORAGE / "incomes_projection.csv" + proj_path = STORAGE / "incomes_projection.csv" if proj_path.exists(): targets.extend(_read_projection_csv(proj_path, ref)) diff --git a/policyengine_uk_data/targets/sources/local_age.py b/policyengine_uk_data/targets/sources/local_age.py index 5cd7f744..0f74bc44 100644 --- a/policyengine_uk_data/targets/sources/local_age.py +++ b/policyengine_uk_data/targets/sources/local_age.py @@ -13,23 +13,20 @@ import pandas as pd +from policyengine_uk_data.targets.sources._common import STORAGE + logger = logging.getLogger(__name__) _CONST_DIR = ( - Path(__file__).parents[2] - / "datasets" - / "local_areas" - / "constituencies" - / "targets" + STORAGE.parent / "datasets" / "local_areas" / "constituencies" / "targets" ) _LA_DIR = ( - Path(__file__).parents[2] + STORAGE.parent / "datasets" / "local_areas" / "local_authorities" / "targets" ) -_STORAGE = Path(__file__).parents[2] / "storage" _REF = ( "https://www.ons.gov.uk/peoplepopulationandcommunity/" @@ -85,7 +82,7 @@ def get_la_age_targets() -> pd.DataFrame: def get_uk_total_population(year: int) -> float: """UK total population from demographics.csv (in persons, not thousands).""" - csv_path = _STORAGE / "demographics.csv" + csv_path = STORAGE / "demographics.csv" if not csv_path.exists(): return 69.9e6 # fallback demographics = pd.read_csv(csv_path) diff --git a/policyengine_uk_data/targets/sources/local_income.py b/policyengine_uk_data/targets/sources/local_income.py index 1e418313..695f790c 100644 --- a/policyengine_uk_data/targets/sources/local_income.py +++ b/policyengine_uk_data/targets/sources/local_income.py @@ -15,23 +15,20 @@ import pandas as pd +from policyengine_uk_data.targets.sources._common import STORAGE + logger = logging.getLogger(__name__) _CONST_DIR = ( - Path(__file__).parents[2] - / "datasets" - / "local_areas" - / "constituencies" - / "targets" + STORAGE.parent / "datasets" / "local_areas" / "constituencies" / "targets" ) _LA_DIR = ( - Path(__file__).parents[2] + STORAGE.parent / "datasets" / "local_areas" / "local_authorities" / "targets" ) -_STORAGE = Path(__file__).parents[2] / "storage" _REF = ( "https://www.gov.uk/government/statistics/" @@ -84,7 +81,7 @@ def get_national_income_projections(year: int) -> pd.DataFrame: Returns the incomes_projection.csv rows for the requested year, filtered to the above-personal-allowance band (12570+). """ - path = _STORAGE / "incomes_projection.csv" + path = STORAGE / "incomes_projection.csv" if not path.exists(): return pd.DataFrame() df = pd.read_csv(path) diff --git a/policyengine_uk_data/targets/sources/local_la_extras.py b/policyengine_uk_data/targets/sources/local_la_extras.py index fea211ca..76bcf06d 100644 --- a/policyengine_uk_data/targets/sources/local_la_extras.py +++ b/policyengine_uk_data/targets/sources/local_la_extras.py @@ -12,13 +12,12 @@ """ import logging -from pathlib import Path import pandas as pd -logger = logging.getLogger(__name__) +from policyengine_uk_data.targets.sources._common import STORAGE -_STORAGE = Path(__file__).parents[2] / "storage" +logger = logging.getLogger(__name__) # Uprating factors from FYE 2020 to 2025 (OBR Nov 2025 EFO) UPRATING_NET_INCOME_BHC_2020_TO_2025 = 1985.1 / 1467.6 @@ -44,7 +43,7 @@ def load_ons_la_income() -> pd.DataFrame: Returns DataFrame with columns: la_code, total_income, net_income_bhc, net_income_ahc (mean income per household, FYE 2020). """ - xlsx_path = _STORAGE / "local_authority_ons_income.xlsx" + xlsx_path = STORAGE / "local_authority_ons_income.xlsx" if not xlsx_path.exists(): logger.warning("ONS LA income file not found: %s", xlsx_path) return pd.DataFrame() @@ -85,7 +84,7 @@ def load_household_counts() -> pd.DataFrame: Returns DataFrame with columns: la_code, households. """ - path = _STORAGE / "la_count_households.xlsx" + path = STORAGE / "la_count_households.xlsx" if not path.exists(): logger.warning("LA household count file not found: %s", path) return pd.DataFrame() @@ -100,7 +99,7 @@ def load_tenure_data() -> pd.DataFrame: Returns DataFrame with columns: la_code, owned_outright_pct, owned_mortgage_pct, private_rent_pct, social_rent_pct. """ - path = _STORAGE / "la_tenure.xlsx" + path = STORAGE / "la_tenure.xlsx" if not path.exists(): logger.warning("LA tenure file not found: %s", path) return pd.DataFrame() @@ -131,7 +130,7 @@ def load_private_rents() -> pd.DataFrame: Returns DataFrame with columns: area_code, median_annual_rent. """ - path = _STORAGE / "la_private_rents_median.xlsx" + path = STORAGE / "la_private_rents_median.xlsx" if not path.exists(): logger.warning("LA private rent file not found: %s", path) return pd.DataFrame() diff --git a/policyengine_uk_data/targets/sources/obr.py b/policyengine_uk_data/targets/sources/obr.py index 6a95e504..45513cff 100644 --- a/policyengine_uk_data/targets/sources/obr.py +++ b/policyengine_uk_data/targets/sources/obr.py @@ -12,18 +12,18 @@ import io import logging from functools import lru_cache -from pathlib import Path import openpyxl import requests -import yaml from policyengine_uk_data.targets.schema import Target, Unit +from policyengine_uk_data.targets.sources._common import ( + HEADERS, + load_config, +) logger = logging.getLogger(__name__) -_SOURCES_YAML = Path(__file__).parent.parent / "sources.yaml" - # Financial year columns in OBR tables: C=2024-25, D=2025-26, ..., I=2030-31 # PolicyEngine convention: FY 2025-26 → calendar year 2025 (first year) _FY_COL_TO_YEAR = { @@ -36,22 +36,11 @@ "I": 2030, } -_HEADERS = { - "User-Agent": ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" - ), -} - - -def _load_config(): - with open(_SOURCES_YAML) as f: - return yaml.safe_load(f) - @lru_cache(maxsize=1) def _download_workbook(url: str) -> openpyxl.Workbook: """Download an xlsx from OBR and return an openpyxl workbook.""" - r = requests.get(url, headers=_HEADERS, allow_redirects=True, timeout=60) + r = requests.get(url, headers=HEADERS, allow_redirects=True, timeout=60) r.raise_for_status() return openpyxl.load_workbook(io.BytesIO(r.content), data_only=False) @@ -85,7 +74,7 @@ def _parse_receipts(wb: openpyxl.Workbook) -> list[Target]: the standard fiscal forecasting convention. Other receipts use Table 3.9 (cash basis) since they only appear there. """ - config = _load_config() + config = load_config() vintage = config["obr"]["vintage"] ref = config["obr"]["efo_receipts"] cols_34 = list(_FY_COL_TO_YEAR.keys()) @@ -169,7 +158,7 @@ def read_39(ws, row_num: int) -> dict[int, float]: def _parse_council_tax(wb: openpyxl.Workbook) -> list[Target]: """Parse Table 4.1 (council tax receipts) from expenditure xlsx.""" - config = _load_config() + config = load_config() vintage = config["obr"]["vintage"] ref = config["obr"]["efo_expenditure"] ws = wb["4.1"] @@ -233,7 +222,7 @@ def read_41(row_num: int) -> dict[int, float]: def _parse_nics(wb: openpyxl.Workbook) -> list[Target]: """Parse Table 3.4 (income tax and NICs detail) for employee/employer.""" - config = _load_config() + config = load_config() vintage = config["obr"]["vintage"] ref = config["obr"]["efo_receipts"] ws = wb["3.4"] @@ -275,7 +264,7 @@ def _parse_nics(wb: openpyxl.Workbook) -> list[Target]: def _parse_welfare(wb: openpyxl.Workbook) -> list[Target]: """Parse Table 4.9 (welfare spending) from expenditure xlsx.""" - config = _load_config() + config = load_config() vintage = config["obr"]["vintage"] ref = config["obr"]["efo_expenditure"] ws = wb["4.9"] @@ -389,7 +378,7 @@ def read_49(row_num: int) -> dict[int, float]: def _parse_tv_licence(wb: openpyxl.Workbook) -> list[Target]: """Parse Table 4.19 (BBC) from expenditure xlsx.""" - config = _load_config() + config = load_config() vintage = config["obr"]["vintage"] ref = config["obr"]["efo_expenditure"] @@ -447,7 +436,7 @@ def _parse_tv_licence(wb: openpyxl.Workbook) -> list[Target]: def get_targets() -> list[Target]: - config = _load_config() + config = load_config() targets = [] try: diff --git a/policyengine_uk_data/targets/sources/ons_demographics.py b/policyengine_uk_data/targets/sources/ons_demographics.py index 3c48c38a..a51adf39 100644 --- a/policyengine_uk_data/targets/sources/ons_demographics.py +++ b/policyengine_uk_data/targets/sources/ons_demographics.py @@ -20,7 +20,6 @@ import logging import zipfile from functools import lru_cache -from pathlib import Path import pandas as pd import requests @@ -30,18 +29,10 @@ Target, Unit, ) +from policyengine_uk_data.targets.sources._common import HEADERS, STORAGE logger = logging.getLogger(__name__) -_SOURCES_YAML = Path(__file__).parent.parent / "sources.yaml" -_STORAGE = Path(__file__).parents[2] / "storage" - -_HEADERS = { - "User-Agent": ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" - ), -} - _UK_ZIP_URL = ( "https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/" "populationandmigration/populationprojections/datasets/" @@ -88,7 +79,7 @@ def _download_uk_projection() -> pd.DataFrame: """Download and parse the UK principal population projection.""" r = requests.get( - _UK_ZIP_URL, headers=_HEADERS, allow_redirects=True, timeout=120 + _UK_ZIP_URL, headers=HEADERS, allow_redirects=True, timeout=120 ) r.raise_for_status() z = zipfile.ZipFile(io.BytesIO(r.content)) @@ -165,7 +156,7 @@ def _parse_regional_from_csv() -> list[Target]: This CSV was extracted from ONS subnational projections which lack a stable machine-readable download URL. """ - csv_path = _STORAGE / "demographics.csv" + csv_path = STORAGE / "demographics.csv" if not csv_path.exists(): logger.warning("demographics.csv not found, skipping regional") return [] diff --git a/policyengine_uk_data/targets/sources/ons_households.py b/policyengine_uk_data/targets/sources/ons_households.py index 88d51cb8..9fd4b49f 100644 --- a/policyengine_uk_data/targets/sources/ons_households.py +++ b/policyengine_uk_data/targets/sources/ons_households.py @@ -14,6 +14,7 @@ import requests from policyengine_uk_data.targets.schema import Target, Unit +from policyengine_uk_data.targets.sources._common import HEADERS logger = logging.getLogger(__name__) @@ -28,11 +29,6 @@ "birthsdeathsandmarriages/families/datasets/" "familiesandhouseholdsfamiliesandhouseholds" ) -_HEADERS = { - "User-Agent": ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" - ), -} # Table 7 rows: (row_number, target_name) # Row numbers are 1-indexed in the xlsx @@ -56,7 +52,7 @@ @lru_cache(maxsize=1) def _download_workbook() -> openpyxl.Workbook: - r = requests.get(_URL, headers=_HEADERS, allow_redirects=True, timeout=60) + r = requests.get(_URL, headers=HEADERS, allow_redirects=True, timeout=60) r.raise_for_status() return openpyxl.load_workbook(io.BytesIO(r.content), data_only=True) diff --git a/policyengine_uk_data/targets/sources/ons_savings.py b/policyengine_uk_data/targets/sources/ons_savings.py index 21edb0c0..3764f22e 100644 --- a/policyengine_uk_data/targets/sources/ons_savings.py +++ b/policyengine_uk_data/targets/sources/ons_savings.py @@ -14,22 +14,18 @@ import requests from policyengine_uk_data.targets.schema import Target, Unit +from policyengine_uk_data.targets.sources._common import HEADERS logger = logging.getLogger(__name__) _API_URL = "https://www.ons.gov.uk/economy/grossdomesticproductgdp/timeseries/haxv/ukea/data" _REF = "https://www.ons.gov.uk/economy/grossdomesticproductgdp/timeseries/haxv/ukea" -_HEADERS = { - "User-Agent": ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" - ), -} def get_targets() -> list[Target]: try: r = requests.get( - _API_URL, headers=_HEADERS, allow_redirects=True, timeout=30 + _API_URL, headers=HEADERS, allow_redirects=True, timeout=30 ) r.raise_for_status() data = r.json() diff --git a/policyengine_uk_data/targets/sources/ons_tenure.py b/policyengine_uk_data/targets/sources/ons_tenure.py index 0ae4ccdd..1da49b95 100644 --- a/policyengine_uk_data/targets/sources/ons_tenure.py +++ b/policyengine_uk_data/targets/sources/ons_tenure.py @@ -19,6 +19,7 @@ Target, Unit, ) +from policyengine_uk_data.targets.sources._common import HEADERS logger = logging.getLogger(__name__) @@ -31,11 +32,6 @@ "https://www.ons.gov.uk/peoplepopulationandcommunity/" "housing/datasets/subnationaldwellingstockbytenureestimates" ) -_HEADERS = { - "User-Agent": ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" - ), -} # Tenure categories in the xlsx header → target name suffix _TENURE_COLS = { @@ -49,7 +45,7 @@ @lru_cache(maxsize=1) def _download_workbook() -> openpyxl.Workbook: - r = requests.get(_URL, headers=_HEADERS, allow_redirects=True, timeout=60) + r = requests.get(_URL, headers=HEADERS, allow_redirects=True, timeout=60) r.raise_for_status() return openpyxl.load_workbook(io.BytesIO(r.content), data_only=True) diff --git a/policyengine_uk_data/targets/sources/voa_council_tax.py b/policyengine_uk_data/targets/sources/voa_council_tax.py index 19c3d92a..0a0af5d1 100644 --- a/policyengine_uk_data/targets/sources/voa_council_tax.py +++ b/policyengine_uk_data/targets/sources/voa_council_tax.py @@ -8,21 +8,20 @@ """ import pandas as pd -from pathlib import Path from policyengine_uk_data.targets.schema import ( GeographicLevel, Target, Unit, ) +from policyengine_uk_data.targets.sources._common import STORAGE -_STORAGE = Path(__file__).parents[2] / "storage" _REF = "https://www.gov.uk/government/statistics/council-tax-stock-of-properties-2024" def get_targets() -> list[Target]: """Build council tax band targets from the CSV.""" - csv_path = _STORAGE / "council_tax_bands_2024.csv" + csv_path = STORAGE / "council_tax_bands_2024.csv" if not csv_path.exists(): return [] From 7d5ada9a2aaa13dcb62e9098d72ee599519caee1 Mon Sep 17 00:00:00 2001 From: Vahid Ahmadi Date: Wed, 18 Feb 2026 10:39:29 +0000 Subject: [PATCH 6/6] Format income.py with black Co-Authored-By: Claude Opus 4.6 --- policyengine_uk_data/targets/compute/income.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/policyengine_uk_data/targets/compute/income.py b/policyengine_uk_data/targets/compute/income.py index ea4c3613..c2f286c4 100644 --- a/policyengine_uk_data/targets/compute/income.py +++ b/policyengine_uk_data/targets/compute/income.py @@ -86,13 +86,12 @@ def compute_ss_headcount(target, ctx) -> np.ndarray: prices before applying the threshold. """ ss = ctx.sim.calculate("pension_contributions_via_salary_sacrifice") - uprating = pd.read_csv( - STORAGE_FOLDER / "uprating_factors.csv" - ).set_index("Variable") + uprating = pd.read_csv(STORAGE_FOLDER / "uprating_factors.csv").set_index( + "Variable" + ) row = "pension_contributions_via_salary_sacrifice" price_adj = ( - uprating.loc[row, "2023"] - / uprating.loc[row, str(ctx.time_period)] + uprating.loc[row, "2023"] / uprating.loc[row, str(ctx.time_period)] ) ss_base = ss * price_adj