Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- bump: minor
changes:
changed:
- Replaced ad-hoc calibration targets with structured registry and source modules.
11 changes: 4 additions & 7 deletions policyengine_uk_data/datasets/create_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from policyengine_uk_data.storage import STORAGE_FOLDER
import logging
import os
from policyengine_uk.data import UKSingleYearDataset
from policyengine_uk_data.utils.uprating import uprate_dataset
from policyengine_uk_data.utils.progress import (
ProcessingProgress,
Expand Down Expand Up @@ -44,7 +43,6 @@ def main():
update_dataset,
nested_progress,
):

# Create base FRS dataset
update_dataset("Create base FRS dataset", "processing")
frs = create_frs(
Expand Down Expand Up @@ -107,9 +105,6 @@ def main():
update_dataset("Uprate to 2025", "completed")

# Calibrate constituency weights with nested progress
from policyengine_uk_data.datasets.local_areas.constituencies.calibrate import (
calibrate,
)

update_dataset("Calibrate constituency weights", "processing")

Expand All @@ -119,7 +114,9 @@ def main():
)
from policyengine_uk_data.datasets.local_areas.constituencies.loss import (
create_constituency_target_matrix,
create_national_target_matrix,
)
from policyengine_uk_data.targets.build_loss_matrix import (
create_target_matrix as create_national_target_matrix,
)
from policyengine_uk_data.datasets.local_areas.constituencies.calibrate import (
get_performance,
Expand Down Expand Up @@ -149,7 +146,7 @@ def main():
)

# Run calibration with verbose progress
frs_calibrated_las = calibrate_local_areas(
calibrate_local_areas(
dataset=frs,
epochs=epochs,
matrix_fn=create_local_authority_target_matrix,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
from policyengine_uk_data.utils.calibrate import calibrate_local_areas
from policyengine_uk_data.datasets.local_areas.constituencies.loss import (
create_constituency_target_matrix,
create_national_target_matrix,
)
from policyengine_uk_data.targets.build_loss_matrix import (
create_target_matrix as create_national_target_matrix,
)
from policyengine_uk_data.storage import STORAGE_FOLDER
from policyengine_uk.data import UKSingleYearDataset
Expand Down
144 changes: 57 additions & 87 deletions policyengine_uk_data/datasets/local_areas/constituencies/loss.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,36 @@
"""Constituency-level calibration target matrix.

Constructs the (matrix, y, country_mask) triple for calibrating
household weights across 650 parliamentary constituencies. Target
data is loaded from source modules in the targets system.

Sources:
- Age: ONS mid-year population estimates
- Income: HMRC SPI table 3.15
- UC: DWP Stat-Xplore
"""

from policyengine_uk import Microsimulation
import pandas as pd
import numpy as np
from pathlib import Path

from policyengine_uk_data.utils.loss import (
create_target_matrix as create_national_target_matrix,
)
from policyengine_uk.data import UKSingleYearDataset
from policyengine_uk_data.storage import STORAGE_FOLDER
from policyengine_uk_data.datasets.local_areas.constituencies.boundary_changes.mapping_matrix import (
mapping_matrix,
)
from policyengine_uk.data import UKSingleYearDataset
from policyengine_uk_data.utils.uc_data import uc_pc_households

FOLDER = Path(__file__).parent
from policyengine_uk_data.targets.sources.local_age import (
get_constituency_age_targets,
get_uk_total_population,
)
from policyengine_uk_data.targets.sources.local_income import (
get_constituency_income_targets,
get_national_income_projections,
INCOME_VARIABLES,
)
from policyengine_uk_data.targets.sources.local_uc import (
get_constituency_uc_targets,
)


def create_constituency_target_matrix(
Expand All @@ -23,26 +40,18 @@ def create_constituency_target_matrix(
):
if time_period is None:
time_period = dataset.time_period
ages = pd.read_csv(FOLDER / "targets" / "age.csv")
national_demographics = pd.read_csv(STORAGE_FOLDER / "demographics.csv")
incomes = pd.read_csv(FOLDER / "targets" / "spi_by_constituency.csv")

sim = Microsimulation(dataset=dataset, reform=reform)
sim.default_calculation_period = dataset.time_period

national_incomes = pd.read_csv(STORAGE_FOLDER / "incomes_projection.csv")
national_incomes = national_incomes[
national_incomes.year
== max(national_incomes.year.min(), int(dataset.time_period))
]

matrix = pd.DataFrame()
y = pd.DataFrame()

INCOME_VARIABLES = [
"self_employment_income",
"employment_income",
]
# ── Income targets ─────────────────────────────────────────────
incomes = get_constituency_income_targets()
national_incomes = get_national_income_projections(
int(dataset.time_period)
)

for income_variable in INCOME_VARIABLES:
income_values = sim.calculate(income_variable).values
Expand All @@ -56,84 +65,50 @@ def create_constituency_target_matrix(
(national_incomes.total_income_lower_bound == 12_570)
& (national_incomes.total_income_upper_bound == np.inf)
][income_variable + "_amount"].iloc[0]
national_consistency_adjustment_factor = (
national_target / local_target_sum
)
y[f"hmrc/{income_variable}/amount"] = (
local_targets * national_consistency_adjustment_factor
)
adjustment = national_target / local_target_sum
y[f"hmrc/{income_variable}/amount"] = local_targets * adjustment

matrix[f"hmrc/{income_variable}/count"] = sim.map_result(
(income_values != 0) * in_spi_frame, "person", "household"
)
local_targets = incomes[f"{income_variable}_count"].values
local_target_sum = local_targets.sum()
national_target = national_incomes[
(national_incomes.total_income_lower_bound == 12_570)
& (national_incomes.total_income_upper_bound == np.inf)
][income_variable + "_count"].iloc[0]
y[f"hmrc/{income_variable}/count"] = (
incomes[f"{income_variable}_count"].values
* national_consistency_adjustment_factor
incomes[f"{income_variable}_count"].values * adjustment
)

uk_total_population = (
national_demographics[national_demographics.name == "uk_population"][
str(time_period)
].values[0]
* 1e6
)
# ── Age targets ────────────────────────────────────────────────
age_targets = get_constituency_age_targets()
uk_total_population = get_uk_total_population(int(time_period))

age = sim.calculate("age").values
targets_total_pop = 0
for lower_age in range(0, 80, 10):
upper_age = lower_age + 10

in_age_band = (age >= lower_age) & (age < upper_age)

age_str = f"{lower_age}_{upper_age}"
matrix[f"age/{age_str}"] = sim.map_result(
in_age_band, "person", "household"
)

age_count = ages[
[str(age) for age in range(lower_age, upper_age)]
].sum(axis=1)

age_str = f"{lower_age}_{upper_age}"
y[f"age/{age_str}"] = age_count.values
targets_total_pop += age_count.values.sum()

# Adjust for consistency
for lower_age in range(0, 80, 10):
upper_age = lower_age + 10

in_age_band = (age >= lower_age) & (age < upper_age)

age_str = f"{lower_age}_{upper_age}"
y[f"age/{age_str}"] *= uk_total_population / targets_total_pop * 0.9

# UC household count by constituency
y["uc_households"] = uc_pc_households.household_count.values
age_cols = [c for c in age_targets.columns if c.startswith("age/")]
for col in age_cols:
lower, upper = col.removeprefix("age/").split("_")
lower, upper = int(lower), int(upper)
in_band = (age >= lower) & (age < upper)
matrix[col] = sim.map_result(in_band, "person", "household")
y[col] = age_targets[col].values
targets_total_pop += age_targets[col].values.sum()

# National consistency adjustment
for col in age_cols:
y[col] *= uk_total_population / targets_total_pop * 0.9

# ── UC targets ─────────────────────────────────────────────────
y["uc_households"] = get_constituency_uc_targets().values
matrix["uc_households"] = sim.map_result(
(sim.calculate("universal_credit").values > 0).astype(int),
"benunit",
"household",
)

# ── Boundary mapping (2010 → 2024) ────────────────────────────
const_2024 = pd.read_csv(STORAGE_FOLDER / "constituencies_2024.csv")
const_2010 = pd.read_csv(STORAGE_FOLDER / "constituencies_2010.csv")

y_2010 = y.copy()
y_2010["name"] = const_2010["name"].values

y_columns = list(y.columns)
y_values = mapping_matrix @ y.values # Transform to 2024 constituencies

y_values = mapping_matrix @ y.values
y = pd.DataFrame(y_values, columns=y_columns)

y_2024 = y.copy()
y_2024["name"] = const_2024["name"].values

country_mask = create_country_mask(
household_countries=sim.calculate("country").values,
codes=const_2024.code,
Expand All @@ -144,21 +119,16 @@ def create_constituency_target_matrix(
def create_country_mask(
household_countries: np.ndarray, codes: pd.Series
) -> np.ndarray:
# Create a matrix R to accompany the loss matrix M s.t. (W x M) x R = Y_
# where Y_ is the target matrix for the country where no target is constructed from weights from a different country.

constituency_countries = codes.apply(lambda code: code[0]).map(
"""Country mask: R[i,j] = 1 iff household j is in same country as area i."""
area_countries = codes.apply(lambda code: code[0]).map(
{
"E": "ENGLAND",
"W": "WALES",
"S": "SCOTLAND",
"N": "NORTHERN_IRELAND",
}
)

r = np.zeros((len(codes), len(household_countries)))

for i in range(len(codes)):
r[i] = household_countries == constituency_countries[i]

r[i] = household_countries == area_countries.iloc[i]
return r
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
from policyengine_uk_data.utils.calibrate import calibrate_local_areas
from policyengine_uk_data.datasets.local_areas.local_authorities.loss import (
create_local_authority_target_matrix,
create_national_target_matrix,
)
from policyengine_uk_data.targets.build_loss_matrix import (
create_target_matrix as create_national_target_matrix,
)
from policyengine_uk_data.storage import STORAGE_FOLDER
from policyengine_uk.data import UKSingleYearDataset
Expand Down
Loading