diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..b9ca1ba3 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,11 @@ +- bump: patch + changes: + fixed: + - Fix salary sacrifice headcount regression caused by uprating mismatch. The + calibrator classified records above/below the 2k cap at uprated 2025 prices, + but the saved dataset stores values at 2023 prices where the classification + differs. Now evaluates SS amounts at base-year prices before applying the + threshold, so calibration matches what end-users see. Stage 2 imputation now + moves the full employee pension amount to SS (instead of capping at 2k) so + both above-cap and below-cap records are created for the calibrator. + - Remove xfail markers from salary sacrifice headcount tests. diff --git a/policyengine_uk_data/datasets/imputations/salary_sacrifice.py b/policyengine_uk_data/datasets/imputations/salary_sacrifice.py index 5d5ae657..7769f7a5 100644 --- a/policyengine_uk_data/datasets/imputations/salary_sacrifice.py +++ b/policyengine_uk_data/datasets/imputations/salary_sacrifice.py @@ -179,43 +179,47 @@ def impute_salary_sacrifice( imputed_ss, # Use imputed for non-respondents ) - # Stage 2: Headcount-targeted imputation for below-cap SS users. + # Stage 2: Headcount-targeted imputation for SS users. # ASHE data shows many more SS users than the FRS captures due to # self-reporting bias in auto-enrolment. Impute additional SS users # from pension contributors to create enough records for calibration - # to hit OBR headcount targets (7.7mn total, 4.3mn below £2,000). + # to hit OBR headcount targets (7.7mn total, 3.3mn above 2k, + # 4.3mn below 2k). Donors keep their full employee pension amount + # so those above 2k become above-cap records and the rest below-cap. person_weight = sim.calculate("person_weight").values employee_pension = dataset.person[ "employee_pension_contributions" ].values.copy() has_ss = final_ss > 0 - below_cap_ss = has_ss & (final_ss <= 2000) # Donor pool: employed pension contributors not already SS users is_donor = (employee_pension > 0) & ~has_ss & (employment_income > 0) - # Target ~4.3mn below-cap SS users (HMRC/ASHE estimate) - TARGET_BELOW_CAP = 4_300_000 - current_below_cap = (person_weight * below_cap_ss).sum() - shortfall = max(0, TARGET_BELOW_CAP - current_below_cap) + # Create enough SS records for the calibrator to work with. + # Target ~70% of the 7.7mn total so the calibrator can gently + # upweight rather than fight a large overshoot. + TARGET_TOTAL = 5_400_000 + current_total = (person_weight * has_ss).sum() + shortfall = max(0, TARGET_TOTAL - current_total) if shortfall > 0: donor_weighted = (person_weight * is_donor).sum() if donor_weighted > 0: - imputation_rate = min(0.8, shortfall / donor_weighted) + imputation_rate = min(0.5, shortfall / donor_weighted) rng = np.random.default_rng(seed=2024) newly_imputed = is_donor & ( rng.random(len(final_ss)) < imputation_rate ) - # Move up to £2,000 of employee pension to SS - ss_new = np.minimum(employee_pension, 2000.0) + # Move full employee pension to SS so the above/below + # 2k split reflects the natural pension distribution + ss_new = employee_pension.copy() final_ss = np.where(newly_imputed, ss_new, final_ss) # Reduce employee pension correspondingly dataset.person["employee_pension_contributions"] = np.where( newly_imputed, - employee_pension - ss_new, + 0.0, employee_pension, ) diff --git a/policyengine_uk_data/tests/test_salary_sacrifice_headcount.py b/policyengine_uk_data/tests/test_salary_sacrifice_headcount.py index 10ee93da..af0fdfd4 100644 --- a/policyengine_uk_data/tests/test_salary_sacrifice_headcount.py +++ b/policyengine_uk_data/tests/test_salary_sacrifice_headcount.py @@ -5,14 +5,9 @@ 7.7mn total SS users (3.3mn above 2k cap, 4.3mn below 2k cap) """ -import pytest - TOLERANCE = 0.15 # 15% relative tolerance -@pytest.mark.xfail( - reason="Will pass after recalibration with new headcount targets" -) def test_salary_sacrifice_total_users(baseline): """Test that total SS user count is close to 7.7mn.""" ss = baseline.calculate( @@ -33,9 +28,6 @@ def test_salary_sacrifice_total_users(baseline): ) -@pytest.mark.xfail( - reason="Will pass after recalibration with new headcount targets" -) def test_salary_sacrifice_below_cap_users(baseline): """Test that below-cap (<=2k) SS users are close to 4.3mn.""" ss = baseline.calculate( @@ -57,9 +49,6 @@ def test_salary_sacrifice_below_cap_users(baseline): ) -@pytest.mark.xfail( - reason="Will pass after recalibration with new headcount targets" -) def test_salary_sacrifice_above_cap_users(baseline): """Test that above-cap (>2k) SS users are close to 3.3mn.""" ss = baseline.calculate( diff --git a/policyengine_uk_data/utils/loss.py b/policyengine_uk_data/utils/loss.py index 562897fd..1ef9f438 100644 --- a/policyengine_uk_data/utils/loss.py +++ b/policyengine_uk_data/utils/loss.py @@ -477,9 +477,25 @@ def pe_count(*variables): # Source: HMRC, "Salary sacrifice reform for pension contributions" # https://www.gov.uk/government/publications/salary-sacrifice-reform-for-pension-contributions-effective-from-6-april-2029 # 7.7mn total SS users (3.3mn above £2k cap, 4.3mn below £2k cap) - ss_has_contributions = ss_contributions > 0 - ss_below_cap = ss_has_contributions & (ss_contributions <= 2000) - ss_above_cap = ss_has_contributions & (ss_contributions > 2000) + # The £2,000 cap is defined at 2023-24 FRS prices. The dataset is + # uprated to 2025 for calibration then downrated back to 2023 for + # saving. To keep the above/below classification consistent across + # price years, evaluate SS amounts at 2023-24 base-year prices. + ss_uprating_factors = pd.read_csv( + STORAGE_FOLDER / "uprating_factors.csv" + ).set_index("Variable") + ss_price_adjustment = ( + ss_uprating_factors.loc[ + "pension_contributions_via_salary_sacrifice", "2023" + ] + / ss_uprating_factors.loc[ + "pension_contributions_via_salary_sacrifice", str(time_period) + ] + ) + ss_at_base_prices = ss_contributions * ss_price_adjustment + ss_has_contributions = ss_at_base_prices > 0 + ss_below_cap = ss_has_contributions & (ss_at_base_prices <= 2000) + ss_above_cap = ss_has_contributions & (ss_at_base_prices > 2000) df["obr/salary_sacrifice_users_total"] = household_from_person( ss_has_contributions