Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 95 additions & 49 deletions policyengine_uk_data/datasets/imputations/student_loans.py
Original file line number Diff line number Diff line change
@@ -1,78 +1,124 @@
"""
Student loan plan imputation.

This module imputes the student_loan_plan variable based on:
- Whether the person has reported student loan repayments
- Their estimated university attendance year (inferred from age)

The imputation assigns plan types according to when the loan system changed:
- NONE: No reported repayments
- PLAN_1: Started university before September 2012
- PLAN_2: Started September 2012 - August 2023
- PLAN_5: Started September 2023 onwards

This enables policyengine-uk's student_loan_repayment variable to calculate
repayments using official threshold parameters.
Assigns Plan 1, 2 and 5 based on age-cohort eligibility and HE participation
rates, regardless of whether the person currently has repayments > 0. This
correctly captures below-threshold borrowers who will start repaying as incomes
rise under uprating.

Plan boundaries:
Plan 1: started uni before Sept 2012 (ages ~34+ in 2023)
Plan 2: started uni Sept 2012-Aug 2023 (ages ~19-33 in 2023)
Plan 5: started uni Sept 2023+ (near-zero in 2023)

HE participation rates by age are derived from HESA data and calibrated so
that total imputed Plan 2 holders (~5.9M GB) is consistent with the DfE
forecast of ~7.4M England graduates with outstanding Plan 2 loans (2024-25),
scaled to GB and adjusted for FRS coverage (~80% of graduates).

Within each age group, we assign the loan plan to the highest-income people
first, reflecting that graduate earnings are above-average.
"""

import numpy as np
from policyengine_uk.data import UKSingleYearDataset
from policyengine_uk import Microsimulation

# Fraction of each age group with an outstanding Plan 2 loan (GB, 2023).
# Calibrated against DfE forecast: 7.44M England graduates with Plan 2 outstanding
# in 2024-25, scaled to GB (÷0.84) and adjusted for FRS coverage (~80%).
# Target GB total: ~7.44/0.84*0.80 = ~7.1M... but FRS pop aged 19-34 = 13.9M
# so realistic rate given actual HE participation for 2012-2022 cohort.
# Rates peak at ages 24-28 (graduates 2-6 years post-study, most still repaying).
_PLAN_2_PARTICIPATION = {
19: 0.09,
20: 0.16,
21: 0.32,
22: 0.44,
23: 0.48,
24: 0.58,
25: 0.60,
26: 0.58,
27: 0.55,
28: 0.53,
29: 0.50,
30: 0.46,
31: 0.44,
32: 0.40,
33: 0.35,
34: 0.29,
}

# Plan 1: pre-2012 starters. Calibrated to ~3.5M GB total outstanding loans.
# HESA pre-2012 entry ~200-280k/yr England; 14 active cohorts (1998-2011).
# Rates taper at older ages as loans are paid off or written off at age 65.
_PLAN_1_PARTICIPATION = {
34: 0.37,
35: 0.37,
36: 0.35,
37: 0.34,
38: 0.32,
39: 0.30,
40: 0.29,
41: 0.27,
42: 0.25,
43: 0.24,
44: 0.22,
45: 0.20,
46: 0.17,
47: 0.13,
48: 0.12,
49: 0.10,
50: 0.08,
51: 0.07,
52: 0.05,
53: 0.03,
54: 0.03,
55: 0.02,
}


def impute_student_loan_plan(
dataset: UKSingleYearDataset,
year: int = 2025,
year: int = 2023,
) -> UKSingleYearDataset:
"""
Impute student loan plan type based on age and reported repayments.
"""Impute student loan plan type from age-cohort eligibility and income rank.

The plan type determines which repayment threshold applies:
- PLAN_1: £26,065 (2025), pre-Sept 2012 England/Wales
- PLAN_2: £29,385 (2026-2029 frozen), Sept 2012 - Aug 2023
- PLAN_4: Scottish loans (not imputed here - requires explicit flag)
- PLAN_5: £25,000 (2025), Sept 2023 onwards
Assigns plans to the highest-income people within each eligible age group,
up to the participation rate target. This captures both above- and
below-threshold borrowers, so that uprating correctly activates repayments
as incomes grow.

Args:
dataset: PolicyEngine UK dataset with student_loan_repayments.
year: The simulation year, used to estimate university attendance.
dataset: PolicyEngine UK dataset.
year: FRS survey year (used to compute cohort start years).

Returns:
Dataset with imputed student_loan_plan values.
"""
dataset = dataset.copy()
sim = Microsimulation(dataset=dataset)

# Get required variables
age = sim.calculate("age").values
student_loan_repayments = sim.calculate("student_loan_repayments").values

# Determine if person has a student loan based on reported repayments
has_student_loan = student_loan_repayments > 0
age = sim.calculate("age").values.astype(int)
income = sim.calculate("employment_income").values

# Estimate when they started university (assume age 18)
# For simulation year Y and age A, university start year = Y - A + 18
estimated_uni_start_year = year - age + 18
n = len(age)
plan = np.full(n, "NONE", dtype=object)

# Assign plan types based on when loan system changed
# StudentLoanPlan is a string enum: "NONE", "PLAN_1", "PLAN_2", "PLAN_4", "PLAN_5"
plan = np.full(len(age), "NONE", dtype=object)
def assign_plan(participation_rates, plan_label):
for a, rate in participation_rates.items():
age_mask = age == a
if age_mask.sum() == 0:
continue
idx = np.where(age_mask)[0]
n_assign = max(1, round(len(idx) * rate))
# Assign to highest-income people in this age group
ranked = idx[np.argsort(income[idx])[::-1]]
plan[ranked[:n_assign]] = plan_label

# Plan 1: Started before September 2012
plan_1_mask = has_student_loan & (estimated_uni_start_year < 2012)
plan[plan_1_mask] = "PLAN_1"
assign_plan(_PLAN_1_PARTICIPATION, "PLAN_1")
assign_plan(_PLAN_2_PARTICIPATION, "PLAN_2")
# Plan 5: near-zero in 2023 (first cohort only just starting in Sept 2023)

# Plan 2: Started September 2012 - August 2023
plan_2_mask = has_student_loan & (
(estimated_uni_start_year >= 2012) & (estimated_uni_start_year < 2023)
)
plan[plan_2_mask] = "PLAN_2"

# Plan 5: Started September 2023 onwards
plan_5_mask = has_student_loan & (estimated_uni_start_year >= 2023)
plan[plan_5_mask] = "PLAN_5"

# Store as the plan type
dataset.person["student_loan_plan"] = plan

return dataset
136 changes: 135 additions & 1 deletion policyengine_uk_data/utils/uprating.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,134 @@ def uprate_values(values, variable_name, start_year=2020, end_year=2034):
return values * relative_change


# FRS-weighted GB targets for total Plan 2 and Plan 5 outstanding borrowers
# (including below-threshold). Derived from DfE student loan forecasts (England),
# scaled to GB (÷0.84) and adjusted for FRS coverage (55.9% of total outstanding
# borrowers captured, calibrated from 2023 base).
# Plan 2 closed to new entrants after Sept 2023; growth reflects new cohorts
# becoming graduates and entering outstanding-loan status. Plan 5 is the new
# post-2023 cohort. Figures from 2030 are extrapolated from DfE trend.
_PLAN_TARGETS = {
# year: (plan_2_millions, plan_5_millions)
# Plan 2 derived from DfE forecast (England) scaled to GB (÷0.84) and FRS
# coverage (55.9%). Plan 2 write-off is 40 years so no significant decline
# within this window; post-2030 extrapolated as very gently declining.
# Plan 5 post-2030 extrapolated at +1.1M England/yr (growth decelerating).
2024: (5.950, 0.007),
2025: (5.949, 0.153),
2026: (6.462, 0.419),
2027: (6.894, 0.918),
2028: (7.064, 1.571),
2029: (7.054, 2.263),
2030: (7.004, 2.995),
2031: (6.954, 3.727),
2032: (6.904, 4.459),
2033: (6.854, 5.191),
2034: (6.804, 5.923),
}

# Plan 1 write-off cutoff by year: loan term is 25 years post-graduation.
# Assuming graduation ~age 21, write-off at age 21+25+3=49... but the standard
# rule is 25 years from the April after graduation. For a person who started
# in 1998 (age 18) and graduated 2001, write-off is April 2026.
# Simplification: write off if age >= (2069 - year) in the base 2023 dataset.
# This matches the 25-year-from-first-repayment rule for the 1998-2011 cohort.
_PLAN_1_WRITEOFF_AGE = lambda year: 2069 - year


def _promote_to_plan(plan, income, weights, eligible_mask, target_weighted_millions, plan_label):
"""Promote the highest-income eligible NONE people to plan_label until the
weighted total reaches target_weighted_millions. Returns updated plan array."""
import numpy as np

target = target_weighted_millions * 1e6
current = weights[plan == plan_label].sum()
delta = target - current
if delta <= 0:
return plan

candidates = np.where(eligible_mask & (plan == "NONE"))[0]
if len(candidates) == 0:
return plan

# Rank candidates by income descending — highest earners promoted first
order = candidates[np.argsort(income[candidates])[::-1]]
promoted = 0.0
for i in order:
if promoted >= delta:
break
plan[i] = plan_label
promoted += weights[i]

return plan


def _demote_from_plan(plan, income, weights, plan_label, target_weighted_millions):
"""Demote the lowest-income plan holders to NONE when the target falls
(e.g. Plan 2 declining as loans are paid off post-2030)."""
import numpy as np

target = target_weighted_millions * 1e6
current = weights[plan == plan_label].sum()
delta = current - target
if delta <= 0:
return plan

holders = np.where(plan == plan_label)[0]
order = holders[np.argsort(income[holders])] # lowest income first
demoted = 0.0
for i in order:
if demoted >= delta:
break
plan[i] = "NONE"
demoted += weights[i]

return plan


def _roll_student_loan_plans(dataset, year, weights):
"""Advance student loan plan assignments to match forecast targets.

- Plan 1: write off loans where age >= (2069 - year) in the base dataset,
reflecting the 25-year loan term for the pre-2012 cohort.
- Plan 2: promote/demote NONE people in the 2012-2022 age band by income
rank to hit DfE-forecast total outstanding borrower targets.
- Plan 5: promote NONE people in the post-2023 age band by income rank.
- Plan 4 and Postgraduate: unchanged.
"""
import numpy as np

age = np.array(dataset.person["age"][:]).astype(int)
income = np.array(dataset.person["employment_income"][:])
plan = np.array(dataset.person["student_loan_plan"][:], dtype=object)

# Plan 1: write off loans for cohort beyond 25-year term
writeoff_age = _PLAN_1_WRITEOFF_AGE(year)
plan[(plan == "PLAN_1") & (age >= writeoff_age)] = "NONE"

if year in _PLAN_TARGETS:
target_p2, target_p5 = _PLAN_TARGETS[year]

# Plan 2: started uni 2012-2022, in year Y ages (Y-2004) to (Y-1994)
p2_eligible = (age >= year - 2004) & (age <= year - 1994)
plan = _promote_to_plan(plan, income, weights, p2_eligible, target_p2, "PLAN_2")
plan = _demote_from_plan(plan, income, weights, "PLAN_2", target_p2)

# Plan 5: started uni 2023+, in year Y ages 18 to (Y-2005)
p5_eligible = (age >= 18) & (age <= year - 2005)
plan = _promote_to_plan(plan, income, weights, p5_eligible, target_p5, "PLAN_5")

dataset.person["student_loan_plan"] = plan
return dataset


def uprate_dataset(dataset: UKSingleYearDataset, target_year=2034):
import numpy as np

dataset = dataset.copy()
uprating_factors = pd.read_csv(STORAGE_FOLDER / "uprating_factors.csv")
uprating_factors = uprating_factors.set_index("Variable")
start_year = dataset.time_period
start_year = int(dataset.time_period)

for table in dataset.tables:
for variable in table.columns:
Expand All @@ -74,6 +197,17 @@ def uprate_dataset(dataset: UKSingleYearDataset, target_year=2034):

dataset.time_period = target_year

if "student_loan_plan" in dataset.person.columns:
# Pre-compute person weights (household weight mapped to persons)
person_hh_id = dataset.person["person_household_id"][:]
hh_id = dataset.household["household_id"][:]
hh_weight = dataset.household["household_weight"][:]
weight_by_hh = dict(zip(hh_id, hh_weight))
weights = np.array([weight_by_hh[i] for i in person_hh_id])

for year in range(start_year + 1, target_year + 1):
dataset = _roll_student_loan_plans(dataset, year, weights)

return dataset


Expand Down
Loading