PolicyEngine · nwoodruff-co · Feb 18, 2026 · Feb 18, 2026 · Feb 18, 2026
diff --git a/policyengine_uk_data/datasets/imputations/student_loans.py b/policyengine_uk_data/datasets/imputations/student_loans.py
@@ -1,78 +1,124 @@
 """
 Student loan plan imputation.
 
-This module imputes the student_loan_plan variable based on:
-- Whether the person has reported student loan repayments
-- Their estimated university attendance year (inferred from age)
-
-The imputation assigns plan types according to when the loan system changed:
-- NONE: No reported repayments
-- PLAN_1: Started university before September 2012
-- PLAN_2: Started September 2012 - August 2023
-- PLAN_5: Started September 2023 onwards
-
-This enables policyengine-uk's student_loan_repayment variable to calculate
-repayments using official threshold parameters.
+Assigns Plan 1, 2 and 5 based on age-cohort eligibility and HE participation
+rates, regardless of whether the person currently has repayments > 0. This
+correctly captures below-threshold borrowers who will start repaying as incomes
+rise under uprating.
+
+Plan boundaries:
+  Plan 1: started uni before Sept 2012  (ages ~34+ in 2023)
+  Plan 2: started uni Sept 2012-Aug 2023 (ages ~19-33 in 2023)
+  Plan 5: started uni Sept 2023+         (near-zero in 2023)
+
+HE participation rates by age are derived from HESA data and calibrated so
+that total imputed Plan 2 holders (~5.9M GB) is consistent with the DfE
+forecast of ~7.4M England graduates with outstanding Plan 2 loans (2024-25),
+scaled to GB and adjusted for FRS coverage (~80% of graduates).
+
+Within each age group, we assign the loan plan to the highest-income people
+first, reflecting that graduate earnings are above-average.
 """
 
 import numpy as np
 from policyengine_uk.data import UKSingleYearDataset
 from policyengine_uk import Microsimulation
 
+# Fraction of each age group with an outstanding Plan 2 loan (GB, 2023).
+# Calibrated against DfE forecast: 7.44M England graduates with Plan 2 outstanding
+# in 2024-25, scaled to GB (÷0.84) and adjusted for FRS coverage (~80%).
+# Target GB total: ~7.44/0.84*0.80 = ~7.1M... but FRS pop aged 19-34 = 13.9M
+# so realistic rate given actual HE participation for 2012-2022 cohort.
+# Rates peak at ages 24-28 (graduates 2-6 years post-study, most still repaying).
+_PLAN_2_PARTICIPATION = {
+    19: 0.09,
+    20: 0.16,
+    21: 0.32,
+    22: 0.44,
+    23: 0.48,
+    24: 0.58,
+    25: 0.60,
+    26: 0.58,
+    27: 0.55,
+    28: 0.53,
+    29: 0.50,
+    30: 0.46,
+    31: 0.44,
+    32: 0.40,
+    33: 0.35,
+    34: 0.29,
+}
+
+# Plan 1: pre-2012 starters. Calibrated to ~3.5M GB total outstanding loans.
+# HESA pre-2012 entry ~200-280k/yr England; 14 active cohorts (1998-2011).
+# Rates taper at older ages as loans are paid off or written off at age 65.
+_PLAN_1_PARTICIPATION = {
+    34: 0.37,
+    35: 0.37,
+    36: 0.35,
+    37: 0.34,
+    38: 0.32,
+    39: 0.30,
+    40: 0.29,
+    41: 0.27,
+    42: 0.25,
+    43: 0.24,
+    44: 0.22,
+    45: 0.20,
+    46: 0.17,
+    47: 0.13,
+    48: 0.12,
+    49: 0.10,
+    50: 0.08,
+    51: 0.07,
+    52: 0.05,
+    53: 0.03,
+    54: 0.03,
+    55: 0.02,
+}
+
 
 def impute_student_loan_plan(
     dataset: UKSingleYearDataset,
-    year: int = 2025,
+    year: int = 2023,
 ) -> UKSingleYearDataset:
-    """
-    Impute student loan plan type based on age and reported repayments.
+    """Impute student loan plan type from age-cohort eligibility and income rank.
 
-    The plan type determines which repayment threshold applies:
-    - PLAN_1: £26,065 (2025), pre-Sept 2012 England/Wales
-    - PLAN_2: £29,385 (2026-2029 frozen), Sept 2012 - Aug 2023
-    - PLAN_4: Scottish loans (not imputed here - requires explicit flag)
-    - PLAN_5: £25,000 (2025), Sept 2023 onwards
+    Assigns plans to the highest-income people within each eligible age group,
+    up to the participation rate target. This captures both above- and
+    below-threshold borrowers, so that uprating correctly activates repayments
+    as incomes grow.
 
     Args:
-        dataset: PolicyEngine UK dataset with student_loan_repayments.
-        year: The simulation year, used to estimate university attendance.
+        dataset: PolicyEngine UK dataset.
+        year: FRS survey year (used to compute cohort start years).
 
     Returns:
         Dataset with imputed student_loan_plan values.
     """
     dataset = dataset.copy()
     sim = Microsimulation(dataset=dataset)
 
-    # Get required variables
-    age = sim.calculate("age").values
-    student_loan_repayments = sim.calculate("student_loan_repayments").values
-
-    # Determine if person has a student loan based on reported repayments
-    has_student_loan = student_loan_repayments > 0
+    age = sim.calculate("age").values.astype(int)
+    income = sim.calculate("employment_income").values
 
-    # Estimate when they started university (assume age 18)
-    # For simulation year Y and age A, university start year = Y - A + 18
-    estimated_uni_start_year = year - age + 18
+    n = len(age)
+    plan = np.full(n, "NONE", dtype=object)
 
-    # Assign plan types based on when loan system changed
-    # StudentLoanPlan is a string enum: "NONE", "PLAN_1", "PLAN_2", "PLAN_4", "PLAN_5"
-    plan = np.full(len(age), "NONE", dtype=object)
+    def assign_plan(participation_rates, plan_label):
+        for a, rate in participation_rates.items():
+            age_mask = age == a
+            if age_mask.sum() == 0:
+                continue
+            idx = np.where(age_mask)[0]
+            n_assign = max(1, round(len(idx) * rate))
+            # Assign to highest-income people in this age group
+            ranked = idx[np.argsort(income[idx])[::-1]]
+            plan[ranked[:n_assign]] = plan_label
 
-    # Plan 1: Started before September 2012
-    plan_1_mask = has_student_loan & (estimated_uni_start_year < 2012)
-    plan[plan_1_mask] = "PLAN_1"
+    assign_plan(_PLAN_1_PARTICIPATION, "PLAN_1")
+    assign_plan(_PLAN_2_PARTICIPATION, "PLAN_2")
+    # Plan 5: near-zero in 2023 (first cohort only just starting in Sept 2023)
 
-    # Plan 2: Started September 2012 - August 2023
-    plan_2_mask = has_student_loan & (
-        (estimated_uni_start_year >= 2012) & (estimated_uni_start_year < 2023)
-    )
-    plan[plan_2_mask] = "PLAN_2"
-
-    # Plan 5: Started September 2023 onwards
-    plan_5_mask = has_student_loan & (estimated_uni_start_year >= 2023)
-    plan[plan_5_mask] = "PLAN_5"
-
-    # Store as the plan type
     dataset.person["student_loan_plan"] = plan
-
     return dataset
diff --git a/policyengine_uk_data/utils/uprating.py b/policyengine_uk_data/utils/uprating.py
@@ -57,11 +57,134 @@ def uprate_values(values, variable_name, start_year=2020, end_year=2034):
     return values * relative_change
 
 
+# FRS-weighted GB targets for total Plan 2 and Plan 5 outstanding borrowers
+# (including below-threshold). Derived from DfE student loan forecasts (England),
+# scaled to GB (÷0.84) and adjusted for FRS coverage (55.9% of total outstanding
+# borrowers captured, calibrated from 2023 base).
+# Plan 2 closed to new entrants after Sept 2023; growth reflects new cohorts
+# becoming graduates and entering outstanding-loan status. Plan 5 is the new
+# post-2023 cohort. Figures from 2030 are extrapolated from DfE trend.
+_PLAN_TARGETS = {
+    # year: (plan_2_millions, plan_5_millions)
+    # Plan 2 derived from DfE forecast (England) scaled to GB (÷0.84) and FRS
+    # coverage (55.9%). Plan 2 write-off is 40 years so no significant decline
+    # within this window; post-2030 extrapolated as very gently declining.
+    # Plan 5 post-2030 extrapolated at +1.1M England/yr (growth decelerating).
+    2024: (5.950, 0.007),
+    2025: (5.949, 0.153),
+    2026: (6.462, 0.419),
+    2027: (6.894, 0.918),
+    2028: (7.064, 1.571),
+    2029: (7.054, 2.263),
+    2030: (7.004, 2.995),
+    2031: (6.954, 3.727),
+    2032: (6.904, 4.459),
+    2033: (6.854, 5.191),
+    2034: (6.804, 5.923),
+}
+
+# Plan 1 write-off cutoff by year: loan term is 25 years post-graduation.
+# Assuming graduation ~age 21, write-off at age 21+25+3=49... but the standard
+# rule is 25 years from the April after graduation. For a person who started
+# in 1998 (age 18) and graduated 2001, write-off is April 2026.
+# Simplification: write off if age >= (2069 - year) in the base 2023 dataset.
+# This matches the 25-year-from-first-repayment rule for the 1998-2011 cohort.
+_PLAN_1_WRITEOFF_AGE = lambda year: 2069 - year
+
+
+def _promote_to_plan(plan, income, weights, eligible_mask, target_weighted_millions, plan_label):
+    """Promote the highest-income eligible NONE people to plan_label until the
+    weighted total reaches target_weighted_millions. Returns updated plan array."""
+    import numpy as np
+
+    target = target_weighted_millions * 1e6
+    current = weights[plan == plan_label].sum()
+    delta = target - current
+    if delta <= 0:
+        return plan
+
+    candidates = np.where(eligible_mask & (plan == "NONE"))[0]
+    if len(candidates) == 0:
+        return plan
+
+    # Rank candidates by income descending — highest earners promoted first
+    order = candidates[np.argsort(income[candidates])[::-1]]
+    promoted = 0.0
+    for i in order:
+        if promoted >= delta:
+            break
+        plan[i] = plan_label
+        promoted += weights[i]
+
+    return plan
+
+
+def _demote_from_plan(plan, income, weights, plan_label, target_weighted_millions):
+    """Demote the lowest-income plan holders to NONE when the target falls
+    (e.g. Plan 2 declining as loans are paid off post-2030)."""
+    import numpy as np
+
+    target = target_weighted_millions * 1e6
+    current = weights[plan == plan_label].sum()
+    delta = current - target
+    if delta <= 0:
+        return plan
+
+    holders = np.where(plan == plan_label)[0]
+    order = holders[np.argsort(income[holders])]  # lowest income first
+    demoted = 0.0
+    for i in order:
+        if demoted >= delta:
+            break
+        plan[i] = "NONE"
+        demoted += weights[i]
+
+    return plan
+
+
+def _roll_student_loan_plans(dataset, year, weights):
+    """Advance student loan plan assignments to match forecast targets.
+
+    - Plan 1: write off loans where age >= (2069 - year) in the base dataset,
+      reflecting the 25-year loan term for the pre-2012 cohort.
+    - Plan 2: promote/demote NONE people in the 2012-2022 age band by income
+      rank to hit DfE-forecast total outstanding borrower targets.
+    - Plan 5: promote NONE people in the post-2023 age band by income rank.
+    - Plan 4 and Postgraduate: unchanged.
+    """
+    import numpy as np
+
+    age = np.array(dataset.person["age"][:]).astype(int)
+    income = np.array(dataset.person["employment_income"][:])
+    plan = np.array(dataset.person["student_loan_plan"][:], dtype=object)
+
+    # Plan 1: write off loans for cohort beyond 25-year term
+    writeoff_age = _PLAN_1_WRITEOFF_AGE(year)
+    plan[(plan == "PLAN_1") & (age >= writeoff_age)] = "NONE"
+
+    if year in _PLAN_TARGETS:
+        target_p2, target_p5 = _PLAN_TARGETS[year]
+
+        # Plan 2: started uni 2012-2022, in year Y ages (Y-2004) to (Y-1994)
+        p2_eligible = (age >= year - 2004) & (age <= year - 1994)
+        plan = _promote_to_plan(plan, income, weights, p2_eligible, target_p2, "PLAN_2")
+        plan = _demote_from_plan(plan, income, weights, "PLAN_2", target_p2)
+
+        # Plan 5: started uni 2023+, in year Y ages 18 to (Y-2005)
+        p5_eligible = (age >= 18) & (age <= year - 2005)
+        plan = _promote_to_plan(plan, income, weights, p5_eligible, target_p5, "PLAN_5")
+
+    dataset.person["student_loan_plan"] = plan
+    return dataset
+
+
 def uprate_dataset(dataset: UKSingleYearDataset, target_year=2034):
+    import numpy as np
+
     dataset = dataset.copy()
     uprating_factors = pd.read_csv(STORAGE_FOLDER / "uprating_factors.csv")
     uprating_factors = uprating_factors.set_index("Variable")
-    start_year = dataset.time_period
+    start_year = int(dataset.time_period)
 
     for table in dataset.tables:
         for variable in table.columns:
@@ -74,6 +197,17 @@ def uprate_dataset(dataset: UKSingleYearDataset, target_year=2034):
 
     dataset.time_period = target_year
 
+    if "student_loan_plan" in dataset.person.columns:
+        # Pre-compute person weights (household weight mapped to persons)
+        person_hh_id = dataset.person["person_household_id"][:]
+        hh_id = dataset.household["household_id"][:]
+        hh_weight = dataset.household["household_weight"][:]
+        weight_by_hh = dict(zip(hh_id, hh_weight))
+        weights = np.array([weight_by_hh[i] for i in person_hh_id])
+
+        for year in range(start_year + 1, target_year + 1):
+            dataset = _roll_student_loan_plans(dataset, year, weights)
+
     return dataset