Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .beads/issues.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{"id":"policyengine-us-data-apq","title":"Add age and demographics to pre-tax contribution QRF imputation","description":"The QRF in puf.py that imputes pre_tax_contributions from CPS to PUF uses only employment_income as a predictor. Age, filing status, and number of dependents are strong predictors of 401(k) participation and contribution rates. Adding these should improve the distributional accuracy.","status":"closed","priority":2,"issue_type":"feature","created_at":"2026-01-31T08:01:22.72749-05:00","updated_at":"2026-01-31T08:08:02.675063-05:00","closed_at":"2026-01-31T08:08:02.675063-05:00"}
{"id":"policyengine-us-data-jhh","title":"Parameterize retirement contribution limits by year","description":"The contribution waterfall in cps.py hardcodes 2022 limits ($20,500 401k, $6,500 catch-up, $6,000 IRA, $1,000 IRA catch-up). These should be pulled from PolicyEngine parameters or a year-indexed lookup so the dataset builds correctly for any year.","status":"closed","priority":2,"issue_type":"bug","created_at":"2026-01-31T08:01:18.941246-05:00","updated_at":"2026-01-31T08:08:02.614396-05:00","closed_at":"2026-01-31T08:08:02.614396-05:00"}
{"id":"policyengine-us-data-mnw","title":"Use SS_SC source code for Social Security retirement/disability split","description":"Currently cps.py uses a hard age-62 cutoff to split SS into retirement vs disability. The CPS ASEC has SS_SC (Social Security source codes) that distinguish retirement, disability, and survivor benefits. Use these codes instead of the age heuristic.","status":"closed","priority":2,"issue_type":"bug","created_at":"2026-01-31T08:01:21.01419-05:00","updated_at":"2026-01-31T08:08:02.644611-05:00","closed_at":"2026-01-31T08:08:02.644611-05:00"}
{"id":"policyengine-us-data-x4q","title":"Calibrate taxable pension fraction from SOI data","description":"imputation_parameters.yaml sets taxable_pension_fraction to 1.0 with the comment 'no SOI data, so arbitrary assumption.' But the SOI targets CSV includes both total_pension_income and taxable_pension_income by AGI bracket. Use the ratio of these to set a data-driven fraction instead of assuming 100% taxable.","status":"closed","priority":2,"issue_type":"bug","created_at":"2026-01-31T08:01:24.590331-05:00","updated_at":"2026-01-31T08:08:02.70425-05:00","closed_at":"2026-01-31T08:08:02.70425-05:00"}
10 changes: 10 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
- bump: minor
changes:
changed:
- Use CPS ASEC RESNSS1/RESNSS2 source codes to classify Social Security income into retirement, disability, survivors, and dependents (replacing age-62 heuristic)
- Parameterize retirement contribution limits by year (2020-2025) instead of hardcoded 2022 values
- Update taxable pension fraction from 1.0 to 0.590 based on SOI 2015 Table 1.4
- Add age and is_male as QRF predictors for pension contribution imputation
added:
- SSA benefit-type calibration targets for social_security_retirement, social_security_disability, social_security_survivors, and social_security_dependents
- IRA contribution calibration targets for traditional_ira_contributions and roth_ira_contributions from IRS SOI data
127 changes: 107 additions & 20 deletions policyengine_us_data/datasets/cps/cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,22 +405,64 @@ def add_personal_income_variables(
1 - p["qualified_dividend_fraction"]
)
cps["rental_income"] = person.RNT_VAL
# Assign Social Security retirement benefits if at least 62.
MINIMUM_RETIREMENT_AGE = 62

# Classify Social Security income using CPS ASEC reason codes
# (RESNSS1 and RESNSS2). Reason code values:
# 1 = Retired
# 2 = Disabled (adult or child)
# 3 = Widowed
# 4 = Spouse
# 5 = Surviving child
# 6 = Dependent child
# 7 = On behalf of surviving/dependent/disabled child(ren)
# 8 = Other
is_retirement = (person.RESNSS1 == 1) | (person.RESNSS2 == 1)
is_disability = (person.RESNSS1 == 2) | (person.RESNSS2 == 2)
is_survivor = np.isin(person.RESNSS1, [3, 5]) | np.isin(
person.RESNSS2, [3, 5]
)
is_dependent = np.isin(person.RESNSS1, [4, 6, 7]) | np.isin(
person.RESNSS2, [4, 6, 7]
)

# Primary classification: assign full SS_VAL to the highest-
# priority category when someone has multiple source codes.
cps["social_security_retirement"] = np.where(
person.A_AGE >= MINIMUM_RETIREMENT_AGE, person.SS_VAL, 0
is_retirement, person.SS_VAL, 0
)
# Otherwise assign them to Social Security disability benefits.
cps["social_security_disability"] = (
person.SS_VAL - cps["social_security_retirement"]
cps["social_security_disability"] = np.where(
is_disability & ~is_retirement, person.SS_VAL, 0
)
# Provide placeholders for other Social Security inputs to avoid creating
# NaNs as they're uprated.
cps["social_security_dependents"] = np.zeros_like(
cps["social_security_retirement"]
cps["social_security_survivors"] = np.where(
is_survivor & ~is_retirement & ~is_disability,
person.SS_VAL,
0,
)
cps["social_security_survivors"] = np.zeros_like(
cps["social_security_retirement"]
cps["social_security_dependents"] = np.where(
is_dependent & ~is_retirement & ~is_disability & ~is_survivor,
person.SS_VAL,
0,
)

# Fallback for records with SS income but no informative source
# code: use the age-62 heuristic (retirement vs. disability).
MINIMUM_RETIREMENT_AGE = 62
unclassified = (
(person.SS_VAL > 0)
& ~is_retirement
& ~is_disability
& ~is_survivor
& ~is_dependent
)
cps["social_security_retirement"] += np.where(
unclassified & (person.A_AGE >= MINIMUM_RETIREMENT_AGE),
person.SS_VAL,
0,
)
cps["social_security_disability"] += np.where(
unclassified & (person.A_AGE < MINIMUM_RETIREMENT_AGE),
person.SS_VAL,
0,
)
cps["unemployment_compensation"] = person.UC_VAL
# Weeks looking for work during the year (Census variable LKWEEKS)
Expand Down Expand Up @@ -496,11 +538,56 @@ def add_personal_income_variables(
# Disregard reported pension contributions from people who report neither wage and salary
# nor self-employment income.
# Assume no 403(b) or 457 contributions for now.
LIMIT_401K_2022 = 20_500
LIMIT_401K_CATCH_UP_2022 = 6_500
LIMIT_IRA_2022 = 6_000
LIMIT_IRA_CATCH_UP_2022 = 1_000
CATCH_UP_AGE_2022 = 50
# IRS retirement contribution limits by year.
RETIREMENT_LIMITS = {
2020: {
"401k": 19_500,
"401k_catch_up": 6_500,
"ira": 6_000,
"ira_catch_up": 1_000,
},
2021: {
"401k": 19_500,
"401k_catch_up": 6_500,
"ira": 6_000,
"ira_catch_up": 1_000,
},
2022: {
"401k": 20_500,
"401k_catch_up": 6_500,
"ira": 6_000,
"ira_catch_up": 1_000,
},
2023: {
"401k": 22_500,
"401k_catch_up": 7_500,
"ira": 6_500,
"ira_catch_up": 1_000,
},
2024: {
"401k": 23_000,
"401k_catch_up": 7_500,
"ira": 7_000,
"ira_catch_up": 1_000,
},
2025: {
"401k": 23_500,
"401k_catch_up": 7_500,
"ira": 7_000,
"ira_catch_up": 1_000,
},
}
# Clamp to the nearest available year for out-of-range values.
clamped_year = max(
min(year, max(RETIREMENT_LIMITS)),
min(RETIREMENT_LIMITS),
)
limits = RETIREMENT_LIMITS[clamped_year]
LIMIT_401K = limits["401k"]
LIMIT_401K_CATCH_UP = limits["401k_catch_up"]
LIMIT_IRA = limits["ira"]
LIMIT_IRA_CATCH_UP = limits["ira_catch_up"]
CATCH_UP_AGE = 50
retirement_contributions = person.RETCB_VAL
cps["self_employed_pension_contributions"] = np.where(
person.SEMP_VAL > 0, retirement_contributions, 0
Expand All @@ -510,9 +597,9 @@ def add_personal_income_variables(
0,
)
# Compute the 401(k) limit for the person's age.
catch_up_eligible = person.A_AGE >= CATCH_UP_AGE_2022
limit_401k = LIMIT_401K_2022 + catch_up_eligible * LIMIT_401K_CATCH_UP_2022
limit_ira = LIMIT_IRA_2022 + catch_up_eligible * LIMIT_IRA_CATCH_UP_2022
catch_up_eligible = person.A_AGE >= CATCH_UP_AGE
limit_401k = LIMIT_401K + catch_up_eligible * LIMIT_401K_CATCH_UP
limit_ira = LIMIT_IRA + catch_up_eligible * LIMIT_IRA_CATCH_UP
cps["traditional_401k_contributions"] = np.where(
person.WSAL_VAL > 0,
np.minimum(remaining_retirement_contributions, limit_401k),
Expand Down
4 changes: 2 additions & 2 deletions policyengine_us_data/datasets/cps/imputation_parameters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ taxable_interest_fraction: 0.680
# SOI 2018 data
qualified_dividend_fraction: 0.448

# no SOI data, so arbitrary assumption
taxable_pension_fraction: 1.0
# SOI 2015 data (Table 1.4: taxable / total pension income)
taxable_pension_fraction: 0.590
taxable_401k_distribution_fraction: 1.0
taxable_403b_distribution_fraction: 1.0
taxable_ira_distribution_fraction: 1.0
Expand Down
20 changes: 15 additions & 5 deletions policyengine_us_data/datasets/puf/puf.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,25 +168,32 @@ def impute_pension_contributions_to_puf(puf_df):

cps = Microsimulation(dataset=CPS_2021)
cps.subsample(10_000)

predictors = [
"employment_income",
"age",
"is_male",
]

cps_df = cps.calculate_dataframe(
["employment_income", "household_weight", "pre_tax_contributions"]
predictors + ["household_weight", "pre_tax_contributions"]
)

from microimpute.models.qrf import QRF

qrf = QRF()

# Combine predictors and target into single DataFrame for models.QRF
cps_train = cps_df[["employment_income", "pre_tax_contributions"]]
cps_train = cps_df[predictors + ["pre_tax_contributions"]]

fitted_model = qrf.fit(
X_train=cps_train,
predictors=["employment_income"],
predictors=predictors,
imputed_variables=["pre_tax_contributions"],
)

# Predict using the fitted model
predictions = fitted_model.predict(X_test=puf_df[["employment_income"]])
predictions = fitted_model.predict(X_test=puf_df[predictors])

return predictions["pre_tax_contributions"]

Expand Down Expand Up @@ -559,8 +566,11 @@ def generate(self):
original_recid = puf.RECID.values.copy()
puf = preprocess_puf(puf)
puf = impute_missing_demographics(puf, demographics)
# Derive age and is_male for pension imputation predictors
puf["age"] = puf["AGERANGE"].apply(decode_age_filer)
puf["is_male"] = (puf["GENDER"] == 1).astype(float)
puf["pre_tax_contributions"] = impute_pension_contributions_to_puf(
puf[["employment_income"]]
puf[["employment_income", "age", "is_male"]]
)

# Sort in original PUF order
Expand Down
45 changes: 45 additions & 0 deletions policyengine_us_data/db/etl_national_targets.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,51 @@ def extract_national_targets():
"notes": "Social security tips uprated 40% to account for underreporting",
"year": HARDCODED_YEAR,
},
# SSA benefit-type totals derived from trust fund data and
# SSA fact sheet type shares
{
"variable": "social_security_retirement",
"value": 1_060e9,
"source": "https://www.ssa.gov/OACT/STATS/table4a3.html",
"notes": "~73% of total OASDI ($1,452B CBO projection)",
"year": HARDCODED_YEAR,
},
{
"variable": "social_security_disability",
"value": 148e9,
"source": "https://www.ssa.gov/OACT/STATS/table4a3.html",
"notes": "~10.2% of total OASDI (disabled workers)",
"year": HARDCODED_YEAR,
},
{
"variable": "social_security_survivors",
"value": 160e9,
"source": "https://www.ssa.gov/OACT/FACTS/",
"notes": "~11.0% of total OASDI (widows, children of deceased)",
"year": HARDCODED_YEAR,
},
{
"variable": "social_security_dependents",
"value": 84e9,
"source": "https://www.ssa.gov/OACT/FACTS/",
"notes": "~5.8% of total OASDI (spouses/children of retired+disabled)",
"year": HARDCODED_YEAR,
},
# IRA contribution totals from IRS SOI accumulation tables
{
"variable": "traditional_ira_contributions",
"value": 25e9,
"source": "https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements",
"notes": "Tax year 2022 (~5M x $4,510 avg) uprated ~12% to 2024",
"year": HARDCODED_YEAR,
},
{
"variable": "roth_ira_contributions",
"value": 39e9,
"source": "https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements",
"notes": "Tax year 2022 (~10M x $3,482 avg) uprated ~12% to 2024",
"year": HARDCODED_YEAR,
},
]

# Conditional count targets - these need strata with constraints
Expand Down
30 changes: 27 additions & 3 deletions policyengine_us_data/utils/loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@
from policyengine_core.reforms import Reform
from policyengine_us_data.utils.soi import pe_to_soi, get_soi

# CPS-derived statistics
# Medical expenses, sum of spm thresholds
# Child support expenses
# National calibration targets consumed by build_loss_matrix().
# These are duplicated in db/etl_national_targets.py which loads them
# into policy_data.db. A future PR should wire build_loss_matrix()
# to read from the database so this dict can be deleted. See PR #488.

HARD_CODED_TOTALS = {
"health_insurance_premiums_without_medicare_part_b": 385e9,
Expand All @@ -35,6 +36,29 @@
# Wages and salaries grew 32% from 2018 to 2023: https://fred.stlouisfed.org/graph/?g=1J0CC
# Assume 40% through 2024
"tip_income": 38e9 * 1.4,
# SSA benefit-type totals for 2024, derived from:
# - Total OASDI: $1,452B (CBO projection)
# - OASI trust fund: $1,227.4B in 2023
# https://www.ssa.gov/OACT/STATS/table4a3.html
# - DI trust fund: $151.9B in 2023
# https://www.ssa.gov/OACT/STATS/table4a3.html
# - SSA 2024 fact sheet type shares: retired+deps=78.5%,
# survivors=11.0%, disabled+deps=10.5%
# https://www.ssa.gov/OACT/FACTS/
# - SSA Annual Statistical Supplement Table 5.A1
# https://www.ssa.gov/policy/docs/statcomps/supplement/2024/5a.html
"social_security_retirement": 1_060e9, # ~73% of total
"social_security_disability": 148e9, # ~10.2% (disabled workers)
"social_security_survivors": 160e9, # ~11.0% (widows, children of deceased)
"social_security_dependents": 84e9, # ~5.8% (spouses/children of retired+disabled)
# IRA contribution totals from IRS SOI IRA accumulation tables.
# Tax year 2022: ~5M taxpayers x $4,510 avg = ~$22.5B traditional;
# ~10M taxpayers x $3,482 avg = ~$34.8B Roth.
# Uprated ~12% to 2024 for limit increases ($6k->$7k) and
# wage growth.
# https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements
"traditional_ira_contributions": 25e9,
"roth_ira_contributions": 39e9,
}


Expand Down