diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 51684758f..275d71e2b 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -15,6 +15,11 @@
ExtendedCPS_2024_Half,
CPS_2024,
)
+from policyengine_us_data.utils.randomness import seeded_rng
+from policyengine_us_data.utils.takeup import (
+ ACA_POST_CALIBRATION_PERSON_TARGETS,
+ extend_aca_takeup_to_match_target,
+)
import logging
try:
@@ -23,6 +28,48 @@
torch = None
+def _get_period_array(period_values: dict, period: int) -> np.ndarray:
+ """Get a period array from a TIME_PERIOD_ARRAYS variable dict."""
+ value = period_values.get(period)
+ if value is None:
+ value = period_values.get(str(period))
+ if value is None:
+ raise KeyError(f"Missing period {period}")
+ return np.asarray(value)
+
+
+def create_aca_2025_takeup_override(
+ base_takeup: np.ndarray,
+ person_enrolled_if_takeup: np.ndarray,
+ person_weights: np.ndarray,
+ person_tax_unit_ids: np.ndarray,
+ tax_unit_ids: np.ndarray,
+ target_people: float = ACA_POST_CALIBRATION_PERSON_TARGETS[2025],
+) -> np.ndarray:
+ """Add 2025 ACA takers until weighted APTC enrollment hits target."""
+ tax_unit_id_to_idx = {
+ int(tax_unit_id): idx for idx, tax_unit_id in enumerate(tax_unit_ids)
+ }
+ person_tax_unit_idx = np.array(
+ [tax_unit_id_to_idx[int(tax_unit_id)] for tax_unit_id in person_tax_unit_ids],
+ dtype=np.int64,
+ )
+ enrolled_person_weights = np.zeros(len(tax_unit_ids), dtype=np.float64)
+ np.add.at(
+ enrolled_person_weights,
+ person_tax_unit_idx,
+ person_enrolled_if_takeup.astype(np.float64) * person_weights,
+ )
+ draws = seeded_rng("takes_up_aca_if_eligible").random(len(tax_unit_ids))
+
+ return extend_aca_takeup_to_match_target(
+ base_takeup=np.asarray(base_takeup, dtype=bool),
+ entity_draws=draws,
+ enrolled_person_weights=enrolled_person_weights,
+ target_people=target_people,
+ )
+
+
def reweight(
original_weights,
loss_matrix,
@@ -142,6 +189,7 @@ def generate(self):
sim = Microsimulation(dataset=self.input_dataset)
data = sim.dataset.load_dataset()
+ base_year = int(sim.default_calculation_period)
data["household_weight"] = {}
original_weights = sim.calculate("household_weight")
original_weights = original_weights.values + np.random.normal(
@@ -216,6 +264,52 @@ def generate(self):
f"{int(np.sum(w > 0))} non-zero"
)
+ if 2025 in ACA_POST_CALIBRATION_PERSON_TARGETS:
+ sim.set_input(
+ "household_weight",
+ base_year,
+ _get_period_array(data["household_weight"], base_year).astype(
+ np.float32
+ ),
+ )
+ sim.set_input(
+ "takes_up_aca_if_eligible",
+ 2025,
+ np.ones(
+ len(_get_period_array(data["tax_unit_id"], base_year)),
+ dtype=bool,
+ ),
+ )
+ sim.delete_arrays("aca_ptc")
+
+ data["takes_up_aca_if_eligible"][2025] = create_aca_2025_takeup_override(
+ base_takeup=_get_period_array(
+ data["takes_up_aca_if_eligible"],
+ base_year,
+ ),
+ person_enrolled_if_takeup=np.asarray(
+ sim.calculate(
+ "aca_ptc",
+ map_to="person",
+ period=2025,
+ use_weights=False,
+ )
+ )
+ > 0,
+ person_weights=np.asarray(
+ sim.calculate(
+ "person_weight",
+ period=2025,
+ use_weights=False,
+ )
+ ),
+ person_tax_unit_ids=_get_period_array(
+ data["person_tax_unit_id"],
+ base_year,
+ ),
+ tax_unit_ids=_get_period_array(data["tax_unit_id"], base_year),
+ )
+
logging.info("Post-generation weight validation passed")
self.save_dataset(data)
diff --git a/policyengine_us_data/datasets/cps/long_term/ASSUMPTION_COMPARISON.md b/policyengine_us_data/datasets/cps/long_term/ASSUMPTION_COMPARISON.md
new file mode 100644
index 000000000..8a1278f8e
--- /dev/null
+++ b/policyengine_us_data/datasets/cps/long_term/ASSUMPTION_COMPARISON.md
@@ -0,0 +1,43 @@
+# Long-Run Calibration Assumption Comparison
+
+This note distinguishes between:
+
+- hard microsimulation calibration targets, which directly shape household weights
+- tax-side assumptions used to make those targets more comparable to the public Trustees/OACT methodology
+
+The current long-run baseline now adopts a named tax-side assumption,
+`trustees-core-thresholds-v1`, before hard-targeting TOB.
+
+| Component | Current `policyengine-us-data` approach | Trustees / OACT published approach | Calibration use |
+| --- | --- | --- | --- |
+| Population by age | SSA single-year age projections | SSA single-year age projections | Hard target |
+| OASDI benefits | Named long-term target source package | Trustees or OACT-patched annual OASDI path | Hard target |
+| Taxable payroll | Named long-term target source package | Trustees annual taxable payroll path | Hard target |
+| Social Security benefit-tax thresholds | Literal current-law statutory thresholds remain fixed in nominal dollars | Trustees also describe the statutory `$25k/$32k/$0` and `$34k/$44k` thresholds as remaining fixed in nominal dollars | Not separately targeted |
+| Federal income-tax brackets | Core ordinary thresholds are wage-indexed after `2034` via `trustees-core-thresholds-v1` | Trustees assume periodic future bracket adjustments; after the tenth projection year, ordinary federal income-tax brackets are assumed to rise with average wages to avoid indefinite bracket creep | Tax-side assumption |
+| Standard deduction / aged-blind addition / capital gains thresholds / AMT thresholds | Included in the same `trustees-core-thresholds-v1` bundle | Not parameterized publicly line-by-line, but these are the main additional federal thresholds most likely to affect long-run TOB | Tax-side assumption |
+| OASDI TOB | Computed under the core-threshold tax assumption and targeted in `ss-payroll-tob` profiles | Trustees/OACT publish annual revenue paths or ratios, but not a full public household-level micro rule schedule | Hard target |
+| HI TOB | Computed under the core-threshold tax assumption and targeted in `ss-payroll-tob` profiles | Trustees publish current-law HI TOB path; OACT OBBBA updates do not currently provide a full public annual HI replacement series | Hard target |
+| OBBBA OASDI update | Available through named target source `oact_2025_08_05_provisional` | August 5, 2025 OACT letter provides annual OASDI changes through 2100 | Benchmark / target-source input |
+| OBBBA HI update | Provisional bridge only in named target source | No equivalent full public annual HI replacement path located yet | Benchmark only |
+
+## Practical interpretation
+
+- `ss-payroll` remains the core non-TOB hard-target profile.
+- `ss-payroll-tob` now means: calibrate on age + OASDI benefits + taxable payroll + TOB under `trustees-core-thresholds-v1`.
+- The core-threshold bundle is a best-public approximation, not a literal public Trustees rules schedule.
+- Trustees-consistent long-run TOB requires keeping two different tax-side ideas separate:
+ - the Social Security benefit-tax thresholds remain fixed in nominal dollars
+ - ordinary federal income-tax brackets are assumed to rise with average wages after the tenth projection year
+
+## Primary-source references
+
+- [SSA 2025 Trustees Report, V.C.7](https://www.ssa.gov/oact/tr/2025/V_C_prog.html)
+ - States that the law specifies fixed threshold amounts for taxation of Social Security benefits and that those thresholds remain constant in future years.
+ - Also states that, after the tenth year of the projection period, income-tax brackets are assumed to rise with average wages rather than with `C-CPI-U`.
+- [26 U.S.C. § 86](https://www.law.cornell.edu/uscode/text/26/86)
+ - Statutory basis for the Social Security benefit-tax threshold structure.
+- [SSA 2025 Trustees Report, Table VI.G6](https://www.ssa.gov/OACT/TR/2025/VI_G3_OASDHI_dollars.html)
+ - Published annual average wage index path through `2100`.
+- [42 U.S.C. § 430](https://www.law.cornell.edu/uscode/text/42/430) and [20 CFR § 404.1048](https://www.law.cornell.edu/cfr/text/20/404.1048)
+ - Statutory and regulatory basis for deriving the Social Security contribution and benefit base from the wage index.
diff --git a/policyengine_us_data/datasets/cps/long_term/README.md b/policyengine_us_data/datasets/cps/long_term/README.md
index c20216dc8..e88d4d6fb 100644
--- a/policyengine_us_data/datasets/cps/long_term/README.md
+++ b/policyengine_us_data/datasets/cps/long_term/README.md
@@ -6,24 +6,72 @@
Run projections using `run_household_projection.py`:
```bash
-# Recommended: GREG with all constraint types
-python run_household_projection.py 2100 --greg --use-ss --use-payroll --use-tob --save-h5
+# Recommended: named profile with core-threshold tax assumption and TOB targeted
+python run_household_projection.py 2100 --profile ss-payroll-tob --target-source trustees_2025_current_law --save-h5
+
+# Experimental: donor-backed late-year support augmentation for tail-year runs
+python run_household_projection.py 2075 2100 --profile ss-payroll-tob --target-source trustees_2025_current_law --support-augmentation-profile donor-backed-synthetic-v1 --support-augmentation-target-year 2100 --allow-validation-failures
+
+# Experimental: role-based donor composites assembled into late-year support
+python run_household_projection.py 2075 2100 --profile ss-payroll-tob --target-source trustees_2025_current_law --support-augmentation-profile donor-backed-composite-v1 --support-augmentation-target-year 2100 --allow-validation-failures
+
+# Experimental: target-year blueprint calibration over donor-composite support
+python run_household_projection.py 2100 2100 --profile ss-payroll-tob --target-source trustees_2025_current_law --support-augmentation-profile donor-backed-composite-v1 --support-augmentation-target-year 2100 --support-augmentation-blueprint-base-weight-scale 0.5 --allow-validation-failures --save-h5
# IPF with only age distribution constraints (faster, less accurate)
-python run_household_projection.py 2050
+python run_household_projection.py 2050 --profile age-only
# GREG with age + Social Security only
-python run_household_projection.py 2100 --greg --use-ss
+python run_household_projection.py 2100 --profile ss
```
**Arguments:**
- `END_YEAR`: Target year for projection (default: 2035)
+- `--profile`: Named calibration contract. Recommended over legacy flags.
+- `--target-source`: Named long-term target source package.
+- `--tax-assumption`: Long-run federal tax assumption. Defaults to `trustees-core-thresholds-v1`; use `current-law-literal` to opt out.
+- `--output-dir`: Output directory for generated H5 files and metadata sidecars.
+- `--support-augmentation-profile`: Experimental late-year support expansion mode. Currently supports `donor-backed-synthetic-v1` and `donor-backed-composite-v1`.
+- `--support-augmentation-target-year`: Extreme year used to build the donor-backed supplement (defaults to `END_YEAR`).
+- `--support-augmentation-align-to-run-year`: Rebuild the donor-backed supplement separately for each run year instead of reusing one target-year support snapshot.
+- `--support-augmentation-start-year`: Earliest run year allowed for augmentation (defaults to `2075`).
+- `--support-augmentation-top-n-targets`: Number of dominant synthetic target types to map back to real donors (default `20`).
+- `--support-augmentation-donors-per-target`: Number of nearest real donor tax units per synthetic target (default `5`).
+- `--support-augmentation-max-distance`: Maximum donor-match distance retained for cloning (default `3.0`).
+- `--support-augmentation-clone-weight-scale`: Baseline weight multiplier applied to each donor-backed clone (default `0.1`).
+- `--support-augmentation-blueprint-base-weight-scale`: When donor-composite augmentation is active at its target year, scales the original household priors before replacing clone priors with synthetic blueprint shares (default `0.5`).
- `--greg`: Use GREG calibration instead of IPF
- `--use-ss`: Include Social Security benefit totals as calibration target (requires `--greg`)
- `--use-payroll`: Include taxable payroll totals as calibration target (requires `--greg`)
- `--use-tob`: Include TOB (Taxation of Benefits) revenue as calibration target (requires `--greg`)
- `--save-h5`: Save year-specific .h5 files to `./projected_datasets/` directory
+**Named profiles:**
+- `age-only`: IPF age-only calibration
+- `ss`: positive entropy calibration with age + Social Security
+- `ss-payroll`: positive entropy calibration with age + Social Security + taxable payroll
+- `ss-payroll-tob`: positive entropy calibration with age + Social Security + taxable payroll + TOB under the long-run core-threshold tax assumption
+- `ss-payroll-tob-h6`: positive entropy calibration with age + Social Security + taxable payroll + TOB + H6 under the long-run core-threshold tax assumption
+
+**Validation contract:**
+- Economic-targeted profiles no longer silently pretend an IPF fallback is equivalent to GREG.
+- Named economic profiles must produce non-negative weights.
+- Each generated H5 now gets a `YYYY.h5.metadata.json` sidecar with profile and calibration audit details.
+- Each generated H5 sidecar now records the named long-term target source used for the build.
+- Each output directory now gets a `calibration_manifest.json` file describing the
+ profile/base dataset contract for the full artifact set.
+- Profiles validate achieved constraint errors before writing output.
+- Experimental donor-backed augmentation is stamped into each year sidecar and the directory manifest via `support_augmentation`.
+- The active long-run tax assumption is stamped into each year sidecar and the directory manifest via `tax_assumption`.
+- Donor-backed runs now also write a shared `support_augmentation_report.json` artifact with per-clone provenance so late-year translation failures can be inspected directly.
+- Long-run payroll calibration now guards against a flat Social Security wage base after 2035. If `policyengine-us` is missing the NAWI / payroll-cap extension, late-year payroll runs fail fast instead of silently mis-targeting taxable payroll.
+- Trustees/OACT tax-side assumptions are documented in [ASSUMPTION_COMPARISON.md](./ASSUMPTION_COMPARISON.md). The active long-run baseline adopts a core-threshold bundle:
+ - Social Security benefit-tax thresholds remain fixed in nominal dollars under Trustees current law.
+ - Core ordinary federal thresholds are assumed to rise with average wages after the tenth projection year.
+- The corrected apples-to-apples TOB share comparison is documented in
+ [TOB_ALIGNMENT_NOTE.md](./TOB_ALIGNMENT_NOTE.md), with a small reproduction
+ script in [compare_tob_shares.py](./compare_tob_shares.py).
+
**Estimated runtime:** ~2 minutes/year without `--save-h5`, ~3 minutes/year with `--save-h5`
---
@@ -35,11 +83,46 @@ python run_household_projection.py 2100 --greg --use-ss
- Fast and simple, but cannot enforce Social Security or payroll totals
- Converges iteratively (typically 20-40 iterations)
+**Positive Entropy Calibration**
+- Solves for strictly positive weights matching multiple constraints simultaneously
+- Can enforce age distribution + Social Security benefits + taxable payroll
+- Uses dual optimization to minimize divergence from baseline weights
+- **Recommended** for publishable long-term projections
+
+**Long-Run Core-Threshold Tax Assumption**
+- Default long-run tax assumption in this runner
+- Keeps Social Security benefit-tax thresholds fixed
+- Wage-indexes a core set of federal thresholds after `2034`:
+ - ordinary income-tax brackets
+ - standard deduction
+ - aged/blind additional standard deduction
+ - capital-gains thresholds
+ - AMT thresholds / exemptions
+- Intended as the best public Trustees approximation before TOB is hard-targeted again
+
+**Donor-Backed Late-Year Support Augmentation**
+- Experimental late-tail option for `2075+` runs
+- Uses the `2100` synthetic-support prototype to identify dominant missing household types
+- Maps those synthetic targets back to nearest real 2024 donor tax units
+- Clones and perturbs the donor tax units to create a small augmented support without replacing the base CPS sample
+- Intended to test whether donor-backed synthetic support improves late-year microsim feasibility without resorting to fully free synthetic records
+- Current status: still diagnostic. The simple nearest-neighbor donor supplement does not materially improve the late-tail fit once the calibration uses SSA taxable payroll rather than uncapped wages.
+
+**Role-Based Donor Composites**
+- Experimental structural extension of the donor-backed approach
+- Recombines older-beneficiary donors, payroll-rich worker donors, and dependent structure into synthetic household candidates before assembling actual augmented rows
+- The actual-row augmented dataset builder is now available in the runner as `donor-backed-composite-v1`
+- Current status:
+ - Fixing the long-run payroll-cap bug in `policyengine-us` changed the picture materially. With the correct SSA wage base extended through `2100`, the donor-composite synthetic support is exact-feasible and dense at the archetype level.
+ - The runner now supports a target-year calibration blueprint for donor-composite augmentation. At the augmentation target year, it can calibrate against the exact clone blueprints and synthetic prior shares while still auditing the realized rows.
+ - In the current `2100` probe, that blueprint path gets actual age + SS + taxable payroll very close while keeping support quality in range: with `--support-augmentation-blueprint-base-weight-scale 0.5`, actual payroll miss is about `-0.86%`, ESS about `102.5`, top-10 weight share about `24.4%`, and top-100 share about `68.4%`.
+ - The runner now also has a dynamic mode, `--support-augmentation-align-to-run-year`, that rebuilds donor-composite support for each run year and writes per-year augmentation reports.
+ - This is still experimental. The blueprint path is now structurally capable of handling year-specific support, but the full `2075-2100` production sweep still needs runtime tuning and caching work.
+
**GREG (Generalized Regression Estimator)**
-- Solves for weights matching multiple constraints simultaneously
-- Can enforce age distribution + Social Security benefits + taxable payroll + TOB revenue
-- One-shot solution using `samplics` package
-- **Recommended** for accurate long-term projections
+- Legacy linear calibration path retained for explicit flag-based runs
+- Can hit constraints exactly, but may produce negative weights in far-horizon years
+- No longer the default for named economic calibration profiles
---
@@ -51,18 +134,21 @@ python run_household_projection.py 2100 --greg --use-ss
2. **Social Security Benefits** (`--use-ss`, GREG only)
- Total OASDI benefit payments (nominal dollars)
- - Source: SSA Trustee Report 2024 (`social_security_aux.csv`)
+ - Source: selected long-term target source package
3. **Taxable Payroll** (`--use-payroll`, GREG only)
- W-2 wages capped at wage base + SE income within remaining cap room
- Calculated as: `taxable_earnings_for_social_security` + `social_security_taxable_self_employment_income`
- - Source: SSA Trustee Report 2024 (`social_security_aux.csv`)
+ - Source: selected long-term target source package
+ - Guardrail: the runner checks that the Social Security taxable earnings cap continues to rise after 2035. A flat cap indicates an invalid `policyengine-us` parameter baseline for long-run payroll work.
-4. **TOB Revenue** (`--use-tob`, GREG only)
+4. **TOB Revenue** (`ss-payroll-tob`, `ss-payroll-tob-h6`, or legacy `--use-tob`)
- Taxation of Benefits revenue for OASDI and Medicare HI trust funds
- OASDI: `tob_revenue_oasdi` (tier 1 taxation, 0-50% of benefits)
- HI: `tob_revenue_medicare_hi` (tier 2 taxation, 50-85% of benefits)
- - Source: SSA Trustee Report 2024 (`social_security_aux.csv`)
+ - Source: selected long-term target source package
+ - In the current branch contract, TOB is hard-targeted under the long-run core-threshold tax assumption rather than under literal CPI-style current-law bracket indexing.
+ - Primary-source note: Trustees do not appear to model long-run TOB by indexing the SS-specific `$25k/$32k/$0` and `$34k/$44k` thresholds. Those remain fixed under current law; the long-run divergence comes from broader tax-side assumptions, including ordinary bracket treatment after the tenth projection year.
---
@@ -72,6 +158,7 @@ python run_household_projection.py 2100 --greg --use-ss
- URL: https://www.ssa.gov/OACT/TR/2025/
- File: `SingleYearTRTables_TR2025.xlsx`
- Tables: IV.B2 (OASDI TOB % of taxable payroll), VI.G6 (taxable payroll in billions), VI.G9 (OASDI costs)
+- Program assumptions: [V.C.7](https://www.ssa.gov/oact/tr/2025/V_C_prog.html) documents fixed Social Security benefit-tax thresholds and the long-run ordinary-bracket assumption.
**CMS 2025 Medicare Trustees Report**
- URL: https://www.cms.gov/data-research/statistics-trends-and-reports/trustees-report-trust-funds
@@ -80,7 +167,9 @@ python run_household_projection.py 2100 --greg --use-ss
**Local files** (in `policyengine_us_data/storage/`):
- `SSPopJul_TR2024.csv` - Population projections 2025-2100 by single year of age
-- `social_security_aux.csv` - OASDI costs, taxable payroll, and TOB revenue projections 2025-2100
+- `long_term_target_sources/trustees_2025_current_law.csv` - explicit frozen Trustees/current-law package
+- `long_term_target_sources/sources.json` - provenance metadata for named source packages
+- `ASSUMPTION_COMPARISON.md` - side-by-side summary of our calibration assumptions versus Trustees/OACT
---
@@ -102,7 +191,7 @@ python run_household_projection.py 2100 --greg --use-ss
- **`run_household_projection.py`** - Main projection script (see Quick Start)
- **`calibration.py`** - IPF and GREG weight calibration implementations
-- **`ssa_data.py`** - Load SSA population, benefit, and payroll projections
+- **`ssa_data.py`** - Load SSA population and named long-term target source projections
- **`projection_utils.py`** - Utility functions (age matrix builder, H5 file creator)
- **`extract_ssa_costs.py`** - One-time script to extract SSA data from Excel (already run)
@@ -116,7 +205,20 @@ For each projection year (2025-2100):
2. **Uprate variables** - PolicyEngine automatically uprates income, thresholds, etc. to target year
3. **Calculate values** - Income tax, Social Security, taxable payroll at household level
4. **Calibrate weights** - Adjust household weights to match SSA demographic/economic targets
-5. **Aggregate results** - Apply calibrated weights to calculate national totals
+5. **Target or benchmark TOB** - Under `ss-payroll-tob`, match modeled OASDI/HI TOB to the selected target source using the core-threshold tax assumption
+6. **Aggregate results** - Apply calibrated weights to calculate national totals
+
+When donor-backed augmentation is enabled, step 1 uses the original 2024 CPS support and step 2 inserts a tagged late-year supplement derived from nearest real donors before the calibration loop begins. The underlying base dataset path remains unchanged in metadata; the augmentation details are recorded separately in `support_augmentation`, and the full augmentation build report is written once per output directory to `support_augmentation_report.json`.
+
+When donor-composite augmentation is enabled and the run year equals the augmentation target year, the runner can also replace the clone rows' calibration constraints with their exact synthetic blueprint values and use the synthetic-support solution as clone priors. The calibration audit still reports achieved constraints on the realized rows, so any blueprint-to-row translation gap remains visible in metadata.
+
+To compare the intended clone targets with the realized output H5, run:
+
+```bash
+uv run python policyengine_us_data/datasets/cps/long_term/diagnose_support_augmentation_translation.py \
+ ./projected_datasets/2100.h5 \
+ --year 2100
+```
**Key innovation:** Household-level calculations avoid person→household aggregation issues, maintaining consistency across all variables.
diff --git a/policyengine_us_data/datasets/cps/long_term/TOB_ALIGNMENT_NOTE.md b/policyengine_us_data/datasets/cps/long_term/TOB_ALIGNMENT_NOTE.md
new file mode 100644
index 000000000..ede8c578a
--- /dev/null
+++ b/policyengine_us_data/datasets/cps/long_term/TOB_ALIGNMENT_NOTE.md
@@ -0,0 +1,224 @@
+# Long-Run TOB Alignment Note
+
+This note records the current apples-to-apples comparison point for long-run
+taxation of benefits (TOB).
+
+## Why this note exists
+
+Our earlier comparison mixed two different objects:
+
+- `OASDI-only` income from taxation of benefits as a share of total OASDI
+ benefits
+- combined `OASDI + HI` credited taxes as a share of total OASDI benefits
+
+The public Urban / DYNASIM and SSA Trustees discussion is about the former,
+not the latter.
+
+## Primary-source baseline
+
+- SSA 2025 Trustees Report, [V.C.7](https://www.ssa.gov/oact/tr/2025/V_C_prog.html)
+ says the benefit-tax thresholds are "constant in the future" and, after the
+ tenth projection year, ordinary income-tax brackets "rise with average
+ wages."
+- Urban's 2024 DYNASIM appendix says it continues current indexing of income
+ tax parameters indefinitely and keeps the Social Security benefit-tax
+ thresholds at current nominal levels throughout the projection period. Source:
+ [Urban 2024 appendix](https://www.urban.org/sites/default/files/2024-10/Does-the-2023-Social-Security-Expansion-Act-Improve-Equity-in-Key-Outcomes.pdf).
+- SSA's published OASDI long-run target series in
+ [Table IV.B2](https://www.ssa.gov/oact/tr/2025/lr4b2.html) and our local
+ `trustees_2025_current_law.csv` target package imply OASDI-only TOB shares
+ of about `6.0%` to `6.1%` of OASDI benefits in the late horizon.
+
+## Current contract decision
+
+As of `2026-04-02`, the branch adopts the `trustees-core-thresholds-v1`
+tax-side assumption for long-run TOB work and re-enables TOB as a hard target
+in the `ss-payroll-tob` profiles.
+
+That assumption:
+
+- keeps the Social Security benefit-tax thresholds fixed
+- wage-indexes a core federal threshold bundle after `2034`
+ - ordinary income-tax brackets
+ - standard deduction
+ - aged/blind additional standard deduction
+ - capital-gains thresholds
+ - AMT thresholds / exemptions
+
+On a `2100` smoke run with donor-composite support augmentation and the local
+`policyengine-us` wage-base fix, this produces a validation-passing artifact
+with:
+
+- `ss_total`: essentially exact
+- `oasdi_tob`: essentially exact
+- `hi_tob`: essentially exact
+- `payroll_total`: `-2.95%`
+- `ESS`: `173.1`
+- `top-10 weight share`: `17.7%`
+
+## Current branch-local comparison
+
+The table below uses one-year probe outputs produced on `2026-04-02` with:
+
+- `policyengine-us-data` branch `codex/us-data-calibration-contract`
+- `policyengine-us` branch `codex/extend-ss-cap-2100` or equivalent fix from
+ [PR #7912](https://github.com/PolicyEngine/policyengine-us/pull/7912)
+- profile `ss-payroll-tob`
+- target source `trustees_2025_current_law`
+- donor-composite support augmentation enabled
+
+| Year | OASDI actual | OASDI target | OASDI gap | Combined actual | Combined target | Combined gap |
+| --- | ---: | ---: | ---: | ---: | ---: | ---: |
+| 2075 | 9.43% | 6.01% | +3.42 pp | 12.58% | 11.09% | +1.50 pp |
+| 2090 | 10.52% | 6.08% | +4.44 pp | 16.66% | 11.20% | +5.46 pp |
+| 2100 | 11.16% | 6.10% | +5.06 pp | 19.35% | 11.22% | +8.14 pp |
+
+Interpretation:
+
+- The `OASDI-only` comparison is the right one for evaluating alignment to the
+ public Urban / SSA discussion.
+- On that comparable metric, the current corrected baseline is above both the
+ Trustees-style target path (`~6.1%`) and Urban's public DYNASIM endpoint
+ (`8.5%` in `2095`).
+- The larger combined `OASDI + HI` shares are still useful internal diagnostics,
+ but they should not be compared directly to the public `5.6%` / `8.5%`
+ figures.
+
+## Trustees-style bracket-growth sensitivity
+
+We also ran a narrow tax-side sensitivity that keeps the calibrated household
+weights fixed and changes only one assumption in the tax model:
+
+- after `2034`, ordinary federal income-tax bracket thresholds are uprated with
+ `NAWI` instead of `C-CPI-U`
+- the Social Security benefit-tax thresholds remain fixed
+
+This is intended as a best-public approximation to the Trustees statement that
+ordinary income-tax brackets "rise with average wages" after the tenth
+projection year.
+
+| Year | Baseline OASDI | Wage-indexed-brackets OASDI | Trustees target | Remaining gap |
+| --- | ---: | ---: | ---: | ---: |
+| 2075 | 9.43% | 8.41% | 6.01% | +2.40 pp |
+| 2090 | 10.52% | 7.85% | 6.08% | +1.76 pp |
+| 2100 | 11.16% | 9.46% | 6.10% | +3.36 pp |
+
+Interpretation:
+
+- This tax-side assumption moves the modeled OASDI-only TOB share materially in
+ the right direction.
+- It explains a substantial share of the excess over the Trustees target,
+ especially around `2090`.
+- It does not explain the whole gap. Even with wage-indexed ordinary brackets,
+ the long-run `2100` OASDI-only share remains well above the Trustees-style
+ `~6.1%` path.
+- Relative to Urban's public DYNASIM endpoint of `8.5%` in `2095`, the
+ wage-indexed-brackets sensitivity lands in the same rough range by `2090`,
+ but is still above that public number by `2100`.
+
+## Broader core-threshold sensitivity
+
+We also ran a broader but still targeted tax-side sensitivity that switches a
+core set of federal thresholds from `C-CPI-U` to `NAWI` after `2034`:
+
+- ordinary income-tax brackets
+- standard deduction
+- aged/blind additional standard deduction
+- capital-gains rate thresholds
+- AMT bracket threshold and exemption thresholds
+
+This is broader than the minimum public Trustees approximation, but still
+narrower than switching the entire `gov.irs.uprating` family to wages.
+
+| Year | Baseline OASDI | Core-threshold OASDI | Trustees target | Remaining gap |
+| --- | ---: | ---: | ---: | ---: |
+| 2075 | 9.43% | 7.65% | 6.01% | +1.64 pp |
+| 2090 | 10.52% | 7.31% | 6.08% | +1.22 pp |
+| 2100 | 11.16% | 8.15% | 6.10% | +2.05 pp |
+
+Interpretation:
+
+- The broader threshold bundle explains more of the TOB gap than brackets
+ alone.
+- The additional movement is meaningful, especially in `2100`, where the
+ OASDI-only share falls from `9.46%` under brackets-only to `8.15%` under the
+ broader core-threshold sensitivity.
+- Even this broader sensitivity still does not fully reconcile the modeled TOB
+ path to the Trustees target, so some remaining gap likely reflects
+ beneficiary income mix, filing composition, or other Treasury-ratio modeling
+ differences.
+
+## Full IRS-uprating upper bound
+
+Finally, we ran an upper-bound sensitivity that rewrites every materialized IRS
+parameter leaf that currently inherits from `gov.irs.uprating`, replacing
+post-`2034` `C-CPI-U` growth with `NAWI` growth.
+
+This is broader than the public Trustees text justifies, but it provides a
+useful ceiling on how much of the TOB gap could plausibly be explained by the
+IRS uprating family alone.
+
+| Year | Baseline OASDI | Full IRS-uprating OASDI | Trustees target | Remaining gap |
+| --- | ---: | ---: | ---: | ---: |
+| 2075 | 9.43% | 7.46% | 6.01% | +1.45 pp |
+| 2090 | 10.52% | 7.17% | 6.08% | +1.09 pp |
+| 2100 | 11.16% | 8.16% | 6.10% | +2.06 pp |
+
+Interpretation:
+
+- The full IRS-uprating upper bound is only slightly lower than the narrower
+ core-threshold bundle.
+- That implies most of the tax-side movement is already coming from the core
+ federal threshold families, not from the rest of the CPI-uprated IRS
+ parameter tree.
+- Even under this broad upper bound, the model still remains above the
+ Trustees OASDI-only TOB path, especially in `2100`.
+
+## DYNASIM public benchmark
+
+Urban's 2024 appendix says DYNASIM's revenue from taxing Social Security
+benefits rises from `5 percent` in `2027` to `8.5 percent` in `2095`, while
+the Social Security actuaries' corresponding share rises from `5 percent` to
+`5.6 percent` over the same period.
+
+Important caveats:
+
+- We have not found a public annual DYNASIM TOB series, only these endpoint
+ shares.
+- The Urban paper is tied to a 2023-vintage Trustees baseline. Comparing the
+ 2023 and 2025 Trustees reports, the long-run CPI and average-wage growth
+ assumptions are effectively unchanged, so that vintage difference does not
+ explain most of our remaining gap.
+
+## Reproducing the table
+
+The comparison script is:
+
+- [compare_tob_shares.py](./compare_tob_shares.py)
+- [benchmark_trustees_bracket_indexing.py](./benchmark_trustees_bracket_indexing.py)
+
+Example:
+
+```bash
+uv run python policyengine_us_data/datasets/cps/long_term/compare_tob_shares.py \
+ /path/to/2075-output-dir \
+ /path/to/2090-output-dir \
+ /path/to/2100-output-dir
+
+uv run python policyengine_us_data/datasets/cps/long_term/benchmark_trustees_bracket_indexing.py \
+ /path/to/2075-output-dir \
+ /path/to/2090-output-dir \
+ /path/to/2100-output-dir \
+ --policyengine-us-path /path/to/local/policyengine-us
+```
+
+The scripts expect metadata sidecars containing:
+
+- `calibration_audit.constraints.ss_total`
+- `calibration_audit.constraints.oasdi_tob` or `calibration_audit.benchmarks.oasdi_tob`
+- `calibration_audit.constraints.hi_tob` or `calibration_audit.benchmarks.hi_tob`
+
+To regenerate the underlying sidecars from scratch, first ensure that
+`policyengine-us` includes the Social Security wage-base extension from
+[PR #7912](https://github.com/PolicyEngine/policyengine-us/pull/7912) or later.
+Without that fix, late-year taxable-payroll calibration is materially wrong.
diff --git a/policyengine_us_data/datasets/cps/long_term/assess_calibration_frontier.py b/policyengine_us_data/datasets/cps/long_term/assess_calibration_frontier.py
new file mode 100644
index 000000000..f9fefd8fe
--- /dev/null
+++ b/policyengine_us_data/datasets/cps/long_term/assess_calibration_frontier.py
@@ -0,0 +1,248 @@
+from __future__ import annotations
+
+import argparse
+import csv
+import gc
+from pathlib import Path
+
+import numpy as np
+from policyengine_us import Microsimulation
+
+from calibration import (
+ _build_constraint_dataframe_and_controls,
+ assess_nonnegative_feasibility,
+)
+from calibration_profiles import get_profile
+from projection_utils import build_household_age_matrix
+from ssa_data import (
+ load_hi_tob_projections,
+ load_oasdi_tob_projections,
+ load_ssa_age_projections,
+ load_ssa_benefit_projections,
+ load_taxable_payroll_projections,
+)
+
+
+DATASET_OPTIONS = {
+ "enhanced_cps_2024": {
+ "path": "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5",
+ "base_year": 2024,
+ },
+}
+SELECTED_DATASET = "enhanced_cps_2024"
+BASE_DATASET_PATH = DATASET_OPTIONS[SELECTED_DATASET]["path"]
+
+
+def parse_args() -> argparse.Namespace:
+ parser = argparse.ArgumentParser(
+ description=(
+ "Assess the nonnegative calibration frontier for a named long-term "
+ "profile by solving the minimax relative-error LP."
+ )
+ )
+ parser.add_argument(
+ "--profile",
+ default="ss-payroll",
+ help="Named calibration profile to assess.",
+ )
+ parser.add_argument(
+ "--start-year",
+ type=int,
+ default=2035,
+ help="First year to assess when --years is not provided.",
+ )
+ parser.add_argument(
+ "--end-year",
+ type=int,
+ default=2100,
+ help="Last year to assess when --years is not provided.",
+ )
+ parser.add_argument(
+ "--step",
+ type=int,
+ default=5,
+ help="Year increment when --years is not provided.",
+ )
+ parser.add_argument(
+ "--years",
+ help="Optional comma-separated list of explicit years to assess.",
+ )
+ parser.add_argument(
+ "--output",
+ help="Optional CSV path for the frontier table.",
+ )
+ return parser.parse_args()
+
+
+def parse_years(args: argparse.Namespace) -> list[int]:
+ if args.years:
+ return [int(value.strip()) for value in args.years.split(",") if value.strip()]
+ return list(range(args.start_year, args.end_year + 1, args.step))
+
+
+def reorder_to_households(values, order, n_households: int) -> np.ndarray:
+ ordered = np.zeros(n_households, dtype=float)
+ ordered[order] = np.asarray(values, dtype=float)
+ return ordered
+
+
+def build_constraint_inputs(
+ year: int, hh_id_to_idx: dict, n_households: int, profile
+) -> dict:
+ sim = Microsimulation(dataset=BASE_DATASET_PATH)
+ if profile.use_h6_reform:
+ raise NotImplementedError(
+ "Frontier assessment for H6-enabled profiles is not yet implemented."
+ )
+ household_ids = sim.calculate(
+ "household_id", period=year, map_to="household"
+ ).values
+ if len(household_ids) != n_households:
+ raise ValueError(
+ f"Household count mismatch for {year}: {len(household_ids)} vs {n_households}"
+ )
+ order = np.fromiter(
+ (hh_id_to_idx[hh_id] for hh_id in household_ids),
+ dtype=int,
+ count=len(household_ids),
+ )
+ inputs: dict[str, np.ndarray | float | None] = {
+ "ss_values": None,
+ "ss_target": None,
+ "payroll_values": None,
+ "payroll_target": None,
+ "h6_income_values": None,
+ "h6_revenue_target": None,
+ "oasdi_tob_values": None,
+ "oasdi_tob_target": None,
+ "hi_tob_values": None,
+ "hi_tob_target": None,
+ }
+
+ if profile.use_ss:
+ inputs["ss_values"] = reorder_to_households(
+ sim.calculate("social_security", period=year, map_to="household").values,
+ order,
+ n_households,
+ )
+ inputs["ss_target"] = load_ssa_benefit_projections(year)
+
+ if profile.use_payroll:
+ inputs["payroll_values"] = reorder_to_households(
+ sim.calculate(
+ "taxable_earnings_for_social_security",
+ period=year,
+ map_to="household",
+ ).values
+ + sim.calculate(
+ "social_security_taxable_self_employment_income",
+ period=year,
+ map_to="household",
+ ).values,
+ order,
+ n_households,
+ )
+ inputs["payroll_target"] = load_taxable_payroll_projections(year)
+
+ if profile.use_tob:
+ inputs["oasdi_tob_values"] = reorder_to_households(
+ sim.calculate(
+ "tob_revenue_oasdi",
+ period=year,
+ map_to="household",
+ ).values,
+ order,
+ n_households,
+ )
+ inputs["hi_tob_values"] = reorder_to_households(
+ sim.calculate(
+ "tob_revenue_medicare_hi",
+ period=year,
+ map_to="household",
+ ).values,
+ order,
+ n_households,
+ )
+ inputs["oasdi_tob_target"] = load_oasdi_tob_projections(year)
+ inputs["hi_tob_target"] = load_hi_tob_projections(year)
+
+ del sim
+ gc.collect()
+ return inputs
+
+
+def main() -> int:
+ args = parse_args()
+ years = parse_years(args)
+ if not years:
+ raise ValueError("No years requested.")
+
+ profile = get_profile(args.profile)
+ start_year = min(years)
+ end_year = max(years)
+ target_matrix = load_ssa_age_projections(start_year=start_year, end_year=end_year)
+
+ base_sim = Microsimulation(dataset=BASE_DATASET_PATH)
+ X, household_ids_unique, hh_id_to_idx = build_household_age_matrix(
+ base_sim,
+ n_ages=target_matrix.shape[0],
+ )
+ del base_sim
+ gc.collect()
+
+ rows: list[dict[str, object]] = []
+ print(
+ f"Assessing profile {profile.name!r} for {len(years)} years "
+ f"using {len(household_ids_unique):,} fixed households."
+ )
+ for year in years:
+ year_idx = year - start_year
+ y_target = target_matrix[:, year_idx]
+ inputs = build_constraint_inputs(
+ year,
+ hh_id_to_idx,
+ len(household_ids_unique),
+ profile,
+ )
+ aux_df, controls = _build_constraint_dataframe_and_controls(
+ X,
+ y_target,
+ n_ages=target_matrix.shape[0],
+ **inputs,
+ )
+ targets = np.array(list(controls.values()), dtype=float)
+ feasibility = assess_nonnegative_feasibility(
+ aux_df.to_numpy(dtype=float),
+ targets,
+ )
+ best_case = feasibility["best_case_max_pct_error"]
+ within_tolerance = (
+ best_case is not None and best_case <= profile.max_constraint_error_pct
+ )
+ row = {
+ "year": year,
+ "profile": profile.name,
+ "best_case_max_pct_error": best_case,
+ "within_profile_tolerance": within_tolerance,
+ "quality": "exact" if within_tolerance else "approximate",
+ "status": feasibility["status"],
+ "message": feasibility["message"],
+ }
+ rows.append(row)
+ best_case_display = "n/a" if best_case is None else f"{best_case:.3f}%"
+ print(f"{year}: best-case max error {best_case_display} -> {row['quality']}")
+
+ if args.output:
+ output_path = Path(args.output)
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ with output_path.open("w", encoding="utf-8", newline="") as file:
+ writer = csv.DictWriter(file, fieldnames=list(rows[0].keys()))
+ writer.writeheader()
+ writer.writerows(rows)
+ print(f"Wrote {output_path}")
+
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/policyengine_us_data/datasets/cps/long_term/assess_publishable_horizon.py b/policyengine_us_data/datasets/cps/long_term/assess_publishable_horizon.py
new file mode 100644
index 000000000..dcf5cd08c
--- /dev/null
+++ b/policyengine_us_data/datasets/cps/long_term/assess_publishable_horizon.py
@@ -0,0 +1,441 @@
+from __future__ import annotations
+
+import argparse
+import csv
+import gc
+from pathlib import Path
+import re
+import sys
+
+import numpy as np
+
+from policyengine_us import Microsimulation
+
+from calibration import build_calibration_audit, calibrate_weights
+from calibration_profiles import (
+ approximate_window_for_year,
+ classify_calibration_quality,
+ get_profile,
+ validate_calibration_audit,
+)
+from projection_utils import (
+ aggregate_age_targets,
+ aggregate_household_age_matrix,
+ build_age_bins,
+ build_household_age_matrix,
+)
+from ssa_data import (
+ get_long_term_target_source,
+ load_hi_tob_projections,
+ load_oasdi_tob_projections,
+ load_ssa_age_projections,
+ load_ssa_benefit_projections,
+ load_taxable_payroll_projections,
+ set_long_term_target_source,
+)
+
+try:
+ from samplics.weighting import SampleWeight
+except ImportError: # pragma: no cover - only needed for greg profiles
+ SampleWeight = None
+
+
+DEFAULT_BASE_DATASET_PATH = (
+ "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5"
+)
+
+
+def parse_args() -> argparse.Namespace:
+ parser = argparse.ArgumentParser(
+ description=(
+ "Assess publishable microsimulation horizon quality for selected years."
+ )
+ )
+ parser.add_argument(
+ "--profile",
+ default="ss-payroll-tob",
+ help="Named calibration profile to assess.",
+ )
+ parser.add_argument(
+ "--target-source",
+ default=get_long_term_target_source(),
+ help="Named long-run target source package.",
+ )
+ parser.add_argument(
+ "--years",
+ default="2075,2080,2085,2090,2095,2100",
+ help="Comma-separated years to assess.",
+ )
+ parser.add_argument(
+ "--base-dataset",
+ default=DEFAULT_BASE_DATASET_PATH,
+ help="Base microsimulation dataset path.",
+ )
+ parser.add_argument(
+ "--output",
+ type=Path,
+ help="Optional CSV output path. Defaults to stdout.",
+ )
+ return parser.parse_args()
+
+
+def parse_years(raw: str) -> list[int]:
+ years = [int(value.strip()) for value in raw.split(",") if value.strip()]
+ if not years:
+ raise ValueError("At least one year must be provided.")
+ return sorted(set(years))
+
+
+def maybe_build_calibrator(method: str):
+ if method != "greg":
+ return None
+ if SampleWeight is None:
+ raise ImportError(
+ "samplics is required for GREG calibration. "
+ "Install with: pip install policyengine-us-data[calibration]"
+ )
+ return SampleWeight()
+
+
+def benchmark_tob_values(
+ year: int,
+ weights: np.ndarray,
+ *,
+ oasdi_tob_values: np.ndarray | None,
+ hi_tob_values: np.ndarray | None,
+) -> dict[str, float] | None:
+ if oasdi_tob_values is None or hi_tob_values is None:
+ return None
+
+ oasdi_target = float(load_oasdi_tob_projections(year))
+ oasdi_achieved = float(np.sum(oasdi_tob_values * weights))
+ hi_target = float(load_hi_tob_projections(year))
+ hi_achieved = float(np.sum(hi_tob_values * weights))
+
+ return {
+ "oasdi_tob_benchmark_pct_error": (
+ 0.0
+ if oasdi_target == 0
+ else (oasdi_achieved - oasdi_target) / oasdi_target * 100
+ ),
+ "hi_tob_benchmark_pct_error": (
+ 0.0 if hi_target == 0 else (hi_achieved - hi_target) / hi_target * 100
+ ),
+ }
+
+
+def assess_years(
+ *,
+ years: list[int],
+ profile_name: str,
+ target_source: str,
+ base_dataset_path: str,
+) -> list[dict[str, object]]:
+ profile = get_profile(profile_name)
+ if profile.use_h6_reform:
+ raise NotImplementedError(
+ "assess_publishable_horizon.py does not yet support H6-calibrated profiles."
+ )
+
+ set_long_term_target_source(target_source)
+ calibrator = maybe_build_calibrator(profile.calibration_method)
+
+ start_year = min(years)
+ end_year = max(years)
+ target_matrix = load_ssa_age_projections(start_year=start_year, end_year=end_year)
+ n_ages = target_matrix.shape[0]
+
+ sim = Microsimulation(dataset=base_dataset_path)
+ X, _, _ = build_household_age_matrix(sim, n_ages)
+ del sim
+ gc.collect()
+
+ aggregated_age_cache: dict[int, tuple[np.ndarray, np.ndarray]] = {}
+ rows: list[dict[str, object]] = []
+
+ for year in years:
+ print(f"[assess_publishable_horizon] year={year}", file=sys.stderr, flush=True)
+ year_idx = year - start_year
+ sim = Microsimulation(dataset=base_dataset_path)
+
+ household_microseries = sim.calculate("household_id", map_to="household")
+ baseline_weights = household_microseries.weights.values
+
+ ss_values = None
+ ss_target = None
+ if profile.use_ss:
+ ss_values = sim.calculate(
+ "social_security",
+ period=year,
+ map_to="household",
+ ).values
+ ss_target = load_ssa_benefit_projections(year)
+
+ payroll_values = None
+ payroll_target = None
+ if profile.use_payroll:
+ taxable_wages = sim.calculate(
+ "taxable_earnings_for_social_security",
+ period=year,
+ map_to="household",
+ ).values
+ taxable_self_employment = sim.calculate(
+ "social_security_taxable_self_employment_income",
+ period=year,
+ map_to="household",
+ ).values
+ payroll_values = taxable_wages + taxable_self_employment
+ payroll_target = load_taxable_payroll_projections(year)
+
+ oasdi_tob_values = None
+ hi_tob_values = None
+ if profile.use_tob or profile.benchmark_tob:
+ oasdi_tob_values = sim.calculate(
+ "tob_revenue_oasdi",
+ period=year,
+ map_to="household",
+ ).values
+ hi_tob_values = sim.calculate(
+ "tob_revenue_medicare_hi",
+ period=year,
+ map_to="household",
+ ).values
+
+ approximate_window = approximate_window_for_year(profile, year)
+ age_bucket_size = (
+ approximate_window.age_bucket_size
+ if approximate_window is not None and approximate_window.age_bucket_size
+ else 1
+ )
+
+ if age_bucket_size > 1:
+ if age_bucket_size not in aggregated_age_cache:
+ age_bins = build_age_bins(n_ages=n_ages, bucket_size=age_bucket_size)
+ aggregated_age_cache[age_bucket_size] = (
+ aggregate_household_age_matrix(X, age_bins),
+ aggregate_age_targets(target_matrix, age_bins),
+ )
+ X_current, aggregated_target_matrix = aggregated_age_cache[age_bucket_size]
+ y_target = aggregated_target_matrix[:, year_idx]
+ else:
+ X_current = X
+ y_target = target_matrix[:, year_idx]
+
+ try:
+ weights, iterations, calibration_event = calibrate_weights(
+ X=X_current,
+ y_target=y_target,
+ baseline_weights=baseline_weights,
+ method=profile.calibration_method,
+ calibrator=calibrator,
+ ss_values=ss_values,
+ ss_target=ss_target,
+ payroll_values=payroll_values,
+ payroll_target=payroll_target,
+ h6_income_values=None,
+ h6_revenue_target=None,
+ oasdi_tob_values=oasdi_tob_values if profile.use_tob else None,
+ oasdi_tob_target=load_oasdi_tob_projections(year)
+ if profile.use_tob
+ else None,
+ hi_tob_values=hi_tob_values if profile.use_tob else None,
+ hi_tob_target=load_hi_tob_projections(year)
+ if profile.use_tob
+ else None,
+ n_ages=X_current.shape[1],
+ max_iters=100,
+ tol=1e-6,
+ verbose=False,
+ allow_fallback_to_ipf=profile.allow_greg_fallback,
+ allow_approximate_entropy=approximate_window is not None,
+ approximate_max_error_pct=(
+ approximate_window.max_constraint_error_pct
+ if approximate_window is not None
+ else None
+ ),
+ )
+ except RuntimeError as error:
+ row: dict[str, object] = {
+ "year": year,
+ "target_source": target_source,
+ "profile": profile.name,
+ "calibration_quality": "failed",
+ "approximation_method": "runtime_error",
+ "iterations": None,
+ "age_bucket_size": age_bucket_size,
+ "window_max_constraint_error_pct": (
+ approximate_window.max_constraint_error_pct
+ if approximate_window is not None
+ else profile.max_constraint_error_pct
+ ),
+ "window_max_age_error_pct": (
+ approximate_window.max_age_error_pct
+ if approximate_window is not None
+ else profile.max_age_error_pct
+ ),
+ "max_constraint_pct_error": None,
+ "age_max_pct_error": None,
+ "positive_weight_count": None,
+ "effective_sample_size": None,
+ "top_10_weight_share_pct": None,
+ "top_100_weight_share_pct": None,
+ "negative_weight_pct": None,
+ "validation_passed": False,
+ "validation_issue_count": 1,
+ "validation_issues": str(error),
+ "runtime_error": str(error),
+ }
+ best_case_match = re.search(r"([0-9.]+)%\s*>\s*([0-9.]+)%", str(error))
+ if best_case_match:
+ row["reported_best_case_constraint_error_pct"] = float(
+ best_case_match.group(1)
+ )
+ row["reported_allowed_constraint_error_pct"] = float(
+ best_case_match.group(2)
+ )
+ rows.append(row)
+ del sim
+ gc.collect()
+ continue
+
+ audit = build_calibration_audit(
+ X=X_current,
+ y_target=y_target,
+ weights=weights,
+ baseline_weights=baseline_weights,
+ calibration_event=calibration_event,
+ ss_values=ss_values,
+ ss_target=ss_target,
+ payroll_values=payroll_values,
+ payroll_target=payroll_target,
+ h6_income_values=None,
+ h6_revenue_target=None,
+ oasdi_tob_values=oasdi_tob_values if profile.use_tob else None,
+ oasdi_tob_target=load_oasdi_tob_projections(year)
+ if profile.use_tob
+ else None,
+ hi_tob_values=hi_tob_values if profile.use_tob else None,
+ hi_tob_target=load_hi_tob_projections(year) if profile.use_tob else None,
+ )
+ audit["calibration_quality"] = classify_calibration_quality(
+ audit,
+ profile,
+ year=year,
+ )
+ audit["age_bucket_size"] = age_bucket_size
+ audit["age_bucket_count"] = int(X_current.shape[1])
+
+ validation_issues = validate_calibration_audit(
+ audit,
+ profile,
+ year=year,
+ )
+ audit["validation_issues"] = validation_issues
+ audit["validation_passed"] = not bool(validation_issues)
+
+ row: dict[str, object] = {
+ "year": year,
+ "target_source": target_source,
+ "profile": profile.name,
+ "calibration_quality": audit["calibration_quality"],
+ "approximation_method": audit.get("approximation_method")
+ or audit.get("method_used"),
+ "iterations": iterations,
+ "age_bucket_size": age_bucket_size,
+ "window_max_constraint_error_pct": (
+ approximate_window.max_constraint_error_pct
+ if approximate_window is not None
+ else profile.max_constraint_error_pct
+ ),
+ "window_max_age_error_pct": (
+ approximate_window.max_age_error_pct
+ if approximate_window is not None
+ else profile.max_age_error_pct
+ ),
+ "max_constraint_pct_error": audit.get("max_constraint_pct_error"),
+ "age_max_pct_error": audit.get("age_max_pct_error"),
+ "positive_weight_count": audit.get("positive_weight_count"),
+ "effective_sample_size": audit.get("effective_sample_size"),
+ "top_10_weight_share_pct": audit.get("top_10_weight_share_pct"),
+ "top_100_weight_share_pct": audit.get("top_100_weight_share_pct"),
+ "negative_weight_pct": audit.get("negative_weight_pct"),
+ "validation_passed": audit["validation_passed"],
+ "validation_issue_count": len(validation_issues),
+ "validation_issues": "; ".join(validation_issues),
+ }
+
+ tob_benchmarks = benchmark_tob_values(
+ year,
+ weights,
+ oasdi_tob_values=oasdi_tob_values,
+ hi_tob_values=hi_tob_values,
+ )
+ if tob_benchmarks is not None:
+ row.update(tob_benchmarks)
+
+ rows.append(row)
+
+ del sim
+ gc.collect()
+
+ return rows
+
+
+def write_rows(rows: list[dict[str, object]], output: Path | None) -> None:
+ if not rows:
+ raise SystemExit("No rows to write.")
+
+ fieldnames = [
+ "year",
+ "target_source",
+ "profile",
+ "calibration_quality",
+ "approximation_method",
+ "iterations",
+ "age_bucket_size",
+ "window_max_constraint_error_pct",
+ "window_max_age_error_pct",
+ "max_constraint_pct_error",
+ "age_max_pct_error",
+ "positive_weight_count",
+ "effective_sample_size",
+ "top_10_weight_share_pct",
+ "top_100_weight_share_pct",
+ "negative_weight_pct",
+ "validation_passed",
+ "validation_issue_count",
+ "validation_issues",
+ "runtime_error",
+ "reported_best_case_constraint_error_pct",
+ "reported_allowed_constraint_error_pct",
+ "oasdi_tob_benchmark_pct_error",
+ "hi_tob_benchmark_pct_error",
+ ]
+
+ if output is None:
+ writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames)
+ writer.writeheader()
+ writer.writerows(rows)
+ return
+
+ output.parent.mkdir(parents=True, exist_ok=True)
+ with output.open("w", encoding="utf-8", newline="") as file:
+ writer = csv.DictWriter(file, fieldnames=fieldnames)
+ writer.writeheader()
+ writer.writerows(rows)
+
+
+def main() -> int:
+ args = parse_args()
+ rows = assess_years(
+ years=parse_years(args.years),
+ profile_name=args.profile,
+ target_source=args.target_source,
+ base_dataset_path=args.base_dataset,
+ )
+ write_rows(rows, args.output)
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/policyengine_us_data/datasets/cps/long_term/benchmark_trustees_bracket_indexing.py b/policyengine_us_data/datasets/cps/long_term/benchmark_trustees_bracket_indexing.py
new file mode 100644
index 000000000..0929c1c2e
--- /dev/null
+++ b/policyengine_us_data/datasets/cps/long_term/benchmark_trustees_bracket_indexing.py
@@ -0,0 +1,267 @@
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import sys
+from pathlib import Path
+
+try:
+ from .tax_assumptions import (
+ create_wage_indexed_core_thresholds_reform,
+ create_wage_indexed_full_irs_uprating_reform,
+ )
+except ImportError: # pragma: no cover - script execution fallback
+ from tax_assumptions import (
+ create_wage_indexed_core_thresholds_reform,
+ create_wage_indexed_full_irs_uprating_reform,
+ )
+
+
+FILING_STATUSES = [
+ "SINGLE",
+ "JOINT",
+ "SEPARATE",
+ "HEAD_OF_HOUSEHOLD",
+ "SURVIVING_SPOUSE",
+]
+
+
+def round_down(amount: float, interval: float) -> float:
+ return math.floor(amount / interval) * interval
+
+
+def create_wage_indexed_brackets_reform(
+ start_year: int = 2035,
+ end_year: int = 2100,
+):
+ from policyengine_us.model_api import Reform
+
+ def modify_parameters(parameters):
+ nawi = parameters.gov.ssa.nawi
+ thresholds = parameters.gov.irs.income.bracket.thresholds
+
+ for bracket in map(str, range(1, 7)):
+ bracket_node = thresholds.get_child(bracket)
+ for filing_status in FILING_STATUSES:
+ parameter = bracket_node.get_child(filing_status)
+ interval = float(parameter.metadata["uprating"]["rounding"]["interval"])
+
+ for year in range(start_year, end_year + 1):
+ previous_value = float(parameter(f"{year - 1}-01-01"))
+ wage_growth = float(nawi(f"{year - 1}-01-01")) / float(
+ nawi(f"{year - 2}-01-01")
+ )
+ updated_value = round_down(previous_value * wage_growth, interval)
+ parameter.update(
+ period=f"year:{year}-01-01:1",
+ value=updated_value,
+ )
+ return parameters
+
+ class reform(Reform):
+ def apply(self):
+ self.modify_parameters(modify_parameters)
+
+ return reform
+
+
+def _coerce_h5_path(raw: str) -> Path:
+ path = Path(raw).expanduser()
+ if path.is_dir():
+ matches = sorted(path.glob("*.h5"))
+ if len(matches) != 1:
+ raise ValueError(
+ f"Expected exactly one .h5 file in {path}, found {len(matches)}"
+ )
+ return matches[0]
+ if path.suffix == ".metadata.json":
+ return path.with_suffix("").with_suffix(".h5")
+ return path
+
+
+def _load_metadata(h5_path: Path) -> dict | None:
+ metadata_path = h5_path.with_suffix(".h5.metadata.json")
+ if not metadata_path.exists():
+ return None
+ return json.loads(metadata_path.read_text(encoding="utf-8"))
+
+
+def _baseline_record(h5_path: Path, metadata: dict | None) -> dict:
+ if metadata is None:
+ return {
+ "year": int(h5_path.stem),
+ "baseline_oasdi_share_pct": None,
+ "baseline_combined_share_pct": None,
+ "target_oasdi_share_pct": None,
+ }
+
+ audit = metadata["calibration_audit"]
+ ss_total = float(audit["constraints"]["ss_total"]["achieved"])
+ ss_target = float(audit["constraints"]["ss_total"]["target"])
+ tob_section = audit.get("benchmarks") or audit.get("constraints")
+ oasdi_actual = float(tob_section["oasdi_tob"]["achieved"])
+ hi_actual = float(tob_section["hi_tob"]["achieved"])
+ oasdi_target = float(tob_section["oasdi_tob"]["target"])
+
+ return {
+ "year": int(metadata["year"]),
+ "baseline_oasdi_share_pct": 100 * oasdi_actual / ss_total,
+ "baseline_combined_share_pct": 100 * (oasdi_actual + hi_actual) / ss_total,
+ "target_oasdi_share_pct": 100 * oasdi_target / ss_target,
+ }
+
+
+def _compute_reformed_shares(
+ h5_path: Path,
+ start_year: int,
+ end_year: int,
+ scenario: str,
+) -> dict:
+ from policyengine_us import Microsimulation
+
+ if scenario == "brackets":
+ reform = create_wage_indexed_brackets_reform(
+ start_year=start_year,
+ end_year=end_year,
+ )
+ elif scenario == "core-thresholds":
+ reform = create_wage_indexed_core_thresholds_reform(
+ start_year=start_year,
+ end_year=end_year,
+ )
+ elif scenario == "irs-uprating":
+ reform = create_wage_indexed_full_irs_uprating_reform(
+ start_year=start_year,
+ end_year=end_year,
+ )
+ else:
+ raise ValueError(f"Unknown scenario: {scenario}")
+
+ sim = Microsimulation(dataset=str(h5_path), reform=reform)
+ ss_total = float(sim.calculate("social_security").sum())
+ oasdi_tob = float(sim.calculate("tob_revenue_oasdi").sum())
+ hi_tob = float(sim.calculate("tob_revenue_medicare_hi").sum())
+ return {
+ "reformed_oasdi_share_pct": 100 * oasdi_tob / ss_total,
+ "reformed_combined_share_pct": 100 * (oasdi_tob + hi_tob) / ss_total,
+ }
+
+
+def _format_markdown(records: list[dict]) -> str:
+ header = (
+ "| Year | Scenario | Trustees OASDI target | Baseline OASDI | "
+ "Reformed OASDI | OASDI delta | Baseline combined | Reformed combined |\n"
+ "| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: |"
+ )
+ rows = []
+ for record in records:
+ baseline_oasdi = record["baseline_oasdi_share_pct"]
+ reformed_oasdi = record["reformed_oasdi_share_pct"]
+ baseline_combined = record["baseline_combined_share_pct"]
+ reformed_combined = record["reformed_combined_share_pct"]
+ rows.append(
+ "| {year} | {scenario} | {target:.2f}% | {base_o:.2f}% | {reform_o:.2f}% | {delta:+.2f} pp | "
+ "{base_c:.2f}% | {reform_c:.2f}% |".format(
+ year=record["year"],
+ scenario=record["scenario"],
+ target=record["target_oasdi_share_pct"],
+ base_o=baseline_oasdi,
+ reform_o=reformed_oasdi,
+ delta=reformed_oasdi - baseline_oasdi,
+ base_c=baseline_combined,
+ reform_c=reformed_combined,
+ )
+ )
+ return "\n".join([header, *rows])
+
+
+def parse_args() -> argparse.Namespace:
+ parser = argparse.ArgumentParser(
+ description=(
+ "Benchmark calibrated long-run H5s under a wage-indexed ordinary "
+ "income-tax bracket sensitivity."
+ ),
+ )
+ parser.add_argument(
+ "paths",
+ nargs="+",
+ help="H5 files, metadata files, or directories containing a single H5.",
+ )
+ parser.add_argument(
+ "--policyengine-us-path",
+ help=(
+ "Optional local policyengine-us checkout to prepend to sys.path. "
+ "Use this when the required tax-side fix is not yet released."
+ ),
+ )
+ parser.add_argument(
+ "--start-year",
+ type=int,
+ default=2035,
+ help="First year to switch ordinary federal brackets from CPI to wages.",
+ )
+ parser.add_argument(
+ "--end-year",
+ type=int,
+ help=(
+ "Last year to extend the wage-indexed sensitivity through. "
+ "Defaults to the maximum year among the input H5s."
+ ),
+ )
+ parser.add_argument(
+ "--format",
+ choices=("markdown", "json"),
+ default="markdown",
+ help="Output format.",
+ )
+ parser.add_argument(
+ "--scenario",
+ choices=("brackets", "core-thresholds", "irs-uprating"),
+ default="brackets",
+ help=(
+ "Tax-side sensitivity to run: wage-index only ordinary bracket "
+ "thresholds, wage-index a core threshold set, or wage-index the "
+ "full IRS uprating path."
+ ),
+ )
+ return parser.parse_args()
+
+
+def main() -> int:
+ args = parse_args()
+ if args.policyengine_us_path:
+ sys.path.insert(0, str(Path(args.policyengine_us_path).expanduser()))
+
+ h5_paths = [_coerce_h5_path(path) for path in args.paths]
+ end_year = args.end_year or max(int(path.stem) for path in h5_paths)
+ records = []
+
+ for h5_path in h5_paths:
+ metadata = _load_metadata(h5_path)
+ baseline = _baseline_record(h5_path, metadata)
+ print(
+ f"[{baseline['year']}] benchmarking {args.scenario} on {h5_path}",
+ file=sys.stderr,
+ flush=True,
+ )
+ reformed = _compute_reformed_shares(
+ h5_path,
+ start_year=args.start_year,
+ end_year=end_year,
+ scenario=args.scenario,
+ )
+ baseline.update(reformed)
+ baseline["scenario"] = args.scenario
+ records.append(baseline)
+
+ records.sort(key=lambda record: record["year"])
+ if args.format == "json":
+ print(json.dumps(records, indent=2, sort_keys=True))
+ else:
+ print(_format_markdown(records))
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/policyengine_us_data/datasets/cps/long_term/build_long_term_target_sources.py b/policyengine_us_data/datasets/cps/long_term/build_long_term_target_sources.py
new file mode 100644
index 000000000..20655c354
--- /dev/null
+++ b/policyengine_us_data/datasets/cps/long_term/build_long_term_target_sources.py
@@ -0,0 +1,139 @@
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+import pandas as pd
+from policyengine_us_data.storage import STORAGE_FOLDER
+
+
+SOURCES_DIR = STORAGE_FOLDER / "long_term_target_sources"
+LEGACY_AUX_PATH = STORAGE_FOLDER / "social_security_aux.csv"
+OACT_DELTA_PATH = SOURCES_DIR / "oasdi_oact_20250805_nominal_delta.csv"
+TRUSTEES_OUTPUT_PATH = SOURCES_DIR / "trustees_2025_current_law.csv"
+OACT_OUTPUT_PATH = SOURCES_DIR / "oact_2025_08_05_provisional.csv"
+MANIFEST_PATH = SOURCES_DIR / "sources.json"
+
+
+def build_trustees_source() -> pd.DataFrame:
+ trustees = pd.read_csv(LEGACY_AUX_PATH).copy()
+ trustees.to_csv(TRUSTEES_OUTPUT_PATH, index=False)
+ return trustees
+
+
+def build_oact_source(trustees: pd.DataFrame) -> pd.DataFrame:
+ delta = pd.read_csv(OACT_DELTA_PATH).copy()
+ if 2100 not in set(delta.year):
+ delta = pd.concat(
+ [
+ delta,
+ pd.DataFrame(
+ {
+ "year": [2100],
+ "oasdi_nominal_delta_billions": [
+ float(delta.iloc[-1]["oasdi_nominal_delta_billions"])
+ ],
+ }
+ ),
+ ],
+ ignore_index=True,
+ )
+
+ merged = trustees.merge(delta, on="year", how="left")
+ if merged["oasdi_nominal_delta_billions"].isna().any():
+ missing_years = merged.loc[
+ merged["oasdi_nominal_delta_billions"].isna(), "year"
+ ].tolist()
+ raise ValueError(f"Missing OACT OASDI deltas for years: {missing_years}")
+
+ merged["oasdi_tob_billions_nominal_usd"] = (
+ merged["oasdi_tob_billions_nominal_usd"]
+ + merged["oasdi_nominal_delta_billions"]
+ )
+ ratio = (
+ merged["oasdi_tob_billions_nominal_usd"]
+ / trustees["oasdi_tob_billions_nominal_usd"]
+ )
+ merged["hi_tob_billions_nominal_usd"] = (
+ trustees["hi_tob_billions_nominal_usd"] * ratio
+ )
+ merged["oasdi_tob_pct_of_taxable_payroll"] = (
+ merged["oasdi_tob_billions_nominal_usd"]
+ / merged["taxable_payroll_in_billion_nominal_usd"]
+ * 100
+ )
+ merged = merged.drop(columns=["oasdi_nominal_delta_billions"])
+ merged.to_csv(OACT_OUTPUT_PATH, index=False)
+ return merged
+
+
+def write_manifest() -> None:
+ manifest = {
+ "default_source": "trustees_2025_current_law",
+ "sources": {
+ "trustees_2025_current_law": {
+ "name": "trustees_2025_current_law",
+ "file": TRUSTEES_OUTPUT_PATH.name,
+ "type": "trustees_current_law",
+ "description": (
+ "2025 Trustees current-law baseline used by the legacy "
+ "long-term calibration stack."
+ ),
+ "source_urls": [
+ "https://www.ssa.gov/oact/tr/2025/lrIndex.html",
+ "https://www.ssa.gov/oact/solvency/provisions/tables/table_run133.html",
+ ],
+ "notes": [
+ "Generated from social_security_aux.csv for explicit source selection.",
+ ],
+ },
+ "oact_2025_08_05_provisional": {
+ "name": "oact_2025_08_05_provisional",
+ "file": OACT_OUTPUT_PATH.name,
+ "type": "oact_override",
+ "description": (
+ "Post-OBBBA SSA OACT baseline overlay with provisional HI "
+ "bridge for long-term calibration experiments."
+ ),
+ "source_urls": [
+ "https://www.ssa.gov/OACT/solvency/RWyden_20250805.pdf",
+ "https://www.ssa.gov/oact/tr/2025/lrIndex.html",
+ ],
+ "notes": [
+ "OASDI TOB nominal deltas are taken from the August 5, 2025 OACT letter.",
+ "2100 OASDI delta is carried forward from 2099 because the published delta table ends at 2099.",
+ "HI TOB series is provisional: it applies the same percentage change as OASDI TOB to preserve the OASDI/HI share split until a published annual HI replacement series is available.",
+ ],
+ "derived_from": "trustees_2025_current_law",
+ "hi_method": "match_oasdi_pct_change",
+ },
+ },
+ }
+ MANIFEST_PATH.write_text(
+ json.dumps(manifest, indent=2, sort_keys=True) + "\n",
+ encoding="utf-8",
+ )
+
+
+def parse_args() -> argparse.Namespace:
+ parser = argparse.ArgumentParser(
+ description="Build explicit long-term target source packages.",
+ )
+ return parser.parse_args()
+
+
+def main() -> int:
+ parse_args()
+ SOURCES_DIR.mkdir(parents=True, exist_ok=True)
+ trustees = build_trustees_source()
+ build_oact_source(trustees)
+ write_manifest()
+ print(f"Wrote {TRUSTEES_OUTPUT_PATH}")
+ print(f"Wrote {OACT_OUTPUT_PATH}")
+ print(f"Wrote {MANIFEST_PATH}")
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/policyengine_us_data/datasets/cps/long_term/calibration.py b/policyengine_us_data/datasets/cps/long_term/calibration.py
index e9d353727..92dc0a4ec 100644
--- a/policyengine_us_data/datasets/cps/long_term/calibration.py
+++ b/policyengine_us_data/datasets/cps/long_term/calibration.py
@@ -1,5 +1,19 @@
import numpy as np
import pandas as pd
+from scipy import optimize, sparse
+
+
+def _pct_error(achieved, target):
+ if target == 0:
+ return 0.0 if achieved == 0 else float("inf")
+ return (achieved - target) / target * 100
+
+
+def _relative_errors(achieved, target):
+ target = np.asarray(target, dtype=float)
+ achieved = np.asarray(achieved, dtype=float)
+ denominator = np.maximum(np.abs(target), 1e-10)
+ return (achieved - target) / denominator
def iterative_proportional_fitting(
@@ -47,7 +61,7 @@ def iterative_proportional_fitting(
if verbose and (iter_num % 10 == 0 or rel_change < tol):
predictions_new = X.T @ w
- rel_errors = np.abs(predictions_new - y) / y
+ rel_errors = np.abs(_relative_errors(predictions_new, y))
max_rel_error = rel_errors.max()
print(
f"Iteration {iter_num:3d}: Max relative error = {max_rel_error:.6f}, Weight change = {rel_change:.6e}"
@@ -66,8 +80,8 @@ def iterative_proportional_fitting(
"iterations": iter_num + 1,
"predictions_initial": predictions_initial,
"predictions_new": predictions_final,
- "relative_errors_initial": (predictions_initial - y) / y,
- "relative_errors_new": (predictions_final - y) / y,
+ "relative_errors_initial": _relative_errors(predictions_initial, y),
+ "relative_errors_new": _relative_errors(predictions_final, y),
"weight_ratio": w / w_initial,
}
@@ -166,6 +180,550 @@ def calibrate_greg(
return w_new, 1
+def _build_constraint_dataframe_and_controls(
+ X,
+ y_target,
+ *,
+ ss_values=None,
+ ss_target=None,
+ payroll_values=None,
+ payroll_target=None,
+ h6_income_values=None,
+ h6_revenue_target=None,
+ oasdi_tob_values=None,
+ oasdi_tob_target=None,
+ hi_tob_values=None,
+ hi_tob_target=None,
+ n_ages=86,
+):
+ controls = {}
+ age_cols = {f"age_{i}": X[:, i].astype(float) for i in range(n_ages)}
+ aux_df = pd.DataFrame(age_cols)
+
+ for age_idx in range(n_ages):
+ controls[f"age_{age_idx}"] = float(y_target[age_idx])
+
+ if ss_values is not None and ss_target is not None:
+ aux_df["ss_total"] = np.asarray(ss_values, dtype=float)
+ controls["ss_total"] = float(ss_target)
+
+ if payroll_values is not None and payroll_target is not None:
+ aux_df["payroll_total"] = np.asarray(payroll_values, dtype=float)
+ controls["payroll_total"] = float(payroll_target)
+
+ if h6_income_values is not None and h6_revenue_target is not None:
+ aux_df["h6_revenue"] = np.asarray(h6_income_values, dtype=float)
+ controls["h6_revenue"] = float(h6_revenue_target)
+
+ if oasdi_tob_values is not None and oasdi_tob_target is not None:
+ aux_df["oasdi_tob"] = np.asarray(oasdi_tob_values, dtype=float)
+ controls["oasdi_tob"] = float(oasdi_tob_target)
+
+ if hi_tob_values is not None and hi_tob_target is not None:
+ aux_df["hi_tob"] = np.asarray(hi_tob_values, dtype=float)
+ controls["hi_tob"] = float(hi_tob_target)
+
+ return aux_df, controls
+
+
+def calibrate_entropy(
+ X,
+ y_target,
+ baseline_weights,
+ ss_values=None,
+ ss_target=None,
+ payroll_values=None,
+ payroll_target=None,
+ h6_income_values=None,
+ h6_revenue_target=None,
+ oasdi_tob_values=None,
+ oasdi_tob_target=None,
+ hi_tob_values=None,
+ hi_tob_target=None,
+ n_ages=86,
+ max_iters=500,
+ tol=1e-10,
+):
+ """
+ Positive calibration via entropy balancing.
+
+ Finds strictly positive weights minimizing KL divergence from the baseline
+ weights while matching all requested calibration constraints.
+ """
+
+ aux_df, controls = _build_constraint_dataframe_and_controls(
+ X,
+ y_target,
+ ss_values=ss_values,
+ ss_target=ss_target,
+ payroll_values=payroll_values,
+ payroll_target=payroll_target,
+ h6_income_values=h6_income_values,
+ h6_revenue_target=h6_revenue_target,
+ oasdi_tob_values=oasdi_tob_values,
+ oasdi_tob_target=oasdi_tob_target,
+ hi_tob_values=hi_tob_values,
+ hi_tob_target=hi_tob_target,
+ n_ages=n_ages,
+ )
+
+ A = aux_df.to_numpy(dtype=float)
+ targets = np.array(list(controls.values()), dtype=float)
+ scales = np.maximum(
+ np.maximum(np.abs(targets), np.abs(A.T @ baseline_weights)),
+ 1.0,
+ )
+ A_scaled = A / scales
+ targets_scaled = targets / scales
+
+ baseline_weights = np.asarray(baseline_weights, dtype=float)
+ gram = A_scaled.T @ (baseline_weights[:, None] * A_scaled)
+ gram += np.eye(gram.shape[0]) * 1e-12
+ beta0 = np.linalg.solve(gram, targets_scaled - (A_scaled.T @ baseline_weights))
+
+ def objective_gradient_hessian(beta):
+ eta = np.clip(A_scaled @ beta, -700, 700)
+ exp_eta = np.exp(eta)
+ weights = baseline_weights * exp_eta
+ objective = float(np.sum(weights) - targets_scaled @ beta)
+ gradient = A_scaled.T @ weights - targets_scaled
+ hessian = A_scaled.T @ (weights[:, None] * A_scaled)
+ return objective, gradient, hessian
+
+ def solve_with_root(beta_start):
+ _cache = {}
+
+ def _cached_ogh(z):
+ key = z.tobytes()
+ if key not in _cache:
+ _cache.clear()
+ _cache[key] = objective_gradient_hessian(z)
+ return _cache[key]
+
+ result = optimize.root(
+ lambda z: _cached_ogh(z)[1],
+ beta_start,
+ jac=lambda z: _cached_ogh(z)[2],
+ method="hybr",
+ options={"xtol": tol},
+ )
+ if not result.success:
+ return None
+ _, gradient, _ = objective_gradient_hessian(result.x)
+ max_error = float(
+ np.max(100 * np.abs(gradient) / np.maximum(np.abs(targets_scaled), 1e-12))
+ )
+ if max_error > tol * 100:
+ return None
+ return result.x, result.nfev
+
+ def infeasibility_error(prefix):
+ feasibility = assess_nonnegative_feasibility(A, targets)
+ if feasibility["success"]:
+ return RuntimeError(
+ f"{prefix}. Nonnegative exact calibration appears infeasible under current support; "
+ f"best achievable max relative constraint error is "
+ f"{feasibility['best_case_max_pct_error']:.3f}%."
+ )
+ return RuntimeError(
+ f"{prefix}. Nonnegative feasibility diagnostic could not certify a solution: "
+ f"{feasibility['message']}"
+ )
+
+ beta = beta0.copy()
+ iterations = 0
+ final_max_error = float("inf")
+
+ for iterations in range(1, max_iters + 1):
+ objective, gradient, hessian = objective_gradient_hessian(beta)
+ final_max_error = float(
+ np.max(100 * np.abs(gradient) / np.maximum(np.abs(targets_scaled), 1e-12))
+ )
+ if final_max_error <= tol * 100:
+ break
+
+ hessian += np.eye(hessian.shape[0]) * 1e-12
+ try:
+ delta = np.linalg.solve(hessian, gradient)
+ except np.linalg.LinAlgError:
+ delta = np.linalg.lstsq(hessian, gradient, rcond=None)[0]
+
+ step = 1.0
+ while step >= 1e-8:
+ candidate = beta - step * delta
+ candidate_objective, candidate_gradient, _ = objective_gradient_hessian(
+ candidate
+ )
+ candidate_max_error = float(
+ np.max(
+ 100
+ * np.abs(candidate_gradient)
+ / np.maximum(np.abs(targets_scaled), 1e-12)
+ )
+ )
+ if np.isfinite(candidate_objective) and (
+ candidate_objective <= objective + 1e-12
+ or candidate_max_error < final_max_error
+ ):
+ beta = candidate
+ break
+ step /= 2.0
+
+ if step < 1e-8:
+ root_solution = solve_with_root(beta)
+ if root_solution is not None:
+ beta, root_iterations = root_solution
+ iterations += int(root_iterations)
+ break
+ raise infeasibility_error(
+ "Entropy calibration line search failed to find a descent step"
+ )
+ else:
+ root_solution = solve_with_root(beta)
+ if root_solution is None:
+ raise infeasibility_error(
+ "Entropy calibration failed: "
+ f"max constraint error remained {final_max_error:.6f}% "
+ f"after {max_iters} iterations"
+ )
+ beta, root_iterations = root_solution
+ iterations += int(root_iterations)
+
+ eta = np.clip(A_scaled @ beta, -700, 700)
+ weights = baseline_weights * np.exp(eta)
+ return weights, iterations
+
+
+def calibrate_entropy_bounded(
+ X,
+ y_target,
+ baseline_weights,
+ ss_values=None,
+ ss_target=None,
+ payroll_values=None,
+ payroll_target=None,
+ h6_income_values=None,
+ h6_revenue_target=None,
+ oasdi_tob_values=None,
+ oasdi_tob_target=None,
+ hi_tob_values=None,
+ hi_tob_target=None,
+ n_ages=86,
+ max_constraint_error_pct=0.0,
+ max_iters=500,
+ tol=1e-9,
+ warm_weights=None,
+):
+ """
+ Approximate positive calibration via entropy balancing inside an error box.
+
+ This keeps the entropy objective, but relaxes the constraints to
+ `|A'w - y| / scale <= epsilon` where `epsilon` is the allowed maximum
+ percent error and `scale = max(|target|, 1)`. It is much denser than the
+ LP minimax solution and is therefore a better approximate microsimulation
+ fallback when exact calibration is infeasible.
+ """
+
+ aux_df, controls = _build_constraint_dataframe_and_controls(
+ X,
+ y_target,
+ ss_values=ss_values,
+ ss_target=ss_target,
+ payroll_values=payroll_values,
+ payroll_target=payroll_target,
+ h6_income_values=h6_income_values,
+ h6_revenue_target=h6_revenue_target,
+ oasdi_tob_values=oasdi_tob_values,
+ oasdi_tob_target=oasdi_tob_target,
+ hi_tob_values=hi_tob_values,
+ hi_tob_target=hi_tob_target,
+ n_ages=n_ages,
+ )
+
+ A = aux_df.to_numpy(dtype=float)
+ targets = np.array(list(controls.values()), dtype=float)
+ baseline_weights = np.asarray(baseline_weights, dtype=float)
+
+ epsilon = max(float(max_constraint_error_pct) / 100.0, 0.0)
+ scales = np.maximum(np.abs(targets), 1.0)
+ A_scaled = A / scales
+ targets_scaled = targets / scales
+ lower_bounds = targets_scaled - epsilon
+ upper_bounds = targets_scaled + epsilon
+
+ gram = A_scaled.T @ (baseline_weights[:, None] * A_scaled)
+ gram += np.eye(gram.shape[0]) * 1e-12
+ beta0 = np.linalg.solve(
+ gram,
+ targets_scaled - (A_scaled.T @ baseline_weights),
+ )
+
+ def objective_and_gradient(z):
+ n_constraints = len(targets_scaled)
+ alpha = z[:n_constraints]
+ gamma = z[n_constraints:]
+ beta = gamma - alpha
+ eta = np.clip(A_scaled @ beta, -700, 700)
+ exp_eta = np.exp(eta)
+ weights = baseline_weights * exp_eta
+ achieved = A_scaled.T @ weights
+ objective = float(np.sum(weights) + upper_bounds @ alpha - lower_bounds @ gamma)
+ gradient = np.concatenate(
+ [
+ upper_bounds - achieved,
+ achieved - lower_bounds,
+ ]
+ )
+ return objective, gradient, weights
+
+ starts = [
+ np.zeros(len(targets_scaled) * 2, dtype=float),
+ np.concatenate(
+ [
+ np.maximum(-beta0, 0.0),
+ np.maximum(beta0, 0.0),
+ ]
+ ),
+ ]
+
+ for weights_start in warm_weights or []:
+ weights_start = np.asarray(weights_start, dtype=float)
+ if weights_start.shape != baseline_weights.shape:
+ continue
+ ratios = np.clip(
+ weights_start / np.maximum(baseline_weights, 1e-300),
+ 1e-300,
+ 1e300,
+ )
+ beta_start, *_ = np.linalg.lstsq(
+ A_scaled,
+ np.log(ratios),
+ rcond=None,
+ )
+ starts.append(
+ np.concatenate(
+ [
+ np.maximum(-beta_start, 0.0),
+ np.maximum(beta_start, 0.0),
+ ]
+ )
+ )
+
+ def objective_with_gradient(z):
+ objective, gradient, _ = objective_and_gradient(z)
+ return objective, gradient
+
+ best_result = None
+ best_weights = None
+ best_max_error_pct = float("inf")
+
+ for start in starts:
+ result = optimize.minimize(
+ objective_with_gradient,
+ start,
+ jac=True,
+ method="L-BFGS-B",
+ bounds=[(0.0, None)] * len(start),
+ options={"maxiter": max_iters, "ftol": tol},
+ )
+
+ objective, gradient, weights = objective_and_gradient(result.x)
+ achieved = A_scaled.T @ weights
+ max_error_pct = float(np.max(np.abs(achieved - targets_scaled)) * 100)
+
+ if max_error_pct < best_max_error_pct:
+ best_result = result
+ best_weights = weights
+ best_max_error_pct = max_error_pct
+
+ if result.success and max_error_pct <= max_constraint_error_pct + 1e-6:
+ return (
+ np.asarray(weights, dtype=float),
+ int(result.nit),
+ {
+ "success": True,
+ "best_case_max_pct_error": max_error_pct,
+ "status": int(result.status),
+ "message": result.message,
+ },
+ )
+
+ if best_result is None or best_weights is None:
+ raise RuntimeError("Approximate bounded entropy calibration did not run.")
+
+ raise RuntimeError(
+ "Approximate bounded entropy calibration failed: "
+ f"best achieved max relative constraint error was "
+ f"{best_max_error_pct:.3f}%"
+ )
+
+
+def densify_lp_solution(
+ A,
+ targets,
+ baseline_weights,
+ lp_weights,
+ max_constraint_error_pct,
+ *,
+ iterations=30,
+):
+ """
+ Blend an LP solution back toward the baseline while staying inside the
+ allowed error band.
+
+ This preserves feasibility while avoiding the most extreme support collapse
+ of a pure LP basic-feasible-point solution.
+ """
+
+ A = np.asarray(A, dtype=float)
+ targets = np.asarray(targets, dtype=float)
+ baseline_weights = np.asarray(baseline_weights, dtype=float)
+ lp_weights = np.asarray(lp_weights, dtype=float)
+
+ scales = np.maximum(np.abs(targets), 1.0)
+ A_scaled = A / scales
+ targets_scaled = targets / scales
+
+ best_lambda = 0.0
+ best_weights = lp_weights.copy()
+ best_error_pct = float(
+ np.max(np.abs(A_scaled.T @ best_weights - targets_scaled)) * 100
+ )
+
+ lo = 0.0
+ hi = 1.0
+ for _ in range(iterations):
+ lam = (lo + hi) / 2.0
+ candidate_weights = (1.0 - lam) * lp_weights + lam * baseline_weights
+ candidate_error_pct = float(
+ np.max(np.abs(A_scaled.T @ candidate_weights - targets_scaled)) * 100
+ )
+ if candidate_error_pct <= max_constraint_error_pct + 1e-6:
+ best_lambda = lam
+ best_weights = candidate_weights
+ best_error_pct = candidate_error_pct
+ lo = lam
+ else:
+ hi = lam
+
+ return best_weights, {
+ "blend_lambda": best_lambda,
+ "best_case_max_pct_error": best_error_pct,
+ "densification_effective": best_lambda > 0.0,
+ }
+
+
+def calibrate_lp_minimax(
+ X,
+ y_target,
+ baseline_weights,
+ ss_values=None,
+ ss_target=None,
+ payroll_values=None,
+ payroll_target=None,
+ h6_income_values=None,
+ h6_revenue_target=None,
+ oasdi_tob_values=None,
+ oasdi_tob_target=None,
+ hi_tob_values=None,
+ hi_tob_target=None,
+ n_ages=86,
+):
+ """
+ Approximate nonnegative calibration via minimax relative-error LP.
+
+ This is a robust feasibility fallback and certificate, but it tends to
+ produce sparse extreme-point solutions. Prefer bounded entropy when we want
+ approximate weights to remain usable as microsimulation support.
+ """
+
+ aux_df, controls = _build_constraint_dataframe_and_controls(
+ X,
+ y_target,
+ ss_values=ss_values,
+ ss_target=ss_target,
+ payroll_values=payroll_values,
+ payroll_target=payroll_target,
+ h6_income_values=h6_income_values,
+ h6_revenue_target=h6_revenue_target,
+ oasdi_tob_values=oasdi_tob_values,
+ oasdi_tob_target=oasdi_tob_target,
+ hi_tob_values=hi_tob_values,
+ hi_tob_target=hi_tob_target,
+ n_ages=n_ages,
+ )
+
+ A = aux_df.to_numpy(dtype=float)
+ targets = np.array(list(controls.values()), dtype=float)
+ feasibility = assess_nonnegative_feasibility(A, targets, return_weights=True)
+ weights = feasibility.get("weights")
+ if not feasibility["success"] or weights is None:
+ raise RuntimeError(
+ f"Approximate nonnegative calibration failed: {feasibility['message']}"
+ )
+
+ return np.asarray(weights, dtype=float), 1, feasibility
+
+
+def assess_nonnegative_feasibility(A, targets, *, return_weights=False):
+ """
+ Solve for the minimum uniform relative error achievable with nonnegative weights.
+
+ Returns a dict with `success` and `best_case_max_pct_error`.
+ """
+ A = np.asarray(A, dtype=float)
+ targets = np.asarray(targets, dtype=float)
+ if A.shape[1] == len(targets):
+ constraint_by_unit = A.T
+ elif A.shape[0] == len(targets):
+ constraint_by_unit = A
+ else:
+ raise ValueError(
+ "Constraint matrix shape does not match targets: "
+ f"{A.shape} vs {targets.shape}"
+ )
+
+ scales = np.maximum(np.abs(targets), 1.0)
+ A_rel = constraint_by_unit / scales[:, None]
+ b_rel = targets / scales
+
+ constraint_matrix = sparse.csr_matrix(A_rel)
+ epsilon_column = sparse.csc_matrix(np.ones((constraint_matrix.shape[0], 1)))
+ A_ub = sparse.vstack(
+ [
+ sparse.hstack([constraint_matrix, -epsilon_column]),
+ sparse.hstack([-constraint_matrix, -epsilon_column]),
+ ],
+ format="csc",
+ )
+ b_ub = np.concatenate([b_rel, -b_rel])
+ c = np.zeros(constraint_matrix.shape[1] + 1)
+ c[-1] = 1.0
+ bounds = [(0, None)] * constraint_matrix.shape[1] + [(0, None)]
+
+ result = optimize.linprog(
+ c,
+ A_ub=A_ub,
+ b_ub=b_ub,
+ bounds=bounds,
+ method="highs",
+ )
+
+ result_dict = {
+ "success": bool(result.success),
+ "best_case_max_pct_error": (
+ float(result.x[-1] * 100) if result.success else None
+ ),
+ "status": int(result.status),
+ "message": result.message,
+ }
+ if return_weights:
+ result_dict["weights"] = (
+ np.asarray(result.x[:-1], dtype=float) if result.success else None
+ )
+ return result_dict
+
+
def calibrate_weights(
X,
y_target,
@@ -186,6 +744,9 @@ def calibrate_weights(
max_iters=100,
tol=1e-6,
verbose=False,
+ allow_fallback_to_ipf=True,
+ allow_approximate_entropy=False,
+ approximate_max_error_pct=None,
):
"""
Unified interface for weight calibration.
@@ -214,12 +775,27 @@ def calibrate_weights(
Returns:
w_new: Calibrated weights
iterations: Number of iterations
+ audit: Metadata about calibration method selection
"""
+ audit = {
+ "method_requested": method,
+ "method_used": method,
+ "greg_attempted": method == "greg",
+ "greg_error": None,
+ "entropy_error": None,
+ "approximate_entropy_error": None,
+ "fell_back_to_ipf": False,
+ "lp_fallback_used": False,
+ "approximate_solution_used": False,
+ "approximation_method": None,
+ "approximate_solution_error_pct": None,
+ }
+
if method == "greg":
if calibrator is None:
raise ValueError("calibrator required for GREG method")
try:
- return calibrate_greg(
+ w_new, iterations = calibrate_greg(
calibrator,
X,
y_target,
@@ -236,15 +812,260 @@ def calibrate_weights(
hi_tob_target,
n_ages,
)
+ return w_new, iterations, audit
except Exception as e:
+ audit["greg_error"] = str(e)
+ if not allow_fallback_to_ipf:
+ raise RuntimeError(
+ "GREG calibration failed while fallback was disabled"
+ ) from e
if verbose:
print(f"GREG failed: {e}, falling back to IPF")
w_new, info = iterative_proportional_fitting(
X, y_target, baseline_weights, max_iters, tol, verbose
)
- return w_new, info["iterations"]
+ audit["method_used"] = "ipf"
+ audit["fell_back_to_ipf"] = True
+ return w_new, info["iterations"], audit
+ elif method == "entropy":
+ try:
+ w_new, iterations = calibrate_entropy(
+ X,
+ y_target,
+ baseline_weights,
+ ss_values=ss_values,
+ ss_target=ss_target,
+ payroll_values=payroll_values,
+ payroll_target=payroll_target,
+ h6_income_values=h6_income_values,
+ h6_revenue_target=h6_revenue_target,
+ oasdi_tob_values=oasdi_tob_values,
+ oasdi_tob_target=oasdi_tob_target,
+ hi_tob_values=hi_tob_values,
+ hi_tob_target=hi_tob_target,
+ n_ages=n_ages,
+ max_iters=max_iters * 5,
+ tol=max(tol, 1e-10),
+ )
+ return w_new, iterations, audit
+ except RuntimeError as error:
+ audit["entropy_error"] = str(error)
+ w_new, iterations, feasibility = calibrate_lp_minimax(
+ X,
+ y_target,
+ baseline_weights,
+ ss_values=ss_values,
+ ss_target=ss_target,
+ payroll_values=payroll_values,
+ payroll_target=payroll_target,
+ h6_income_values=h6_income_values,
+ h6_revenue_target=h6_revenue_target,
+ oasdi_tob_values=oasdi_tob_values,
+ oasdi_tob_target=oasdi_tob_target,
+ hi_tob_values=hi_tob_values,
+ hi_tob_target=hi_tob_target,
+ n_ages=n_ages,
+ )
+ approximate_error_pct = float(feasibility["best_case_max_pct_error"])
+ if approximate_error_pct <= max(tol * 100, 1e-6):
+ audit["lp_fallback_used"] = True
+ audit["approximation_method"] = "lp_minimax_exact"
+ audit["approximate_solution_error_pct"] = approximate_error_pct
+ return w_new, iterations, audit
+
+ if not allow_approximate_entropy:
+ raise
+
+ if (
+ approximate_max_error_pct is not None
+ and approximate_error_pct > approximate_max_error_pct
+ ):
+ raise RuntimeError(
+ "Approximate entropy fallback exceeded allowable error: "
+ f"{approximate_error_pct:.3f}% > {approximate_max_error_pct:.3f}%"
+ ) from error
+
+ dense_lp_weights = None
+ dense_lp_info = None
+ try:
+ aux_df, controls = _build_constraint_dataframe_and_controls(
+ X,
+ y_target,
+ ss_values=ss_values,
+ ss_target=ss_target,
+ payroll_values=payroll_values,
+ payroll_target=payroll_target,
+ h6_income_values=h6_income_values,
+ h6_revenue_target=h6_revenue_target,
+ oasdi_tob_values=oasdi_tob_values,
+ oasdi_tob_target=oasdi_tob_target,
+ hi_tob_values=hi_tob_values,
+ hi_tob_target=hi_tob_target,
+ n_ages=n_ages,
+ )
+ dense_lp_weights, dense_lp_info = densify_lp_solution(
+ aux_df.to_numpy(dtype=float),
+ np.array(list(controls.values()), dtype=float),
+ baseline_weights,
+ w_new,
+ approximate_max_error_pct,
+ )
+ except Exception:
+ dense_lp_weights = None
+ dense_lp_info = None
+
+ if approximate_max_error_pct is not None:
+ try:
+ warm_weights = [w_new]
+ if dense_lp_weights is not None:
+ warm_weights.insert(0, dense_lp_weights)
+ (
+ bounded_weights,
+ bounded_iterations,
+ bounded_feasibility,
+ ) = calibrate_entropy_bounded(
+ X,
+ y_target,
+ baseline_weights,
+ ss_values=ss_values,
+ ss_target=ss_target,
+ payroll_values=payroll_values,
+ payroll_target=payroll_target,
+ h6_income_values=h6_income_values,
+ h6_revenue_target=h6_revenue_target,
+ oasdi_tob_values=oasdi_tob_values,
+ oasdi_tob_target=oasdi_tob_target,
+ hi_tob_values=hi_tob_values,
+ hi_tob_target=hi_tob_target,
+ n_ages=n_ages,
+ max_constraint_error_pct=approximate_max_error_pct,
+ max_iters=max_iters * 10,
+ tol=max(tol, 1e-10),
+ warm_weights=warm_weights,
+ )
+ audit["approximate_solution_used"] = True
+ audit["approximation_method"] = "bounded_entropy"
+ audit["approximate_solution_error_pct"] = float(
+ bounded_feasibility["best_case_max_pct_error"]
+ )
+ return bounded_weights, bounded_iterations, audit
+ except RuntimeError as bounded_error:
+ audit["approximate_entropy_error"] = str(bounded_error)
+
+ if dense_lp_weights is not None and dense_lp_info is not None:
+ audit["lp_fallback_used"] = True
+ audit["approximate_solution_used"] = True
+ densified = dense_lp_info.get("densification_effective", False)
+ audit["approximation_method"] = (
+ "lp_blend" if densified else "lp_minimax"
+ )
+ audit["approximate_solution_error_pct"] = float(
+ dense_lp_info["best_case_max_pct_error"]
+ )
+ audit["lp_blend_lambda"] = float(dense_lp_info["blend_lambda"])
+ return dense_lp_weights, iterations, audit
+
+ audit["lp_fallback_used"] = True
+ audit["approximate_solution_used"] = True
+ audit["approximation_method"] = "lp_minimax"
+ audit["approximate_solution_error_pct"] = approximate_error_pct
+ return w_new, iterations, audit
else:
w_new, info = iterative_proportional_fitting(
X, y_target, baseline_weights, max_iters, tol, verbose
)
- return w_new, info["iterations"]
+ return w_new, info["iterations"], audit
+
+
+def build_calibration_audit(
+ *,
+ X,
+ y_target,
+ weights,
+ baseline_weights,
+ calibration_event,
+ ss_values=None,
+ ss_target=None,
+ payroll_values=None,
+ payroll_target=None,
+ h6_income_values=None,
+ h6_revenue_target=None,
+ oasdi_tob_values=None,
+ oasdi_tob_target=None,
+ hi_tob_values=None,
+ hi_tob_target=None,
+):
+ achieved_ages = X.T @ weights
+ age_errors = (
+ np.abs(achieved_ages - y_target) / np.maximum(np.abs(y_target), 1e-10) * 100
+ )
+
+ neg_mask = weights < 0
+ negative_values = np.abs(weights[neg_mask])
+ positive_mask = weights > 0
+ weight_sum = float(np.sum(weights))
+ abs_weight_sum = float(np.sum(np.abs(weights)))
+ if weight_sum > 0:
+ sorted_weights = np.sort(weights)
+ top_10_weight_share_pct = float(sorted_weights[-10:].sum() / weight_sum * 100)
+ top_100_weight_share_pct = float(sorted_weights[-100:].sum() / weight_sum * 100)
+ else:
+ top_10_weight_share_pct = 0.0
+ top_100_weight_share_pct = 0.0
+
+ if weight_sum > 0 and float(np.dot(weights, weights)) > 0:
+ effective_sample_size = float(weight_sum**2 / np.dot(weights, weights))
+ else:
+ effective_sample_size = 0.0
+
+ audit = dict(calibration_event)
+ audit.update(
+ {
+ "age_max_pct_error": float(age_errors.max()),
+ "negative_weight_count": int(neg_mask.sum()),
+ "negative_weight_household_pct": float(100 * neg_mask.sum() / len(weights)),
+ "negative_weight_pct": (
+ float(100 * negative_values.sum() / abs_weight_sum)
+ if abs_weight_sum > 0
+ else 0.0
+ ),
+ "largest_negative_weight": (
+ float(negative_values.max()) if negative_values.size else 0.0
+ ),
+ "positive_weight_count": int(positive_mask.sum()),
+ "positive_weight_pct": float(100 * positive_mask.sum() / len(weights)),
+ "effective_sample_size": effective_sample_size,
+ "top_10_weight_share_pct": top_10_weight_share_pct,
+ "top_100_weight_share_pct": top_100_weight_share_pct,
+ "constraints": {},
+ "baseline_weight_sum": float(np.sum(baseline_weights)),
+ "calibrated_weight_sum": weight_sum,
+ "max_constraint_pct_error": 0.0,
+ }
+ )
+
+ constraint_specs = [
+ ("ss_total", ss_values, ss_target),
+ ("payroll_total", payroll_values, payroll_target),
+ ("h6_revenue", h6_income_values, h6_revenue_target),
+ ("oasdi_tob", oasdi_tob_values, oasdi_tob_target),
+ ("hi_tob", hi_tob_values, hi_tob_target),
+ ]
+
+ for name, values, target in constraint_specs:
+ if values is None or target is None:
+ continue
+ achieved = float(np.sum(values * weights))
+ audit["constraints"][name] = {
+ "target": float(target),
+ "achieved": achieved,
+ "error": achieved - float(target),
+ "pct_error": float(_pct_error(achieved, float(target))),
+ }
+
+ if audit["constraints"]:
+ audit["max_constraint_pct_error"] = float(
+ max(abs(stats["pct_error"]) for stats in audit["constraints"].values())
+ )
+
+ return audit
diff --git a/policyengine_us_data/datasets/cps/long_term/calibration_artifacts.py b/policyengine_us_data/datasets/cps/long_term/calibration_artifacts.py
new file mode 100644
index 000000000..3dcb86f2e
--- /dev/null
+++ b/policyengine_us_data/datasets/cps/long_term/calibration_artifacts.py
@@ -0,0 +1,296 @@
+from __future__ import annotations
+
+from datetime import datetime, timezone
+import json
+from pathlib import Path
+from typing import Any
+
+try:
+ from .calibration_profiles import (
+ classify_calibration_quality,
+ get_profile,
+ validate_calibration_audit,
+ )
+except ImportError: # pragma: no cover - script execution fallback
+ from calibration_profiles import (
+ classify_calibration_quality,
+ get_profile,
+ validate_calibration_audit,
+ )
+
+
+CONTRACT_VERSION = 1
+MANIFEST_FILENAME = "calibration_manifest.json"
+SUPPORT_AUGMENTATION_REPORT_FILENAME = "support_augmentation_report.json"
+
+
+def metadata_path_for(h5_path: str | Path) -> Path:
+ return Path(f"{Path(h5_path)}.metadata.json")
+
+
+def normalize_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
+ normalized = json.loads(json.dumps(metadata))
+ normalized.setdefault("contract_version", CONTRACT_VERSION)
+
+ profile_data = normalized.get("profile", {})
+ audit = normalized.setdefault("calibration_audit", {})
+ constraints = audit.get("constraints", {})
+
+ if "max_constraint_pct_error" not in audit:
+ audit["max_constraint_pct_error"] = float(
+ max(
+ (abs(stats.get("pct_error", 0.0)) for stats in constraints.values()),
+ default=0.0,
+ )
+ )
+
+ if audit.get("lp_fallback_used"):
+ realized_error = float(audit.get("max_constraint_pct_error", 0.0))
+ stored_error = audit.get("approximate_solution_error_pct")
+ if stored_error is None or float(stored_error) < realized_error:
+ audit["approximate_solution_error_pct"] = realized_error
+
+ if "calibration_quality" not in audit and profile_data.get("name"):
+ try:
+ profile = get_profile(profile_data["name"])
+ except ValueError:
+ profile = None
+ if profile is not None:
+ canonical_profile = profile.to_dict()
+ merged_profile = json.loads(json.dumps(canonical_profile))
+ merged_profile.update(profile_data)
+ normalized["profile"] = merged_profile
+ audit["calibration_quality"] = classify_calibration_quality(
+ audit,
+ profile,
+ year=normalized.get("year"),
+ )
+
+ if audit.get("lp_fallback_used"):
+ quality = audit.get("calibration_quality")
+ if quality == "exact":
+ audit["approximation_method"] = "lp_minimax_exact"
+ audit["approximate_solution_used"] = False
+ elif quality == "approximate":
+ audit["approximation_method"] = "lp_minimax"
+ audit["approximate_solution_used"] = True
+
+ if "validation_passed" not in audit and profile_data.get("name"):
+ try:
+ profile = get_profile(profile_data["name"])
+ except ValueError:
+ profile = None
+ if profile is not None:
+ issues = validate_calibration_audit(
+ audit,
+ profile,
+ year=normalized.get("year"),
+ )
+ audit["validation_passed"] = not bool(issues)
+ audit.setdefault("validation_issues", issues)
+
+ return normalized
+
+
+def write_year_metadata(
+ h5_path: str | Path,
+ *,
+ year: int,
+ base_dataset_path: str,
+ profile: dict[str, Any],
+ calibration_audit: dict[str, Any],
+ target_source: dict[str, Any] | None = None,
+ tax_assumption: dict[str, Any] | None = None,
+ support_augmentation: dict[str, Any] | None = None,
+) -> Path:
+ metadata = {
+ "contract_version": CONTRACT_VERSION,
+ "year": year,
+ "base_dataset_path": base_dataset_path,
+ "profile": profile,
+ "calibration_audit": calibration_audit,
+ }
+ if target_source is not None:
+ metadata["target_source"] = target_source
+ if tax_assumption is not None:
+ metadata["tax_assumption"] = tax_assumption
+ if support_augmentation is not None:
+ metadata["support_augmentation"] = support_augmentation
+ metadata = normalize_metadata(metadata)
+ metadata_path = metadata_path_for(h5_path)
+ metadata_path.write_text(
+ json.dumps(metadata, indent=2, sort_keys=True) + "\n",
+ encoding="utf-8",
+ )
+ return metadata_path
+
+
+def write_support_augmentation_report(
+ output_dir: str | Path,
+ report: dict[str, Any],
+ *,
+ filename: str = SUPPORT_AUGMENTATION_REPORT_FILENAME,
+) -> Path:
+ output_dir = Path(output_dir)
+ output_dir.mkdir(parents=True, exist_ok=True)
+ report_path = output_dir / filename
+ report_path.write_text(
+ json.dumps(json.loads(json.dumps(report)), indent=2, sort_keys=True) + "\n",
+ encoding="utf-8",
+ )
+ return report_path
+
+
+def update_dataset_manifest(
+ output_dir: str | Path,
+ *,
+ year: int,
+ h5_path: str | Path,
+ metadata_path: str | Path,
+ base_dataset_path: str,
+ profile: dict[str, Any],
+ calibration_audit: dict[str, Any],
+ target_source: dict[str, Any] | None = None,
+ tax_assumption: dict[str, Any] | None = None,
+ support_augmentation: dict[str, Any] | None = None,
+) -> Path:
+ output_dir = Path(output_dir)
+ manifest_path = output_dir / MANIFEST_FILENAME
+ profile = json.loads(json.dumps(profile))
+ target_source = json.loads(json.dumps(target_source))
+ tax_assumption = json.loads(json.dumps(tax_assumption))
+ support_augmentation = json.loads(json.dumps(support_augmentation))
+
+ if manifest_path.exists():
+ manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
+ else:
+ manifest = {
+ "contract_version": CONTRACT_VERSION,
+ "generated_at": None,
+ "base_dataset_path": base_dataset_path,
+ "profile": profile,
+ "target_source": target_source,
+ "tax_assumption": tax_assumption,
+ "support_augmentation": support_augmentation,
+ "years": [],
+ "datasets": {},
+ }
+
+ if manifest["base_dataset_path"] != base_dataset_path:
+ raise ValueError(
+ "Output directory already contains a different base dataset path: "
+ f"{manifest['base_dataset_path']} != {base_dataset_path}"
+ )
+ manifest_profile = json.loads(json.dumps(manifest["profile"]))
+ if manifest_profile != profile:
+ if manifest_profile.get("name") == profile.get("name") and manifest_profile.get(
+ "calibration_method"
+ ) == profile.get("calibration_method"):
+ manifest["profile"] = profile
+ else:
+ raise ValueError(
+ "Output directory already contains a different calibration profile: "
+ f"{manifest['profile'].get('name')} != {profile.get('name')}"
+ )
+ if manifest.get("target_source") is None and target_source is not None:
+ manifest["target_source"] = target_source
+ elif manifest.get("target_source") != target_source:
+ raise ValueError(
+ "Output directory already contains a different target source: "
+ f"{manifest.get('target_source')} != {target_source}"
+ )
+ if manifest.get("tax_assumption") is None and tax_assumption is not None:
+ manifest["tax_assumption"] = tax_assumption
+ elif manifest.get("tax_assumption") != tax_assumption:
+ raise ValueError(
+ "Output directory already contains a different tax assumption: "
+ f"{manifest.get('tax_assumption')} != {tax_assumption}"
+ )
+ if (
+ manifest.get("support_augmentation") is None
+ and support_augmentation is not None
+ ):
+ manifest["support_augmentation"] = support_augmentation
+ elif manifest.get("support_augmentation") != support_augmentation:
+ raise ValueError(
+ "Output directory already contains a different support augmentation: "
+ f"{manifest.get('support_augmentation')} != {support_augmentation}"
+ )
+
+ datasets = manifest.setdefault("datasets", {})
+ datasets[str(year)] = {
+ "h5": Path(h5_path).name,
+ "metadata": Path(metadata_path).name,
+ "calibration_quality": calibration_audit.get("calibration_quality"),
+ "method_used": calibration_audit.get("method_used"),
+ "fell_back_to_ipf": calibration_audit.get("fell_back_to_ipf"),
+ "age_max_pct_error": calibration_audit.get("age_max_pct_error"),
+ "max_constraint_pct_error": calibration_audit.get("max_constraint_pct_error"),
+ "negative_weight_pct": calibration_audit.get("negative_weight_pct"),
+ "negative_weight_household_pct": calibration_audit.get(
+ "negative_weight_household_pct"
+ ),
+ "validation_passed": calibration_audit.get("validation_passed"),
+ "validation_issue_count": len(calibration_audit.get("validation_issues", [])),
+ }
+
+ year_set = {int(value) for value in manifest.get("years", [])}
+ year_set.add(year)
+ manifest["years"] = sorted(year_set)
+ manifest["year_range"] = {
+ "start": min(year_set),
+ "end": max(year_set),
+ }
+ manifest["generated_at"] = datetime.now(timezone.utc).isoformat()
+ manifest["contains_invalid_artifacts"] = any(
+ entry.get("validation_passed") is False for entry in datasets.values()
+ )
+
+ manifest_path.write_text(
+ json.dumps(manifest, indent=2, sort_keys=True) + "\n",
+ encoding="utf-8",
+ )
+ return manifest_path
+
+
+def rebuild_dataset_manifest(output_dir: str | Path) -> Path:
+ return rebuild_dataset_manifest_with_target_source(output_dir)
+
+
+def rebuild_dataset_manifest_with_target_source(
+ output_dir: str | Path,
+ *,
+ target_source: dict[str, Any] | None = None,
+) -> Path:
+ output_dir = Path(output_dir)
+ metadata_files = sorted(output_dir.glob("*.h5.metadata.json"))
+ if not metadata_files:
+ raise FileNotFoundError(f"No metadata sidecars found in {output_dir}")
+
+ manifest_path: Path | None = None
+ for metadata_file in metadata_files:
+ metadata = json.loads(metadata_file.read_text(encoding="utf-8"))
+ metadata = normalize_metadata(metadata)
+ if target_source is not None:
+ metadata["target_source"] = target_source
+ metadata_file.write_text(
+ json.dumps(metadata, indent=2, sort_keys=True) + "\n",
+ encoding="utf-8",
+ )
+ year = int(metadata["year"])
+ h5_path = output_dir / f"{year}.h5"
+ manifest_path = update_dataset_manifest(
+ output_dir,
+ year=year,
+ h5_path=h5_path,
+ metadata_path=metadata_file,
+ base_dataset_path=metadata["base_dataset_path"],
+ profile=metadata["profile"],
+ calibration_audit=metadata["calibration_audit"],
+ target_source=metadata.get("target_source"),
+ tax_assumption=metadata.get("tax_assumption"),
+ support_augmentation=metadata.get("support_augmentation"),
+ )
+
+ assert manifest_path is not None
+ return manifest_path
diff --git a/policyengine_us_data/datasets/cps/long_term/calibration_profiles.py b/policyengine_us_data/datasets/cps/long_term/calibration_profiles.py
new file mode 100644
index 000000000..15c91f8a6
--- /dev/null
+++ b/policyengine_us_data/datasets/cps/long_term/calibration_profiles.py
@@ -0,0 +1,532 @@
+from __future__ import annotations
+
+from dataclasses import asdict, dataclass, field
+from typing import Any
+
+
+@dataclass(frozen=True)
+class ApproximateCalibrationWindow:
+ start_year: int
+ end_year: int | None
+ max_constraint_error_pct: float
+ max_age_error_pct: float
+ max_negative_weight_pct: float | None = 0.0
+ age_bucket_size: int | None = None
+ min_positive_household_count: int | None = None
+ min_effective_sample_size: float | None = None
+ max_top_10_weight_share_pct: float | None = None
+ max_top_100_weight_share_pct: float | None = None
+
+ def applies(self, year: int) -> bool:
+ if year < self.start_year:
+ return False
+ if self.end_year is not None and year > self.end_year:
+ return False
+ return True
+
+
+@dataclass(frozen=True)
+class CalibrationProfile:
+ name: str
+ description: str
+ calibration_method: str
+ use_greg: bool
+ use_ss: bool
+ use_payroll: bool
+ use_h6_reform: bool
+ use_tob: bool
+ benchmark_tob: bool = False
+ allow_greg_fallback: bool = False
+ max_constraint_error_pct: float = 0.1
+ max_age_error_pct: float = 0.1
+ max_negative_weight_pct: float | None = None
+ min_positive_household_count: int | None = None
+ min_effective_sample_size: float | None = None
+ max_top_10_weight_share_pct: float | None = None
+ max_top_100_weight_share_pct: float | None = None
+ approximate_windows: tuple[ApproximateCalibrationWindow, ...] = field(
+ default_factory=tuple
+ )
+
+ def to_dict(self) -> dict[str, Any]:
+ return asdict(self)
+
+
+DEFAULT_LONG_RUN_APPROXIMATE_WINDOWS = (
+ ApproximateCalibrationWindow(
+ start_year=2075,
+ end_year=2078,
+ max_constraint_error_pct=0.5,
+ max_age_error_pct=0.5,
+ max_negative_weight_pct=0.0,
+ age_bucket_size=5,
+ min_positive_household_count=1000,
+ min_effective_sample_size=75.0,
+ max_top_10_weight_share_pct=25.0,
+ max_top_100_weight_share_pct=95.0,
+ ),
+ ApproximateCalibrationWindow(
+ start_year=2079,
+ end_year=2085,
+ max_constraint_error_pct=10.0,
+ max_age_error_pct=10.0,
+ max_negative_weight_pct=0.0,
+ age_bucket_size=5,
+ min_positive_household_count=1000,
+ min_effective_sample_size=75.0,
+ max_top_10_weight_share_pct=25.0,
+ max_top_100_weight_share_pct=95.0,
+ ),
+ ApproximateCalibrationWindow(
+ start_year=2086,
+ end_year=2095,
+ max_constraint_error_pct=20.0,
+ max_age_error_pct=20.0,
+ max_negative_weight_pct=0.0,
+ age_bucket_size=5,
+ min_positive_household_count=1000,
+ min_effective_sample_size=75.0,
+ max_top_10_weight_share_pct=25.0,
+ max_top_100_weight_share_pct=95.0,
+ ),
+ ApproximateCalibrationWindow(
+ start_year=2096,
+ end_year=None,
+ max_constraint_error_pct=35.0,
+ max_age_error_pct=35.0,
+ max_negative_weight_pct=0.0,
+ age_bucket_size=5,
+ min_positive_household_count=1000,
+ min_effective_sample_size=75.0,
+ max_top_10_weight_share_pct=25.0,
+ max_top_100_weight_share_pct=95.0,
+ ),
+)
+
+
+NAMED_PROFILES: dict[str, CalibrationProfile] = {
+ "age-only": CalibrationProfile(
+ name="age-only",
+ description="Age-only calibration using IPF.",
+ calibration_method="ipf",
+ use_greg=False,
+ use_ss=False,
+ use_payroll=False,
+ use_h6_reform=False,
+ use_tob=False,
+ benchmark_tob=False,
+ allow_greg_fallback=False,
+ min_positive_household_count=1000,
+ min_effective_sample_size=75.0,
+ max_top_10_weight_share_pct=25.0,
+ max_top_100_weight_share_pct=95.0,
+ ),
+ "ss": CalibrationProfile(
+ name="ss",
+ description="Age plus Social Security benefits using positive entropy calibration.",
+ calibration_method="entropy",
+ use_greg=False,
+ use_ss=True,
+ use_payroll=False,
+ use_h6_reform=False,
+ use_tob=False,
+ benchmark_tob=False,
+ max_negative_weight_pct=0.0,
+ min_positive_household_count=1000,
+ min_effective_sample_size=75.0,
+ max_top_10_weight_share_pct=25.0,
+ max_top_100_weight_share_pct=95.0,
+ approximate_windows=DEFAULT_LONG_RUN_APPROXIMATE_WINDOWS,
+ ),
+ "ss-payroll": CalibrationProfile(
+ name="ss-payroll",
+ description="Age, Social Security, and taxable payroll using positive entropy calibration.",
+ calibration_method="entropy",
+ use_greg=False,
+ use_ss=True,
+ use_payroll=True,
+ use_h6_reform=False,
+ use_tob=False,
+ benchmark_tob=False,
+ max_negative_weight_pct=0.0,
+ min_positive_household_count=1000,
+ min_effective_sample_size=75.0,
+ max_top_10_weight_share_pct=25.0,
+ max_top_100_weight_share_pct=95.0,
+ approximate_windows=DEFAULT_LONG_RUN_APPROXIMATE_WINDOWS,
+ ),
+ "ss-payroll-tob": CalibrationProfile(
+ name="ss-payroll-tob",
+ description="Age, Social Security, taxable payroll, and TOB using positive entropy calibration under the long-run core-threshold tax assumption.",
+ calibration_method="entropy",
+ use_greg=False,
+ use_ss=True,
+ use_payroll=True,
+ use_h6_reform=False,
+ use_tob=True,
+ benchmark_tob=False,
+ max_negative_weight_pct=0.0,
+ min_positive_household_count=1000,
+ min_effective_sample_size=75.0,
+ max_top_10_weight_share_pct=25.0,
+ max_top_100_weight_share_pct=95.0,
+ approximate_windows=DEFAULT_LONG_RUN_APPROXIMATE_WINDOWS,
+ ),
+ "ss-payroll-tob-h6": CalibrationProfile(
+ name="ss-payroll-tob-h6",
+ description="Age, Social Security, taxable payroll, TOB, and H6 using positive entropy calibration under the long-run core-threshold tax assumption.",
+ calibration_method="entropy",
+ use_greg=False,
+ use_ss=True,
+ use_payroll=True,
+ use_h6_reform=True,
+ use_tob=True,
+ benchmark_tob=False,
+ max_negative_weight_pct=0.0,
+ min_positive_household_count=1000,
+ min_effective_sample_size=75.0,
+ max_top_10_weight_share_pct=25.0,
+ max_top_100_weight_share_pct=95.0,
+ approximate_windows=DEFAULT_LONG_RUN_APPROXIMATE_WINDOWS,
+ ),
+}
+
+QUALITY_RANK = {
+ "aggregate": 0,
+ "approximate": 1,
+ "exact": 2,
+}
+
+
+def get_profile(name: str) -> CalibrationProfile:
+ try:
+ return NAMED_PROFILES[name]
+ except KeyError as error:
+ valid = ", ".join(sorted(NAMED_PROFILES))
+ raise ValueError(
+ f"Unknown calibration profile '{name}'. Valid profiles: {valid}"
+ ) from error
+
+
+def approximate_window_for_year(
+ profile: CalibrationProfile,
+ year: int | None,
+) -> ApproximateCalibrationWindow | None:
+ if not profile.approximate_windows:
+ return None
+
+ if year is None:
+ return max(
+ profile.approximate_windows,
+ key=lambda window: (
+ float("inf") if window.end_year is None else window.end_year,
+ window.max_constraint_error_pct,
+ window.max_age_error_pct,
+ ),
+ )
+
+ for window in profile.approximate_windows:
+ if window.applies(year):
+ return window
+ return None
+
+
+def build_profile_from_flags(
+ *,
+ use_greg: bool,
+ use_ss: bool,
+ use_payroll: bool,
+ use_h6_reform: bool,
+ use_tob: bool,
+) -> CalibrationProfile:
+ if use_tob and not use_greg:
+ use_greg = True
+
+ if not use_greg:
+ for profile in NAMED_PROFILES.values():
+ if (
+ profile.use_greg is False
+ and profile.use_ss == use_ss
+ and profile.use_payroll == use_payroll
+ and profile.use_h6_reform == use_h6_reform
+ and profile.use_tob == use_tob
+ ):
+ return profile
+
+ for profile in NAMED_PROFILES.values():
+ if (
+ profile.calibration_method == ("greg" if use_greg else "ipf")
+ and profile.use_greg == use_greg
+ and profile.use_ss == use_ss
+ and profile.use_payroll == use_payroll
+ and profile.use_h6_reform == use_h6_reform
+ and profile.use_tob == use_tob
+ ):
+ return profile
+
+ flag_names = []
+ if use_greg:
+ flag_names.append("greg")
+ if use_ss:
+ flag_names.append("ss")
+ if use_payroll:
+ flag_names.append("payroll")
+ if use_h6_reform:
+ flag_names.append("h6")
+ if use_tob:
+ flag_names.append("tob")
+
+ suffix = "-".join(flag_names) if flag_names else "age-only"
+ return CalibrationProfile(
+ name=f"custom-{suffix}",
+ description="Legacy flag-derived calibration profile.",
+ calibration_method="greg" if use_greg else "ipf",
+ use_greg=use_greg,
+ use_ss=use_ss,
+ use_payroll=use_payroll,
+ use_h6_reform=use_h6_reform,
+ use_tob=use_tob,
+ benchmark_tob=False,
+ )
+
+
+def validate_calibration_audit(
+ audit: dict[str, Any],
+ profile: CalibrationProfile,
+ *,
+ year: int | None = None,
+ quality: str | None = None,
+) -> list[str]:
+ if quality is None:
+ quality = audit.get("calibration_quality") or classify_calibration_quality(
+ audit,
+ profile,
+ year=year,
+ )
+
+ if quality == "exact":
+ window = approximate_window_for_year(profile, year)
+ return _collect_threshold_issues(
+ audit,
+ profile,
+ max_constraint_error_pct=profile.max_constraint_error_pct,
+ max_age_error_pct=profile.max_age_error_pct,
+ max_negative_weight_pct=profile.max_negative_weight_pct,
+ min_positive_household_count=(
+ window.min_positive_household_count
+ if window is not None
+ else profile.min_positive_household_count
+ ),
+ min_effective_sample_size=(
+ window.min_effective_sample_size
+ if window is not None
+ else profile.min_effective_sample_size
+ ),
+ max_top_10_weight_share_pct=(
+ window.max_top_10_weight_share_pct
+ if window is not None
+ else profile.max_top_10_weight_share_pct
+ ),
+ max_top_100_weight_share_pct=(
+ window.max_top_100_weight_share_pct
+ if window is not None
+ else profile.max_top_100_weight_share_pct
+ ),
+ )
+
+ if quality == "approximate":
+ window = approximate_window_for_year(profile, year)
+ if window is None:
+ issues = _collect_threshold_issues(
+ audit,
+ profile,
+ max_constraint_error_pct=profile.max_constraint_error_pct,
+ max_age_error_pct=profile.max_age_error_pct,
+ max_negative_weight_pct=profile.max_negative_weight_pct,
+ min_positive_household_count=profile.min_positive_household_count,
+ min_effective_sample_size=profile.min_effective_sample_size,
+ max_top_10_weight_share_pct=profile.max_top_10_weight_share_pct,
+ max_top_100_weight_share_pct=profile.max_top_100_weight_share_pct,
+ )
+ issues.append(
+ "Approximate calibration is not permitted for this profile/year"
+ )
+ return issues
+ return _collect_threshold_issues(
+ audit,
+ profile,
+ max_constraint_error_pct=window.max_constraint_error_pct,
+ max_age_error_pct=window.max_age_error_pct,
+ max_negative_weight_pct=window.max_negative_weight_pct,
+ min_positive_household_count=window.min_positive_household_count,
+ min_effective_sample_size=window.min_effective_sample_size,
+ max_top_10_weight_share_pct=window.max_top_10_weight_share_pct,
+ max_top_100_weight_share_pct=window.max_top_100_weight_share_pct,
+ )
+
+ exact_issues = _collect_threshold_issues(
+ audit,
+ profile,
+ max_constraint_error_pct=profile.max_constraint_error_pct,
+ max_age_error_pct=profile.max_age_error_pct,
+ max_negative_weight_pct=profile.max_negative_weight_pct,
+ min_positive_household_count=profile.min_positive_household_count,
+ min_effective_sample_size=profile.min_effective_sample_size,
+ max_top_10_weight_share_pct=profile.max_top_10_weight_share_pct,
+ max_top_100_weight_share_pct=profile.max_top_100_weight_share_pct,
+ )
+ window = approximate_window_for_year(profile, year)
+ if window is None:
+ return exact_issues + [
+ "Calibration quality aggregate exceeds approximate thresholds"
+ ]
+ approximate_issues = _collect_threshold_issues(
+ audit,
+ profile,
+ max_constraint_error_pct=window.max_constraint_error_pct,
+ max_age_error_pct=window.max_age_error_pct,
+ max_negative_weight_pct=window.max_negative_weight_pct,
+ min_positive_household_count=window.min_positive_household_count,
+ min_effective_sample_size=window.min_effective_sample_size,
+ max_top_10_weight_share_pct=window.max_top_10_weight_share_pct,
+ max_top_100_weight_share_pct=window.max_top_100_weight_share_pct,
+ )
+ return approximate_issues + [
+ "Calibration quality aggregate exceeds approximate thresholds"
+ ]
+
+
+def _collect_threshold_issues(
+ audit: dict[str, Any],
+ profile: CalibrationProfile,
+ *,
+ max_constraint_error_pct: float | None,
+ max_age_error_pct: float | None,
+ max_negative_weight_pct: float | None,
+ min_positive_household_count: int | None,
+ min_effective_sample_size: float | None,
+ max_top_10_weight_share_pct: float | None,
+ max_top_100_weight_share_pct: float | None,
+) -> list[str]:
+ issues: list[str] = []
+
+ if profile.calibration_method == "greg" and audit.get("fell_back_to_ipf"):
+ issues.append("GREG calibration fell back to IPF")
+
+ age_error = audit.get("age_max_pct_error")
+ if (
+ max_age_error_pct is not None
+ and age_error is not None
+ and age_error > max_age_error_pct
+ ):
+ issues.append(
+ f"Age max error {age_error:.3f}% exceeds {max_age_error_pct:.3f}%"
+ )
+
+ for constraint_name, stats in audit.get("constraints", {}).items():
+ pct_error = stats.get("pct_error")
+ if (
+ max_constraint_error_pct is not None
+ and pct_error is not None
+ and abs(pct_error) > max_constraint_error_pct
+ ):
+ issues.append(
+ f"{constraint_name} error {pct_error:.3f}% exceeds "
+ f"{max_constraint_error_pct:.3f}%"
+ )
+
+ if max_negative_weight_pct is not None:
+ pct = audit.get("negative_weight_pct")
+ if pct is not None and pct > max_negative_weight_pct:
+ issues.append(
+ f"Negative weight share {pct:.3f}% exceeds "
+ f"{max_negative_weight_pct:.3f}%"
+ )
+
+ positive_count = audit.get("positive_weight_count")
+ if (
+ min_positive_household_count is not None
+ and positive_count is not None
+ and positive_count < min_positive_household_count
+ ):
+ issues.append(
+ f"Positive household count {positive_count} is below "
+ f"{min_positive_household_count}"
+ )
+
+ ess = audit.get("effective_sample_size")
+ if (
+ min_effective_sample_size is not None
+ and ess is not None
+ and ess < min_effective_sample_size
+ ):
+ issues.append(
+ f"Effective sample size {ess:.3f} is below {min_effective_sample_size:.3f}"
+ )
+
+ top_10_share = audit.get("top_10_weight_share_pct")
+ if (
+ max_top_10_weight_share_pct is not None
+ and top_10_share is not None
+ and top_10_share > max_top_10_weight_share_pct
+ ):
+ issues.append(
+ f"Top-10 weight share {top_10_share:.3f}% exceeds "
+ f"{max_top_10_weight_share_pct:.3f}%"
+ )
+
+ top_100_share = audit.get("top_100_weight_share_pct")
+ if (
+ max_top_100_weight_share_pct is not None
+ and top_100_share is not None
+ and top_100_share > max_top_100_weight_share_pct
+ ):
+ issues.append(
+ f"Top-100 weight share {top_100_share:.3f}% exceeds "
+ f"{max_top_100_weight_share_pct:.3f}%"
+ )
+
+ return issues
+
+
+def classify_calibration_quality(
+ audit: dict[str, Any],
+ profile: CalibrationProfile,
+ *,
+ year: int | None = None,
+) -> str:
+ exact_issues = _collect_threshold_issues(
+ audit,
+ profile,
+ max_constraint_error_pct=profile.max_constraint_error_pct,
+ max_age_error_pct=profile.max_age_error_pct,
+ max_negative_weight_pct=profile.max_negative_weight_pct,
+ min_positive_household_count=profile.min_positive_household_count,
+ min_effective_sample_size=profile.min_effective_sample_size,
+ max_top_10_weight_share_pct=profile.max_top_10_weight_share_pct,
+ max_top_100_weight_share_pct=profile.max_top_100_weight_share_pct,
+ )
+ if not exact_issues:
+ return "exact"
+
+ window = approximate_window_for_year(profile, year)
+ if window is None:
+ if year is not None:
+ return "aggregate"
+ return "approximate"
+
+ approximate_issues = _collect_threshold_issues(
+ audit,
+ profile,
+ max_constraint_error_pct=window.max_constraint_error_pct,
+ max_age_error_pct=window.max_age_error_pct,
+ max_negative_weight_pct=window.max_negative_weight_pct,
+ min_positive_household_count=window.min_positive_household_count,
+ min_effective_sample_size=window.min_effective_sample_size,
+ max_top_10_weight_share_pct=window.max_top_10_weight_share_pct,
+ max_top_100_weight_share_pct=window.max_top_100_weight_share_pct,
+ )
+ if not approximate_issues:
+ return "approximate"
+
+ return "aggregate"
diff --git a/policyengine_us_data/datasets/cps/long_term/compare_tob_shares.py b/policyengine_us_data/datasets/cps/long_term/compare_tob_shares.py
new file mode 100644
index 000000000..91b0e13a5
--- /dev/null
+++ b/policyengine_us_data/datasets/cps/long_term/compare_tob_shares.py
@@ -0,0 +1,109 @@
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+
+def _resolve_metadata_paths(inputs: list[str]) -> list[Path]:
+ paths: list[Path] = []
+ for raw in inputs:
+ path = Path(raw).expanduser()
+ if path.is_dir():
+ for candidate in sorted(path.glob("*.metadata.json")):
+ paths.append(candidate)
+ continue
+ if path.is_file():
+ paths.append(path)
+ continue
+ raise FileNotFoundError(f"Metadata path not found: {path}")
+ if not paths:
+ raise ValueError("No metadata files found.")
+ return paths
+
+
+def _load_record(path: Path) -> dict:
+ metadata = json.loads(path.read_text(encoding="utf-8"))
+ audit = metadata["calibration_audit"]
+ constraints = audit["constraints"]
+ tob_section = audit.get("benchmarks") or audit.get("constraints")
+
+ ss_actual = float(constraints["ss_total"]["achieved"])
+ ss_target = float(constraints["ss_total"]["target"])
+ oasdi_actual = float(tob_section["oasdi_tob"]["achieved"])
+ oasdi_target = float(tob_section["oasdi_tob"]["target"])
+ hi_actual = float(tob_section["hi_tob"]["achieved"])
+ hi_target = float(tob_section["hi_tob"]["target"])
+
+ combined_actual = oasdi_actual + hi_actual
+ combined_target = oasdi_target + hi_target
+
+ return {
+ "year": int(metadata["year"]),
+ "source_path": str(path),
+ "oasdi_actual_share_pct": 100 * oasdi_actual / ss_actual,
+ "oasdi_target_share_pct": 100 * oasdi_target / ss_target,
+ "oasdi_gap_pct_pt": 100 * oasdi_actual / ss_actual
+ - 100 * oasdi_target / ss_target,
+ "combined_actual_share_pct": 100 * combined_actual / ss_actual,
+ "combined_target_share_pct": 100 * combined_target / ss_target,
+ "combined_gap_pct_pt": 100 * combined_actual / ss_actual
+ - 100 * combined_target / ss_target,
+ }
+
+
+def _format_markdown(records: list[dict]) -> str:
+ header = (
+ "| Year | OASDI actual | OASDI target | OASDI gap | Combined actual | "
+ "Combined target | Combined gap |\n"
+ "| --- | ---: | ---: | ---: | ---: | ---: | ---: |"
+ )
+ rows = [
+ (
+ f"| {record['year']} | "
+ f"{record['oasdi_actual_share_pct']:.2f}% | "
+ f"{record['oasdi_target_share_pct']:.2f}% | "
+ f"{record['oasdi_gap_pct_pt']:+.2f} pp | "
+ f"{record['combined_actual_share_pct']:.2f}% | "
+ f"{record['combined_target_share_pct']:.2f}% | "
+ f"{record['combined_gap_pct_pt']:+.2f} pp |"
+ )
+ for record in records
+ ]
+ return "\n".join([header, *rows])
+
+
+def parse_args() -> argparse.Namespace:
+ parser = argparse.ArgumentParser(
+ description=(
+ "Summarize OASDI-only and combined TOB shares from long-run "
+ "metadata sidecars."
+ ),
+ )
+ parser.add_argument(
+ "paths",
+ nargs="+",
+ help=("Metadata files or directories containing *.metadata.json sidecars."),
+ )
+ parser.add_argument(
+ "--format",
+ choices=("markdown", "json"),
+ default="markdown",
+ help="Output format.",
+ )
+ return parser.parse_args()
+
+
+def main() -> int:
+ args = parse_args()
+ paths = _resolve_metadata_paths(args.paths)
+ records = sorted((_load_record(path) for path in paths), key=lambda x: x["year"])
+ if args.format == "json":
+ print(json.dumps(records, indent=2, sort_keys=True))
+ else:
+ print(_format_markdown(records))
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/policyengine_us_data/datasets/cps/long_term/diagnose_support_augmentation_translation.py b/policyengine_us_data/datasets/cps/long_term/diagnose_support_augmentation_translation.py
new file mode 100644
index 000000000..3230380f3
--- /dev/null
+++ b/policyengine_us_data/datasets/cps/long_term/diagnose_support_augmentation_translation.py
@@ -0,0 +1,93 @@
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+try:
+ from .prototype_synthetic_2100_support import summarize_realized_clone_translation
+except ImportError: # pragma: no cover - script execution fallback
+ from prototype_synthetic_2100_support import summarize_realized_clone_translation
+
+
+def _default_metadata_path(h5_path: Path) -> Path:
+ return h5_path.with_suffix(h5_path.suffix + ".metadata.json")
+
+
+def main() -> int:
+ parser = argparse.ArgumentParser(
+ description=(
+ "Compare support-augmentation clone targets to the realized output H5."
+ )
+ )
+ parser.add_argument("h5_path", type=Path, help="Path to the realized year H5 file.")
+ parser.add_argument(
+ "--metadata",
+ type=Path,
+ default=None,
+ help="Path to the year metadata sidecar. Defaults to
.metadata.json.",
+ )
+ parser.add_argument(
+ "--report",
+ type=Path,
+ default=None,
+ help=(
+ "Optional path to support_augmentation_report.json. Defaults to the "
+ "report_file named in metadata, or /support_augmentation_report.json."
+ ),
+ )
+ parser.add_argument(
+ "--year", type=int, required=True, help="Output year to inspect."
+ )
+ parser.add_argument(
+ "--age-bucket-size",
+ type=int,
+ default=5,
+ help="Age bucket size for the translation comparison summary.",
+ )
+ parser.add_argument(
+ "--output",
+ type=Path,
+ default=None,
+ help="Optional JSON path to write the diagnostic summary.",
+ )
+ args = parser.parse_args()
+
+ metadata_path = args.metadata or _default_metadata_path(args.h5_path)
+ metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
+ augmentation_metadata = metadata.get("support_augmentation")
+ if not augmentation_metadata:
+ raise ValueError(f"No support_augmentation metadata found in {metadata_path}")
+
+ report_path = args.report
+ if report_path is None:
+ report_file = augmentation_metadata.get("report_file")
+ if report_file:
+ report_path = metadata_path.parent / report_file
+ else:
+ report_path = metadata_path.parent / "support_augmentation_report.json"
+
+ augmentation_report = json.loads(report_path.read_text(encoding="utf-8"))
+ summary = summarize_realized_clone_translation(
+ str(args.h5_path),
+ period=args.year,
+ augmentation_report=augmentation_report,
+ age_bucket_size=args.age_bucket_size,
+ )
+ payload = {
+ "h5_path": str(args.h5_path),
+ "metadata_path": str(metadata_path),
+ "report_path": str(report_path),
+ "year": int(args.year),
+ "age_bucket_size": int(args.age_bucket_size),
+ "summary": summary,
+ }
+ rendered = json.dumps(payload, indent=2, sort_keys=True) + "\n"
+ if args.output is not None:
+ args.output.write_text(rendered, encoding="utf-8")
+ print(rendered, end="")
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/policyengine_us_data/datasets/cps/long_term/evaluate_support_augmentation.py b/policyengine_us_data/datasets/cps/long_term/evaluate_support_augmentation.py
new file mode 100644
index 000000000..a90c23382
--- /dev/null
+++ b/policyengine_us_data/datasets/cps/long_term/evaluate_support_augmentation.py
@@ -0,0 +1,215 @@
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+import numpy as np
+from policyengine_us import Microsimulation
+
+from calibration import (
+ _build_constraint_dataframe_and_controls,
+ assess_nonnegative_feasibility,
+)
+from calibration_profiles import approximate_window_for_year, get_profile
+from projection_utils import (
+ aggregate_age_targets,
+ aggregate_household_age_matrix,
+ build_age_bins,
+ build_household_age_matrix,
+)
+from ssa_data import (
+ get_long_term_target_source,
+ load_hi_tob_projections,
+ load_oasdi_tob_projections,
+ load_ssa_age_projections,
+ load_ssa_benefit_projections,
+ load_taxable_payroll_projections,
+ set_long_term_target_source,
+)
+from support_augmentation import build_augmented_dataset
+
+
+DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5"
+BASE_YEAR = 2024
+
+
+def _evaluate_dataset(
+ *,
+ dataset: str | object,
+ dataset_label: str,
+ year: int,
+ profile_name: str,
+) -> dict[str, object]:
+ profile = get_profile(profile_name)
+ sim = Microsimulation(dataset=dataset)
+ target_matrix = load_ssa_age_projections(start_year=year, end_year=year)
+ n_ages = target_matrix.shape[0]
+ X, _, _ = build_household_age_matrix(sim, n_ages=n_ages)
+
+ approximate_window = approximate_window_for_year(profile, year)
+ age_bucket_size = (
+ approximate_window.age_bucket_size if approximate_window is not None else None
+ )
+ if age_bucket_size and age_bucket_size > 1:
+ age_bins = build_age_bins(n_ages=n_ages, bucket_size=age_bucket_size)
+ X_current = aggregate_household_age_matrix(X, age_bins)
+ y_target = aggregate_age_targets(target_matrix, age_bins)[:, 0]
+ else:
+ X_current = X
+ y_target = target_matrix[:, 0]
+ age_bucket_size = 1
+
+ household_series = sim.calculate("household_id", period=year, map_to="household")
+ baseline_weights = household_series.weights.values
+
+ ss_values = None
+ ss_target = None
+ if profile.use_ss:
+ ss_values = sim.calculate(
+ "social_security", period=year, map_to="household"
+ ).values
+ ss_target = load_ssa_benefit_projections(year)
+
+ payroll_values = None
+ payroll_target = None
+ if profile.use_payroll:
+ payroll_values = (
+ sim.calculate(
+ "taxable_earnings_for_social_security",
+ period=year,
+ map_to="household",
+ ).values
+ + sim.calculate(
+ "social_security_taxable_self_employment_income",
+ period=year,
+ map_to="household",
+ ).values
+ )
+ payroll_target = load_taxable_payroll_projections(year)
+
+ oasdi_tob_values = None
+ oasdi_tob_target = None
+ hi_tob_values = None
+ hi_tob_target = None
+ if profile.use_tob:
+ oasdi_tob_values = sim.calculate(
+ "tob_revenue_oasdi",
+ period=year,
+ map_to="household",
+ ).values
+ hi_tob_values = sim.calculate(
+ "tob_revenue_medicare_hi",
+ period=year,
+ map_to="household",
+ ).values
+ oasdi_tob_target = load_oasdi_tob_projections(year)
+ hi_tob_target = load_hi_tob_projections(year)
+
+ aux_df, controls = _build_constraint_dataframe_and_controls(
+ X_current,
+ y_target,
+ ss_values=ss_values,
+ ss_target=ss_target,
+ payroll_values=payroll_values,
+ payroll_target=payroll_target,
+ oasdi_tob_values=oasdi_tob_values,
+ oasdi_tob_target=oasdi_tob_target,
+ hi_tob_values=hi_tob_values,
+ hi_tob_target=hi_tob_target,
+ n_ages=X_current.shape[1],
+ )
+ targets = np.array(list(controls.values()), dtype=float)
+ feasibility = assess_nonnegative_feasibility(
+ aux_df.to_numpy(dtype=float),
+ targets,
+ )
+
+ return {
+ "dataset": dataset_label,
+ "year": year,
+ "profile": profile.name,
+ "target_source": get_long_term_target_source(),
+ "household_count": int(len(baseline_weights)),
+ "age_bucket_size": int(age_bucket_size),
+ "constraint_count": int(len(targets)),
+ "best_case_max_pct_error": feasibility["best_case_max_pct_error"],
+ "feasibility_status": feasibility["status"],
+ "feasibility_message": feasibility["message"],
+ }
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(
+ description=(
+ "Compare late-year nonnegative feasibility before and after support augmentation."
+ )
+ )
+ parser.add_argument("year", type=int, help="Projection year to evaluate.")
+ parser.add_argument(
+ "--profile",
+ default="ss-payroll",
+ help="Calibration profile to evaluate.",
+ )
+ parser.add_argument(
+ "--target-source",
+ default="trustees_2025_current_law",
+ help="Named long-run target source package.",
+ )
+ parser.add_argument(
+ "--dataset",
+ default=DEFAULT_DATASET,
+ help="Base dataset path or HF reference.",
+ )
+ parser.add_argument(
+ "--support-augmentation",
+ default="late-clone-v1",
+ help="Support augmentation profile name.",
+ )
+ args = parser.parse_args()
+
+ set_long_term_target_source(args.target_source)
+
+ base_result = _evaluate_dataset(
+ dataset=args.dataset,
+ dataset_label="base",
+ year=args.year,
+ profile_name=args.profile,
+ )
+ augmented_dataset, augmentation_report = build_augmented_dataset(
+ base_dataset=args.dataset,
+ base_year=BASE_YEAR,
+ profile=args.support_augmentation,
+ )
+ augmented_result = _evaluate_dataset(
+ dataset=augmented_dataset,
+ dataset_label=args.support_augmentation,
+ year=args.year,
+ profile_name=args.profile,
+ )
+
+ report = {
+ "year": args.year,
+ "profile": args.profile,
+ "target_source": args.target_source,
+ "augmentation": augmentation_report,
+ "results": {
+ "base": base_result,
+ "augmented": augmented_result,
+ "delta_best_case_max_pct_error": (
+ None
+ if base_result["best_case_max_pct_error"] is None
+ or augmented_result["best_case_max_pct_error"] is None
+ else augmented_result["best_case_max_pct_error"]
+ - base_result["best_case_max_pct_error"]
+ ),
+ "delta_household_count": (
+ augmented_result["household_count"] - base_result["household_count"]
+ ),
+ },
+ }
+ print(json.dumps(report, indent=2))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/policyengine_us_data/datasets/cps/long_term/profile_support_concentration.py b/policyengine_us_data/datasets/cps/long_term/profile_support_concentration.py
new file mode 100644
index 000000000..b79ded0ff
--- /dev/null
+++ b/policyengine_us_data/datasets/cps/long_term/profile_support_concentration.py
@@ -0,0 +1,166 @@
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+import h5py
+import numpy as np
+
+
+SS_COMPONENTS = (
+ "social_security_retirement",
+ "social_security_disability",
+ "social_security_survivors",
+ "social_security_dependents",
+)
+PAYROLL_COMPONENTS = (
+ "employment_income_before_lsr",
+ "self_employment_income_before_lsr",
+)
+
+
+def _read_year_array(store: h5py.File, name: str, year: int) -> np.ndarray:
+ return store[name][str(year)][()]
+
+
+def _build_household_lookup(household_ids: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+ order = np.argsort(household_ids)
+ sorted_ids = household_ids[order]
+ return sorted_ids, order
+
+
+def _household_index(
+ sorted_household_ids: np.ndarray,
+ order: np.ndarray,
+ person_household_ids: np.ndarray,
+) -> np.ndarray:
+ positions = np.searchsorted(sorted_household_ids, person_household_ids)
+ if np.any(positions >= len(sorted_household_ids)):
+ raise ValueError("Person household ids exceed household id support")
+ matched = sorted_household_ids[positions]
+ if not np.array_equal(matched, person_household_ids):
+ raise ValueError("Person household ids do not match household-level ids")
+ return order[positions]
+
+
+def _load_year(path: Path, year: int) -> dict[str, np.ndarray]:
+ with h5py.File(path, "r") as store:
+ household_ids = _read_year_array(store, "household_id", year).astype(np.int64)
+ household_weights = _read_year_array(store, "household_weight", year).astype(
+ float
+ )
+ person_household_ids = _read_year_array(
+ store, "person_household_id", year
+ ).astype(np.int64)
+ ages = _read_year_array(store, "age", year).astype(float)
+ payroll = np.zeros_like(ages, dtype=float)
+ for component in PAYROLL_COMPONENTS:
+ payroll += _read_year_array(store, component, year).astype(float)
+ social_security = np.zeros_like(ages, dtype=float)
+ for component in SS_COMPONENTS:
+ social_security += _read_year_array(store, component, year).astype(float)
+
+ sorted_ids, order = _build_household_lookup(household_ids)
+ person_household_index = _household_index(sorted_ids, order, person_household_ids)
+
+ return {
+ "household_ids": household_ids,
+ "household_weights": household_weights,
+ "person_household_ids": person_household_ids,
+ "person_household_index": person_household_index,
+ "ages": ages,
+ "payroll": payroll,
+ "social_security": social_security,
+ }
+
+
+def _effective_sample_size(weights: np.ndarray) -> float:
+ total = float(weights.sum())
+ denom = float(np.dot(weights, weights))
+ if total <= 0 or denom <= 0:
+ return 0.0
+ return total**2 / denom
+
+
+def _top_households(data: dict[str, np.ndarray], top_n: int) -> list[dict[str, object]]:
+ weights = data["household_weights"]
+ top_idx = np.argsort(weights)[-top_n:][::-1]
+ records: list[dict[str, object]] = []
+ for idx in top_idx:
+ if weights[idx] <= 0:
+ continue
+ mask = data["person_household_index"] == idx
+ ages = np.sort(data["ages"][mask]).astype(int).tolist()
+ payroll_total = float(data["payroll"][mask].sum())
+ social_security_total = float(data["social_security"][mask].sum())
+ records.append(
+ {
+ "household_id": int(data["household_ids"][idx]),
+ "weight": float(weights[idx]),
+ "weight_share_pct": float(weights[idx] / weights.sum() * 100),
+ "ages": ages,
+ "payroll_proxy": payroll_total,
+ "social_security_total": social_security_total,
+ }
+ )
+ return records
+
+
+def profile_support(path: Path, year: int, *, top_n: int) -> dict[str, object]:
+ data = _load_year(path, year)
+ household_weights = data["household_weights"]
+ positive_mask = household_weights > 0
+ sorted_weights = np.sort(household_weights)
+ person_weights = household_weights[data["person_household_index"]]
+ ages = data["ages"]
+ payroll = data["payroll"]
+
+ overall_nonworking = payroll <= 0
+ age_85_plus = ages >= 85
+
+ return {
+ "path": str(path),
+ "year": year,
+ "positive_household_count": int(positive_mask.sum()),
+ "positive_household_pct": float(positive_mask.mean() * 100),
+ "effective_sample_size": _effective_sample_size(household_weights),
+ "top_10_weight_share_pct": float(
+ sorted_weights[-10:].sum() / household_weights.sum() * 100
+ ),
+ "top_100_weight_share_pct": float(
+ sorted_weights[-100:].sum() / household_weights.sum() * 100
+ ),
+ "weighted_nonworking_share_pct": float(
+ person_weights[overall_nonworking].sum() / person_weights.sum() * 100
+ ),
+ "weighted_nonworking_share_85_plus_pct": float(
+ person_weights[age_85_plus & overall_nonworking].sum()
+ / person_weights[age_85_plus].sum()
+ * 100
+ )
+ if person_weights[age_85_plus].sum() > 0
+ else 0.0,
+ "top_households": _top_households(data, top_n),
+ }
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(
+ description="Profile late-year support concentration in projected household datasets."
+ )
+ parser.add_argument(
+ "dataset", type=Path, help="Projected year-specific H5 dataset."
+ )
+ parser.add_argument("year", type=int, help="Projection year stored in the dataset.")
+ parser.add_argument(
+ "--top-n", type=int, default=20, help="Number of top households to emit."
+ )
+ args = parser.parse_args()
+
+ report = profile_support(args.dataset, args.year, top_n=args.top_n)
+ print(json.dumps(report, indent=2))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/policyengine_us_data/datasets/cps/long_term/projection_utils.py b/policyengine_us_data/datasets/cps/long_term/projection_utils.py
index 8aee4f3b7..1c0baf5bc 100644
--- a/policyengine_us_data/datasets/cps/long_term/projection_utils.py
+++ b/policyengine_us_data/datasets/cps/long_term/projection_utils.py
@@ -1,5 +1,6 @@
import os
import gc
+import sys
import numpy as np
import h5py
@@ -7,6 +8,34 @@
from policyengine_core.data.dataset import Dataset
+def validate_projected_social_security_cap(
+ parameter_accessor,
+ year: int,
+ *,
+ reference_year: int = 2035,
+) -> float:
+ """
+ Ensure the Social Security taxable earnings cap keeps growing beyond the
+ last explicitly projected year.
+
+ The long-run calibration and diagnostics use taxable payroll targets
+ through 2100. If the payroll cap flattens after the reference year, the
+ late-year taxable payroll problem becomes mechanically distorted.
+ """
+ current_cap = float(parameter_accessor(year).gov.irs.payroll.social_security.cap)
+ reference_cap = float(
+ parameter_accessor(reference_year).gov.irs.payroll.social_security.cap
+ )
+ if year > reference_year and current_cap <= reference_cap * (1 + 1e-12):
+ raise RuntimeError(
+ "Social Security payroll cap is flat after "
+ f"{reference_year}: {current_cap:,.2f} in {year}. "
+ "This usually means policyengine-us is missing the long-run NAWI/"
+ "payroll-cap extension."
+ )
+ return current_cap
+
+
def build_household_age_matrix(sim, n_ages=86):
"""
Build household age composition matrix from simulation.
@@ -39,6 +68,50 @@ def build_household_age_matrix(sim, n_ages=86):
return X, household_ids_unique, hh_id_to_idx
+def build_age_bins(n_ages=86, bucket_size=None):
+ """
+ Build age-bucket ranges over the single-year age target vector.
+
+ The final bucket always preserves the open-ended 85+ slot.
+ """
+ if bucket_size is None or bucket_size <= 1:
+ return [(age_idx, age_idx + 1) for age_idx in range(n_ages)]
+
+ bins = []
+ upper_single_age = max(n_ages - 1, 0)
+ for start in range(0, upper_single_age, bucket_size):
+ end = min(start + bucket_size, upper_single_age)
+ bins.append((start, end))
+ bins.append((upper_single_age, n_ages))
+ return bins
+
+
+def aggregate_household_age_matrix(X, age_bins):
+ """
+ Aggregate a single-year household age matrix into coarser age buckets.
+ """
+ if len(age_bins) == X.shape[1] and all(end - start == 1 for start, end in age_bins):
+ return X
+ return np.column_stack([X[:, start:end].sum(axis=1) for start, end in age_bins])
+
+
+def aggregate_age_targets(targets, age_bins):
+ """
+ Aggregate age targets over the first axis.
+
+ Accepts either a single target vector `(n_ages,)` or a matrix
+ `(n_ages, n_years)`.
+ """
+ targets = np.asarray(targets, dtype=float)
+ if targets.ndim == 1:
+ return np.array(
+ [targets[start:end].sum() for start, end in age_bins],
+ dtype=float,
+ )
+
+ return np.vstack([targets[start:end, :].sum(axis=0) for start, end in age_bins])
+
+
def get_pseudo_input_variables(sim):
"""
Identify variables that appear as inputs but aggregate calculated values.
@@ -65,7 +138,14 @@ def get_pseudo_input_variables(sim):
return pseudo_inputs
-def create_household_year_h5(year, household_weights, base_dataset_path, output_dir):
+def create_household_year_h5(
+ year,
+ household_weights,
+ base_dataset,
+ output_dir,
+ *,
+ reform=None,
+):
"""
Create a year-specific .h5 file with calibrated household weights.
@@ -75,15 +155,16 @@ def create_household_year_h5(year, household_weights, base_dataset_path, output_
Args:
year: The year for this dataset
household_weights: Calibrated household weights for this year
- base_dataset_path: Path to base dataset
+ base_dataset: Path to base dataset or in-memory Dataset instance
output_dir: Directory to save the .h5 file
+ reform: Optional reform to apply when materializing year-specific values
Returns:
Path to the created .h5 file
"""
output_path = os.path.join(output_dir, f"{year}.h5")
- sim = Microsimulation(dataset=base_dataset_path)
+ sim = Microsimulation(dataset=base_dataset, reform=reform)
base_period = int(sim.default_calculation_period)
df = sim.to_input_dataframe()
@@ -129,9 +210,20 @@ def create_household_year_h5(year, household_weights, base_dataset_path, output_
df[col_name_new] = uprated_values
df.drop(columns=[col], inplace=True)
else:
+ print(
+ f"Warning: uprating {var_name} for {year} returned "
+ f"{len(uprated_values)} rows instead of {len(df)}; "
+ "renaming the base-year column without recalculation.",
+ file=sys.stderr,
+ )
df.rename(columns={col: col_name_new}, inplace=True)
- except:
+ except Exception as error:
+ print(
+ f"Warning: failed to uprate {var_name} for {year}: {error}; "
+ "renaming the base-year column without recalculation.",
+ file=sys.stderr,
+ )
df.rename(columns={col: col_name_new}, inplace=True)
dataset = Dataset.from_dataframe(df, year)
@@ -154,7 +246,7 @@ def create_household_year_h5(year, household_weights, base_dataset_path, output_
if values.dtype == np.object_:
try:
values = values.astype("S")
- except:
+ except (TypeError, ValueError):
continue
data[variable][period] = values
diff --git a/policyengine_us_data/datasets/cps/long_term/prototype_synthetic_2100_support.py b/policyengine_us_data/datasets/cps/long_term/prototype_synthetic_2100_support.py
new file mode 100644
index 000000000..12ab5b793
--- /dev/null
+++ b/policyengine_us_data/datasets/cps/long_term/prototype_synthetic_2100_support.py
@@ -0,0 +1,3193 @@
+from __future__ import annotations
+
+import argparse
+from dataclasses import asdict, dataclass
+from functools import lru_cache
+import json
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from policyengine_core.data.dataset import Dataset
+from policyengine_us import Microsimulation
+
+try:
+ from .calibration import (
+ assess_nonnegative_feasibility,
+ calibrate_entropy,
+ calibrate_entropy_bounded,
+ densify_lp_solution,
+ )
+ from .projection_utils import (
+ aggregate_age_targets,
+ build_age_bins,
+ validate_projected_social_security_cap,
+ )
+ from .ssa_data import (
+ get_long_term_target_source,
+ load_ssa_age_projections,
+ load_ssa_benefit_projections,
+ load_taxable_payroll_projections,
+ set_long_term_target_source,
+ )
+except ImportError: # pragma: no cover - script execution fallback
+ from calibration import (
+ assess_nonnegative_feasibility,
+ calibrate_entropy,
+ calibrate_entropy_bounded,
+ densify_lp_solution,
+ )
+ from projection_utils import (
+ aggregate_age_targets,
+ build_age_bins,
+ validate_projected_social_security_cap,
+ )
+ from ssa_data import (
+ get_long_term_target_source,
+ load_ssa_age_projections,
+ load_ssa_benefit_projections,
+ load_taxable_payroll_projections,
+ set_long_term_target_source,
+ )
+
+
+DEFAULT_DATASET = "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5"
+DEFAULT_YEAR = 2100
+BASE_YEAR = 2024
+ENTITY_ID_COLUMNS = {
+ "household": ("household_id", "person_household_id"),
+ "family": ("family_id", "person_family_id"),
+ "tax_unit": ("tax_unit_id", "person_tax_unit_id"),
+ "spm_unit": ("spm_unit_id", "person_spm_unit_id"),
+ "marital_unit": ("marital_unit_id", "person_marital_unit_id"),
+}
+PERSON_ID_COLUMN = "person_id"
+SS_COMPONENTS = (
+ "social_security_retirement",
+ "social_security_disability",
+ "social_security_survivors",
+ "social_security_dependents",
+)
+PAYROLL_COMPONENTS = (
+ "employment_income_before_lsr",
+ "self_employment_income_before_lsr",
+)
+PAYROLL_UPRATING_FACTOR_COLUMN = "__pe_payroll_uprating_factor"
+SS_UPRATING_FACTOR_COLUMN = "__pe_ss_uprating_factor"
+
+
+@dataclass(frozen=True)
+class SyntheticTemplate:
+ name: str
+ head_ages: tuple[int, ...]
+ spouse_age_offsets: tuple[int | None, ...]
+ dependent_age_sets: tuple[tuple[int, ...], ...]
+ ss_source: str
+ payroll_source: str
+ pension_source: str
+ dividend_source: str
+ ss_split: tuple[float, float]
+ payroll_split: tuple[float, float]
+ ss_scale_factors: tuple[float, ...] = (1.0,)
+ payroll_scale_factors: tuple[float, ...] = (1.0,)
+ pension_scale_factors: tuple[float, ...] = (1.0,)
+ dividend_scale_factors: tuple[float, ...] = (1.0,)
+
+
+@dataclass(frozen=True)
+class SyntheticCandidate:
+ archetype: str
+ head_age: int
+ spouse_age: int | None
+ dependent_ages: tuple[int, ...]
+ head_wages: float
+ spouse_wages: float
+ head_ss: float
+ spouse_ss: float
+ pension_income: float
+ dividend_income: float
+
+ @property
+ def payroll_total(self) -> float:
+ return float(self.head_wages + self.spouse_wages)
+
+ def taxable_payroll_total(self, payroll_cap: float) -> float:
+ return float(
+ min(self.head_wages, payroll_cap) + min(self.spouse_wages, payroll_cap)
+ )
+
+ @property
+ def ss_total(self) -> float:
+ return float(self.head_ss + self.spouse_ss)
+
+ def ages(self) -> list[int]:
+ values = [self.head_age]
+ if self.spouse_age is not None:
+ values.append(self.spouse_age)
+ values.extend(self.dependent_ages)
+ return values
+
+ def filing_status(self) -> str:
+ return "joint" if self.spouse_age is not None else "single"
+
+ def taxable_benefits_proxy(self) -> float:
+ benefits = self.ss_total
+ if benefits <= 0:
+ return 0.0
+ provisional_income = (
+ self.payroll_total
+ + self.pension_income
+ + self.dividend_income
+ + 0.5 * benefits
+ )
+ if self.filing_status() == "joint":
+ base = 32_000.0
+ adjusted = 44_000.0
+ lesser_cap = 6_000.0
+ else:
+ base = 25_000.0
+ adjusted = 34_000.0
+ lesser_cap = 4_500.0
+
+ if provisional_income <= base:
+ return 0.0
+ if provisional_income <= adjusted:
+ return min(0.5 * benefits, 0.5 * (provisional_income - base))
+ return min(
+ 0.85 * benefits,
+ 0.85 * (provisional_income - adjusted) + min(0.5 * benefits, lesser_cap),
+ )
+
+
+TEMPLATES = (
+ SyntheticTemplate(
+ name="older_beneficiary_single",
+ head_ages=(62, 67, 72, 77, 82, 85),
+ spouse_age_offsets=(None,),
+ dependent_age_sets=((),),
+ ss_source="older_beneficiary",
+ payroll_source="zero",
+ pension_source="older_asset",
+ dividend_source="older_asset",
+ ss_split=(1.0, 0.0),
+ payroll_split=(0.0, 0.0),
+ ss_scale_factors=(0.75, 1.0, 1.25),
+ pension_scale_factors=(0.0, 1.0),
+ dividend_scale_factors=(0.0, 1.0),
+ ),
+ SyntheticTemplate(
+ name="older_beneficiary_couple",
+ head_ages=(62, 67, 72, 77, 82, 85),
+ spouse_age_offsets=(-2, -5, -8),
+ dependent_age_sets=((),),
+ ss_source="older_couple_beneficiary",
+ payroll_source="zero",
+ pension_source="older_asset",
+ dividend_source="older_asset",
+ ss_split=(0.55, 0.45),
+ payroll_split=(0.0, 0.0),
+ ss_scale_factors=(0.75, 1.0, 1.25),
+ pension_scale_factors=(0.0, 1.0),
+ dividend_scale_factors=(0.0, 1.0),
+ ),
+ SyntheticTemplate(
+ name="older_worker_single",
+ head_ages=(62, 65, 67, 70, 75, 80),
+ spouse_age_offsets=(None,),
+ dependent_age_sets=((),),
+ ss_source="older_worker",
+ payroll_source="older_worker",
+ pension_source="older_asset",
+ dividend_source="older_asset",
+ ss_split=(1.0, 0.0),
+ payroll_split=(1.0, 0.0),
+ ss_scale_factors=(0.5, 0.75, 1.0, 1.25),
+ payroll_scale_factors=(0.5, 1.0, 1.5, 2.0),
+ pension_scale_factors=(0.0, 1.0),
+ dividend_scale_factors=(0.0, 1.0),
+ ),
+ SyntheticTemplate(
+ name="older_worker_couple",
+ head_ages=(62, 65, 67, 70, 75, 80),
+ spouse_age_offsets=(-2, -5),
+ dependent_age_sets=((),),
+ ss_source="older_worker",
+ payroll_source="older_worker",
+ pension_source="older_asset",
+ dividend_source="older_asset",
+ ss_split=(0.55, 0.45),
+ payroll_split=(0.55, 0.45),
+ ss_scale_factors=(0.5, 0.75, 1.0, 1.25),
+ payroll_scale_factors=(0.5, 1.0, 1.5, 2.0),
+ pension_scale_factors=(0.0, 1.0),
+ dividend_scale_factors=(0.0, 1.0),
+ ),
+ SyntheticTemplate(
+ name="mixed_retiree_worker_couple",
+ head_ages=(62, 67, 72, 77, 82, 85),
+ spouse_age_offsets=(-10, -15, -20, -25, -35),
+ dependent_age_sets=((),),
+ ss_source="older_beneficiary",
+ payroll_source="prime_worker",
+ pension_source="older_asset",
+ dividend_source="older_asset",
+ ss_split=(1.0, 0.0),
+ payroll_split=(0.0, 1.0),
+ ss_scale_factors=(0.5, 0.75, 1.0, 1.25),
+ payroll_scale_factors=(0.5, 1.0, 1.5, 2.0),
+ pension_scale_factors=(0.0, 1.0),
+ dividend_scale_factors=(0.0, 1.0),
+ ),
+ SyntheticTemplate(
+ name="prime_worker_single",
+ head_ages=(20, 22, 25, 27, 30, 35, 40, 45, 50, 55, 60, 64),
+ spouse_age_offsets=(None,),
+ dependent_age_sets=((),),
+ ss_source="zero",
+ payroll_source="prime_worker",
+ pension_source="prime_asset",
+ dividend_source="prime_asset",
+ ss_split=(0.0, 0.0),
+ payroll_split=(1.0, 0.0),
+ payroll_scale_factors=(0.5, 1.0, 1.5, 2.0),
+ pension_scale_factors=(0.0, 1.0),
+ dividend_scale_factors=(0.0, 1.0),
+ ),
+ SyntheticTemplate(
+ name="prime_worker_couple",
+ head_ages=(25, 30, 35, 40, 45, 50, 55, 60),
+ spouse_age_offsets=(-2, -5, -8),
+ dependent_age_sets=((),),
+ ss_source="zero",
+ payroll_source="prime_worker",
+ pension_source="prime_asset",
+ dividend_source="prime_asset",
+ ss_split=(0.0, 0.0),
+ payroll_split=(0.6, 0.4),
+ payroll_scale_factors=(0.5, 1.0, 1.5, 2.0),
+ pension_scale_factors=(0.0, 1.0),
+ dividend_scale_factors=(0.0, 1.0),
+ ),
+ SyntheticTemplate(
+ name="prime_worker_family",
+ head_ages=(25, 30, 35, 40, 45, 50, 55),
+ spouse_age_offsets=(-2,),
+ dependent_age_sets=((0,), (3,), (7,), (12,), (16,), (4, 9), (11, 16)),
+ ss_source="zero",
+ payroll_source="prime_worker_family",
+ pension_source="prime_asset",
+ dividend_source="prime_asset",
+ ss_split=(0.0, 0.0),
+ payroll_split=(0.6, 0.4),
+ payroll_scale_factors=(0.5, 1.0, 1.5, 2.0),
+ pension_scale_factors=(0.0, 1.0),
+ dividend_scale_factors=(0.0, 1.0),
+ ),
+ SyntheticTemplate(
+ name="older_plus_prime_worker_family",
+ head_ages=(62, 67, 72, 77, 82, 85),
+ spouse_age_offsets=(-15, -25, -35),
+ dependent_age_sets=((0,), (7,), (15,), (4, 9), (11, 16)),
+ ss_source="older_beneficiary",
+ payroll_source="prime_worker_family",
+ pension_source="older_asset",
+ dividend_source="older_asset",
+ ss_split=(1.0, 0.0),
+ payroll_split=(0.0, 1.0),
+ ss_scale_factors=(0.5, 0.75, 1.0, 1.25),
+ payroll_scale_factors=(0.5, 1.0, 1.5, 2.0),
+ pension_scale_factors=(0.0, 1.0),
+ dividend_scale_factors=(0.0, 1.0),
+ ),
+ SyntheticTemplate(
+ name="late_worker_couple",
+ head_ages=(58, 60, 62, 64, 66, 68),
+ spouse_age_offsets=(-2, -5),
+ dependent_age_sets=((),),
+ ss_source="older_worker",
+ payroll_source="prime_worker",
+ pension_source="prime_asset",
+ dividend_source="prime_asset",
+ ss_split=(0.6, 0.4),
+ payroll_split=(0.6, 0.4),
+ ss_scale_factors=(0.25, 0.5, 0.75, 1.0),
+ payroll_scale_factors=(0.5, 1.0, 1.5, 2.0),
+ pension_scale_factors=(0.0, 1.0),
+ dividend_scale_factors=(0.0, 1.0),
+ ),
+ SyntheticTemplate(
+ name="late_worker_single",
+ head_ages=(58, 60, 62, 64, 66, 68),
+ spouse_age_offsets=(None,),
+ dependent_age_sets=((),),
+ ss_source="older_worker",
+ payroll_source="prime_worker",
+ pension_source="prime_asset",
+ dividend_source="prime_asset",
+ ss_split=(1.0, 0.0),
+ payroll_split=(1.0, 0.0),
+ ss_scale_factors=(0.25, 0.5, 0.75, 1.0),
+ payroll_scale_factors=(0.5, 1.0, 1.5, 2.0),
+ pension_scale_factors=(0.0, 1.0),
+ dividend_scale_factors=(0.0, 1.0),
+ ),
+)
+
+
+def parse_args() -> argparse.Namespace:
+ parser = argparse.ArgumentParser(
+ description=(
+ "Prototype a fully synthetic late-year support using a minimal set of "
+ "head/spouse/dependent features."
+ )
+ )
+ parser.add_argument(
+ "--year",
+ type=int,
+ default=DEFAULT_YEAR,
+ help="Projection year to target.",
+ )
+ parser.add_argument(
+ "--target-source",
+ default=get_long_term_target_source(),
+ help="Named long-run target source package.",
+ )
+ parser.add_argument(
+ "--base-dataset",
+ default=DEFAULT_DATASET,
+ help="Base 2024 dataset used to derive comparison pools.",
+ )
+ parser.add_argument(
+ "--output",
+ type=Path,
+ help="Optional JSON output path.",
+ )
+ parser.add_argument(
+ "--epsilon-path",
+ default="0.25,0.5,1.0",
+ help=(
+ "Comma-separated approximate error thresholds to probe with "
+ "bounded entropy after the exact solve. Use an empty string to disable."
+ ),
+ )
+ parser.add_argument(
+ "--donor-probe-top-n",
+ type=int,
+ default=20,
+ help="Number of top exact-fit synthetic candidates to probe against real donors.",
+ )
+ parser.add_argument(
+ "--donor-probe-k",
+ type=int,
+ default=5,
+ help="Number of nearest real donors to report for each probed synthetic candidate.",
+ )
+ return parser.parse_args()
+
+
+def parse_epsilon_path(value: str) -> tuple[float, ...]:
+ if not value.strip():
+ return ()
+ return tuple(float(part.strip()) for part in value.split(",") if part.strip())
+
+
+def _period_column(name: str, base_year: int) -> str:
+ return f"{name}__{base_year}"
+
+
+def classify_archetype(
+ *,
+ head_age: float,
+ spouse_age: float | None,
+ dependent_count: int,
+ ss_total: float,
+ payroll_total: float,
+) -> str:
+ older_head = head_age >= 65
+ has_spouse = spouse_age is not None
+ older_spouse = spouse_age is not None and spouse_age >= 65
+ positive_ss = ss_total > 0
+ positive_payroll = payroll_total > 0
+
+ if older_head:
+ if dependent_count > 0 and positive_payroll:
+ return "older_plus_prime_worker_family"
+ if (
+ has_spouse
+ and spouse_age is not None
+ and spouse_age < 65
+ and positive_payroll
+ ):
+ return "mixed_retiree_worker_couple"
+ if has_spouse and older_spouse:
+ if positive_payroll and positive_ss:
+ return "older_worker_couple"
+ if positive_ss:
+ return "older_beneficiary_couple"
+ return "older_couple_other"
+ if positive_payroll and positive_ss:
+ return "older_worker_single"
+ if positive_ss:
+ return "older_beneficiary_single"
+ return "older_single_other"
+
+ if dependent_count > 0:
+ return "prime_worker_family" if positive_payroll else "prime_other_family"
+ if has_spouse:
+ return "prime_worker_couple" if positive_payroll else "prime_other_couple"
+ return "prime_worker_single" if positive_payroll else "prime_other_single"
+
+
+def build_tax_unit_summary(
+ dataset: str | Dataset,
+ *,
+ period: int,
+) -> pd.DataFrame:
+ sim = Microsimulation(dataset=dataset)
+ input_df = sim.to_input_dataframe()
+
+ person_df = pd.DataFrame(
+ {
+ "tax_unit_id": sim.calculate("person_tax_unit_id", period=period).values,
+ "household_id": sim.calculate("person_household_id", period=period).values,
+ "age": sim.calculate("age", period=period).values,
+ "is_head": sim.calculate("is_tax_unit_head", period=period).values,
+ "is_spouse": sim.calculate("is_tax_unit_spouse", period=period).values,
+ "is_dependent": sim.calculate(
+ "is_tax_unit_dependent", period=period
+ ).values,
+ "social_security": sim.calculate("social_security", period=period).values,
+ "payroll": (
+ sim.calculate(
+ "taxable_earnings_for_social_security", period=period
+ ).values
+ + sim.calculate(
+ "social_security_taxable_self_employment_income", period=period
+ ).values
+ ),
+ "dividend_income": sim.calculate(
+ "qualified_dividend_income", period=period
+ ).values,
+ "pension_income": sim.calculate(
+ "taxable_pension_income", period=period
+ ).values,
+ "person_weight": input_df[f"person_weight__{period}"].astype(float).values,
+ "household_weight": input_df[f"household_weight__{period}"]
+ .astype(float)
+ .values,
+ }
+ )
+
+ rows: list[dict[str, object]] = []
+ for tax_unit_id, group in person_df.groupby("tax_unit_id", sort=False):
+ heads = group[group["is_head"]]
+ spouses = group[group["is_spouse"]]
+ dependents = group[group["is_dependent"]]
+ adults = group[group["age"] >= 18]
+ head_age = float(
+ heads["age"].iloc[0]
+ if not heads.empty
+ else adults["age"].max()
+ if not adults.empty
+ else group["age"].max()
+ )
+ spouse_age = float(spouses["age"].iloc[0]) if not spouses.empty else None
+ dependent_count = int(len(dependents))
+ adult_count = int((group["age"] >= 18).sum())
+ dependent_ages = tuple(sorted(int(age) for age in dependents["age"].tolist()))
+ head_payroll = float(heads["payroll"].sum()) if not heads.empty else 0.0
+ spouse_payroll = float(spouses["payroll"].sum()) if not spouses.empty else 0.0
+ head_ss = float(heads["social_security"].sum()) if not heads.empty else 0.0
+ spouse_ss = (
+ float(spouses["social_security"].sum()) if not spouses.empty else 0.0
+ )
+ row = {
+ "tax_unit_id": int(tax_unit_id),
+ "household_id": int(group["household_id"].iloc[0]),
+ "head_age": head_age,
+ "spouse_age": spouse_age,
+ "adult_count": adult_count,
+ "dependent_count": dependent_count,
+ "dependent_ages": dependent_ages,
+ "head_payroll": head_payroll,
+ "spouse_payroll": spouse_payroll,
+ "head_ss": head_ss,
+ "spouse_ss": spouse_ss,
+ "payroll_total": float(group["payroll"].sum()),
+ "ss_total": float(group["social_security"].sum()),
+ "dividend_income": float(group["dividend_income"].sum()),
+ "pension_income": float(group["pension_income"].sum()),
+ "support_count_weight": 1.0,
+ "person_weight_proxy": float(group["person_weight"].max()),
+ "household_weight_proxy": float(group["household_weight"].max()),
+ }
+ row["archetype"] = classify_archetype(
+ head_age=row["head_age"],
+ spouse_age=row["spouse_age"],
+ dependent_count=row["dependent_count"],
+ ss_total=row["ss_total"],
+ payroll_total=row["payroll_total"],
+ )
+ rows.append(row)
+
+ return pd.DataFrame(rows)
+
+
+def build_actual_tax_unit_summary(base_dataset: str) -> pd.DataFrame:
+ return build_tax_unit_summary(base_dataset, period=BASE_YEAR)
+
+
+def attach_person_uprating_factors(
+ input_df: pd.DataFrame,
+ sim: Microsimulation,
+ *,
+ base_year: int,
+ target_year: int,
+) -> pd.DataFrame:
+ df = input_df.copy()
+ payroll_columns = [
+ _period_column(component, base_year)
+ for component in PAYROLL_COMPONENTS
+ if _period_column(component, base_year) in df.columns
+ ]
+ ss_columns = [
+ _period_column(component, base_year)
+ for component in SS_COMPONENTS
+ if _period_column(component, base_year) in df.columns
+ ]
+ base_payroll = (
+ df[payroll_columns].astype(float).sum(axis=1).to_numpy()
+ if payroll_columns
+ else np.zeros(len(df), dtype=float)
+ )
+ base_ss = (
+ df[ss_columns].astype(float).sum(axis=1).to_numpy()
+ if ss_columns
+ else np.zeros(len(df), dtype=float)
+ )
+ uprated_payroll = sum(
+ sim.calculate(component, period=target_year).values.astype(float)
+ for component in PAYROLL_COMPONENTS
+ )
+ uprated_ss = sum(
+ sim.calculate(component, period=target_year).values.astype(float)
+ for component in SS_COMPONENTS
+ )
+ df[PAYROLL_UPRATING_FACTOR_COLUMN] = np.where(
+ base_payroll > 0,
+ uprated_payroll / np.maximum(base_payroll, 1e-12),
+ np.nan,
+ )
+ df[SS_UPRATING_FACTOR_COLUMN] = np.where(
+ base_ss > 0,
+ uprated_ss / np.maximum(base_ss, 1e-12),
+ np.nan,
+ )
+ return df
+
+
+def load_base_aggregates(base_dataset: str) -> dict[str, float]:
+ sim = Microsimulation(dataset=base_dataset)
+ household_series = sim.calculate(
+ "household_id", period=BASE_YEAR, map_to="household"
+ )
+ weights = household_series.weights.values.astype(float)
+ ss = sim.calculate("social_security", period=BASE_YEAR, map_to="household").values
+ payroll = (
+ sim.calculate(
+ "taxable_earnings_for_social_security",
+ period=BASE_YEAR,
+ map_to="household",
+ ).values
+ + sim.calculate(
+ "social_security_taxable_self_employment_income",
+ period=BASE_YEAR,
+ map_to="household",
+ ).values
+ )
+ return {
+ "weighted_ss_total": float(np.sum(ss * weights)),
+ "weighted_payroll_total": float(np.sum(payroll * weights)),
+ }
+
+
+def quantile_levels(
+ values: pd.Series,
+ *,
+ quantiles: tuple[float, ...],
+ include_zero: bool = False,
+ positive_only: bool = False,
+) -> list[float]:
+ series = values.astype(float)
+ if positive_only:
+ series = series[series > 0]
+ if series.empty:
+ levels = [0.0]
+ else:
+ levels = [float(series.quantile(q)) for q in quantiles]
+ if include_zero:
+ levels = [0.0, *levels]
+ deduped = []
+ for value in levels:
+ rounded = round(value, 2)
+ if rounded not in deduped:
+ deduped.append(rounded)
+ return [float(value) for value in deduped]
+
+
+def _scale_levels(levels: list[float], scale: float) -> list[float]:
+ return [round(level * scale, 2) for level in levels]
+
+
+@lru_cache(maxsize=None)
+def load_policyengine_social_security_cap(year: int) -> float:
+ sim = Microsimulation(dataset=DEFAULT_DATASET)
+ return validate_projected_social_security_cap(
+ sim.tax_benefit_system.parameters,
+ year,
+ )
+
+
+def allocate_taxable_payroll_wages(
+ total_taxable_payroll: float,
+ payroll_split: tuple[float, float],
+ payroll_cap: float,
+ *,
+ has_spouse: bool,
+) -> tuple[float, float]:
+ total_taxable_payroll = max(float(total_taxable_payroll), 0.0)
+ positive_earner_count = int(payroll_split[0] > 0) + int(
+ has_spouse and payroll_split[1] > 0
+ )
+ if not has_spouse or positive_earner_count <= 1:
+ if has_spouse and payroll_split[1] > payroll_split[0]:
+ return 0.0, min(total_taxable_payroll, payroll_cap)
+ return min(total_taxable_payroll, payroll_cap), 0.0
+
+ total_taxable_payroll = min(
+ total_taxable_payroll, positive_earner_count * payroll_cap
+ )
+ preferred_head = total_taxable_payroll * float(payroll_split[0])
+ lower = max(0.0, total_taxable_payroll - payroll_cap)
+ upper = min(payroll_cap, total_taxable_payroll)
+ head_taxable = min(max(preferred_head, lower), upper)
+ spouse_taxable = total_taxable_payroll - head_taxable
+ return float(head_taxable), float(spouse_taxable)
+
+
+def build_quantile_pools(
+ actual_summary: pd.DataFrame,
+ *,
+ ss_scale: float,
+ earnings_scale: float,
+) -> dict[str, dict[str, list[float]]]:
+ masks = {
+ "older_beneficiary": (actual_summary["head_age"] >= 65)
+ & (actual_summary["ss_total"] > 0),
+ "older_couple_beneficiary": (actual_summary["head_age"] >= 65)
+ & actual_summary["spouse_age"].fillna(-1).ge(65)
+ & (actual_summary["ss_total"] > 0),
+ "older_worker": (actual_summary["head_age"] >= 65)
+ & (actual_summary["ss_total"] > 0)
+ & (actual_summary["payroll_total"] > 0),
+ "prime_worker": (actual_summary["head_age"] < 65)
+ & (actual_summary["payroll_total"] > 0),
+ "prime_worker_family": (actual_summary["head_age"] < 65)
+ & (actual_summary["dependent_count"] > 0)
+ & (actual_summary["payroll_total"] > 0),
+ "older_asset": (actual_summary["head_age"] >= 65)
+ & (
+ (actual_summary["pension_income"] > 0)
+ | (actual_summary["dividend_income"] > 0)
+ ),
+ "prime_asset": (actual_summary["head_age"] < 65)
+ & (
+ (actual_summary["pension_income"] > 0)
+ | (actual_summary["dividend_income"] > 0)
+ ),
+ "zero": actual_summary["head_age"].notna(),
+ }
+
+ pools: dict[str, dict[str, list[float]]] = {}
+ for name, mask in masks.items():
+ subset = actual_summary[mask]
+ pools[name] = {
+ "ss": _scale_levels(
+ quantile_levels(
+ subset["ss_total"],
+ quantiles=(0.1, 0.25, 0.5, 0.75, 0.9),
+ include_zero=(name == "zero"),
+ positive_only=(name != "zero"),
+ ),
+ ss_scale,
+ ),
+ "payroll": _scale_levels(
+ quantile_levels(
+ subset["payroll_total"],
+ quantiles=(0.1, 0.25, 0.5, 0.75, 0.9),
+ include_zero=(
+ name
+ in {
+ "zero",
+ "older_beneficiary",
+ "older_couple_beneficiary",
+ "older_asset",
+ }
+ ),
+ positive_only=(name not in {"zero"}),
+ ),
+ earnings_scale,
+ ),
+ "pension": _scale_levels(
+ quantile_levels(
+ subset["pension_income"],
+ quantiles=(0.25, 0.5, 0.75, 0.9),
+ include_zero=True,
+ positive_only=True,
+ ),
+ earnings_scale,
+ ),
+ "dividend": _scale_levels(
+ quantile_levels(
+ subset["dividend_income"],
+ quantiles=(0.25, 0.5, 0.75, 0.9),
+ include_zero=True,
+ positive_only=True,
+ ),
+ earnings_scale,
+ ),
+ }
+ return pools
+
+
+def generate_synthetic_candidates(
+ pools: dict[str, dict[str, list[float]]],
+ *,
+ payroll_cap: float,
+) -> list[SyntheticCandidate]:
+ candidates: list[SyntheticCandidate] = []
+ for template in TEMPLATES:
+ ss_levels = pools[template.ss_source]["ss"]
+ payroll_levels = pools[template.payroll_source]["payroll"]
+ pension_levels = pools[template.pension_source]["pension"]
+ dividend_levels = pools[template.dividend_source]["dividend"]
+ for head_age in template.head_ages:
+ for spouse_offset in template.spouse_age_offsets:
+ spouse_age = (
+ None if spouse_offset is None else max(18, head_age + spouse_offset)
+ )
+ for dependent_ages in template.dependent_age_sets:
+ for ss_total in ss_levels:
+ for ss_scale in template.ss_scale_factors:
+ for payroll_total in payroll_levels:
+ for payroll_scale in template.payroll_scale_factors:
+ for pension_income in pension_levels:
+ for (
+ pension_scale
+ ) in template.pension_scale_factors:
+ for dividend_income in dividend_levels:
+ for (
+ dividend_scale
+ ) in template.dividend_scale_factors:
+ scaled_ss_total = (
+ ss_total * ss_scale
+ )
+ scaled_payroll_total = (
+ payroll_total * payroll_scale
+ )
+ scaled_pension_income = (
+ pension_income * pension_scale
+ )
+ scaled_dividend_income = (
+ dividend_income * dividend_scale
+ )
+ head_ss = (
+ scaled_ss_total
+ * template.ss_split[0]
+ )
+ spouse_ss = (
+ scaled_ss_total
+ * template.ss_split[1]
+ )
+ head_wages, spouse_wages = (
+ allocate_taxable_payroll_wages(
+ scaled_payroll_total,
+ template.payroll_split,
+ payroll_cap,
+ has_spouse=spouse_age
+ is not None,
+ )
+ )
+ candidates.append(
+ SyntheticCandidate(
+ archetype=template.name,
+ head_age=head_age,
+ spouse_age=spouse_age,
+ dependent_ages=tuple(
+ dependent_ages
+ ),
+ head_wages=head_wages,
+ spouse_wages=spouse_wages,
+ head_ss=head_ss,
+ spouse_ss=spouse_ss,
+ pension_income=scaled_pension_income,
+ dividend_income=scaled_dividend_income,
+ )
+ )
+ # Deduplicate exact duplicates caused by repeated quantiles.
+ deduped: dict[tuple[object, ...], SyntheticCandidate] = {}
+ for candidate in candidates:
+ key = (
+ candidate.archetype,
+ candidate.head_age,
+ candidate.spouse_age,
+ candidate.dependent_ages,
+ round(candidate.head_wages, 2),
+ round(candidate.spouse_wages, 2),
+ round(candidate.head_ss, 2),
+ round(candidate.spouse_ss, 2),
+ round(candidate.pension_income, 2),
+ round(candidate.dividend_income, 2),
+ )
+ deduped[key] = candidate
+ return list(deduped.values())
+
+
+def age_bucket_vector(ages: list[int], age_bins: list[tuple[int, int]]) -> np.ndarray:
+ vector = np.zeros(len(age_bins), dtype=float)
+ for age in ages:
+ if age >= 85:
+ vector[-1] += 1.0
+ continue
+ for idx, (start, end) in enumerate(age_bins):
+ if start <= age < end:
+ vector[idx] += 1.0
+ break
+ return vector
+
+
+def build_role_composite_calibration_blueprint(
+ augmentation_report: dict[str, object],
+ *,
+ year: int,
+ age_bins: list[tuple[int, int]],
+ hh_id_to_idx: dict[int, int],
+ baseline_weights: np.ndarray,
+ base_weight_scale: float = 0.5,
+) -> dict[str, object] | None:
+ """
+ Build target-year calibration overrides for donor-composite clones.
+
+ Donor-composite augmentation produces realized microdata rows that are close
+ to the synthetic target support but not numerically identical. At the
+ augmentation target year we can calibrate against the exact clone
+ blueprints, using the synthetic solution as priors, while still applying
+ the resulting household weights to the realized rows.
+ """
+ target_year = augmentation_report.get("target_year")
+ clone_reports = augmentation_report.get("clone_household_reports")
+ if target_year is None or int(target_year) != int(year):
+ return None
+ if not clone_reports:
+ return None
+
+ baseline_weights = np.asarray(baseline_weights, dtype=float)
+ if baseline_weights.ndim != 1:
+ raise ValueError("baseline_weights must be one-dimensional")
+
+ prior_weights = np.maximum(
+ baseline_weights * float(base_weight_scale),
+ 1e-12,
+ )
+ clone_total_prior_weight = max(float(baseline_weights.sum()), 1.0)
+ age_overrides: dict[int, np.ndarray] = {}
+ ss_overrides: dict[int, float] = {}
+ payroll_overrides: dict[int, float] = {}
+ applied_clone_households = 0
+
+ for clone_report in clone_reports:
+ household_id = int(clone_report["clone_household_id"])
+ idx = hh_id_to_idx.get(household_id)
+ if idx is None:
+ continue
+ ages = [int(clone_report["target_head_age"])]
+ spouse_age = clone_report.get("target_spouse_age")
+ if spouse_age is not None:
+ ages.append(int(spouse_age))
+ ages.extend(int(age) for age in clone_report.get("target_dependent_ages", []))
+ age_overrides[idx] = age_bucket_vector(ages, age_bins)
+ ss_overrides[idx] = float(clone_report["target_ss_total"])
+ payroll_overrides[idx] = float(clone_report["target_payroll_total"])
+ prior_weights[idx] = (
+ clone_total_prior_weight
+ * float(clone_report["per_clone_weight_share_pct"])
+ / 100.0
+ )
+ applied_clone_households += 1
+
+ return {
+ "baseline_weights": np.maximum(prior_weights, 1e-12),
+ "age_overrides": age_overrides,
+ "ss_overrides": ss_overrides,
+ "payroll_overrides": payroll_overrides,
+ "summary": {
+ "mode": "target_year_role_composite_blueprint",
+ "target_year": int(target_year),
+ "clone_household_count": int(applied_clone_households),
+ "base_weight_scale": float(base_weight_scale),
+ "clone_total_prior_weight": float(clone_total_prior_weight),
+ },
+ }
+
+
+def _ages_from_summary_row(row: pd.Series) -> list[int]:
+ ages = [int(round(float(row["head_age"])))]
+ if pd.notna(row.get("spouse_age")):
+ ages.append(int(round(float(row["spouse_age"]))))
+ ages.extend(int(age) for age in row.get("dependent_ages", ()))
+ return ages
+
+
+def _clone_report_record(
+ *,
+ clone_df: pd.DataFrame,
+ base_year: int,
+ target_candidate: SyntheticCandidate,
+ candidate_idx: int,
+ target_weight_share_pct: float,
+ clone_weight_scale: float,
+ combination_count: int,
+ older_donor_row: pd.Series | None,
+ worker_donor_row: pd.Series | None,
+) -> dict[str, object]:
+ household_id_col = _period_column("household_id", base_year)
+ tax_unit_id_col = _period_column("tax_unit_id", base_year)
+ age_col = _period_column("age", base_year)
+ payroll_columns = [
+ _period_column(component, base_year)
+ for component in PAYROLL_COMPONENTS
+ if _period_column(component, base_year) in clone_df.columns
+ ]
+ ss_columns = [
+ _period_column(component, base_year)
+ for component in SS_COMPONENTS
+ if _period_column(component, base_year) in clone_df.columns
+ ]
+ ages = sorted(int(round(age)) for age in clone_df[age_col].astype(float).tolist())
+ return {
+ "candidate_idx": int(candidate_idx),
+ "archetype": target_candidate.archetype,
+ "clone_household_id": int(clone_df[household_id_col].iloc[0]),
+ "clone_tax_unit_id": int(clone_df[tax_unit_id_col].iloc[0]),
+ "clone_person_count": int(len(clone_df)),
+ "clone_ages": ages,
+ "base_clone_payroll_total": float(clone_df[payroll_columns].sum().sum())
+ if payroll_columns
+ else 0.0,
+ "base_clone_ss_total": float(clone_df[ss_columns].sum().sum())
+ if ss_columns
+ else 0.0,
+ "target_weight_share_pct": float(target_weight_share_pct),
+ "per_clone_weight_share_pct": float(
+ target_weight_share_pct / max(combination_count, 1)
+ ),
+ "clone_weight_scale": float(clone_weight_scale),
+ "target_head_age": int(target_candidate.head_age),
+ "target_spouse_age": (
+ int(target_candidate.spouse_age)
+ if target_candidate.spouse_age is not None
+ else None
+ ),
+ "target_dependent_ages": list(target_candidate.dependent_ages),
+ "target_head_wages": float(target_candidate.head_wages),
+ "target_spouse_wages": float(target_candidate.spouse_wages),
+ "target_head_ss": float(target_candidate.head_ss),
+ "target_spouse_ss": float(target_candidate.spouse_ss),
+ "target_payroll_total": float(target_candidate.payroll_total),
+ "target_ss_total": float(target_candidate.ss_total),
+ "older_donor_tax_unit_id": (
+ int(older_donor_row["tax_unit_id"]) if older_donor_row is not None else None
+ ),
+ "worker_donor_tax_unit_id": (
+ int(worker_donor_row["tax_unit_id"])
+ if worker_donor_row is not None
+ else None
+ ),
+ "older_donor_distance": (
+ float(older_donor_row["distance"])
+ if older_donor_row is not None and "distance" in older_donor_row
+ else None
+ ),
+ "worker_donor_distance": (
+ float(worker_donor_row["distance"])
+ if worker_donor_row is not None and "distance" in worker_donor_row
+ else None
+ ),
+ }
+
+
+def summarize_realized_clone_translation(
+ dataset: str | Dataset,
+ *,
+ period: int,
+ augmentation_report: dict[str, object],
+ age_bucket_size: int = 5,
+) -> dict[str, object]:
+ clone_reports = augmentation_report.get("clone_household_reports", [])
+ if not clone_reports:
+ return {
+ "clone_household_count": 0,
+ "matched_clone_household_count": 0,
+ "unmatched_clone_household_count": 0,
+ "per_clone": [],
+ "by_archetype": [],
+ }
+
+ realized_summary = build_tax_unit_summary(dataset, period=period)
+ realized_by_tax_unit = realized_summary.set_index("tax_unit_id", drop=False)
+ age_bins = build_age_bins(85, bucket_size=age_bucket_size)
+ per_clone: list[dict[str, object]] = []
+
+ for clone_report in clone_reports:
+ target_ages = [int(clone_report["target_head_age"])]
+ if clone_report.get("target_spouse_age") is not None:
+ target_ages.append(int(clone_report["target_spouse_age"]))
+ target_ages.extend(int(age) for age in clone_report["target_dependent_ages"])
+ target_vector = age_bucket_vector(target_ages, age_bins)
+
+ clone_tax_unit_id = int(clone_report["clone_tax_unit_id"])
+ if clone_tax_unit_id not in realized_by_tax_unit.index:
+ per_clone.append(
+ {
+ **clone_report,
+ "matched": False,
+ "realized_archetype": None,
+ "realized_ages": None,
+ "realized_ss_total": None,
+ "realized_payroll_total": None,
+ "age_bucket_l1": None,
+ "ss_pct_error": None,
+ "payroll_pct_error": None,
+ }
+ )
+ continue
+
+ realized_row = realized_by_tax_unit.loc[clone_tax_unit_id]
+ realized_ages = _ages_from_summary_row(realized_row)
+ realized_vector = age_bucket_vector(realized_ages, age_bins)
+ realized_ss_total = float(realized_row["ss_total"])
+ realized_payroll_total = float(realized_row["payroll_total"])
+ target_ss_total = float(clone_report["target_ss_total"])
+ target_payroll_total = float(clone_report["target_payroll_total"])
+ per_clone.append(
+ {
+ **clone_report,
+ "matched": True,
+ "realized_archetype": realized_row["archetype"],
+ "realized_ages": realized_ages,
+ "realized_ss_total": realized_ss_total,
+ "realized_payroll_total": realized_payroll_total,
+ "realized_household_weight": float(
+ realized_row["household_weight_proxy"]
+ ),
+ "age_bucket_l1": float(np.abs(realized_vector - target_vector).sum()),
+ "ss_pct_error": (
+ 0.0
+ if abs(target_ss_total) < 1e-9
+ else (realized_ss_total - target_ss_total) / target_ss_total * 100
+ ),
+ "payroll_pct_error": (
+ 0.0
+ if abs(target_payroll_total) < 1e-9
+ else (realized_payroll_total - target_payroll_total)
+ / target_payroll_total
+ * 100
+ ),
+ }
+ )
+
+ per_clone_df = pd.DataFrame(per_clone)
+ matched_df = per_clone_df[per_clone_df["matched"]].copy()
+ if matched_df.empty:
+ return {
+ "clone_household_count": int(len(per_clone_df)),
+ "matched_clone_household_count": 0,
+ "unmatched_clone_household_count": int(len(per_clone_df)),
+ "per_clone": per_clone,
+ "by_archetype": [],
+ }
+
+ by_archetype = (
+ matched_df.groupby("archetype", sort=False)
+ .agg(
+ clone_household_count=("clone_tax_unit_id", "count"),
+ avg_age_bucket_l1=("age_bucket_l1", "mean"),
+ avg_ss_pct_error=("ss_pct_error", "mean"),
+ avg_payroll_pct_error=("payroll_pct_error", "mean"),
+ )
+ .reset_index()
+ )
+ target_ss_total = float(matched_df["target_ss_total"].sum())
+ target_payroll_total = float(matched_df["target_payroll_total"].sum())
+ realized_ss_total = float(matched_df["realized_ss_total"].sum())
+ realized_payroll_total = float(matched_df["realized_payroll_total"].sum())
+ return {
+ "clone_household_count": int(len(per_clone_df)),
+ "matched_clone_household_count": int(len(matched_df)),
+ "unmatched_clone_household_count": int(len(per_clone_df) - len(matched_df)),
+ "target_ss_total": target_ss_total,
+ "realized_ss_total": realized_ss_total,
+ "aggregate_ss_pct_error": (
+ 0.0
+ if abs(target_ss_total) < 1e-9
+ else (realized_ss_total - target_ss_total) / target_ss_total * 100
+ ),
+ "target_payroll_total": target_payroll_total,
+ "realized_payroll_total": realized_payroll_total,
+ "aggregate_payroll_pct_error": (
+ 0.0
+ if abs(target_payroll_total) < 1e-9
+ else (realized_payroll_total - target_payroll_total)
+ / target_payroll_total
+ * 100
+ ),
+ "median_age_bucket_l1": float(matched_df["age_bucket_l1"].median()),
+ "median_ss_pct_error": float(matched_df["ss_pct_error"].median()),
+ "median_payroll_pct_error": float(matched_df["payroll_pct_error"].median()),
+ "top_ss_over": matched_df.sort_values("ss_pct_error", ascending=False)
+ .head(10)
+ .to_dict("records"),
+ "top_payroll_over": matched_df.sort_values("payroll_pct_error", ascending=False)
+ .head(10)
+ .to_dict("records"),
+ "by_archetype": by_archetype.to_dict("records"),
+ "per_clone": per_clone,
+ }
+
+
+def build_synthetic_constraint_problem(
+ candidates: list[SyntheticCandidate],
+ *,
+ year: int,
+ baseline_weights: np.ndarray | None = None,
+) -> dict[str, object]:
+ payroll_cap = load_policyengine_social_security_cap(year)
+ age_targets = load_ssa_age_projections(start_year=year, end_year=year)
+ age_bins = build_age_bins(n_ages=age_targets.shape[0], bucket_size=5)
+ aggregated_age_targets = aggregate_age_targets(age_targets, age_bins)[:, 0]
+ X = np.vstack(
+ [age_bucket_vector(candidate.ages(), age_bins) for candidate in candidates]
+ )
+ ss_values = np.array([candidate.ss_total for candidate in candidates], dtype=float)
+ payroll_values = np.array(
+ [candidate.taxable_payroll_total(payroll_cap) for candidate in candidates],
+ dtype=float,
+ )
+ if baseline_weights is None:
+ baseline_weights = np.ones(len(candidates), dtype=float)
+ else:
+ baseline_weights = np.asarray(baseline_weights, dtype=float)
+ if len(baseline_weights) != len(candidates):
+ raise ValueError("baseline_weights must align with candidates")
+ return {
+ "age_bins": age_bins,
+ "aggregated_age_targets": aggregated_age_targets,
+ "X": X,
+ "ss_values": ss_values,
+ "ss_target": float(load_ssa_benefit_projections(year)),
+ "payroll_values": payroll_values,
+ "payroll_target": float(load_taxable_payroll_projections(year)),
+ "payroll_cap": float(payroll_cap),
+ "baseline_weights": baseline_weights,
+ }
+
+
+def build_constraint_matrix(
+ problem: dict[str, object],
+) -> tuple[np.ndarray, np.ndarray]:
+ constraint_matrix = np.column_stack(
+ [
+ problem["X"],
+ problem["ss_values"],
+ problem["payroll_values"],
+ ]
+ )
+ targets = np.concatenate(
+ [
+ problem["aggregated_age_targets"],
+ np.array([problem["ss_target"], problem["payroll_target"]], dtype=float),
+ ]
+ )
+ return constraint_matrix, targets
+
+
+def build_scaled_actual_summary(
+ actual_summary: pd.DataFrame,
+ *,
+ ss_scale: float,
+ earnings_scale: float,
+) -> pd.DataFrame:
+ scaled = actual_summary.copy()
+ scaled["scaled_head_payroll"] = scaled["head_payroll"] * earnings_scale
+ scaled["scaled_spouse_payroll"] = scaled["spouse_payroll"] * earnings_scale
+ scaled["scaled_payroll_total"] = scaled["payroll_total"] * earnings_scale
+ scaled["scaled_head_ss"] = scaled["head_ss"] * ss_scale
+ scaled["scaled_spouse_ss"] = scaled["spouse_ss"] * ss_scale
+ scaled["scaled_ss_total"] = scaled["ss_total"] * ss_scale
+ scaled["spouse_present"] = scaled["spouse_age"].notna()
+ scaled["spouse_age_filled"] = scaled["spouse_age"].fillna(-1)
+ return scaled
+
+
+def _target_head_payroll_share(candidate: SyntheticCandidate) -> float:
+ return _safe_split(
+ candidate.head_wages,
+ candidate.payroll_total,
+ 1.0 if candidate.spouse_age is None else 0.5,
+ )
+
+
+def _target_head_ss_share(candidate: SyntheticCandidate) -> float:
+ return _safe_split(
+ candidate.head_ss,
+ candidate.ss_total,
+ 1.0 if candidate.spouse_age is None else 0.5,
+ )
+
+
+def _target_worker_age(candidate: SyntheticCandidate) -> float:
+ if (
+ candidate.spouse_age is not None
+ and candidate.spouse_wages > candidate.head_wages
+ ):
+ return float(candidate.spouse_age)
+ return float(candidate.head_age)
+
+
+def match_older_role_donors(
+ target_candidate: SyntheticCandidate,
+ scaled_actual_summary: pd.DataFrame,
+ *,
+ donors_per_target: int,
+) -> pd.DataFrame:
+ subset = scaled_actual_summary[
+ (scaled_actual_summary["ss_total"] > 0)
+ & (scaled_actual_summary["head_age"] >= 55)
+ ].copy()
+ if subset.empty:
+ return subset
+ target_spouse_age = (
+ -1
+ if target_candidate.spouse_age is None or target_candidate.spouse_age < 65
+ else target_candidate.spouse_age
+ )
+ target_head_ss_share = _target_head_ss_share(target_candidate)
+ donor_head_ss_share = _safe_series_split(
+ subset["scaled_head_ss"],
+ subset["scaled_ss_total"],
+ target_head_ss_share,
+ )
+ subset["distance"] = (
+ (subset["head_age"] - target_candidate.head_age).abs() / 5.0
+ + (subset["spouse_age_filled"] - target_spouse_age).abs() / 7.5
+ + np.abs(
+ np.log1p(subset["scaled_ss_total"]) - np.log1p(target_candidate.ss_total)
+ )
+ + 0.5 * np.abs(donor_head_ss_share - target_head_ss_share)
+ + 0.15
+ * subset["archetype"]
+ .isin(
+ {
+ "older_beneficiary_single",
+ "older_beneficiary_couple",
+ "older_worker_single",
+ "older_worker_couple",
+ "mixed_retiree_worker_couple",
+ "older_plus_prime_worker_family",
+ }
+ )
+ .rsub(1)
+ .astype(float)
+ )
+ return subset.nsmallest(donors_per_target, "distance").copy()
+
+
+def match_worker_role_donors(
+ target_candidate: SyntheticCandidate,
+ scaled_actual_summary: pd.DataFrame,
+ *,
+ donors_per_target: int,
+) -> pd.DataFrame:
+ subset = scaled_actual_summary[scaled_actual_summary["payroll_total"] > 0].copy()
+ target_dependent_count = len(target_candidate.dependent_ages)
+ target_spouse_present = target_candidate.spouse_age is not None
+ if target_dependent_count > 0:
+ family_subset = subset[subset["dependent_count"] == target_dependent_count]
+ if not family_subset.empty:
+ subset = family_subset.copy()
+ if target_spouse_present:
+ spouse_subset = subset[subset["spouse_present"]]
+ if not spouse_subset.empty:
+ subset = spouse_subset.copy()
+ target_worker_age = _target_worker_age(target_candidate)
+ target_head_payroll_share = _target_head_payroll_share(target_candidate)
+ donor_head_payroll_share = _safe_series_split(
+ subset["scaled_head_payroll"],
+ subset["scaled_payroll_total"],
+ target_head_payroll_share,
+ )
+ subset["distance"] = (
+ (subset["head_age"] - target_worker_age).abs() / 5.0
+ + (subset["dependent_count"] - target_dependent_count).abs() * 0.75
+ + np.abs(
+ np.log1p(subset["scaled_payroll_total"])
+ - np.log1p(target_candidate.payroll_total)
+ )
+ + 0.5 * np.abs(donor_head_payroll_share - target_head_payroll_share)
+ + 0.25 * subset["spouse_present"].ne(target_spouse_present).astype(float)
+ )
+ return subset.nsmallest(donors_per_target, "distance").copy()
+
+
+def build_role_donor_composite_candidate(
+ target_candidate: SyntheticCandidate,
+ *,
+ older_donor_row: pd.Series | None,
+ worker_donor_row: pd.Series | None,
+ earnings_scale: float,
+) -> SyntheticCandidate:
+ target_head_payroll_share = _target_head_payroll_share(target_candidate)
+ target_head_ss_share = _target_head_ss_share(target_candidate)
+ if (
+ worker_donor_row is not None
+ and target_candidate.head_wages > 0
+ and target_candidate.spouse_wages > 0
+ ):
+ head_payroll_share = _safe_split(
+ float(worker_donor_row["scaled_head_payroll"]),
+ float(worker_donor_row["scaled_payroll_total"]),
+ target_head_payroll_share,
+ )
+ else:
+ head_payroll_share = target_head_payroll_share
+ if (
+ older_donor_row is not None
+ and target_candidate.head_ss > 0
+ and target_candidate.spouse_ss > 0
+ ):
+ head_ss_share = _safe_split(
+ float(older_donor_row["scaled_head_ss"]),
+ float(older_donor_row["scaled_ss_total"]),
+ target_head_ss_share,
+ )
+ else:
+ head_ss_share = target_head_ss_share
+
+ payroll_total = target_candidate.payroll_total
+ ss_total = target_candidate.ss_total
+ pension_income = 0.0
+ dividend_income = 0.0
+ if older_donor_row is not None:
+ pension_income += float(older_donor_row["pension_income"]) * earnings_scale
+ dividend_income += float(older_donor_row["dividend_income"]) * earnings_scale
+ if worker_donor_row is not None and target_candidate.ss_total <= 0:
+ pension_income += float(worker_donor_row["pension_income"]) * earnings_scale
+ dividend_income += float(worker_donor_row["dividend_income"]) * earnings_scale
+
+ return SyntheticCandidate(
+ archetype=f"{target_candidate.archetype}_role_donor",
+ head_age=target_candidate.head_age,
+ spouse_age=target_candidate.spouse_age,
+ dependent_ages=target_candidate.dependent_ages,
+ head_wages=payroll_total * head_payroll_share,
+ spouse_wages=payroll_total * (1.0 - head_payroll_share),
+ head_ss=ss_total * head_ss_share,
+ spouse_ss=ss_total * (1.0 - head_ss_share),
+ pension_income=pension_income,
+ dividend_income=dividend_income,
+ )
+
+
+def build_role_donor_composites(
+ candidates: list[SyntheticCandidate],
+ weights: np.ndarray,
+ actual_summary: pd.DataFrame,
+ *,
+ ss_scale: float,
+ earnings_scale: float,
+ top_n_targets: int,
+ older_donors_per_target: int,
+ worker_donors_per_target: int,
+ max_older_distance: float = 3.0,
+ max_worker_distance: float = 3.0,
+) -> tuple[list[SyntheticCandidate], np.ndarray, dict[str, object]]:
+ exact_df = summarize_exact_candidates(candidates, weights)
+ target_df = exact_df[exact_df["synthetic_weight"] > 0].head(top_n_targets).copy()
+ scaled_actual = build_scaled_actual_summary(
+ actual_summary,
+ ss_scale=ss_scale,
+ earnings_scale=earnings_scale,
+ )
+
+ composite_candidates: list[SyntheticCandidate] = []
+ composite_weights: list[float] = []
+ composite_records: list[dict[str, object]] = []
+ skipped_targets: list[dict[str, object]] = []
+
+ total_weight = max(float(weights.sum()), 1.0)
+
+ for _, target_row in target_df.iterrows():
+ target_candidate = candidates[int(target_row["candidate_idx"])]
+ older_donors = [None]
+ if target_candidate.ss_total > 0:
+ matched = match_older_role_donors(
+ target_candidate,
+ scaled_actual,
+ donors_per_target=older_donors_per_target,
+ )
+ usable = matched[matched["distance"] <= max_older_distance].copy()
+ if usable.empty:
+ skipped_targets.append(
+ {
+ "candidate_idx": int(target_row["candidate_idx"]),
+ "archetype": target_candidate.archetype,
+ "reason": "no_older_donor",
+ "best_distance": float(matched["distance"].min())
+ if not matched.empty
+ else None,
+ }
+ )
+ continue
+ older_donors = [row for _, row in usable.iterrows()]
+
+ worker_donors = [None]
+ if target_candidate.payroll_total > 0:
+ matched = match_worker_role_donors(
+ target_candidate,
+ scaled_actual,
+ donors_per_target=worker_donors_per_target,
+ )
+ usable = matched[matched["distance"] <= max_worker_distance].copy()
+ if usable.empty:
+ skipped_targets.append(
+ {
+ "candidate_idx": int(target_row["candidate_idx"]),
+ "archetype": target_candidate.archetype,
+ "reason": "no_worker_donor",
+ "best_distance": float(matched["distance"].min())
+ if not matched.empty
+ else None,
+ }
+ )
+ continue
+ worker_donors = [row for _, row in usable.iterrows()]
+
+ target_weight = float(target_row["synthetic_weight"])
+ combination_count = max(len(older_donors) * len(worker_donors), 1)
+ per_candidate_weight = target_weight / combination_count
+ for older_donor in older_donors:
+ for worker_donor in worker_donors:
+ composite_candidates.append(
+ build_role_donor_composite_candidate(
+ target_candidate,
+ older_donor_row=older_donor,
+ worker_donor_row=worker_donor,
+ earnings_scale=earnings_scale,
+ )
+ )
+ composite_weights.append(per_candidate_weight)
+ composite_records.append(
+ {
+ "composite_idx": int(len(composite_candidates) - 1),
+ "candidate_idx": int(target_row["candidate_idx"]),
+ "archetype": target_candidate.archetype,
+ "older_tax_unit_id": (
+ None
+ if older_donor is None
+ else int(older_donor["tax_unit_id"])
+ ),
+ "worker_tax_unit_id": (
+ None
+ if worker_donor is None
+ else int(worker_donor["tax_unit_id"])
+ ),
+ "older_distance": (
+ None
+ if older_donor is None
+ else float(older_donor["distance"])
+ ),
+ "worker_distance": (
+ None
+ if worker_donor is None
+ else float(worker_donor["distance"])
+ ),
+ "assigned_weight_share_pct": float(
+ per_candidate_weight / total_weight * 100
+ ),
+ }
+ )
+
+ prior_weights = np.asarray(composite_weights, dtype=float)
+ probe_summary = summarize_solution(
+ composite_candidates,
+ prior_weights,
+ actual_summary,
+ )
+ return (
+ composite_candidates,
+ prior_weights,
+ {
+ "top_n_targets": int(top_n_targets),
+ "older_donors_per_target": int(older_donors_per_target),
+ "worker_donors_per_target": int(worker_donors_per_target),
+ "max_older_distance": float(max_older_distance),
+ "max_worker_distance": float(max_worker_distance),
+ "skipped_targets": skipped_targets,
+ "composite_records": composite_records,
+ "prior_summary": probe_summary,
+ },
+ )
+
+
+def summarize_exact_candidates(
+ candidates: list[SyntheticCandidate],
+ weights: np.ndarray,
+) -> pd.DataFrame:
+ candidate_rows = []
+ for idx, (candidate, weight) in enumerate(zip(candidates, weights)):
+ candidate_rows.append(
+ {
+ "candidate_idx": idx,
+ **asdict(candidate),
+ "dependent_count": len(candidate.dependent_ages),
+ "payroll_total": candidate.payroll_total,
+ "ss_total": candidate.ss_total,
+ "synthetic_weight": float(weight),
+ }
+ )
+ candidate_df = pd.DataFrame(candidate_rows).sort_values(
+ "synthetic_weight",
+ ascending=False,
+ )
+ total_weight = max(float(candidate_df["synthetic_weight"].sum()), 1.0)
+ candidate_df["weight_share_pct"] = (
+ candidate_df["synthetic_weight"] / total_weight * 100
+ )
+ return candidate_df
+
+
+def match_real_donors_for_target(
+ target_row: pd.Series,
+ scaled_actual_summary: pd.DataFrame,
+ *,
+ donors_per_target: int,
+) -> pd.DataFrame:
+ target_spouse_present = pd.notna(target_row["spouse_age"])
+ target_spouse_age = -1 if not target_spouse_present else target_row["spouse_age"]
+ target_adult_count = 2 if target_spouse_present else 1
+ subset = scaled_actual_summary[
+ scaled_actual_summary["spouse_present"].eq(target_spouse_present)
+ & scaled_actual_summary["adult_count"].eq(target_adult_count)
+ & scaled_actual_summary["dependent_count"].eq(
+ int(target_row["dependent_count"])
+ )
+ ].copy()
+ if subset.empty:
+ subset = scaled_actual_summary[
+ scaled_actual_summary["adult_count"].ge(target_adult_count)
+ ].copy()
+ if subset.empty:
+ subset = scaled_actual_summary.copy()
+ subset["distance"] = (
+ (subset["head_age"] - target_row["head_age"]).abs() / 5.0
+ + (subset["spouse_age_filled"] - target_spouse_age).abs() / 5.0
+ + (subset["dependent_count"] - target_row["dependent_count"]).abs() * 0.75
+ + np.abs(
+ np.log1p(subset["scaled_payroll_total"])
+ - np.log1p(float(target_row["payroll_total"]))
+ )
+ + np.abs(
+ np.log1p(subset["scaled_ss_total"])
+ - np.log1p(float(target_row["ss_total"]))
+ )
+ + 0.25 * subset["archetype"].ne(target_row["archetype"]).astype(float)
+ )
+ nearest = subset.nsmallest(donors_per_target, "distance").copy()
+ nearest["target_candidate_idx"] = int(target_row["candidate_idx"])
+ nearest["target_archetype"] = target_row["archetype"]
+ nearest["target_weight_share_pct"] = float(target_row["weight_share_pct"])
+ nearest["target_head_age"] = int(target_row["head_age"])
+ nearest["target_spouse_age"] = (
+ None if not target_spouse_present else int(target_row["spouse_age"])
+ )
+ nearest["target_dependent_count"] = int(target_row["dependent_count"])
+ nearest["target_payroll_total"] = float(target_row["payroll_total"])
+ nearest["target_ss_total"] = float(target_row["ss_total"])
+ nearest["required_head_age_shift"] = (
+ nearest["target_head_age"] - nearest["head_age"]
+ )
+ nearest["required_spouse_age_shift"] = np.where(
+ nearest["target_spouse_age"].isna(),
+ np.nan,
+ nearest["target_spouse_age"] - nearest["spouse_age_filled"],
+ )
+ return nearest
+
+
+def summarize_donor_probe(
+ candidates: list[SyntheticCandidate],
+ weights: np.ndarray,
+ actual_summary: pd.DataFrame,
+ *,
+ ss_scale: float,
+ earnings_scale: float,
+ top_n_targets: int,
+ donors_per_target: int,
+) -> dict[str, object]:
+ exact_df = summarize_exact_candidates(candidates, weights)
+ target_df = exact_df[exact_df["synthetic_weight"] > 0].head(top_n_targets).copy()
+ scaled_actual = build_scaled_actual_summary(
+ actual_summary,
+ ss_scale=ss_scale,
+ earnings_scale=earnings_scale,
+ )
+ donor_matches = []
+ for _, target_row in target_df.iterrows():
+ donor_matches.append(
+ match_real_donors_for_target(
+ target_row,
+ scaled_actual,
+ donors_per_target=donors_per_target,
+ )
+ )
+ donor_df = pd.concat(donor_matches, ignore_index=True)
+ nearest_only = (
+ donor_df.sort_values(
+ ["target_candidate_idx", "distance"],
+ ascending=[True, True],
+ )
+ .groupby("target_candidate_idx", as_index=False)
+ .first()
+ )
+ distance_summary = {
+ "median_best_distance": float(nearest_only["distance"].median()),
+ "targets_with_best_distance_le_1": int((nearest_only["distance"] <= 1.0).sum()),
+ "targets_with_best_distance_le_2": int((nearest_only["distance"] <= 2.0).sum()),
+ "targets_with_best_distance_gt_3": int((nearest_only["distance"] > 3.0).sum()),
+ }
+ outlier_targets = nearest_only[nearest_only["distance"] > 3.0].copy()
+ return {
+ "top_n_targets": int(top_n_targets),
+ "donors_per_target": int(donors_per_target),
+ "distance_summary": distance_summary,
+ "nearest_targets": nearest_only[
+ [
+ "target_candidate_idx",
+ "target_archetype",
+ "target_weight_share_pct",
+ "target_head_age",
+ "target_spouse_age",
+ "target_dependent_count",
+ "target_payroll_total",
+ "target_ss_total",
+ "tax_unit_id",
+ "archetype",
+ "head_age",
+ "spouse_age",
+ "dependent_count",
+ "scaled_payroll_total",
+ "scaled_ss_total",
+ "required_head_age_shift",
+ "required_spouse_age_shift",
+ "distance",
+ ]
+ ].to_dict("records"),
+ "outlier_targets": outlier_targets[
+ [
+ "target_candidate_idx",
+ "target_archetype",
+ "target_weight_share_pct",
+ "target_head_age",
+ "target_spouse_age",
+ "target_dependent_count",
+ "target_payroll_total",
+ "target_ss_total",
+ "distance",
+ ]
+ ].to_dict("records"),
+ }
+
+
+def _next_entity_id(values: pd.Series) -> int:
+ non_null = values.dropna()
+ if non_null.empty:
+ return 1
+ return int(non_null.max()) + 1
+
+
+def _cast_mapped_ids(series: pd.Series, mapped: pd.Series) -> pd.Series:
+ dtype = series.dtype
+ if pd.api.types.is_integer_dtype(dtype):
+ return mapped.astype(dtype)
+ if pd.api.types.is_float_dtype(dtype):
+ return mapped.astype(dtype)
+ return mapped
+
+
+def _scale_person_components(
+ row: pd.Series,
+ columns: tuple[str, ...],
+ target_total: float,
+) -> pd.Series:
+ available = [column for column in columns if column in row.index]
+ if not available:
+ return row
+ target_total = float(target_total)
+ if target_total <= 0:
+ for column in available:
+ row[column] = 0.0
+ return row
+ current_total = float(sum(float(row[column]) for column in available))
+ if current_total > 0:
+ scale = target_total / current_total
+ for column in available:
+ row[column] = float(row[column]) * scale
+ return row
+ row[available[0]] = target_total
+ for column in available[1:]:
+ row[column] = 0.0
+ return row
+
+
+def _target_base_total_for_row(
+ row: pd.Series,
+ *,
+ target_total: float,
+ factor_column: str,
+ fallback_factor: float,
+) -> float:
+ target_total = float(target_total)
+ if target_total <= 0:
+ return 0.0
+ factor = row.get(factor_column, np.nan)
+ if pd.isna(factor) or float(factor) <= 0:
+ factor = fallback_factor
+ return target_total / max(float(factor), 1e-12)
+
+
+def _clone_tax_unit_rows_to_target(
+ donor_rows: pd.DataFrame,
+ *,
+ base_year: int,
+ target_candidate: SyntheticCandidate,
+ ss_scale: float,
+ earnings_scale: float,
+ id_counters: dict[str, int],
+ clone_weight_scale: float,
+ clone_weight_divisor: int,
+) -> tuple[pd.DataFrame, dict[str, int]] | tuple[None, dict[str, int]]:
+ age_col = _period_column("age", base_year)
+ household_weight_col = _period_column("household_weight", base_year)
+ person_weight_col = _period_column("person_weight", base_year)
+ person_id_col = _period_column(PERSON_ID_COLUMN, base_year)
+
+ adults = donor_rows[donor_rows[age_col] >= 18].sort_values(age_col, ascending=False)
+ dependents = donor_rows[donor_rows[age_col] < 18].sort_values(
+ age_col, ascending=False
+ )
+ target_has_spouse = target_candidate.spouse_age is not None
+ target_adult_count = 2 if target_has_spouse else 1
+ if len(adults) < target_adult_count or len(dependents) != len(
+ target_candidate.dependent_ages
+ ):
+ return None, id_counters
+
+ cloned = donor_rows.copy()
+ household_id = id_counters["household"]
+ id_counters["household"] += 1
+ for entity_name, columns in ENTITY_ID_COLUMNS.items():
+ entity_id = id_counters[entity_name]
+ id_counters[entity_name] += 1
+ for raw_column in columns:
+ column = _period_column(raw_column, base_year)
+ if column in cloned.columns:
+ cloned[column] = (
+ entity_id if entity_name != "household" else household_id
+ )
+ cloned[_period_column("household_id", base_year)] = household_id
+ cloned[_period_column("person_household_id", base_year)] = household_id
+
+ person_ids = range(id_counters["person"], id_counters["person"] + len(cloned))
+ id_counters["person"] += len(cloned)
+ cloned[person_id_col] = _cast_mapped_ids(
+ cloned[person_id_col],
+ pd.Series(list(person_ids), index=cloned.index),
+ )
+
+ if household_weight_col in cloned.columns:
+ cloned[household_weight_col] = (
+ cloned[household_weight_col].astype(float)
+ * clone_weight_scale
+ / max(clone_weight_divisor, 1)
+ )
+ if person_weight_col in cloned.columns:
+ cloned[person_weight_col] = (
+ cloned[person_weight_col].astype(float)
+ * clone_weight_scale
+ / max(clone_weight_divisor, 1)
+ )
+
+ adult_indices = adults.index.tolist()
+ head_idx = adult_indices[0]
+ spouse_idx = adult_indices[1] if target_has_spouse else None
+ dependent_indices = dependents.index.tolist()
+
+ cloned.loc[head_idx, age_col] = float(target_candidate.head_age)
+ if spouse_idx is not None:
+ cloned.loc[spouse_idx, age_col] = float(target_candidate.spouse_age)
+ for dep_idx, dep_age in zip(dependent_indices, target_candidate.dependent_ages):
+ cloned.loc[dep_idx, age_col] = float(dep_age)
+
+ payroll_columns = tuple(
+ _period_column(component, base_year) for component in PAYROLL_COMPONENTS
+ )
+ ss_columns = tuple(
+ _period_column(component, base_year) for component in SS_COMPONENTS
+ )
+ qbi_col = _period_column("w2_wages_from_qualified_business", base_year)
+
+ target_head_payroll = _target_base_total_for_row(
+ cloned.loc[head_idx],
+ target_total=float(target_candidate.head_wages),
+ factor_column=PAYROLL_UPRATING_FACTOR_COLUMN,
+ fallback_factor=earnings_scale,
+ )
+ target_spouse_payroll = _target_base_total_for_row(
+ cloned.loc[spouse_idx] if spouse_idx is not None else pd.Series(dtype=float),
+ target_total=float(target_candidate.spouse_wages),
+ factor_column=PAYROLL_UPRATING_FACTOR_COLUMN,
+ fallback_factor=earnings_scale,
+ )
+ target_head_ss = _target_base_total_for_row(
+ cloned.loc[head_idx],
+ target_total=float(target_candidate.head_ss),
+ factor_column=SS_UPRATING_FACTOR_COLUMN,
+ fallback_factor=ss_scale,
+ )
+ target_spouse_ss = _target_base_total_for_row(
+ cloned.loc[spouse_idx] if spouse_idx is not None else pd.Series(dtype=float),
+ target_total=float(target_candidate.spouse_ss),
+ factor_column=SS_UPRATING_FACTOR_COLUMN,
+ fallback_factor=ss_scale,
+ )
+
+ cloned.loc[head_idx] = _scale_person_components(
+ cloned.loc[head_idx].copy(),
+ payroll_columns,
+ target_head_payroll,
+ )
+ cloned.loc[head_idx] = _scale_person_components(
+ cloned.loc[head_idx].copy(),
+ ss_columns,
+ target_head_ss,
+ )
+ if spouse_idx is not None:
+ cloned.loc[spouse_idx] = _scale_person_components(
+ cloned.loc[spouse_idx].copy(),
+ payroll_columns,
+ target_spouse_payroll,
+ )
+ cloned.loc[spouse_idx] = _scale_person_components(
+ cloned.loc[spouse_idx].copy(),
+ ss_columns,
+ target_spouse_ss,
+ )
+
+ for dep_idx in dependent_indices:
+ cloned.loc[dep_idx] = _scale_person_components(
+ cloned.loc[dep_idx].copy(),
+ payroll_columns,
+ 0.0,
+ )
+ cloned.loc[dep_idx] = _scale_person_components(
+ cloned.loc[dep_idx].copy(),
+ ss_columns,
+ 0.0,
+ )
+ if qbi_col in cloned.columns:
+ cloned.loc[dep_idx, qbi_col] = 0.0
+
+ if qbi_col in cloned.columns and head_idx in cloned.index:
+ cloned.loc[head_idx, qbi_col] = 0.0
+ if spouse_idx is not None:
+ cloned.loc[spouse_idx, qbi_col] = 0.0
+
+ return cloned, id_counters
+
+
+def _compose_role_donor_rows_to_target(
+ older_donor_rows: pd.DataFrame | None,
+ worker_donor_rows: pd.DataFrame | None,
+ *,
+ base_year: int,
+ target_candidate: SyntheticCandidate,
+ ss_scale: float,
+ earnings_scale: float,
+ id_counters: dict[str, int],
+ clone_weight_scale: float,
+ clone_weight_divisor: int,
+) -> tuple[pd.DataFrame, dict[str, int]] | tuple[None, dict[str, int]]:
+ age_col = _period_column("age", base_year)
+ household_weight_col = _period_column("household_weight", base_year)
+ person_weight_col = _period_column("person_weight", base_year)
+ person_id_col = _period_column(PERSON_ID_COLUMN, base_year)
+
+ def _adult_rows(df: pd.DataFrame | None) -> pd.DataFrame:
+ if df is None:
+ return pd.DataFrame(columns=[] if df is None else df.columns)
+ return df[df[age_col] >= 18].sort_values(age_col, ascending=False)
+
+ def _dependent_rows(df: pd.DataFrame | None) -> pd.DataFrame:
+ if df is None:
+ return pd.DataFrame(columns=[] if df is None else df.columns)
+ return df[df[age_col] < 18].sort_values(age_col, ascending=False)
+
+ older_adults = _adult_rows(older_donor_rows)
+ worker_adults = _adult_rows(worker_donor_rows)
+ worker_dependents = _dependent_rows(worker_donor_rows)
+
+ worker_payroll_factor = (
+ float(
+ np.nanmedian(
+ worker_adults[PAYROLL_UPRATING_FACTOR_COLUMN].astype(float).to_numpy()
+ )
+ )
+ if not worker_adults.empty
+ and PAYROLL_UPRATING_FACTOR_COLUMN in worker_adults.columns
+ else np.nan
+ )
+ older_ss_factor = (
+ float(
+ np.nanmedian(
+ older_adults[SS_UPRATING_FACTOR_COLUMN].astype(float).to_numpy()
+ )
+ )
+ if not older_adults.empty and SS_UPRATING_FACTOR_COLUMN in older_adults.columns
+ else np.nan
+ )
+ payroll_reference_factor = (
+ worker_payroll_factor
+ if np.isfinite(worker_payroll_factor) and worker_payroll_factor > 0
+ else earnings_scale
+ )
+ ss_reference_factor = (
+ older_ss_factor
+ if np.isfinite(older_ss_factor) and older_ss_factor > 0
+ else ss_scale
+ )
+
+ selected_rows: list[pd.Series] = []
+ head_target_older = target_candidate.head_ss > 0 or target_candidate.head_age >= 65
+ head_source_rows = (
+ older_adults if head_target_older and not older_adults.empty else worker_adults
+ )
+ if head_source_rows.empty:
+ return None, id_counters
+ head_row = head_source_rows.iloc[0].copy()
+ selected_rows.append(head_row)
+
+ spouse_row = None
+ if target_candidate.spouse_age is not None:
+ if target_candidate.spouse_age >= 65 and len(older_adults) >= 2:
+ spouse_row = older_adults.iloc[1].copy()
+ elif not worker_adults.empty:
+ worker_candidates = (
+ worker_adults.iloc[1:]
+ if worker_adults.index[0] == head_row.name
+ else worker_adults
+ )
+ if worker_candidates.empty:
+ worker_candidates = worker_adults
+ spouse_idx = (
+ (worker_candidates[age_col] - target_candidate.spouse_age)
+ .abs()
+ .idxmin()
+ )
+ spouse_row = worker_candidates.loc[spouse_idx].copy()
+ elif len(older_adults) >= 2:
+ spouse_row = older_adults.iloc[1].copy()
+ if spouse_row is None:
+ fallback_spouse_pool = (
+ worker_adults if not worker_adults.empty else older_adults
+ )
+ if fallback_spouse_pool.empty:
+ return None, id_counters
+ spouse_row = fallback_spouse_pool.iloc[0].copy()
+ selected_rows.append(spouse_row)
+
+ if len(target_candidate.dependent_ages) > 0:
+ dependent_rows = [row.copy() for _, row in worker_dependents.iterrows()]
+ if not dependent_rows:
+ fallback_source = None
+ if worker_donor_rows is not None and not worker_donor_rows.empty:
+ fallback_source = (
+ worker_donor_rows.sort_values(age_col, ascending=True)
+ .iloc[0]
+ .copy()
+ )
+ elif older_donor_rows is not None and not older_donor_rows.empty:
+ fallback_source = (
+ older_donor_rows.sort_values(age_col, ascending=True).iloc[0].copy()
+ )
+ if fallback_source is None:
+ return None, id_counters
+ dependent_rows = [fallback_source.copy()]
+ while len(dependent_rows) < len(target_candidate.dependent_ages):
+ dependent_rows.append(dependent_rows[-1].copy())
+ selected_rows.extend(dependent_rows[: len(target_candidate.dependent_ages)])
+
+ # Reset duplicate donor indices so later row-specific retargeting only touches
+ # the intended clone row.
+ cloned = pd.DataFrame(selected_rows).reset_index(drop=True).copy()
+ household_id = id_counters["household"]
+ id_counters["household"] += 1
+ for entity_name, columns in ENTITY_ID_COLUMNS.items():
+ entity_id = id_counters[entity_name]
+ id_counters[entity_name] += 1
+ for raw_column in columns:
+ column = _period_column(raw_column, base_year)
+ if column in cloned.columns:
+ cloned[column] = (
+ entity_id if entity_name != "household" else household_id
+ )
+ cloned[_period_column("household_id", base_year)] = household_id
+ cloned[_period_column("person_household_id", base_year)] = household_id
+
+ person_ids = range(id_counters["person"], id_counters["person"] + len(cloned))
+ id_counters["person"] += len(cloned)
+ cloned[person_id_col] = _cast_mapped_ids(
+ cloned[person_id_col],
+ pd.Series(list(person_ids), index=cloned.index),
+ )
+
+ if household_weight_col in cloned.columns:
+ cloned[household_weight_col] = (
+ cloned[household_weight_col].astype(float)
+ * clone_weight_scale
+ / max(clone_weight_divisor, 1)
+ )
+ if person_weight_col in cloned.columns:
+ cloned[person_weight_col] = (
+ cloned[person_weight_col].astype(float)
+ * clone_weight_scale
+ / max(clone_weight_divisor, 1)
+ )
+
+ head_idx = cloned.index[0]
+ spouse_idx = cloned.index[1] if target_candidate.spouse_age is not None else None
+ dependent_indices = (
+ cloned.index[2 : 2 + len(target_candidate.dependent_ages)]
+ if target_candidate.spouse_age is not None
+ else cloned.index[1 : 1 + len(target_candidate.dependent_ages)]
+ )
+
+ cloned.loc[head_idx, age_col] = float(target_candidate.head_age)
+ if spouse_idx is not None:
+ cloned.loc[spouse_idx, age_col] = float(target_candidate.spouse_age)
+ for dep_idx, dep_age in zip(dependent_indices, target_candidate.dependent_ages):
+ cloned.loc[dep_idx, age_col] = float(dep_age)
+
+ payroll_columns = tuple(
+ _period_column(component, base_year) for component in PAYROLL_COMPONENTS
+ )
+ ss_columns = tuple(
+ _period_column(component, base_year) for component in SS_COMPONENTS
+ )
+ qbi_col = _period_column("w2_wages_from_qualified_business", base_year)
+
+ target_head_payroll = float(target_candidate.head_wages) / max(
+ payroll_reference_factor,
+ 1e-12,
+ )
+ target_spouse_payroll = float(target_candidate.spouse_wages) / max(
+ payroll_reference_factor,
+ 1e-12,
+ )
+ target_head_ss = float(target_candidate.head_ss) / max(ss_reference_factor, 1e-12)
+ target_spouse_ss = float(target_candidate.spouse_ss) / max(
+ ss_reference_factor,
+ 1e-12,
+ )
+
+ cloned.loc[head_idx] = _scale_person_components(
+ cloned.loc[head_idx].copy(),
+ payroll_columns,
+ target_head_payroll,
+ )
+ cloned.loc[head_idx] = _scale_person_components(
+ cloned.loc[head_idx].copy(),
+ ss_columns,
+ target_head_ss,
+ )
+ if spouse_idx is not None:
+ cloned.loc[spouse_idx] = _scale_person_components(
+ cloned.loc[spouse_idx].copy(),
+ payroll_columns,
+ target_spouse_payroll,
+ )
+ cloned.loc[spouse_idx] = _scale_person_components(
+ cloned.loc[spouse_idx].copy(),
+ ss_columns,
+ target_spouse_ss,
+ )
+ for dep_idx in dependent_indices:
+ cloned.loc[dep_idx] = _scale_person_components(
+ cloned.loc[dep_idx].copy(),
+ payroll_columns,
+ 0.0,
+ )
+ cloned.loc[dep_idx] = _scale_person_components(
+ cloned.loc[dep_idx].copy(),
+ ss_columns,
+ 0.0,
+ )
+ if qbi_col in cloned.columns:
+ cloned.loc[dep_idx, qbi_col] = 0.0
+
+ if qbi_col in cloned.columns and head_idx in cloned.index:
+ cloned.loc[head_idx, qbi_col] = 0.0
+ if spouse_idx is not None:
+ cloned.loc[spouse_idx, qbi_col] = 0.0
+
+ return cloned, id_counters
+
+
+def build_donor_backed_augmented_input_dataframe(
+ *,
+ base_dataset: str,
+ base_year: int,
+ target_year: int,
+ top_n_targets: int = 20,
+ donors_per_target: int = 5,
+ max_distance_for_clone: float = 3.0,
+ clone_weight_scale: float = 0.1,
+) -> tuple[pd.DataFrame, dict[str, object]]:
+ sim = Microsimulation(dataset=base_dataset)
+ input_df = attach_person_uprating_factors(
+ sim.to_input_dataframe(),
+ sim,
+ base_year=base_year,
+ target_year=target_year,
+ )
+ actual_summary = build_actual_tax_unit_summary(base_dataset)
+ base_aggregates = load_base_aggregates(base_dataset)
+ ss_scale = load_ssa_benefit_projections(target_year) / max(
+ base_aggregates["weighted_ss_total"],
+ 1.0,
+ )
+ earnings_scale = load_taxable_payroll_projections(target_year) / max(
+ base_aggregates["weighted_payroll_total"],
+ 1.0,
+ )
+ pools = build_quantile_pools(
+ actual_summary,
+ ss_scale=ss_scale,
+ earnings_scale=earnings_scale,
+ )
+ candidates = generate_synthetic_candidates(
+ pools,
+ payroll_cap=load_policyengine_social_security_cap(target_year),
+ )
+ exact_weights, solve_info = solve_synthetic_support(candidates, year=target_year)
+ exact_df = summarize_exact_candidates(candidates, exact_weights)
+ target_df = exact_df[exact_df["synthetic_weight"] > 0].head(top_n_targets).copy()
+ scaled_actual = build_scaled_actual_summary(
+ actual_summary,
+ ss_scale=ss_scale,
+ earnings_scale=earnings_scale,
+ )
+
+ tax_unit_id_col = _period_column("person_tax_unit_id", base_year)
+ id_counters = {
+ entity_name: _next_entity_id(input_df[_period_column(columns[0], base_year)])
+ for entity_name, columns in ENTITY_ID_COLUMNS.items()
+ }
+ id_counters["person"] = _next_entity_id(
+ input_df[_period_column(PERSON_ID_COLUMN, base_year)]
+ )
+
+ clone_frames = []
+ clone_household_reports = []
+ target_reports = []
+ skipped_targets = []
+
+ for _, target_row in target_df.iterrows():
+ target_candidate = candidates[int(target_row["candidate_idx"])]
+ donor_matches = match_real_donors_for_target(
+ target_row,
+ scaled_actual,
+ donors_per_target=donors_per_target,
+ )
+ usable = donor_matches[
+ donor_matches["distance"] <= max_distance_for_clone
+ ].copy()
+ if usable.empty:
+ skipped_targets.append(
+ {
+ "candidate_idx": int(target_row["candidate_idx"]),
+ "archetype": target_candidate.archetype,
+ "weight_share_pct": float(target_row["weight_share_pct"]),
+ "best_distance": float(donor_matches["distance"].min()),
+ }
+ )
+ continue
+
+ successful_clone_count = 0
+ for _, donor_row in usable.iterrows():
+ donor_rows = input_df[
+ input_df[tax_unit_id_col] == int(donor_row["tax_unit_id"])
+ ].copy()
+ clone_df, id_counters = _clone_tax_unit_rows_to_target(
+ donor_rows,
+ base_year=base_year,
+ target_candidate=target_candidate,
+ ss_scale=ss_scale,
+ earnings_scale=earnings_scale,
+ id_counters=id_counters,
+ clone_weight_scale=clone_weight_scale,
+ clone_weight_divisor=len(usable),
+ )
+ if clone_df is None:
+ continue
+ clone_frames.append(clone_df)
+ successful_clone_count += 1
+ target_reports.append(
+ {
+ "candidate_idx": int(target_row["candidate_idx"]),
+ "archetype": target_candidate.archetype,
+ "weight_share_pct": float(target_row["weight_share_pct"]),
+ "requested_donor_count": int(len(usable)),
+ "successful_clone_count": int(successful_clone_count),
+ }
+ )
+
+ augmented_df = (
+ pd.concat([input_df, *clone_frames], ignore_index=True)
+ if clone_frames
+ else input_df.copy()
+ )
+ helper_columns = [PAYROLL_UPRATING_FACTOR_COLUMN, SS_UPRATING_FACTOR_COLUMN]
+ augmented_df.drop(
+ columns=[column for column in helper_columns if column in augmented_df.columns],
+ inplace=True,
+ errors="ignore",
+ )
+ report = {
+ "base_dataset": base_dataset,
+ "base_year": int(base_year),
+ "target_year": int(target_year),
+ "target_source": get_long_term_target_source(),
+ "solve_info": solve_info,
+ "top_n_targets": int(top_n_targets),
+ "donors_per_target": int(donors_per_target),
+ "max_distance_for_clone": float(max_distance_for_clone),
+ "clone_weight_scale": float(clone_weight_scale),
+ "base_household_count": int(
+ input_df[_period_column("household_id", base_year)].nunique()
+ ),
+ "augmented_household_count": int(
+ augmented_df[_period_column("household_id", base_year)].nunique()
+ ),
+ "base_person_count": int(len(input_df)),
+ "augmented_person_count": int(len(augmented_df)),
+ "target_reports": target_reports,
+ "skipped_targets": skipped_targets,
+ }
+ return augmented_df, report
+
+
+def build_role_composite_augmented_input_dataframe(
+ *,
+ base_dataset: str,
+ base_year: int,
+ target_year: int,
+ top_n_targets: int = 20,
+ donors_per_target: int = 5,
+ max_older_distance: float = 3.0,
+ max_worker_distance: float = 3.0,
+ clone_weight_scale: float = 0.1,
+) -> tuple[pd.DataFrame, dict[str, object]]:
+ sim = Microsimulation(dataset=base_dataset)
+ input_df = attach_person_uprating_factors(
+ sim.to_input_dataframe(),
+ sim,
+ base_year=base_year,
+ target_year=target_year,
+ )
+ actual_summary = build_actual_tax_unit_summary(base_dataset)
+ base_aggregates = load_base_aggregates(base_dataset)
+ ss_scale = load_ssa_benefit_projections(target_year) / max(
+ base_aggregates["weighted_ss_total"],
+ 1.0,
+ )
+ earnings_scale = load_taxable_payroll_projections(target_year) / max(
+ base_aggregates["weighted_payroll_total"],
+ 1.0,
+ )
+ pools = build_quantile_pools(
+ actual_summary,
+ ss_scale=ss_scale,
+ earnings_scale=earnings_scale,
+ )
+ candidates = generate_synthetic_candidates(
+ pools,
+ payroll_cap=load_policyengine_social_security_cap(target_year),
+ )
+ exact_weights, solve_info = solve_synthetic_support(candidates, year=target_year)
+ scaled_actual = build_scaled_actual_summary(
+ actual_summary,
+ ss_scale=ss_scale,
+ earnings_scale=earnings_scale,
+ )
+ (
+ role_composite_candidates,
+ role_composite_prior,
+ role_composite_probe,
+ ) = build_role_donor_composites(
+ candidates,
+ exact_weights,
+ actual_summary,
+ ss_scale=ss_scale,
+ earnings_scale=earnings_scale,
+ top_n_targets=top_n_targets,
+ older_donors_per_target=donors_per_target,
+ worker_donors_per_target=donors_per_target,
+ max_older_distance=max_older_distance,
+ max_worker_distance=max_worker_distance,
+ )
+ role_composite_weights, role_composite_solve_info = solve_synthetic_support(
+ role_composite_candidates,
+ year=target_year,
+ baseline_weights=role_composite_prior,
+ )
+ role_composite_df = summarize_exact_candidates(
+ role_composite_candidates,
+ role_composite_weights,
+ )
+ selected_composite_df = role_composite_df[
+ role_composite_df["synthetic_weight"] > 0
+ ].copy()
+ composite_records_by_idx = {
+ int(record["composite_idx"]): record
+ for record in role_composite_probe["composite_records"]
+ }
+
+ tax_unit_id_col = _period_column("person_tax_unit_id", base_year)
+ id_counters = {
+ entity_name: _next_entity_id(input_df[_period_column(columns[0], base_year)])
+ for entity_name, columns in ENTITY_ID_COLUMNS.items()
+ }
+ id_counters["person"] = _next_entity_id(
+ input_df[_period_column(PERSON_ID_COLUMN, base_year)]
+ )
+
+ clone_frames = []
+ clone_household_reports = []
+ target_reports = []
+ skipped_targets = []
+
+ for _, target_row in selected_composite_df.iterrows():
+ composite_idx = int(target_row["candidate_idx"])
+ target_candidate = role_composite_candidates[composite_idx]
+ composite_record = composite_records_by_idx.get(composite_idx)
+ if composite_record is None:
+ skipped_targets.append(
+ {
+ "candidate_idx": composite_idx,
+ "archetype": target_candidate.archetype,
+ "weight_share_pct": float(target_row["weight_share_pct"]),
+ "reason": "missing_composite_record",
+ }
+ )
+ continue
+
+ older_tax_unit_id = composite_record.get("older_tax_unit_id")
+ worker_tax_unit_id = composite_record.get("worker_tax_unit_id")
+ older_row = None
+ worker_row = None
+ older_rows = None
+ worker_rows = None
+ if older_tax_unit_id is not None:
+ older_row = scaled_actual[
+ scaled_actual["tax_unit_id"].eq(int(older_tax_unit_id))
+ ].iloc[0]
+ older_rows = input_df[
+ input_df[tax_unit_id_col] == int(older_tax_unit_id)
+ ].copy()
+ if worker_tax_unit_id is not None:
+ worker_row = scaled_actual[
+ scaled_actual["tax_unit_id"].eq(int(worker_tax_unit_id))
+ ].iloc[0]
+ worker_rows = input_df[
+ input_df[tax_unit_id_col] == int(worker_tax_unit_id)
+ ].copy()
+
+ clone_df, id_counters = _compose_role_donor_rows_to_target(
+ older_rows,
+ worker_rows,
+ base_year=base_year,
+ target_candidate=target_candidate,
+ ss_scale=ss_scale,
+ earnings_scale=earnings_scale,
+ id_counters=id_counters,
+ clone_weight_scale=clone_weight_scale,
+ clone_weight_divisor=1,
+ )
+ if clone_df is None:
+ skipped_targets.append(
+ {
+ "candidate_idx": composite_idx,
+ "archetype": target_candidate.archetype,
+ "weight_share_pct": float(target_row["weight_share_pct"]),
+ "reason": "clone_build_failed",
+ }
+ )
+ continue
+ clone_frames.append(clone_df)
+ clone_household_reports.append(
+ _clone_report_record(
+ clone_df=clone_df,
+ base_year=base_year,
+ target_candidate=target_candidate,
+ candidate_idx=composite_idx,
+ target_weight_share_pct=float(target_row["weight_share_pct"]),
+ clone_weight_scale=clone_weight_scale,
+ combination_count=1,
+ older_donor_row=older_row,
+ worker_donor_row=worker_row,
+ )
+ )
+ target_reports.append(
+ {
+ "candidate_idx": composite_idx,
+ "archetype": target_candidate.archetype,
+ "weight_share_pct": float(target_row["weight_share_pct"]),
+ "older_match_count": int(older_tax_unit_id is not None),
+ "worker_match_count": int(worker_tax_unit_id is not None),
+ "successful_clone_count": 1,
+ }
+ )
+
+ augmented_df = (
+ pd.concat([input_df, *clone_frames], ignore_index=True)
+ if clone_frames
+ else input_df.copy()
+ )
+ helper_columns = [PAYROLL_UPRATING_FACTOR_COLUMN, SS_UPRATING_FACTOR_COLUMN]
+ augmented_df.drop(
+ columns=[column for column in helper_columns if column in augmented_df.columns],
+ inplace=True,
+ errors="ignore",
+ )
+ report = {
+ "base_dataset": base_dataset,
+ "base_year": int(base_year),
+ "target_year": int(target_year),
+ "target_source": get_long_term_target_source(),
+ "solve_info": solve_info,
+ "role_composite_solve_info": role_composite_solve_info,
+ "selection_strategy": "role_composite_positive_support",
+ "top_n_targets": int(top_n_targets),
+ "donors_per_target": int(donors_per_target),
+ "max_older_distance": float(max_older_distance),
+ "max_worker_distance": float(max_worker_distance),
+ "clone_weight_scale": float(clone_weight_scale),
+ "base_household_count": int(
+ input_df[_period_column("household_id", base_year)].nunique()
+ ),
+ "augmented_household_count": int(
+ augmented_df[_period_column("household_id", base_year)].nunique()
+ ),
+ "base_person_count": int(len(input_df)),
+ "augmented_person_count": int(len(augmented_df)),
+ "role_composite_candidate_count": int(len(role_composite_candidates)),
+ "selected_role_composite_count": int(len(selected_composite_df)),
+ "clone_household_count": int(len(clone_household_reports)),
+ "clone_household_reports": clone_household_reports,
+ "target_reports": target_reports,
+ "skipped_targets": skipped_targets,
+ }
+ return augmented_df, report
+
+
+def build_role_composite_augmented_dataset(
+ *,
+ base_dataset: str,
+ base_year: int,
+ target_year: int,
+ top_n_targets: int = 20,
+ donors_per_target: int = 5,
+ max_older_distance: float = 3.0,
+ max_worker_distance: float = 3.0,
+ clone_weight_scale: float = 0.1,
+) -> tuple[Dataset, dict[str, object]]:
+ augmented_df, report = build_role_composite_augmented_input_dataframe(
+ base_dataset=base_dataset,
+ base_year=base_year,
+ target_year=target_year,
+ top_n_targets=top_n_targets,
+ donors_per_target=donors_per_target,
+ max_older_distance=max_older_distance,
+ max_worker_distance=max_worker_distance,
+ clone_weight_scale=clone_weight_scale,
+ )
+ return Dataset.from_dataframe(augmented_df, base_year), report
+
+
+def build_donor_backed_augmented_dataset(
+ *,
+ base_dataset: str,
+ base_year: int,
+ target_year: int,
+ top_n_targets: int = 20,
+ donors_per_target: int = 5,
+ max_distance_for_clone: float = 3.0,
+ clone_weight_scale: float = 0.1,
+) -> tuple[Dataset, dict[str, object]]:
+ augmented_df, report = build_donor_backed_augmented_input_dataframe(
+ base_dataset=base_dataset,
+ base_year=base_year,
+ target_year=target_year,
+ top_n_targets=top_n_targets,
+ donors_per_target=donors_per_target,
+ max_distance_for_clone=max_distance_for_clone,
+ clone_weight_scale=clone_weight_scale,
+ )
+ return Dataset.from_dataframe(augmented_df, base_year), report
+
+
+def _safe_split(numerator: float, denominator: float, fallback: float) -> float:
+ if denominator <= 0:
+ return fallback
+ return float(numerator / denominator)
+
+
+def _safe_series_split(
+ numerator: pd.Series,
+ denominator: pd.Series,
+ fallback: float,
+) -> pd.Series:
+ numerator = numerator.astype(float)
+ denominator = denominator.astype(float)
+ result = pd.Series(fallback, index=numerator.index, dtype=float)
+ positive = denominator > 0
+ result.loc[positive] = numerator.loc[positive] / denominator.loc[positive]
+ return result
+
+
+def build_donor_backed_clones(
+ candidates: list[SyntheticCandidate],
+ weights: np.ndarray,
+ actual_summary: pd.DataFrame,
+ *,
+ ss_scale: float,
+ earnings_scale: float,
+ top_n_targets: int,
+ donors_per_target: int,
+ max_distance_for_clone: float = 3.0,
+) -> tuple[list[SyntheticCandidate], np.ndarray, dict[str, object]]:
+ exact_df = summarize_exact_candidates(candidates, weights)
+ target_df = exact_df[exact_df["synthetic_weight"] > 0].head(top_n_targets).copy()
+ scaled_actual = build_scaled_actual_summary(
+ actual_summary,
+ ss_scale=ss_scale,
+ earnings_scale=earnings_scale,
+ )
+
+ donor_backed_candidates: list[SyntheticCandidate] = []
+ donor_backed_weights: list[float] = []
+ clone_records: list[dict[str, object]] = []
+ outlier_targets: list[dict[str, object]] = []
+
+ for _, target_row in target_df.iterrows():
+ target_candidate = candidates[int(target_row["candidate_idx"])]
+ donor_matches = match_real_donors_for_target(
+ target_row,
+ scaled_actual,
+ donors_per_target=donors_per_target,
+ )
+ usable = donor_matches[
+ donor_matches["distance"] <= max_distance_for_clone
+ ].copy()
+ if usable.empty:
+ donor_backed_candidates.append(target_candidate)
+ donor_backed_weights.append(float(target_row["synthetic_weight"]))
+ outlier_targets.append(
+ {
+ "candidate_idx": int(target_row["candidate_idx"]),
+ "archetype": target_row["archetype"],
+ "weight_share_pct": float(target_row["weight_share_pct"]),
+ "best_distance": float(donor_matches["distance"].min()),
+ }
+ )
+ continue
+
+ per_clone_weight = float(target_row["synthetic_weight"]) / len(usable)
+ target_payroll_total = float(target_row["payroll_total"])
+ target_ss_total = float(target_row["ss_total"])
+ target_head_payroll_share = _safe_split(
+ target_candidate.head_wages,
+ target_candidate.payroll_total,
+ 1.0 if target_candidate.spouse_age is None else 0.5,
+ )
+ target_head_ss_share = _safe_split(
+ target_candidate.head_ss,
+ target_candidate.ss_total,
+ 1.0 if target_candidate.spouse_age is None else 0.5,
+ )
+
+ for _, donor_row in usable.iterrows():
+ donor_head_payroll_share = _safe_split(
+ float(donor_row["scaled_head_payroll"]),
+ float(donor_row["scaled_payroll_total"]),
+ target_head_payroll_share,
+ )
+ donor_head_ss_share = _safe_split(
+ float(donor_row["scaled_head_ss"]),
+ float(donor_row["scaled_ss_total"]),
+ target_head_ss_share,
+ )
+ donor_backed_candidates.append(
+ SyntheticCandidate(
+ archetype=target_candidate.archetype,
+ head_age=target_candidate.head_age,
+ spouse_age=target_candidate.spouse_age,
+ dependent_ages=target_candidate.dependent_ages,
+ head_wages=target_payroll_total * donor_head_payroll_share,
+ spouse_wages=target_payroll_total
+ * (1.0 - donor_head_payroll_share),
+ head_ss=target_ss_total * donor_head_ss_share,
+ spouse_ss=target_ss_total * (1.0 - donor_head_ss_share),
+ pension_income=float(donor_row["pension_income"]) * earnings_scale,
+ dividend_income=float(donor_row["dividend_income"])
+ * earnings_scale,
+ )
+ )
+ donor_backed_weights.append(per_clone_weight)
+ clone_records.append(
+ {
+ "candidate_idx": int(target_row["candidate_idx"]),
+ "archetype": target_row["archetype"],
+ "tax_unit_id": int(donor_row["tax_unit_id"]),
+ "distance": float(donor_row["distance"]),
+ "assigned_weight_share_pct": float(
+ per_clone_weight / max(float(weights.sum()), 1.0) * 100
+ ),
+ }
+ )
+
+ clone_summary = summarize_solution(
+ donor_backed_candidates,
+ np.asarray(donor_backed_weights, dtype=float),
+ actual_summary,
+ )
+ return (
+ donor_backed_candidates,
+ np.asarray(donor_backed_weights, dtype=float),
+ {
+ "top_n_targets": int(top_n_targets),
+ "donors_per_target": int(donors_per_target),
+ "max_distance_for_clone": float(max_distance_for_clone),
+ "outlier_targets": outlier_targets,
+ "clone_records": clone_records[:100],
+ "clone_summary": clone_summary,
+ },
+ )
+
+
+def solve_synthetic_support(
+ candidates: list[SyntheticCandidate],
+ *,
+ year: int,
+ max_constraint_error_pct: float = 0.0,
+ warm_weights: np.ndarray | None = None,
+ baseline_weights: np.ndarray | None = None,
+) -> tuple[np.ndarray, dict[str, object]]:
+ problem = build_synthetic_constraint_problem(
+ candidates,
+ year=year,
+ baseline_weights=baseline_weights,
+ )
+ X = problem["X"]
+ aggregated_age_targets = problem["aggregated_age_targets"]
+ ss_values = problem["ss_values"]
+ payroll_values = problem["payroll_values"]
+ baseline_weights = problem["baseline_weights"]
+ ss_target = problem["ss_target"]
+ payroll_target = problem["payroll_target"]
+
+ if max_constraint_error_pct > 0:
+ try:
+ weights, iterations, info = calibrate_entropy_bounded(
+ X,
+ aggregated_age_targets,
+ baseline_weights,
+ ss_values=ss_values,
+ ss_target=ss_target,
+ payroll_values=payroll_values,
+ payroll_target=payroll_target,
+ n_ages=X.shape[1],
+ max_constraint_error_pct=max_constraint_error_pct,
+ max_iters=500,
+ tol=1e-9,
+ warm_weights=([warm_weights] if warm_weights is not None else None),
+ )
+ return np.asarray(weights, dtype=float), {
+ "method": "bounded_entropy",
+ "iterations": int(iterations),
+ "best_case_max_pct_error": float(info["best_case_max_pct_error"]),
+ "requested_max_constraint_error_pct": float(max_constraint_error_pct),
+ "age_bucket_size": 5,
+ "status": int(info.get("status", 0)),
+ "message": info.get("message"),
+ }
+ except RuntimeError as error:
+ constraint_matrix, targets = build_constraint_matrix(problem)
+ lp_weights = None
+ if warm_weights is not None:
+ lp_weights = np.asarray(warm_weights, dtype=float)
+ else:
+ feasibility = assess_nonnegative_feasibility(
+ constraint_matrix,
+ targets,
+ return_weights=True,
+ )
+ if feasibility["success"] and feasibility.get("weights") is not None:
+ lp_weights = np.asarray(feasibility["weights"], dtype=float)
+ if lp_weights is None:
+ raise RuntimeError(
+ f"Approximate synthetic support solve failed for {year}: {error}"
+ ) from error
+ dense_weights, dense_info = densify_lp_solution(
+ constraint_matrix,
+ targets,
+ baseline_weights,
+ lp_weights,
+ max_constraint_error_pct,
+ )
+ return np.asarray(dense_weights, dtype=float), {
+ "method": (
+ "lp_blend"
+ if dense_info["densification_effective"]
+ else "lp_minimax"
+ ),
+ "iterations": 1,
+ "best_case_max_pct_error": float(dense_info["best_case_max_pct_error"]),
+ "requested_max_constraint_error_pct": float(max_constraint_error_pct),
+ "age_bucket_size": 5,
+ "entropy_error": str(error),
+ "lp_blend_lambda": float(dense_info["blend_lambda"]),
+ }
+
+ try:
+ weights, iterations = calibrate_entropy(
+ X,
+ aggregated_age_targets,
+ baseline_weights,
+ ss_values=ss_values,
+ ss_target=ss_target,
+ payroll_values=payroll_values,
+ payroll_target=payroll_target,
+ n_ages=X.shape[1],
+ max_iters=500,
+ tol=1e-9,
+ )
+ return weights, {
+ "method": "entropy",
+ "iterations": int(iterations),
+ "best_case_max_pct_error": 0.0,
+ "age_bucket_size": 5,
+ }
+ except RuntimeError as error:
+ constraint_matrix, targets = build_constraint_matrix(problem)
+ feasibility = assess_nonnegative_feasibility(
+ constraint_matrix,
+ targets,
+ return_weights=True,
+ )
+ if not feasibility["success"] or feasibility.get("weights") is None:
+ raise RuntimeError(
+ f"Synthetic support could not match {year} targets: {error}"
+ ) from error
+ return np.asarray(feasibility["weights"], dtype=float), {
+ "method": "lp_minimax",
+ "iterations": 1,
+ "best_case_max_pct_error": feasibility["best_case_max_pct_error"],
+ "age_bucket_size": 5,
+ "entropy_error": str(error),
+ }
+
+
+def summarize_solution_diff(
+ candidates: list[SyntheticCandidate],
+ base_weights: np.ndarray,
+ alt_weights: np.ndarray,
+) -> dict[str, object]:
+ base_weights = np.asarray(base_weights, dtype=float)
+ alt_weights = np.asarray(alt_weights, dtype=float)
+ base_total = max(float(base_weights.sum()), 1.0)
+ alt_total = max(float(alt_weights.sum()), 1.0)
+ rows: list[dict[str, object]] = []
+ for candidate, base_weight, alt_weight in zip(
+ candidates, base_weights, alt_weights
+ ):
+ base_share_pct = float(base_weight / base_total * 100)
+ alt_share_pct = float(alt_weight / alt_total * 100)
+ rows.append(
+ {
+ "archetype": candidate.archetype,
+ "head_age": candidate.head_age,
+ "spouse_age": candidate.spouse_age,
+ "dependent_count": len(candidate.dependent_ages),
+ "payroll_total": candidate.payroll_total,
+ "ss_total": candidate.ss_total,
+ "base_weight_share_pct": base_share_pct,
+ "alt_weight_share_pct": alt_share_pct,
+ "weight_share_gain_pct_points": alt_share_pct - base_share_pct,
+ "newly_entering": alt_weight > 0 and base_weight <= 1e-12,
+ }
+ )
+ diff_df = pd.DataFrame(rows)
+ entrants = diff_df[
+ diff_df["newly_entering"] & diff_df["alt_weight_share_pct"].gt(0.01)
+ ].copy()
+ entrant_archetypes = (
+ entrants.groupby("archetype", as_index=False)
+ .agg(
+ entrant_weight_share_pct=("alt_weight_share_pct", "sum"),
+ entrant_candidate_count=("archetype", "count"),
+ )
+ .sort_values("entrant_weight_share_pct", ascending=False)
+ )
+ return {
+ "top_weight_gainers": diff_df.sort_values(
+ "weight_share_gain_pct_points",
+ ascending=False,
+ )
+ .head(20)
+ .to_dict("records"),
+ "entrant_archetypes": entrant_archetypes.head(12).to_dict("records"),
+ "entrant_candidate_count": int(len(entrants)),
+ }
+
+
+def summarize_solution(
+ candidates: list[SyntheticCandidate],
+ weights: np.ndarray,
+ actual_summary: pd.DataFrame,
+) -> dict[str, object]:
+ weight_sum = float(weights.sum())
+ candidate_df = pd.DataFrame(
+ [
+ {
+ **asdict(candidate),
+ "spouse_age": candidate.spouse_age,
+ "dependent_count": len(candidate.dependent_ages),
+ "payroll_total": candidate.payroll_total,
+ "ss_total": candidate.ss_total,
+ "taxable_benefits_proxy": candidate.taxable_benefits_proxy(),
+ "synthetic_weight": float(weight),
+ }
+ for candidate, weight in zip(candidates, weights)
+ ]
+ )
+ candidate_df["weight_share_pct"] = (
+ candidate_df["synthetic_weight"] / weight_sum * 100 if weight_sum > 0 else 0.0
+ )
+ candidate_df = candidate_df.sort_values("synthetic_weight", ascending=False)
+ positive_weights = candidate_df.loc[
+ candidate_df["synthetic_weight"] > 0,
+ "synthetic_weight",
+ ].to_numpy(dtype=float)
+ if positive_weights.size > 0:
+ effective_sample_size = float(
+ (positive_weights.sum() ** 2) / np.sum(positive_weights**2)
+ )
+ top_10_weight_share_pct = float(
+ positive_weights[:10].sum() / positive_weights.sum() * 100
+ )
+ top_20_weight_share_pct = float(
+ positive_weights[:20].sum() / positive_weights.sum() * 100
+ )
+ else:
+ effective_sample_size = 0.0
+ top_10_weight_share_pct = 0.0
+ top_20_weight_share_pct = 0.0
+
+ def _weighted_mean(group: pd.DataFrame, column: str) -> float:
+ total = float(group["synthetic_weight"].sum())
+ if total <= 0:
+ return 0.0
+ return float(np.average(group[column], weights=group["synthetic_weight"]))
+
+ synthetic_rows = []
+ for archetype, group in candidate_df.groupby("archetype", sort=False):
+ synthetic_rows.append(
+ {
+ "archetype": archetype,
+ "synthetic_weight": float(group["synthetic_weight"].sum()),
+ "candidate_count": int(len(group)),
+ "avg_head_age": _weighted_mean(group, "head_age"),
+ "avg_payroll_total": _weighted_mean(group, "payroll_total"),
+ "avg_ss_total": _weighted_mean(group, "ss_total"),
+ "avg_pension_income": _weighted_mean(group, "pension_income"),
+ "avg_dividend_income": _weighted_mean(group, "dividend_income"),
+ }
+ )
+ synthetic_archetypes = pd.DataFrame(synthetic_rows).sort_values(
+ "synthetic_weight",
+ ascending=False,
+ )
+ synthetic_archetypes["synthetic_weight_share_pct"] = (
+ synthetic_archetypes["synthetic_weight"] / weight_sum * 100
+ if weight_sum > 0
+ else 0.0
+ )
+
+ actual_archetypes = (
+ actual_summary.groupby("archetype", as_index=False)
+ .agg(
+ actual_support_count=("archetype", "count"),
+ avg_head_age=("head_age", "mean"),
+ avg_payroll_total=("payroll_total", "mean"),
+ avg_ss_total=("ss_total", "mean"),
+ avg_pension_income=("pension_income", "mean"),
+ avg_dividend_income=("dividend_income", "mean"),
+ )
+ .sort_values("actual_support_count", ascending=False)
+ )
+ actual_total = float(actual_archetypes["actual_support_count"].sum())
+ actual_archetypes["actual_support_share_pct"] = (
+ actual_archetypes["actual_support_count"] / actual_total * 100
+ if actual_total > 0
+ else 0.0
+ )
+
+ comparison = pd.merge(
+ synthetic_archetypes[
+ ["archetype", "synthetic_weight_share_pct", "candidate_count"]
+ ],
+ actual_archetypes[
+ ["archetype", "actual_support_share_pct", "actual_support_count"]
+ ],
+ on="archetype",
+ how="outer",
+ ).fillna(0.0)
+ comparison["share_gap_pct_points"] = (
+ comparison["synthetic_weight_share_pct"]
+ - comparison["actual_support_share_pct"]
+ )
+ comparison = comparison.sort_values(
+ "synthetic_weight_share_pct",
+ ascending=False,
+ )
+
+ weighted_metrics = {
+ "synthetic_payroll_positive_85_plus_household_share_pct": float(
+ candidate_df.loc[
+ (candidate_df["head_age"] >= 85) & (candidate_df["payroll_total"] > 0),
+ "synthetic_weight",
+ ].sum()
+ / weight_sum
+ * 100
+ )
+ if weight_sum > 0
+ else 0.0,
+ "synthetic_mixed_retiree_worker_share_pct": float(
+ candidate_df.loc[
+ candidate_df["archetype"].eq("mixed_retiree_worker_couple"),
+ "synthetic_weight",
+ ].sum()
+ / weight_sum
+ * 100
+ )
+ if weight_sum > 0
+ else 0.0,
+ "synthetic_units_with_positive_pension_or_dividend_share_pct": float(
+ candidate_df.loc[
+ (candidate_df["pension_income"] > 0)
+ | (candidate_df["dividend_income"] > 0),
+ "synthetic_weight",
+ ].sum()
+ / weight_sum
+ * 100
+ )
+ if weight_sum > 0
+ else 0.0,
+ "synthetic_avg_taxable_benefits_proxy_share_pct": float(
+ (
+ (
+ candidate_df["taxable_benefits_proxy"]
+ * candidate_df["synthetic_weight"]
+ ).sum()
+ / max(
+ (candidate_df["ss_total"] * candidate_df["synthetic_weight"]).sum(),
+ 1.0,
+ )
+ )
+ * 100
+ ),
+ }
+
+ return {
+ "synthetic_candidate_count": int(len(candidate_df)),
+ "positive_weight_candidate_count": int(
+ (candidate_df["synthetic_weight"] > 0).sum()
+ ),
+ "effective_sample_size": effective_sample_size,
+ "top_10_weight_share_pct": top_10_weight_share_pct,
+ "top_20_weight_share_pct": top_20_weight_share_pct,
+ "top_candidates": candidate_df.head(20).to_dict("records"),
+ "synthetic_archetypes": synthetic_archetypes.to_dict("records"),
+ "actual_support_archetypes": actual_archetypes.to_dict("records"),
+ "archetype_gap_table": comparison.to_dict("records"),
+ "weighted_metrics": weighted_metrics,
+ }
+
+
+def main() -> int:
+ args = parse_args()
+ set_long_term_target_source(args.target_source)
+
+ actual_summary = build_actual_tax_unit_summary(args.base_dataset)
+ base_aggregates = load_base_aggregates(args.base_dataset)
+ ss_scale = load_ssa_benefit_projections(args.year) / max(
+ base_aggregates["weighted_ss_total"],
+ 1.0,
+ )
+ earnings_scale = load_taxable_payroll_projections(args.year) / max(
+ base_aggregates["weighted_payroll_total"],
+ 1.0,
+ )
+ pools = build_quantile_pools(
+ actual_summary,
+ ss_scale=ss_scale,
+ earnings_scale=earnings_scale,
+ )
+ candidates = generate_synthetic_candidates(
+ pools,
+ payroll_cap=load_policyengine_social_security_cap(args.year),
+ )
+ weights, solve_info = solve_synthetic_support(candidates, year=args.year)
+ solution_summary = summarize_solution(candidates, weights, actual_summary)
+ donor_probe = summarize_donor_probe(
+ candidates,
+ weights,
+ actual_summary,
+ ss_scale=ss_scale,
+ earnings_scale=earnings_scale,
+ top_n_targets=args.donor_probe_top_n,
+ donors_per_target=args.donor_probe_k,
+ )
+ _, _, donor_backed_clone_probe = build_donor_backed_clones(
+ candidates,
+ weights,
+ actual_summary,
+ ss_scale=ss_scale,
+ earnings_scale=earnings_scale,
+ top_n_targets=args.donor_probe_top_n,
+ donors_per_target=args.donor_probe_k,
+ )
+ (
+ role_composite_candidates,
+ role_composite_prior,
+ role_donor_composite_probe,
+ ) = build_role_donor_composites(
+ candidates,
+ weights,
+ actual_summary,
+ ss_scale=ss_scale,
+ earnings_scale=earnings_scale,
+ top_n_targets=args.donor_probe_top_n,
+ older_donors_per_target=args.donor_probe_k,
+ worker_donors_per_target=args.donor_probe_k,
+ )
+ role_donor_composite_result: dict[str, object] = {
+ "candidate_count": int(len(role_composite_candidates)),
+ "prior_summary": role_donor_composite_probe["prior_summary"],
+ "skipped_targets": role_donor_composite_probe["skipped_targets"],
+ "composite_records": role_donor_composite_probe["composite_records"],
+ }
+ if role_composite_candidates:
+ role_weights, role_solve_info = solve_synthetic_support(
+ role_composite_candidates,
+ year=args.year,
+ baseline_weights=role_composite_prior,
+ )
+ role_donor_composite_result["solve_info"] = role_solve_info
+ role_donor_composite_result["solution_summary"] = summarize_solution(
+ role_composite_candidates,
+ role_weights,
+ actual_summary,
+ )
+ else:
+ role_donor_composite_result["solve_info"] = {"status": "no_candidates"}
+ epsilon_path_results = []
+ for epsilon in parse_epsilon_path(args.epsilon_path):
+ epsilon_weights, epsilon_solve_info = solve_synthetic_support(
+ candidates,
+ year=args.year,
+ max_constraint_error_pct=epsilon,
+ warm_weights=weights,
+ )
+ epsilon_path_results.append(
+ {
+ "epsilon_pct": float(epsilon),
+ "solve_info": epsilon_solve_info,
+ "solution_summary": summarize_solution(
+ candidates,
+ epsilon_weights,
+ actual_summary,
+ ),
+ "vs_exact": summarize_solution_diff(
+ candidates,
+ weights,
+ epsilon_weights,
+ ),
+ }
+ )
+
+ report = {
+ "year": args.year,
+ "target_source": args.target_source,
+ "base_dataset": args.base_dataset,
+ "solve_info": solve_info,
+ "targets": {
+ "ss_total": float(load_ssa_benefit_projections(args.year)),
+ "taxable_payroll": float(load_taxable_payroll_projections(args.year)),
+ },
+ "macro_scales": {
+ "ss_scale": float(ss_scale),
+ "earnings_scale": float(earnings_scale),
+ **base_aggregates,
+ },
+ "actual_support_tax_unit_count": int(len(actual_summary)),
+ "synthetic_solution": solution_summary,
+ "donor_probe": donor_probe,
+ "donor_backed_clone_probe": donor_backed_clone_probe,
+ "role_donor_composite_probe": role_donor_composite_result,
+ "epsilon_path": epsilon_path_results,
+ }
+
+ payload = json.dumps(report, indent=2)
+ if args.output is None:
+ print(payload)
+ else:
+ args.output.parent.mkdir(parents=True, exist_ok=True)
+ args.output.write_text(payload + "\n", encoding="utf-8")
+ print(args.output)
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/policyengine_us_data/datasets/cps/long_term/rebuild_calibration_manifest.py b/policyengine_us_data/datasets/cps/long_term/rebuild_calibration_manifest.py
new file mode 100644
index 000000000..bcae97125
--- /dev/null
+++ b/policyengine_us_data/datasets/cps/long_term/rebuild_calibration_manifest.py
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+import argparse
+
+from calibration_artifacts import rebuild_dataset_manifest_with_target_source
+from ssa_data import describe_long_term_target_source
+
+
+def parse_args() -> argparse.Namespace:
+ parser = argparse.ArgumentParser(
+ description="Rebuild calibration_manifest.json from existing metadata sidecars.",
+ )
+ parser.add_argument(
+ "output_dir",
+ help="Directory containing YYYY.h5 and YYYY.h5.metadata.json files.",
+ )
+ parser.add_argument(
+ "--target-source",
+ help="Optional target source name to stamp into each sidecar while rebuilding the manifest.",
+ )
+ return parser.parse_args()
+
+
+def main() -> int:
+ args = parse_args()
+ target_source = (
+ describe_long_term_target_source(args.target_source)
+ if args.target_source
+ else None
+ )
+ manifest_path = rebuild_dataset_manifest_with_target_source(
+ args.output_dir,
+ target_source=target_source,
+ )
+ print(f"Rebuilt {manifest_path}")
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/policyengine_us_data/datasets/cps/long_term/run_household_projection.py b/policyengine_us_data/datasets/cps/long_term/run_household_projection.py
index 1413efe4b..5fd8008de 100644
--- a/policyengine_us_data/datasets/cps/long_term/run_household_projection.py
+++ b/policyengine_us_data/datasets/cps/long_term/run_household_projection.py
@@ -3,20 +3,35 @@
Usage:
+ python run_household_projection.py [START_YEAR] [END_YEAR] [--profile PROFILE] [--target-source SOURCE] [--tax-assumption ASSUMPTION] [--output-dir DIR] [--save-h5] [--allow-validation-failures]
+ python run_household_projection.py [START_YEAR] [END_YEAR] [--profile PROFILE] [--target-source SOURCE] [--support-augmentation-profile donor-backed-synthetic-v1] [--support-augmentation-target-year YEAR]
+ python run_household_projection.py [START_YEAR] [END_YEAR] [--profile PROFILE] [--target-source SOURCE] [--support-augmentation-profile donor-backed-composite-v1] [--support-augmentation-target-year YEAR] [--support-augmentation-align-to-run-year] [--support-augmentation-blueprint-base-weight-scale SCALE]
python run_household_projection.py [START_YEAR] [END_YEAR] [--greg] [--use-ss] [--use-payroll] [--use-h6-reform] [--use-tob] [--save-h5]
START_YEAR: Optional starting year (default: 2025)
END_YEAR: Optional ending year (default: 2035)
+ --profile: Named calibration contract (recommended)
+ --target-source: Named long-term target source package
+ --tax-assumption: Long-run federal tax assumption (`trustees-core-thresholds-v1` by default)
+ --output-dir: Output directory for generated H5 files and metadata
+ --allow-validation-failures: Record validation issues in metadata and continue instead of aborting the run
+ --support-augmentation-profile: Experimental late-year support expansion profile
+ --support-augmentation-target-year: Year whose extreme support is used to build the supplement
+ --support-augmentation-align-to-run-year: Rebuild the augmentation support for each run year instead of reusing one target-year support
+ --support-augmentation-blueprint-base-weight-scale: Prior scaling applied to original households when target-year donor-composite blueprint calibration is active
--greg: Use GREG calibration instead of IPF (optional)
--use-ss: Include Social Security benefit totals as calibration target (requires --greg)
--use-payroll: Include taxable payroll totals as calibration target (requires --greg)
--use-h6-reform: Include H6 reform income impact ratio as calibration target (requires --greg)
- --use-tob: Include TOB (Taxation of Benefits) revenue as calibration target (requires --greg)
+ --use-tob: Include TOB (Taxation of Benefits) revenue as a hard calibration target (requires --greg)
--save-h5: Save year-specific .h5 files with calibrated weights to ./projected_datasets/
Examples:
- python run_household_projection.py 2045 2045 --greg --use-ss # single year
- python run_household_projection.py 2025 2100 --greg --use-ss --use-payroll --use-tob --save-h5
+ python run_household_projection.py 2045 2045 --profile ss --target-source trustees_2025_current_law --save-h5
+ python run_household_projection.py 2025 2100 --profile ss-payroll-tob --target-source trustees_2025_current_law --save-h5
+ python run_household_projection.py 2075 2100 --profile ss-payroll-tob --target-source trustees_2025_current_law --support-augmentation-profile donor-backed-synthetic-v1 --support-augmentation-target-year 2100 --allow-validation-failures
+ python run_household_projection.py 2100 2100 --profile ss-payroll-tob --target-source trustees_2025_current_law --support-augmentation-profile donor-backed-composite-v1 --support-augmentation-target-year 2100 --support-augmentation-blueprint-base-weight-scale 0.5 --allow-validation-failures --save-h5
+ python run_household_projection.py 2075 2100 --profile ss-payroll-tob --target-source trustees_2025_current_law --support-augmentation-profile donor-backed-composite-v1 --support-augmentation-align-to-run-year --support-augmentation-blueprint-base-weight-scale 0.5 --allow-validation-failures
"""
import sys
@@ -29,14 +44,43 @@
from policyengine_us import Microsimulation
from ssa_data import (
+ describe_long_term_target_source,
+ get_long_term_target_source,
load_ssa_age_projections,
load_ssa_benefit_projections,
load_taxable_payroll_projections,
+ set_long_term_target_source,
+)
+from calibration import build_calibration_audit, calibrate_weights
+from calibration_artifacts import (
+ update_dataset_manifest,
+ write_support_augmentation_report,
+ write_year_metadata,
+)
+from calibration_profiles import (
+ approximate_window_for_year,
+ build_profile_from_flags,
+ classify_calibration_quality,
+ get_profile,
+ validate_calibration_audit,
)
-from calibration import calibrate_weights
from projection_utils import (
+ aggregate_age_targets,
+ aggregate_household_age_matrix,
+ build_age_bins,
build_household_age_matrix,
create_household_year_h5,
+ validate_projected_social_security_cap,
+)
+from tax_assumptions import (
+ TRUSTEES_CORE_THRESHOLD_ASSUMPTION,
+ create_wage_indexed_core_thresholds_reform,
+ get_long_run_tax_assumption_metadata,
+)
+from prototype_synthetic_2100_support import (
+ build_role_composite_calibration_blueprint,
+ build_donor_backed_augmented_dataset,
+ build_role_composite_augmented_dataset,
)
@@ -226,6 +270,136 @@ def create_h6_reform():
BASE_DATASET_PATH = DATASET_OPTIONS[SELECTED_DATASET]["path"]
BASE_YEAR = DATASET_OPTIONS[SELECTED_DATASET]["base_year"]
+SUPPORTED_AUGMENTATION_PROFILES = {
+ "donor-backed-synthetic-v1",
+ "donor-backed-composite-v1",
+}
+SUPPORTED_TAX_ASSUMPTIONS = {
+ "current-law-literal",
+ TRUSTEES_CORE_THRESHOLD_ASSUMPTION["name"],
+}
+
+
+PROFILE_NAME = None
+if "--profile" in sys.argv:
+ profile_index = sys.argv.index("--profile")
+ if profile_index + 1 >= len(sys.argv):
+ raise ValueError("--profile requires a profile name")
+ PROFILE_NAME = sys.argv[profile_index + 1]
+ del sys.argv[profile_index : profile_index + 2]
+
+TARGET_SOURCE = None
+if "--target-source" in sys.argv:
+ source_index = sys.argv.index("--target-source")
+ if source_index + 1 >= len(sys.argv):
+ raise ValueError("--target-source requires a source name")
+ TARGET_SOURCE = sys.argv[source_index + 1]
+ del sys.argv[source_index : source_index + 2]
+
+OUTPUT_DIR = "./projected_datasets"
+if "--output-dir" in sys.argv:
+ output_dir_index = sys.argv.index("--output-dir")
+ if output_dir_index + 1 >= len(sys.argv):
+ raise ValueError("--output-dir requires a directory path")
+ OUTPUT_DIR = sys.argv[output_dir_index + 1]
+ del sys.argv[output_dir_index : output_dir_index + 2]
+
+SUPPORT_AUGMENTATION_PROFILE = None
+if "--support-augmentation-profile" in sys.argv:
+ augmentation_index = sys.argv.index("--support-augmentation-profile")
+ if augmentation_index + 1 >= len(sys.argv):
+ raise ValueError("--support-augmentation-profile requires a profile name")
+ SUPPORT_AUGMENTATION_PROFILE = sys.argv[augmentation_index + 1]
+ del sys.argv[augmentation_index : augmentation_index + 2]
+
+SUPPORT_AUGMENTATION_TARGET_YEAR = None
+if "--support-augmentation-target-year" in sys.argv:
+ target_year_index = sys.argv.index("--support-augmentation-target-year")
+ if target_year_index + 1 >= len(sys.argv):
+ raise ValueError("--support-augmentation-target-year requires a year")
+ SUPPORT_AUGMENTATION_TARGET_YEAR = int(sys.argv[target_year_index + 1])
+ del sys.argv[target_year_index : target_year_index + 2]
+
+SUPPORT_AUGMENTATION_ALIGN_TO_RUN_YEAR = (
+ "--support-augmentation-align-to-run-year" in sys.argv
+)
+if SUPPORT_AUGMENTATION_ALIGN_TO_RUN_YEAR:
+ sys.argv.remove("--support-augmentation-align-to-run-year")
+
+SUPPORT_AUGMENTATION_START_YEAR = 2075
+if "--support-augmentation-start-year" in sys.argv:
+ start_year_index = sys.argv.index("--support-augmentation-start-year")
+ if start_year_index + 1 >= len(sys.argv):
+ raise ValueError("--support-augmentation-start-year requires a year")
+ SUPPORT_AUGMENTATION_START_YEAR = int(sys.argv[start_year_index + 1])
+ del sys.argv[start_year_index : start_year_index + 2]
+
+SUPPORT_AUGMENTATION_TOP_N_TARGETS = 20
+if "--support-augmentation-top-n-targets" in sys.argv:
+ top_n_index = sys.argv.index("--support-augmentation-top-n-targets")
+ if top_n_index + 1 >= len(sys.argv):
+ raise ValueError("--support-augmentation-top-n-targets requires an integer")
+ SUPPORT_AUGMENTATION_TOP_N_TARGETS = int(sys.argv[top_n_index + 1])
+ del sys.argv[top_n_index : top_n_index + 2]
+
+SUPPORT_AUGMENTATION_DONORS_PER_TARGET = 5
+if "--support-augmentation-donors-per-target" in sys.argv:
+ donor_index = sys.argv.index("--support-augmentation-donors-per-target")
+ if donor_index + 1 >= len(sys.argv):
+ raise ValueError("--support-augmentation-donors-per-target requires an integer")
+ SUPPORT_AUGMENTATION_DONORS_PER_TARGET = int(sys.argv[donor_index + 1])
+ del sys.argv[donor_index : donor_index + 2]
+
+SUPPORT_AUGMENTATION_MAX_DISTANCE = 3.0
+if "--support-augmentation-max-distance" in sys.argv:
+ distance_index = sys.argv.index("--support-augmentation-max-distance")
+ if distance_index + 1 >= len(sys.argv):
+ raise ValueError("--support-augmentation-max-distance requires a float")
+ SUPPORT_AUGMENTATION_MAX_DISTANCE = float(sys.argv[distance_index + 1])
+ del sys.argv[distance_index : distance_index + 2]
+
+SUPPORT_AUGMENTATION_CLONE_WEIGHT_SCALE = 0.1
+if "--support-augmentation-clone-weight-scale" in sys.argv:
+ weight_scale_index = sys.argv.index("--support-augmentation-clone-weight-scale")
+ if weight_scale_index + 1 >= len(sys.argv):
+ raise ValueError("--support-augmentation-clone-weight-scale requires a float")
+ SUPPORT_AUGMENTATION_CLONE_WEIGHT_SCALE = float(sys.argv[weight_scale_index + 1])
+ del sys.argv[weight_scale_index : weight_scale_index + 2]
+
+SUPPORT_AUGMENTATION_BLUEPRINT_BASE_WEIGHT_SCALE = 0.5
+if "--support-augmentation-blueprint-base-weight-scale" in sys.argv:
+ blueprint_scale_index = sys.argv.index(
+ "--support-augmentation-blueprint-base-weight-scale"
+ )
+ if blueprint_scale_index + 1 >= len(sys.argv):
+ raise ValueError(
+ "--support-augmentation-blueprint-base-weight-scale requires a float"
+ )
+ SUPPORT_AUGMENTATION_BLUEPRINT_BASE_WEIGHT_SCALE = float(
+ sys.argv[blueprint_scale_index + 1]
+ )
+ del sys.argv[blueprint_scale_index : blueprint_scale_index + 2]
+
+TAX_ASSUMPTION = TRUSTEES_CORE_THRESHOLD_ASSUMPTION["name"]
+if "--tax-assumption" in sys.argv:
+ tax_assumption_index = sys.argv.index("--tax-assumption")
+ if tax_assumption_index + 1 >= len(sys.argv):
+ raise ValueError("--tax-assumption requires a value")
+ TAX_ASSUMPTION = sys.argv[tax_assumption_index + 1]
+ del sys.argv[tax_assumption_index : tax_assumption_index + 2]
+if TAX_ASSUMPTION not in SUPPORTED_TAX_ASSUMPTIONS:
+ raise ValueError(
+ "Unsupported --tax-assumption: "
+ f"{TAX_ASSUMPTION}. Valid values: {sorted(SUPPORTED_TAX_ASSUMPTIONS)}"
+ )
+
+ALLOW_VALIDATION_FAILURES = "--allow-validation-failures" in sys.argv
+if ALLOW_VALIDATION_FAILURES:
+ sys.argv.remove("--allow-validation-failures")
+ALLOW_VALIDATION_FAILURES = ALLOW_VALIDATION_FAILURES or (
+ os.environ.get("PEUD_ALLOW_INVALID_ARTIFACTS", "").lower() in {"1", "true", "yes"}
+)
+
USE_GREG = "--greg" in sys.argv
if USE_GREG:
@@ -251,7 +425,6 @@ def create_h6_reform():
if not USE_GREG:
print("Warning: --use-h6-reform requires --greg, enabling GREG automatically")
USE_GREG = True
- from ssa_data import load_h6_income_rate_change
USE_TOB = "--use-tob" in sys.argv
if USE_TOB:
@@ -259,7 +432,6 @@ def create_h6_reform():
if not USE_GREG:
print("Warning: --use-tob requires --greg, enabling GREG automatically")
USE_GREG = True
- from ssa_data import load_oasdi_tob_projections, load_hi_tob_projections
SAVE_H5 = "--save-h5" in sys.argv
if SAVE_H5:
@@ -268,6 +440,92 @@ def create_h6_reform():
START_YEAR = int(sys.argv[1]) if len(sys.argv) > 1 else 2025
END_YEAR = int(sys.argv[2]) if len(sys.argv) > 2 else 2035
+if SUPPORT_AUGMENTATION_TARGET_YEAR is None:
+ SUPPORT_AUGMENTATION_TARGET_YEAR = END_YEAR
+
+if SUPPORT_AUGMENTATION_PROFILE is not None:
+ if SUPPORT_AUGMENTATION_PROFILE not in SUPPORTED_AUGMENTATION_PROFILES:
+ raise ValueError(
+ f"Unsupported support augmentation profile: {SUPPORT_AUGMENTATION_PROFILE}"
+ )
+ if START_YEAR < SUPPORT_AUGMENTATION_START_YEAR:
+ raise ValueError(
+ "Support augmentation is only supported for late-year runs. "
+ f"Received START_YEAR={START_YEAR}, requires >= "
+ f"{SUPPORT_AUGMENTATION_START_YEAR}."
+ )
+
+legacy_flags_used = any([USE_GREG, USE_SS, USE_PAYROLL, USE_H6_REFORM, USE_TOB])
+if PROFILE_NAME and legacy_flags_used:
+ raise ValueError("Use either --profile or legacy calibration flags, not both.")
+
+if PROFILE_NAME:
+ PROFILE = get_profile(PROFILE_NAME)
+else:
+ PROFILE = build_profile_from_flags(
+ use_greg=USE_GREG,
+ use_ss=USE_SS,
+ use_payroll=USE_PAYROLL,
+ use_h6_reform=USE_H6_REFORM,
+ use_tob=USE_TOB,
+ )
+
+if TARGET_SOURCE:
+ set_long_term_target_source(TARGET_SOURCE)
+TARGET_SOURCE = get_long_term_target_source()
+TARGET_SOURCE_METADATA = describe_long_term_target_source(TARGET_SOURCE)
+
+
+def _compose_reforms(*reforms):
+ reforms = tuple(reform for reform in reforms if reform is not None)
+ if not reforms:
+ return None
+ if len(reforms) == 1:
+ return reforms[0]
+ return reforms
+
+
+if TAX_ASSUMPTION == "current-law-literal":
+ ACTIVE_LONG_RUN_TAX_REFORM = None
+ LONG_RUN_TAX_ASSUMPTION_METADATA = {
+ "name": "current-law-literal",
+ "description": (
+ "Use the baseline PolicyEngine federal tax parameter uprating "
+ "without long-run Trustees-style wage-index overrides."
+ ),
+ "source": "PolicyEngine baseline",
+ "start_year": None,
+ "end_year": int(END_YEAR),
+ }
+else:
+ ACTIVE_LONG_RUN_TAX_REFORM = create_wage_indexed_core_thresholds_reform(
+ start_year=TRUSTEES_CORE_THRESHOLD_ASSUMPTION["start_year"],
+ end_year=END_YEAR,
+ )
+ LONG_RUN_TAX_ASSUMPTION_METADATA = get_long_run_tax_assumption_metadata(
+ TAX_ASSUMPTION,
+ end_year=END_YEAR,
+ )
+
+BASE_DATASET = BASE_DATASET_PATH
+SUPPORT_AUGMENTATION_METADATA = None
+SUPPORT_AUGMENTATION_REPORT = None
+MANIFEST_SUPPORT_AUGMENTATION_METADATA = None
+
+CALIBRATION_METHOD = PROFILE.calibration_method
+USE_GREG = CALIBRATION_METHOD == "greg"
+USE_SS = PROFILE.use_ss
+USE_PAYROLL = PROFILE.use_payroll
+USE_H6_REFORM = PROFILE.use_h6_reform
+USE_TOB = PROFILE.use_tob
+BENCHMARK_TOB = PROFILE.benchmark_tob
+
+if USE_H6_REFORM:
+ from ssa_data import load_h6_income_rate_change
+
+if USE_TOB or BENCHMARK_TOB:
+ from ssa_data import load_hi_tob_projections, load_oasdi_tob_projections
+
if USE_GREG:
try:
from samplics.weighting import SampleWeight
@@ -280,8 +538,6 @@ def create_h6_reform():
else:
calibrator = None
-OUTPUT_DIR = "./projected_datasets"
-
print("=" * 70)
print(f"HOUSEHOLD-LEVEL INCOME TAX PROJECTION: {START_YEAR}-{END_YEAR}")
print("=" * 70)
@@ -289,7 +545,21 @@ def create_h6_reform():
print(f" Base year: {BASE_YEAR} (CPS microdata)")
print(f" Projection: {START_YEAR}-{END_YEAR}")
print(f" Calculation level: HOUSEHOLD ONLY (simplified)")
-print(f" Calibration method: {'GREG' if USE_GREG else 'IPF'}")
+print(f" Calibration profile: {PROFILE.name}")
+print(f" Profile description: {PROFILE.description}")
+print(f" Target source: {TARGET_SOURCE}")
+print(f" Long-run tax assumption: {LONG_RUN_TAX_ASSUMPTION_METADATA['name']}")
+print(f" Calibration method: {CALIBRATION_METHOD.upper()}")
+if SUPPORT_AUGMENTATION_PROFILE:
+ print(f" Support augmentation: {SUPPORT_AUGMENTATION_PROFILE}")
+ if SUPPORT_AUGMENTATION_ALIGN_TO_RUN_YEAR:
+ print(" Support augmentation target year: each run year")
+ else:
+ print(f" Support augmentation target year: {SUPPORT_AUGMENTATION_TARGET_YEAR}")
+ print(
+ " Support augmentation blueprint base-weight scale: "
+ f"{SUPPORT_AUGMENTATION_BLUEPRINT_BASE_WEIGHT_SCALE}"
+ )
if USE_SS:
print(f" Including Social Security benefits constraint: Yes")
if USE_PAYROLL:
@@ -298,6 +568,8 @@ def create_h6_reform():
print(f" Including H6 reform income impact constraint: Yes")
if USE_TOB:
print(f" Including TOB revenue constraint: Yes")
+elif BENCHMARK_TOB:
+ print(f" Benchmarking TOB after calibration: Yes")
if SAVE_H5:
print(f" Saving year-specific .h5 files: Yes (to {OUTPUT_DIR}/)")
os.makedirs(OUTPUT_DIR, exist_ok=True)
@@ -307,6 +579,119 @@ def create_h6_reform():
est_time = (END_YEAR - START_YEAR + 1) * (3 if SAVE_H5 else 2)
print(f" Estimated time: ~{est_time:.0f} minutes")
+
+def _build_support_augmentation(
+ target_year: int,
+ *,
+ report_filename: str | None = None,
+):
+ if SUPPORT_AUGMENTATION_PROFILE is None:
+ return BASE_DATASET_PATH, None, None, None
+
+ if SUPPORT_AUGMENTATION_PROFILE == "donor-backed-synthetic-v1":
+ augmented_dataset, augmentation_report = build_donor_backed_augmented_dataset(
+ base_dataset=BASE_DATASET_PATH,
+ base_year=BASE_YEAR,
+ target_year=target_year,
+ top_n_targets=SUPPORT_AUGMENTATION_TOP_N_TARGETS,
+ donors_per_target=SUPPORT_AUGMENTATION_DONORS_PER_TARGET,
+ max_distance_for_clone=SUPPORT_AUGMENTATION_MAX_DISTANCE,
+ clone_weight_scale=SUPPORT_AUGMENTATION_CLONE_WEIGHT_SCALE,
+ )
+ else:
+ augmented_dataset, augmentation_report = build_role_composite_augmented_dataset(
+ base_dataset=BASE_DATASET_PATH,
+ base_year=BASE_YEAR,
+ target_year=target_year,
+ top_n_targets=SUPPORT_AUGMENTATION_TOP_N_TARGETS,
+ donors_per_target=SUPPORT_AUGMENTATION_DONORS_PER_TARGET,
+ max_older_distance=SUPPORT_AUGMENTATION_MAX_DISTANCE,
+ max_worker_distance=SUPPORT_AUGMENTATION_MAX_DISTANCE,
+ clone_weight_scale=SUPPORT_AUGMENTATION_CLONE_WEIGHT_SCALE,
+ )
+
+ report_path = write_support_augmentation_report(
+ OUTPUT_DIR,
+ augmentation_report,
+ filename=(
+ report_filename
+ if report_filename is not None
+ else "support_augmentation_report.json"
+ ),
+ )
+ year_metadata = {
+ "name": SUPPORT_AUGMENTATION_PROFILE,
+ "activation_start_year": SUPPORT_AUGMENTATION_START_YEAR,
+ "target_year": int(target_year),
+ "target_year_strategy": (
+ "run_year" if SUPPORT_AUGMENTATION_ALIGN_TO_RUN_YEAR else "fixed"
+ ),
+ "top_n_targets": SUPPORT_AUGMENTATION_TOP_N_TARGETS,
+ "donors_per_target": SUPPORT_AUGMENTATION_DONORS_PER_TARGET,
+ "max_distance_for_clone": SUPPORT_AUGMENTATION_MAX_DISTANCE,
+ "clone_weight_scale": SUPPORT_AUGMENTATION_CLONE_WEIGHT_SCALE,
+ "blueprint_base_weight_scale": (
+ SUPPORT_AUGMENTATION_BLUEPRINT_BASE_WEIGHT_SCALE
+ ),
+ "report_file": report_path.name,
+ "report_summary": {
+ "base_household_count": augmentation_report["base_household_count"],
+ "augmented_household_count": augmentation_report[
+ "augmented_household_count"
+ ],
+ "base_person_count": augmentation_report["base_person_count"],
+ "augmented_person_count": augmentation_report["augmented_person_count"],
+ "clone_household_count": augmentation_report.get(
+ "clone_household_count", 0
+ ),
+ "successful_target_count": sum(
+ report["successful_clone_count"] > 0
+ for report in augmentation_report["target_reports"]
+ ),
+ "skipped_target_count": len(augmentation_report["skipped_targets"]),
+ },
+ }
+ manifest_metadata = {
+ "name": SUPPORT_AUGMENTATION_PROFILE,
+ "activation_start_year": SUPPORT_AUGMENTATION_START_YEAR,
+ "target_year": (
+ int(target_year) if not SUPPORT_AUGMENTATION_ALIGN_TO_RUN_YEAR else None
+ ),
+ "target_year_strategy": (
+ "run_year" if SUPPORT_AUGMENTATION_ALIGN_TO_RUN_YEAR else "fixed"
+ ),
+ "top_n_targets": SUPPORT_AUGMENTATION_TOP_N_TARGETS,
+ "donors_per_target": SUPPORT_AUGMENTATION_DONORS_PER_TARGET,
+ "max_distance_for_clone": SUPPORT_AUGMENTATION_MAX_DISTANCE,
+ "clone_weight_scale": SUPPORT_AUGMENTATION_CLONE_WEIGHT_SCALE,
+ "blueprint_base_weight_scale": (
+ SUPPORT_AUGMENTATION_BLUEPRINT_BASE_WEIGHT_SCALE
+ ),
+ "report_file": (
+ None if SUPPORT_AUGMENTATION_ALIGN_TO_RUN_YEAR else report_path.name
+ ),
+ }
+ return augmented_dataset, augmentation_report, year_metadata, manifest_metadata
+
+
+def _print_support_augmentation_summary(augmentation_report: dict) -> None:
+ print(
+ " Base households -> augmented households: "
+ f"{augmentation_report['base_household_count']:,} -> "
+ f"{augmentation_report['augmented_household_count']:,}"
+ )
+ print(
+ " Base people -> augmented people: "
+ f"{augmentation_report['base_person_count']:,} -> "
+ f"{augmentation_report['augmented_person_count']:,}"
+ )
+ print(
+ " Successful target clones: "
+ f"{sum(report['successful_clone_count'] > 0 for report in augmentation_report['target_reports'])}"
+ )
+ print(f" Skipped synthetic targets: {len(augmentation_report['skipped_targets'])}")
+
+
# =========================================================================
# STEP 1: LOAD SSA DEMOGRAPHIC PROJECTIONS
# =========================================================================
@@ -335,6 +720,58 @@ def create_h6_reform():
pop = target_matrix[:, idx].sum()
print(f" {y}: {pop / 1e6:6.1f}M")
+augmentation_cache: dict[int, tuple[object, dict, dict, dict]] = {}
+X = None
+hh_id_to_idx = None
+n_households = None
+household_ids_unique = None
+aggregated_age_cache: dict[int, tuple[np.ndarray, np.ndarray]] = {}
+
+if SUPPORT_AUGMENTATION_PROFILE in {
+ "donor-backed-synthetic-v1",
+ "donor-backed-composite-v1",
+}:
+ print("\n" + "=" * 70)
+ print("STEP 1B: BUILD DONOR-BACKED LATE-YEAR SUPPORT")
+ print("=" * 70)
+ if SUPPORT_AUGMENTATION_ALIGN_TO_RUN_YEAR:
+ print(
+ " Dynamic mode: donor-backed support will be rebuilt separately for "
+ "each run year."
+ )
+ else:
+ (
+ BASE_DATASET,
+ SUPPORT_AUGMENTATION_REPORT,
+ SUPPORT_AUGMENTATION_METADATA,
+ MANIFEST_SUPPORT_AUGMENTATION_METADATA,
+ ) = _build_support_augmentation(
+ SUPPORT_AUGMENTATION_TARGET_YEAR,
+ )
+ augmentation_cache[SUPPORT_AUGMENTATION_TARGET_YEAR] = (
+ BASE_DATASET,
+ SUPPORT_AUGMENTATION_REPORT,
+ SUPPORT_AUGMENTATION_METADATA,
+ MANIFEST_SUPPORT_AUGMENTATION_METADATA,
+ )
+ _print_support_augmentation_summary(SUPPORT_AUGMENTATION_REPORT)
+
+if SUPPORT_AUGMENTATION_PROFILE is not None and SUPPORT_AUGMENTATION_ALIGN_TO_RUN_YEAR:
+ MANIFEST_SUPPORT_AUGMENTATION_METADATA = {
+ "name": SUPPORT_AUGMENTATION_PROFILE,
+ "activation_start_year": SUPPORT_AUGMENTATION_START_YEAR,
+ "target_year": None,
+ "target_year_strategy": "run_year",
+ "top_n_targets": SUPPORT_AUGMENTATION_TOP_N_TARGETS,
+ "donors_per_target": SUPPORT_AUGMENTATION_DONORS_PER_TARGET,
+ "max_distance_for_clone": SUPPORT_AUGMENTATION_MAX_DISTANCE,
+ "clone_weight_scale": SUPPORT_AUGMENTATION_CLONE_WEIGHT_SCALE,
+ "blueprint_base_weight_scale": (
+ SUPPORT_AUGMENTATION_BLUEPRINT_BASE_WEIGHT_SCALE
+ ),
+ "report_file": None,
+ }
+
# =========================================================================
# STEP 2: BUILD HOUSEHOLD AGE MATRIX
# =========================================================================
@@ -342,7 +779,12 @@ def create_h6_reform():
print("STEP 2: BUILDING HOUSEHOLD AGE COMPOSITION")
print("=" * 70)
-sim = Microsimulation(dataset=BASE_DATASET_PATH)
+if SUPPORT_AUGMENTATION_ALIGN_TO_RUN_YEAR:
+ print(
+ "\nDynamic augmentation enabled; base support will be used before the activation year and rebuilt per-year after that."
+ )
+
+sim = Microsimulation(dataset=BASE_DATASET)
X, household_ids_unique, hh_id_to_idx = build_household_age_matrix(sim, n_ages)
n_households = len(household_ids_unique)
@@ -368,8 +810,6 @@ def create_h6_reform():
total_income_tax = np.zeros(n_years)
total_income_tax_baseline = np.zeros(n_years)
total_population = np.zeros(n_years)
-weights_matrix = np.zeros((n_households, n_years))
-baseline_weights_matrix = np.zeros((n_households, n_years))
process = psutil.Process()
print(f"\nInitial memory usage: {process.memory_info().rss / 1024**3:.2f} GB")
@@ -379,8 +819,43 @@ def create_h6_reform():
for year_idx in range(n_years):
year = START_YEAR + year_idx
-
- sim = Microsimulation(dataset=BASE_DATASET_PATH)
+ current_dataset = BASE_DATASET
+ current_support_augmentation_report = SUPPORT_AUGMENTATION_REPORT
+ current_support_augmentation_metadata = SUPPORT_AUGMENTATION_METADATA
+ current_X = X
+ current_hh_id_to_idx = hh_id_to_idx
+
+ if (
+ SUPPORT_AUGMENTATION_PROFILE is not None
+ and SUPPORT_AUGMENTATION_ALIGN_TO_RUN_YEAR
+ and year >= SUPPORT_AUGMENTATION_START_YEAR
+ ):
+ cached = augmentation_cache.get(year)
+ if cached is None:
+ cached = _build_support_augmentation(
+ year,
+ report_filename=f"support_augmentation_report_{year}.json",
+ )
+ augmentation_cache[year] = cached
+ (
+ current_dataset,
+ current_support_augmentation_report,
+ current_support_augmentation_metadata,
+ _,
+ ) = cached
+ if year in display_years:
+ print(f" [DEBUG {year}] Rebuilt support augmentation for run year {year}")
+ _print_support_augmentation_summary(current_support_augmentation_report)
+ sim = Microsimulation(
+ dataset=current_dataset,
+ reform=ACTIVE_LONG_RUN_TAX_REFORM,
+ )
+ current_X, _, current_hh_id_to_idx = build_household_age_matrix(sim, n_ages)
+ else:
+ sim = Microsimulation(
+ dataset=current_dataset,
+ reform=ACTIVE_LONG_RUN_TAX_REFORM,
+ )
income_tax_hh = sim.calculate("income_tax", period=year, map_to="household")
income_tax_baseline_total = income_tax_hh.sum()
@@ -390,7 +865,11 @@ def create_h6_reform():
baseline_weights = household_microseries.weights.values
household_ids_hh = household_microseries.values
- assert len(household_ids_hh) == n_households
+ if not (
+ SUPPORT_AUGMENTATION_ALIGN_TO_RUN_YEAR
+ and current_support_augmentation_report is not None
+ ):
+ assert len(household_ids_hh) == n_households
ss_values = None
ss_target = None
@@ -407,6 +886,10 @@ def create_h6_reform():
payroll_values = None
payroll_target = None
if USE_PAYROLL:
+ payroll_cap = validate_projected_social_security_cap(
+ sim.tax_benefit_system.parameters,
+ year,
+ )
# SSA taxable payroll = W-2 wages capped at wage base + SE income within remaining cap room
taxable_wages_hh = sim.calculate(
"taxable_earnings_for_social_security",
@@ -422,6 +905,7 @@ def create_h6_reform():
payroll_target = load_taxable_payroll_projections(year)
if year in display_years:
payroll_baseline = np.sum(payroll_values * baseline_weights)
+ print(f" [DEBUG {year}] Payroll cap: ${payroll_cap:,.0f}")
print(
f" [DEBUG {year}] Payroll baseline: ${payroll_baseline / 1e9:.1f}B, target: ${payroll_target / 1e9:.1f}B"
)
@@ -440,7 +924,10 @@ def create_h6_reform():
else:
# Create and apply H6 reform
h6_reform = create_h6_reform()
- reform_sim = Microsimulation(dataset=BASE_DATASET_PATH, reform=h6_reform)
+ reform_sim = Microsimulation(
+ dataset=current_dataset,
+ reform=_compose_reforms(ACTIVE_LONG_RUN_TAX_REFORM, h6_reform),
+ )
# Calculate reform income tax
income_tax_reform_hh = reform_sim.calculate(
@@ -473,7 +960,7 @@ def create_h6_reform():
oasdi_tob_target = None
hi_tob_values = None
hi_tob_target = None
- if USE_TOB:
+ if USE_TOB or BENCHMARK_TOB:
oasdi_tob_hh = sim.calculate(
"tob_revenue_oasdi", period=year, map_to="household"
)
@@ -496,38 +983,181 @@ def create_h6_reform():
f" [DEBUG {year}] HI TOB baseline: ${hi_baseline / 1e9:.1f}B, target: ${hi_tob_target / 1e9:.1f}B"
)
- y_target = target_matrix[:, year_idx]
+ approximate_window = approximate_window_for_year(PROFILE, year)
+ age_bucket_size = (
+ approximate_window.age_bucket_size if approximate_window is not None else None
+ )
+ if age_bucket_size and age_bucket_size > 1:
+ if (
+ SUPPORT_AUGMENTATION_ALIGN_TO_RUN_YEAR
+ and current_support_augmentation_report is not None
+ ):
+ age_bins = build_age_bins(n_ages=n_ages, bucket_size=age_bucket_size)
+ X_current = aggregate_household_age_matrix(current_X, age_bins)
+ aggregated_target_matrix = aggregate_age_targets(target_matrix, age_bins)
+ else:
+ if age_bucket_size not in aggregated_age_cache:
+ age_bins = build_age_bins(n_ages=n_ages, bucket_size=age_bucket_size)
+ aggregated_age_cache[age_bucket_size] = (
+ aggregate_household_age_matrix(current_X, age_bins),
+ aggregate_age_targets(target_matrix, age_bins),
+ )
+ X_current, aggregated_target_matrix = aggregated_age_cache[age_bucket_size]
+ y_target = aggregated_target_matrix[:, year_idx]
+ else:
+ X_current = current_X
+ y_target = target_matrix[:, year_idx]
+ age_bucket_size = 1
+
+ X_actual_current = X_current
+ ss_values_actual = None if ss_values is None else np.asarray(ss_values, dtype=float)
+ payroll_values_actual = (
+ None if payroll_values is None else np.asarray(payroll_values, dtype=float)
+ )
+ calibration_baseline_weights = baseline_weights
+ X_calibration = X_current.copy()
+ ss_values_calibration = (
+ None if ss_values_actual is None else ss_values_actual.copy()
+ )
+ payroll_values_calibration = (
+ None if payroll_values_actual is None else payroll_values_actual.copy()
+ )
+ blueprint_summary = None
+ if (
+ SUPPORT_AUGMENTATION_PROFILE == "donor-backed-composite-v1"
+ and current_support_augmentation_report is not None
+ and year >= SUPPORT_AUGMENTATION_START_YEAR
+ ):
+ calibration_blueprint = build_role_composite_calibration_blueprint(
+ current_support_augmentation_report,
+ year=year,
+ age_bins=build_age_bins(n_ages=n_ages, bucket_size=age_bucket_size),
+ hh_id_to_idx=current_hh_id_to_idx,
+ baseline_weights=baseline_weights,
+ base_weight_scale=SUPPORT_AUGMENTATION_BLUEPRINT_BASE_WEIGHT_SCALE,
+ )
+ if calibration_blueprint is not None:
+ calibration_baseline_weights = calibration_blueprint["baseline_weights"]
+ for idx, age_vector in calibration_blueprint["age_overrides"].items():
+ X_calibration[idx] = age_vector
+ if ss_values_calibration is not None:
+ for idx, target_value in calibration_blueprint["ss_overrides"].items():
+ ss_values_calibration[idx] = target_value
+ if payroll_values_calibration is not None:
+ for idx, target_value in calibration_blueprint[
+ "payroll_overrides"
+ ].items():
+ payroll_values_calibration[idx] = target_value
+ blueprint_summary = calibration_blueprint["summary"]
+ if year in display_years:
+ print(
+ f" [DEBUG {year}] Using support blueprint for "
+ f"{blueprint_summary['clone_household_count']} clone households "
+ f"(base-weight scale {blueprint_summary['base_weight_scale']:.3f})"
+ )
- w_new, iterations = calibrate_weights(
- X=X,
+ w_new, iterations, calibration_event = calibrate_weights(
+ X=X_calibration,
y_target=y_target,
- baseline_weights=baseline_weights,
- method="greg" if USE_GREG else "ipf",
+ baseline_weights=calibration_baseline_weights,
+ method=CALIBRATION_METHOD,
calibrator=calibrator,
- ss_values=ss_values,
+ ss_values=ss_values_calibration,
ss_target=ss_target,
- payroll_values=payroll_values,
+ payroll_values=payroll_values_calibration,
payroll_target=payroll_target,
h6_income_values=h6_income_values,
h6_revenue_target=h6_revenue_target,
- oasdi_tob_values=oasdi_tob_values,
- oasdi_tob_target=oasdi_tob_target,
- hi_tob_values=hi_tob_values,
- hi_tob_target=hi_tob_target,
- n_ages=n_ages,
+ oasdi_tob_values=oasdi_tob_values if USE_TOB else None,
+ oasdi_tob_target=oasdi_tob_target if USE_TOB else None,
+ hi_tob_values=hi_tob_values if USE_TOB else None,
+ hi_tob_target=hi_tob_target if USE_TOB else None,
+ n_ages=X_current.shape[1],
max_iters=100,
tol=1e-6,
verbose=False,
+ allow_fallback_to_ipf=PROFILE.allow_greg_fallback,
+ allow_approximate_entropy=approximate_window is not None,
+ approximate_max_error_pct=(
+ approximate_window.max_constraint_error_pct
+ if approximate_window is not None
+ else None
+ ),
)
- if year in display_years and USE_GREG:
- neg_mask = w_new < 0
- n_neg = neg_mask.sum()
+ calibration_audit = build_calibration_audit(
+ X=X_actual_current,
+ y_target=y_target,
+ weights=w_new,
+ baseline_weights=calibration_baseline_weights,
+ calibration_event=calibration_event,
+ ss_values=ss_values_actual,
+ ss_target=ss_target,
+ payroll_values=payroll_values_actual,
+ payroll_target=payroll_target,
+ h6_income_values=h6_income_values,
+ h6_revenue_target=h6_revenue_target,
+ oasdi_tob_values=oasdi_tob_values if USE_TOB else None,
+ oasdi_tob_target=oasdi_tob_target if USE_TOB else None,
+ hi_tob_values=hi_tob_values if USE_TOB else None,
+ hi_tob_target=hi_tob_target if USE_TOB else None,
+ )
+ if blueprint_summary is not None:
+ calibration_audit["support_blueprint"] = blueprint_summary
+ if BENCHMARK_TOB and oasdi_tob_values is not None and hi_tob_values is not None:
+ calibration_audit["benchmarks"] = {
+ "oasdi_tob": {
+ "target": float(oasdi_tob_target),
+ "achieved": float(np.sum(oasdi_tob_values * w_new)),
+ },
+ "hi_tob": {
+ "target": float(hi_tob_target),
+ "achieved": float(np.sum(hi_tob_values * w_new)),
+ },
+ }
+ for benchmark in calibration_audit["benchmarks"].values():
+ benchmark["error"] = benchmark["achieved"] - benchmark["target"]
+ benchmark["pct_error"] = (
+ 0.0
+ if benchmark["target"] == 0
+ else (benchmark["error"] / benchmark["target"] * 100)
+ )
+ benchmark["source"] = TARGET_SOURCE
+ calibration_audit["calibration_quality"] = classify_calibration_quality(
+ calibration_audit,
+ PROFILE,
+ year=year,
+ )
+ calibration_audit["age_bucket_size"] = age_bucket_size
+ calibration_audit["age_bucket_count"] = int(X_current.shape[1])
+
+ validation_issues = validate_calibration_audit(
+ calibration_audit,
+ PROFILE,
+ year=year,
+ )
+ calibration_audit["validation_issues"] = validation_issues
+ calibration_audit["validation_passed"] = not bool(validation_issues)
+ if validation_issues:
+ issue_text = "; ".join(validation_issues)
+ if not ALLOW_VALIDATION_FAILURES:
+ raise RuntimeError(
+ f"Calibration validation failed for {year}: {issue_text}"
+ )
+ print(
+ f" [WARN {year}] Validation issues recorded but not fatal: {issue_text}",
+ file=sys.stderr,
+ )
+
+ if year in display_years and CALIBRATION_METHOD in {"greg", "entropy"}:
+ n_neg = calibration_audit["negative_weight_count"]
if n_neg > 0:
- pct_neg = 100 * n_neg / len(w_new)
- max_neg = np.abs(w_new[neg_mask]).max()
+ pct_neg = calibration_audit["negative_weight_pct"]
+ hh_pct_neg = calibration_audit.get("negative_weight_household_pct", 0.0)
+ max_neg = calibration_audit["largest_negative_weight"]
print(
- f" [DEBUG {year}] Negative weights: {n_neg} ({pct_neg:.2f}%), "
+ f" [DEBUG {year}] Negative weights: {n_neg} households "
+ f"({hh_pct_neg:.2f}% of households, {pct_neg:.2f}% of weight mass), "
f"largest: {max_neg:,.0f}"
)
else:
@@ -535,45 +1165,89 @@ def create_h6_reform():
if year in display_years and (USE_SS or USE_PAYROLL or USE_H6_REFORM or USE_TOB):
if USE_SS:
- ss_achieved = np.sum(ss_values * w_new)
+ ss_stats = calibration_audit["constraints"]["ss_total"]
print(
- f" [DEBUG {year}] SS achieved: ${ss_achieved / 1e9:.1f}B (error: ${abs(ss_achieved - ss_target) / 1e6:.1f}M, {(ss_achieved - ss_target) / ss_target * 100:.3f}%)"
+ f" [DEBUG {year}] SS achieved: ${ss_stats['achieved'] / 1e9:.1f}B "
+ f"(error: ${abs(ss_stats['error']) / 1e6:.1f}M, "
+ f"{ss_stats['pct_error']:.3f}%)"
)
if USE_PAYROLL:
- payroll_achieved = np.sum(payroll_values * w_new)
+ payroll_stats = calibration_audit["constraints"]["payroll_total"]
print(
- f" [DEBUG {year}] Payroll achieved: ${payroll_achieved / 1e9:.1f}B (error: ${abs(payroll_achieved - payroll_target) / 1e6:.1f}M, {(payroll_achieved - payroll_target) / payroll_target * 100:.3f}%)"
+ f" [DEBUG {year}] Payroll achieved: ${payroll_stats['achieved'] / 1e9:.1f}B "
+ f"(error: ${abs(payroll_stats['error']) / 1e6:.1f}M, "
+ f"{payroll_stats['pct_error']:.3f}%)"
)
if USE_H6_REFORM and h6_revenue_target is not None:
- h6_revenue_achieved = np.sum(h6_income_values * w_new)
- error_pct = (
- (h6_revenue_achieved - h6_revenue_target) / abs(h6_revenue_target) * 100
- if h6_revenue_target != 0
- else 0
- )
+ h6_stats = calibration_audit["constraints"]["h6_revenue"]
print(
- f" [DEBUG {year}] H6 achieved revenue: ${h6_revenue_achieved / 1e9:.3f}B (error: ${abs(h6_revenue_achieved - h6_revenue_target) / 1e6:.1f}M, {error_pct:.3f}%)"
+ f" [DEBUG {year}] H6 achieved revenue: ${h6_stats['achieved'] / 1e9:.3f}B "
+ f"(error: ${abs(h6_stats['error']) / 1e6:.1f}M, "
+ f"{h6_stats['pct_error']:.3f}%)"
)
if USE_TOB:
- oasdi_achieved = np.sum(oasdi_tob_values * w_new)
- hi_achieved = np.sum(hi_tob_values * w_new)
+ oasdi_stats = calibration_audit["constraints"]["oasdi_tob"]
+ hi_stats = calibration_audit["constraints"]["hi_tob"]
print(
- f" [DEBUG {year}] OASDI TOB achieved: ${oasdi_achieved / 1e9:.1f}B (error: ${abs(oasdi_achieved - oasdi_tob_target) / 1e6:.1f}M, {(oasdi_achieved - oasdi_tob_target) / oasdi_tob_target * 100:.3f}%)"
+ f" [DEBUG {year}] OASDI TOB achieved: ${oasdi_stats['achieved'] / 1e9:.1f}B "
+ f"(error: ${abs(oasdi_stats['error']) / 1e6:.1f}M, "
+ f"{oasdi_stats['pct_error']:.3f}%)"
)
print(
- f" [DEBUG {year}] HI TOB achieved: ${hi_achieved / 1e9:.1f}B (error: ${abs(hi_achieved - hi_tob_target) / 1e6:.1f}M, {(hi_achieved - hi_tob_target) / hi_tob_target * 100:.3f}%)"
+ f" [DEBUG {year}] HI TOB achieved: ${hi_stats['achieved'] / 1e9:.1f}B "
+ f"(error: ${abs(hi_stats['error']) / 1e6:.1f}M, "
+ f"{hi_stats['pct_error']:.3f}%)"
)
+ if year in display_years and BENCHMARK_TOB:
+ oasdi_stats = calibration_audit["benchmarks"]["oasdi_tob"]
+ hi_stats = calibration_audit["benchmarks"]["hi_tob"]
+ print(
+ f" [DEBUG {year}] OASDI TOB benchmark: ${oasdi_stats['achieved'] / 1e9:.1f}B "
+ f"(gap: ${abs(oasdi_stats['error']) / 1e6:.1f}M, "
+ f"{oasdi_stats['pct_error']:.3f}%)"
+ )
+ print(
+ f" [DEBUG {year}] HI TOB benchmark: ${hi_stats['achieved'] / 1e9:.1f}B "
+ f"(gap: ${abs(hi_stats['error']) / 1e6:.1f}M, "
+ f"{hi_stats['pct_error']:.3f}%)"
+ )
- weights_matrix[:, year_idx] = w_new
- baseline_weights_matrix[:, year_idx] = baseline_weights
total_income_tax[year_idx] = np.sum(income_tax_values * w_new)
total_income_tax_baseline[year_idx] = income_tax_baseline_total
total_population[year_idx] = np.sum(y_target)
if SAVE_H5:
- h5_path = create_household_year_h5(year, w_new, BASE_DATASET_PATH, OUTPUT_DIR)
+ h5_path = create_household_year_h5(
+ year,
+ w_new,
+ current_dataset,
+ OUTPUT_DIR,
+ reform=ACTIVE_LONG_RUN_TAX_REFORM,
+ )
+ metadata_path = write_year_metadata(
+ h5_path,
+ year=year,
+ base_dataset_path=BASE_DATASET_PATH,
+ profile=PROFILE.to_dict(),
+ calibration_audit=calibration_audit,
+ target_source=TARGET_SOURCE_METADATA,
+ tax_assumption=LONG_RUN_TAX_ASSUMPTION_METADATA,
+ support_augmentation=current_support_augmentation_metadata,
+ )
+ update_dataset_manifest(
+ OUTPUT_DIR,
+ year=year,
+ h5_path=h5_path,
+ metadata_path=metadata_path,
+ base_dataset_path=BASE_DATASET_PATH,
+ profile=PROFILE.to_dict(),
+ calibration_audit=calibration_audit,
+ target_source=TARGET_SOURCE_METADATA,
+ tax_assumption=LONG_RUN_TAX_ASSUMPTION_METADATA,
+ support_augmentation=MANIFEST_SUPPORT_AUGMENTATION_METADATA,
+ )
if year in display_years:
- print(f" Saved {year}.h5")
+ print(f" Saved {year}.h5 and metadata")
del sim
gc.collect()
diff --git a/policyengine_us_data/datasets/cps/long_term/ssa_data.py b/policyengine_us_data/datasets/cps/long_term/ssa_data.py
index 6b76ae219..b54ba0550 100644
--- a/policyengine_us_data/datasets/cps/long_term/ssa_data.py
+++ b/policyengine_us_data/datasets/cps/long_term/ssa_data.py
@@ -1,8 +1,77 @@
+import json
+import os
+from functools import lru_cache
+
import numpy as np
import pandas as pd
from policyengine_us_data.storage import STORAGE_FOLDER
+LONG_TERM_TARGET_SOURCES_DIR = STORAGE_FOLDER / "long_term_target_sources"
+LONG_TERM_TARGET_SOURCES_MANIFEST = LONG_TERM_TARGET_SOURCES_DIR / "sources.json"
+DEFAULT_LONG_TERM_TARGET_SOURCE = "trustees_2025_current_law"
+_CURRENT_LONG_TERM_TARGET_SOURCE = os.environ.get(
+ "POLICYENGINE_US_DATA_LONG_TERM_TARGET_SOURCE",
+ DEFAULT_LONG_TERM_TARGET_SOURCE,
+)
+
+
+@lru_cache(maxsize=1)
+def _load_long_term_target_sources_manifest() -> dict:
+ return json.loads(LONG_TERM_TARGET_SOURCES_MANIFEST.read_text(encoding="utf-8"))
+
+
+def available_long_term_target_sources() -> list[str]:
+ manifest = _load_long_term_target_sources_manifest()
+ return sorted(manifest["sources"])
+
+
+def get_long_term_target_source() -> str:
+ return _CURRENT_LONG_TERM_TARGET_SOURCE
+
+
+def set_long_term_target_source(source_name: str) -> None:
+ global _CURRENT_LONG_TERM_TARGET_SOURCE
+ _CURRENT_LONG_TERM_TARGET_SOURCE = resolve_long_term_target_source_name(source_name)
+
+
+def resolve_long_term_target_source_name(source_name: str | None = None) -> str:
+ manifest = _load_long_term_target_sources_manifest()
+ candidate = source_name or _CURRENT_LONG_TERM_TARGET_SOURCE
+ if candidate not in manifest["sources"]:
+ valid = ", ".join(sorted(manifest["sources"]))
+ raise ValueError(
+ f"Unknown long-term target source {candidate!r}. Valid sources: {valid}"
+ )
+ return candidate
+
+
+def describe_long_term_target_source(source_name: str | None = None) -> dict:
+ manifest = _load_long_term_target_sources_manifest()
+ resolved_name = resolve_long_term_target_source_name(source_name)
+ source = dict(manifest["sources"][resolved_name])
+ source["name"] = resolved_name
+ return source
+
+
+@lru_cache(maxsize=None)
+def _load_long_term_target_frame(source_name: str) -> pd.DataFrame:
+ source = describe_long_term_target_source(source_name)
+ csv_path = LONG_TERM_TARGET_SOURCES_DIR / source["file"]
+ return pd.read_csv(csv_path)
+
+
+def _load_long_term_target_row(year: int, source_name: str | None = None) -> pd.Series:
+ resolved_name = resolve_long_term_target_source_name(source_name)
+ df = _load_long_term_target_frame(resolved_name)
+ row = df[df["year"] == year]
+ if row.empty:
+ raise ValueError(
+ f"Year {year} not found in long-term target source {resolved_name!r}"
+ )
+ return row.iloc[0]
+
+
def load_ssa_age_projections(start_year=2025, end_year=2100):
"""
Load SSA population projections from package storage.
@@ -37,7 +106,7 @@ def load_ssa_age_projections(start_year=2025, end_year=2100):
return target_matrix
-def load_ssa_benefit_projections(year):
+def load_ssa_benefit_projections(year, source_name: str | None = None):
"""
Load SSA Trustee Report projections for Social Security benefits.
@@ -47,15 +116,12 @@ def load_ssa_benefit_projections(year):
Returns:
Total OASDI benefits in nominal dollars
"""
- csv_path = STORAGE_FOLDER / "social_security_aux.csv"
- df = pd.read_csv(csv_path)
-
- row = df[df["year"] == year]
- nominal_billions = row["oasdi_cost_in_billion_nominal_usd"].values[0]
+ row = _load_long_term_target_row(year, source_name)
+ nominal_billions = row["oasdi_cost_in_billion_nominal_usd"]
return nominal_billions * 1e9
-def load_taxable_payroll_projections(year):
+def load_taxable_payroll_projections(year, source_name: str | None = None):
"""
Load SSA Trustee Report projections for taxable payroll.
@@ -65,15 +131,12 @@ def load_taxable_payroll_projections(year):
Returns:
Total taxable payroll in nominal dollars
"""
- csv_path = STORAGE_FOLDER / "social_security_aux.csv"
- df = pd.read_csv(csv_path)
-
- row = df[df["year"] == year]
- nominal_billions = row["taxable_payroll_in_billion_nominal_usd"].values[0]
+ row = _load_long_term_target_row(year, source_name)
+ nominal_billions = row["taxable_payroll_in_billion_nominal_usd"]
return nominal_billions * 1e9
-def load_h6_income_rate_change(year):
+def load_h6_income_rate_change(year, source_name: str | None = None):
"""
Load H6 reform income rate change target for a given year.
@@ -83,15 +146,12 @@ def load_h6_income_rate_change(year):
Returns:
H6 income rate change as decimal (e.g., -0.0018 for -0.18%)
"""
- csv_path = STORAGE_FOLDER / "social_security_aux.csv"
- df = pd.read_csv(csv_path)
-
- row = df[df["year"] == year]
+ row = _load_long_term_target_row(year, source_name)
# CSV stores as percentage (e.g., -0.18), convert to decimal
- return row["h6_income_rate_change"].values[0] / 100
+ return row["h6_income_rate_change"] / 100
-def load_oasdi_tob_projections(year):
+def load_oasdi_tob_projections(year, source_name: str | None = None):
"""
Load OASDI TOB (Taxation of Benefits) revenue target for a given year.
@@ -101,15 +161,12 @@ def load_oasdi_tob_projections(year):
Returns:
Total OASDI TOB revenue in nominal dollars
"""
- csv_path = STORAGE_FOLDER / "social_security_aux.csv"
- df = pd.read_csv(csv_path)
-
- row = df[df["year"] == year]
- nominal_billions = row["oasdi_tob_billions_nominal_usd"].values[0]
+ row = _load_long_term_target_row(year, source_name)
+ nominal_billions = row["oasdi_tob_billions_nominal_usd"]
return nominal_billions * 1e9
-def load_hi_tob_projections(year):
+def load_hi_tob_projections(year, source_name: str | None = None):
"""
Load HI (Medicare) TOB revenue target for a given year.
@@ -119,9 +176,6 @@ def load_hi_tob_projections(year):
Returns:
Total HI TOB revenue in nominal dollars
"""
- csv_path = STORAGE_FOLDER / "social_security_aux.csv"
- df = pd.read_csv(csv_path)
-
- row = df[df["year"] == year]
- nominal_billions = row["hi_tob_billions_nominal_usd"].values[0]
+ row = _load_long_term_target_row(year, source_name)
+ nominal_billions = row["hi_tob_billions_nominal_usd"]
return nominal_billions * 1e9
diff --git a/policyengine_us_data/datasets/cps/long_term/summarize_calibration_runs.py b/policyengine_us_data/datasets/cps/long_term/summarize_calibration_runs.py
new file mode 100644
index 000000000..029d27985
--- /dev/null
+++ b/policyengine_us_data/datasets/cps/long_term/summarize_calibration_runs.py
@@ -0,0 +1,184 @@
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+from pathlib import Path
+import sys
+from typing import Any
+
+from calibration_artifacts import normalize_metadata
+from profile_support_concentration import profile_support
+
+
+SUPPORT_FIELDS = (
+ "positive_household_count",
+ "positive_household_pct",
+ "effective_sample_size",
+ "top_10_weight_share_pct",
+ "top_100_weight_share_pct",
+ "weighted_nonworking_share_pct",
+ "weighted_nonworking_share_85_plus_pct",
+)
+
+
+def parse_args() -> argparse.Namespace:
+ parser = argparse.ArgumentParser(
+ description=(
+ "Summarize and compare calibration quality across projected dataset directories."
+ )
+ )
+ parser.add_argument("left", type=Path, help="First projected dataset directory.")
+ parser.add_argument(
+ "right", type=Path, nargs="?", help="Optional second directory to compare."
+ )
+ parser.add_argument(
+ "--years",
+ help="Optional comma-separated list of years to include. Defaults to all years found.",
+ )
+ parser.add_argument(
+ "--profile-support",
+ action="store_true",
+ help="Compute support-concentration metrics from the H5 files when metadata is missing them.",
+ )
+ parser.add_argument(
+ "--output",
+ type=Path,
+ help="Optional CSV output path. Prints to stdout when omitted.",
+ )
+ return parser.parse_args()
+
+
+def parse_years(raw: str | None) -> list[int] | None:
+ if not raw:
+ return None
+ return [int(value.strip()) for value in raw.split(",") if value.strip()]
+
+
+def metadata_for(directory: Path, year: int) -> dict[str, Any] | None:
+ metadata_path = directory / f"{year}.h5.metadata.json"
+ if not metadata_path.exists():
+ return None
+ return normalize_metadata(json.loads(metadata_path.read_text(encoding="utf-8")))
+
+
+def support_metrics(
+ directory: Path, year: int, metadata: dict[str, Any] | None, *, profile: bool
+) -> dict[str, Any]:
+ audit = (metadata or {}).get("calibration_audit", {})
+ metrics = {field: audit.get(field) for field in SUPPORT_FIELDS}
+ if not profile or all(value is not None for value in metrics.values()):
+ return metrics
+
+ dataset_path = directory / f"{year}.h5"
+ if not dataset_path.exists():
+ return metrics
+ profiled = profile_support(dataset_path, year, top_n=10)
+ return {
+ "positive_household_count": profiled["positive_household_count"],
+ "positive_household_pct": profiled["positive_household_pct"],
+ "effective_sample_size": profiled["effective_sample_size"],
+ "top_10_weight_share_pct": profiled["top_10_weight_share_pct"],
+ "top_100_weight_share_pct": profiled["top_100_weight_share_pct"],
+ "weighted_nonworking_share_pct": profiled["weighted_nonworking_share_pct"],
+ "weighted_nonworking_share_85_plus_pct": profiled[
+ "weighted_nonworking_share_85_plus_pct"
+ ],
+ }
+
+
+def summarize_directory(
+ directory: Path, years: list[int] | None, *, profile: bool
+) -> dict[int, dict[str, Any]]:
+ if years is None:
+ years = sorted(
+ int(path.name.split(".")[0])
+ for path in directory.glob("*.h5.metadata.json")
+ )
+
+ rows: dict[int, dict[str, Any]] = {}
+ for year in years:
+ metadata = metadata_for(directory, year)
+ if metadata is None:
+ continue
+ audit = metadata["calibration_audit"]
+ row: dict[str, Any] = {
+ "quality": audit.get("calibration_quality"),
+ "method": audit.get("approximation_method") or audit.get("method_used"),
+ "age_bucket_size": audit.get("age_bucket_size"),
+ "max_constraint_pct_error": audit.get("max_constraint_pct_error"),
+ "age_max_pct_error": audit.get("age_max_pct_error"),
+ "negative_weight_pct": audit.get("negative_weight_pct"),
+ }
+ row.update(support_metrics(directory, year, metadata, profile=profile))
+ rows[year] = row
+ return rows
+
+
+def build_rows(
+ left: dict[int, dict[str, Any]],
+ right: dict[int, dict[str, Any]] | None,
+) -> list[dict[str, Any]]:
+ years = sorted(set(left) | set(right or {}))
+ rows: list[dict[str, Any]] = []
+ for year in years:
+ row: dict[str, Any] = {"year": year}
+ for prefix, source in [("left", left), ("right", right or {})]:
+ values = source.get(year, {})
+ for key, value in values.items():
+ row[f"{prefix}_{key}"] = value
+ if right is not None and year in left and year in right:
+ for key in (
+ "max_constraint_pct_error",
+ "age_max_pct_error",
+ "effective_sample_size",
+ "top_10_weight_share_pct",
+ "top_100_weight_share_pct",
+ "weighted_nonworking_share_pct",
+ "weighted_nonworking_share_85_plus_pct",
+ ):
+ left_value = left[year].get(key)
+ right_value = right[year].get(key)
+ if left_value is not None and right_value is not None:
+ row[f"delta_{key}"] = right_value - left_value
+ rows.append(row)
+ return rows
+
+
+def write_rows(rows: list[dict[str, Any]], output: Path | None) -> None:
+ if not rows:
+ raise SystemExit("No rows to write.")
+
+ fieldnames = sorted({key for row in rows for key in row})
+ if output is None:
+ writer = csv.DictWriter(
+ sys.stdout,
+ fieldnames=fieldnames,
+ )
+ writer.writeheader()
+ writer.writerows(rows)
+ return
+
+ output.parent.mkdir(parents=True, exist_ok=True)
+ with output.open("w", encoding="utf-8", newline="") as file:
+ writer = csv.DictWriter(file, fieldnames=fieldnames)
+ writer.writeheader()
+ writer.writerows(rows)
+
+
+def main() -> int:
+ args = parse_args()
+ years = parse_years(args.years)
+ left = summarize_directory(args.left, years, profile=args.profile_support)
+ right = (
+ summarize_directory(args.right, years, profile=args.profile_support)
+ if args.right is not None
+ else None
+ )
+ rows = build_rows(left, right)
+ write_rows(rows, args.output)
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/policyengine_us_data/datasets/cps/long_term/support_augmentation.py b/policyengine_us_data/datasets/cps/long_term/support_augmentation.py
new file mode 100644
index 000000000..d23bfe3ff
--- /dev/null
+++ b/policyengine_us_data/datasets/cps/long_term/support_augmentation.py
@@ -0,0 +1,1237 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, Literal
+
+import numpy as np
+import pandas as pd
+from policyengine_core.data.dataset import Dataset
+from policyengine_us import Microsimulation
+
+
+SS_COMPONENTS = (
+ "social_security_retirement",
+ "social_security_disability",
+ "social_security_survivors",
+ "social_security_dependents",
+)
+PAYROLL_COMPONENTS = (
+ "employment_income_before_lsr",
+ "self_employment_income_before_lsr",
+)
+PAYROLL_TRANSFER_COMPONENTS = PAYROLL_COMPONENTS + ("w2_wages_from_qualified_business",)
+ENTITY_ID_COLUMNS = {
+ "household": ("household_id", "person_household_id"),
+ "family": ("family_id", "person_family_id"),
+ "tax_unit": ("tax_unit_id", "person_tax_unit_id"),
+ "spm_unit": ("spm_unit_id", "person_spm_unit_id"),
+ "marital_unit": ("marital_unit_id", "person_marital_unit_id"),
+}
+PERSON_ID_COLUMN = "person_id"
+
+
+ConstraintState = Literal["any", "positive", "nonpositive"]
+
+
+@dataclass(frozen=True)
+class AgeShiftCloneRule:
+ name: str
+ min_max_age: int
+ max_max_age: int
+ age_shift: int
+ ss_state: ConstraintState = "any"
+ payroll_state: ConstraintState = "any"
+ clone_weight_scale: float = 0.25
+
+
+@dataclass(frozen=True)
+class CompositePayrollRule:
+ name: str
+ recipient_min_max_age: int
+ recipient_max_max_age: int
+ donor_min_max_age: int
+ donor_max_max_age: int
+ recipient_ss_state: ConstraintState = "positive"
+ recipient_payroll_state: ConstraintState = "nonpositive"
+ donor_ss_state: ConstraintState = "nonpositive"
+ donor_payroll_state: ConstraintState = "positive"
+ payroll_transfer_scale: float = 1.0
+ clone_weight_scale: float = 0.25
+
+
+@dataclass(frozen=True)
+class SinglePersonSyntheticGridRule:
+ name: str
+ template_min_max_age: int
+ template_max_max_age: int
+ target_ages: tuple[int, ...]
+ ss_quantiles: tuple[float, ...]
+ payroll_quantiles: tuple[float, ...]
+ template_ss_state: ConstraintState = "positive"
+ template_payroll_state: ConstraintState = "any"
+ payroll_donor_min_max_age: int = 45
+ payroll_donor_max_max_age: int = 64
+ payroll_donor_ss_state: ConstraintState = "nonpositive"
+ payroll_donor_payroll_state: ConstraintState = "positive"
+ payroll_scale_factors: tuple[float, ...] = (1.0,)
+ clone_weight_scale: float = 0.1
+
+
+@dataclass(frozen=True)
+class MixedAgeAppendRule:
+ name: str
+ recipient_min_max_age: int
+ recipient_max_max_age: int
+ donor_min_max_age: int
+ donor_max_max_age: int
+ recipient_ss_state: ConstraintState = "positive"
+ recipient_payroll_state: ConstraintState = "any"
+ donor_ss_state: ConstraintState = "nonpositive"
+ donor_payroll_state: ConstraintState = "positive"
+ donor_age_shift: int = 0
+ clone_weight_scale: float = 0.15
+
+
+@dataclass(frozen=True)
+class SupportAugmentationProfile:
+ name: str
+ description: str
+ rules: tuple[
+ AgeShiftCloneRule
+ | CompositePayrollRule
+ | SinglePersonSyntheticGridRule
+ | MixedAgeAppendRule,
+ ...,
+ ]
+
+
+LATE_CLONE_V1 = SupportAugmentationProfile(
+ name="late-clone-v1",
+ description=(
+ "Age-shifted donor households to expand late-year support for older "
+ "beneficiary, older beneficiary-plus-payroll, and payroll-only households."
+ ),
+ rules=(
+ AgeShiftCloneRule(
+ name="ss_only_65_74_to_75_84",
+ min_max_age=65,
+ max_max_age=74,
+ age_shift=10,
+ ss_state="positive",
+ payroll_state="nonpositive",
+ clone_weight_scale=0.35,
+ ),
+ AgeShiftCloneRule(
+ name="ss_only_75_84_to_85_plus",
+ min_max_age=75,
+ max_max_age=84,
+ age_shift=10,
+ ss_state="positive",
+ payroll_state="nonpositive",
+ clone_weight_scale=0.5,
+ ),
+ AgeShiftCloneRule(
+ name="ss_pay_65_74_to_75_84",
+ min_max_age=65,
+ max_max_age=74,
+ age_shift=10,
+ ss_state="positive",
+ payroll_state="positive",
+ clone_weight_scale=0.35,
+ ),
+ AgeShiftCloneRule(
+ name="ss_pay_75_84_to_85_plus",
+ min_max_age=75,
+ max_max_age=84,
+ age_shift=10,
+ ss_state="positive",
+ payroll_state="positive",
+ clone_weight_scale=0.5,
+ ),
+ AgeShiftCloneRule(
+ name="pay_only_55_64_to_65_74",
+ min_max_age=55,
+ max_max_age=64,
+ age_shift=10,
+ ss_state="nonpositive",
+ payroll_state="positive",
+ clone_weight_scale=0.2,
+ ),
+ ),
+)
+
+LATE_CLONE_V2 = SupportAugmentationProfile(
+ name="late-clone-v2",
+ description=(
+ "More aggressive age-shifted donor households that test whether the "
+ "late-year infeasibility is driven by missing older payroll-rich support."
+ ),
+ rules=(
+ *LATE_CLONE_V1.rules,
+ AgeShiftCloneRule(
+ name="pay_only_45_64_to_75_84",
+ min_max_age=45,
+ max_max_age=64,
+ age_shift=20,
+ ss_state="nonpositive",
+ payroll_state="positive",
+ clone_weight_scale=0.15,
+ ),
+ AgeShiftCloneRule(
+ name="pay_only_55_64_to_85_plus",
+ min_max_age=55,
+ max_max_age=64,
+ age_shift=30,
+ ss_state="nonpositive",
+ payroll_state="positive",
+ clone_weight_scale=0.1,
+ ),
+ AgeShiftCloneRule(
+ name="ss_pay_65_74_to_85_plus",
+ min_max_age=65,
+ max_max_age=74,
+ age_shift=20,
+ ss_state="positive",
+ payroll_state="positive",
+ clone_weight_scale=0.2,
+ ),
+ ),
+)
+
+LATE_COMPOSITE_V1 = SupportAugmentationProfile(
+ name="late-composite-v1",
+ description=(
+ "Composite synthetic households that preserve older beneficiary age/SS "
+ "structure while injecting payroll from younger payroll-rich donors."
+ ),
+ rules=(
+ CompositePayrollRule(
+ name="ss_only_75_84_plus_payroll_from_55_64",
+ recipient_min_max_age=75,
+ recipient_max_max_age=84,
+ donor_min_max_age=55,
+ donor_max_max_age=64,
+ recipient_ss_state="positive",
+ recipient_payroll_state="nonpositive",
+ donor_ss_state="nonpositive",
+ donor_payroll_state="positive",
+ payroll_transfer_scale=1.0,
+ clone_weight_scale=0.2,
+ ),
+ CompositePayrollRule(
+ name="ss_only_85_plus_plus_payroll_from_55_64",
+ recipient_min_max_age=85,
+ recipient_max_max_age=85,
+ donor_min_max_age=55,
+ donor_max_max_age=64,
+ recipient_ss_state="positive",
+ recipient_payroll_state="nonpositive",
+ donor_ss_state="nonpositive",
+ donor_payroll_state="positive",
+ payroll_transfer_scale=0.75,
+ clone_weight_scale=0.15,
+ ),
+ CompositePayrollRule(
+ name="ss_pay_75_84_boost_from_45_64",
+ recipient_min_max_age=75,
+ recipient_max_max_age=84,
+ donor_min_max_age=45,
+ donor_max_max_age=64,
+ recipient_ss_state="positive",
+ recipient_payroll_state="positive",
+ donor_ss_state="nonpositive",
+ donor_payroll_state="positive",
+ payroll_transfer_scale=0.5,
+ clone_weight_scale=0.15,
+ ),
+ ),
+)
+
+LATE_COMPOSITE_V2 = SupportAugmentationProfile(
+ name="late-composite-v2",
+ description=(
+ "Extreme composite synthetic households for diagnosing whether the late "
+ "frontier is limited by missing older payroll intensity."
+ ),
+ rules=(
+ CompositePayrollRule(
+ name="ss_only_75_84_plus_heavy_payroll_from_45_64",
+ recipient_min_max_age=75,
+ recipient_max_max_age=84,
+ donor_min_max_age=45,
+ donor_max_max_age=64,
+ recipient_ss_state="positive",
+ recipient_payroll_state="nonpositive",
+ donor_ss_state="nonpositive",
+ donor_payroll_state="positive",
+ payroll_transfer_scale=3.0,
+ clone_weight_scale=0.15,
+ ),
+ CompositePayrollRule(
+ name="ss_only_85_plus_heavy_payroll_from_45_64",
+ recipient_min_max_age=85,
+ recipient_max_max_age=85,
+ donor_min_max_age=45,
+ donor_max_max_age=64,
+ recipient_ss_state="positive",
+ recipient_payroll_state="nonpositive",
+ donor_ss_state="nonpositive",
+ donor_payroll_state="positive",
+ payroll_transfer_scale=2.0,
+ clone_weight_scale=0.1,
+ ),
+ CompositePayrollRule(
+ name="ss_pay_75_84_heavy_boost_from_45_64",
+ recipient_min_max_age=75,
+ recipient_max_max_age=84,
+ donor_min_max_age=45,
+ donor_max_max_age=64,
+ recipient_ss_state="positive",
+ recipient_payroll_state="positive",
+ donor_ss_state="nonpositive",
+ donor_payroll_state="positive",
+ payroll_transfer_scale=1.5,
+ clone_weight_scale=0.1,
+ ),
+ ),
+)
+
+LATE_SYNTHETIC_GRID_V1 = SupportAugmentationProfile(
+ name="late-synthetic-grid-v1",
+ description=(
+ "Appended synthetic single-person older households on an explicit "
+ "age/SS/payroll grid, preserving the base CPS support untouched."
+ ),
+ rules=(
+ SinglePersonSyntheticGridRule(
+ name="single_75_84_grid",
+ template_min_max_age=75,
+ template_max_max_age=84,
+ target_ages=(77, 82),
+ ss_quantiles=(0.25, 0.5, 0.75),
+ payroll_quantiles=(0.25, 0.5, 0.75),
+ template_ss_state="positive",
+ template_payroll_state="any",
+ payroll_donor_min_max_age=45,
+ payroll_donor_max_max_age=64,
+ clone_weight_scale=0.1,
+ ),
+ SinglePersonSyntheticGridRule(
+ name="single_85_plus_grid",
+ template_min_max_age=85,
+ template_max_max_age=85,
+ target_ages=(85,),
+ ss_quantiles=(0.25, 0.5, 0.75),
+ payroll_quantiles=(0.25, 0.5, 0.75),
+ template_ss_state="positive",
+ template_payroll_state="any",
+ payroll_donor_min_max_age=45,
+ payroll_donor_max_max_age=64,
+ clone_weight_scale=0.08,
+ ),
+ ),
+)
+
+LATE_SYNTHETIC_GRID_V2 = SupportAugmentationProfile(
+ name="late-synthetic-grid-v2",
+ description=(
+ "Appended synthetic older-worker grid with more extreme payroll levels "
+ "to test whether older payroll intensity alone can move the late frontier."
+ ),
+ rules=(
+ SinglePersonSyntheticGridRule(
+ name="single_75_84_extreme_grid",
+ template_min_max_age=75,
+ template_max_max_age=84,
+ target_ages=(77, 82),
+ ss_quantiles=(0.25, 0.5, 0.75),
+ payroll_quantiles=(0.25, 0.5, 0.75),
+ payroll_scale_factors=(1.0, 2.0, 4.0),
+ template_ss_state="positive",
+ template_payroll_state="any",
+ payroll_donor_min_max_age=45,
+ payroll_donor_max_max_age=64,
+ clone_weight_scale=0.08,
+ ),
+ SinglePersonSyntheticGridRule(
+ name="single_85_plus_extreme_grid",
+ template_min_max_age=85,
+ template_max_max_age=85,
+ target_ages=(85,),
+ ss_quantiles=(0.25, 0.5, 0.75),
+ payroll_quantiles=(0.25, 0.5, 0.75),
+ payroll_scale_factors=(1.0, 2.0, 4.0),
+ template_ss_state="positive",
+ template_payroll_state="any",
+ payroll_donor_min_max_age=45,
+ payroll_donor_max_max_age=64,
+ clone_weight_scale=0.06,
+ ),
+ ),
+)
+
+LATE_MIXED_HOUSEHOLD_V1 = SupportAugmentationProfile(
+ name="late-mixed-household-v1",
+ description=(
+ "Append synthetic mixed-age households by combining older beneficiary "
+ "households with a younger payroll-rich donor person as a separate "
+ "subunit in the same household."
+ ),
+ rules=(
+ MixedAgeAppendRule(
+ name="older_75_84_plus_prime_age_earner",
+ recipient_min_max_age=75,
+ recipient_max_max_age=84,
+ donor_min_max_age=35,
+ donor_max_max_age=64,
+ recipient_ss_state="positive",
+ recipient_payroll_state="any",
+ donor_ss_state="nonpositive",
+ donor_payroll_state="positive",
+ clone_weight_scale=0.12,
+ ),
+ MixedAgeAppendRule(
+ name="older_85_plus_prime_age_earner",
+ recipient_min_max_age=85,
+ recipient_max_max_age=85,
+ donor_min_max_age=35,
+ donor_max_max_age=64,
+ recipient_ss_state="positive",
+ recipient_payroll_state="any",
+ donor_ss_state="nonpositive",
+ donor_payroll_state="positive",
+ clone_weight_scale=0.08,
+ ),
+ ),
+)
+
+
+NAMED_SUPPORT_AUGMENTATION_PROFILES = {
+ LATE_CLONE_V1.name: LATE_CLONE_V1,
+ LATE_CLONE_V2.name: LATE_CLONE_V2,
+ LATE_COMPOSITE_V1.name: LATE_COMPOSITE_V1,
+ LATE_COMPOSITE_V2.name: LATE_COMPOSITE_V2,
+ LATE_SYNTHETIC_GRID_V1.name: LATE_SYNTHETIC_GRID_V1,
+ LATE_SYNTHETIC_GRID_V2.name: LATE_SYNTHETIC_GRID_V2,
+ LATE_MIXED_HOUSEHOLD_V1.name: LATE_MIXED_HOUSEHOLD_V1,
+}
+
+
+def _period_column(name: str, base_year: int) -> str:
+ return f"{name}__{base_year}"
+
+
+def get_support_augmentation_profile(name: str) -> SupportAugmentationProfile:
+ try:
+ return NAMED_SUPPORT_AUGMENTATION_PROFILES[name]
+ except KeyError as error:
+ valid = ", ".join(sorted(NAMED_SUPPORT_AUGMENTATION_PROFILES))
+ raise ValueError(
+ f"Unknown support augmentation profile '{name}'. Valid profiles: {valid}"
+ ) from error
+
+
+def household_support_summary(
+ input_df: pd.DataFrame,
+ *,
+ base_year: int,
+) -> pd.DataFrame:
+ household_id_col = _period_column("household_id", base_year)
+ household_weight_col = _period_column("household_weight", base_year)
+ age_col = _period_column("age", base_year)
+
+ required = [household_id_col, household_weight_col, age_col]
+ required.extend(_period_column(component, base_year) for component in SS_COMPONENTS)
+ required.extend(
+ _period_column(component, base_year) for component in PAYROLL_COMPONENTS
+ )
+ missing = [column for column in required if column not in input_df.columns]
+ if missing:
+ raise ValueError(
+ "Input dataframe is missing required support columns: "
+ + ", ".join(sorted(missing))
+ )
+
+ aggregations: dict[str, str] = {
+ age_col: "max",
+ household_weight_col: "max",
+ }
+ aggregations.update(
+ {_period_column(component, base_year): "sum" for component in SS_COMPONENTS}
+ )
+ aggregations.update(
+ {
+ _period_column(component, base_year): "sum"
+ for component in PAYROLL_COMPONENTS
+ }
+ )
+
+ summary = (
+ input_df.groupby(household_id_col, sort=False)
+ .agg(aggregations)
+ .rename(
+ columns={
+ age_col: "max_age",
+ household_weight_col: "baseline_weight",
+ }
+ )
+ )
+ summary["ss_total"] = summary[
+ [_period_column(component, base_year) for component in SS_COMPONENTS]
+ ].sum(axis=1)
+ summary["payroll_total"] = summary[
+ [_period_column(component, base_year) for component in PAYROLL_COMPONENTS]
+ ].sum(axis=1)
+ summary["household_size"] = input_df.groupby(household_id_col, sort=False)[
+ _period_column(PERSON_ID_COLUMN, base_year)
+ ].count()
+ return summary
+
+
+def _match_state(values: pd.Series, state: ConstraintState) -> pd.Series:
+ if state == "any":
+ return pd.Series(True, index=values.index)
+ if state == "positive":
+ return values > 0
+ if state == "nonpositive":
+ return values <= 0
+ raise ValueError(f"Unsupported state '{state}'")
+
+
+def _age_range_mask(
+ ages: pd.Series,
+ *,
+ min_max_age: int,
+ max_max_age: int,
+) -> pd.Series:
+ if min_max_age == 85 and max_max_age == 85:
+ return ages >= 85
+ return ages.between(min_max_age, max_max_age)
+
+
+def select_donor_households(
+ summary: pd.DataFrame,
+ rule: AgeShiftCloneRule,
+) -> pd.Index:
+ age_mask = _age_range_mask(
+ summary["max_age"],
+ min_max_age=rule.min_max_age,
+ max_max_age=rule.max_max_age,
+ )
+ positive_weight_mask = summary["baseline_weight"] > 0
+ ss_mask = _match_state(summary["ss_total"], rule.ss_state)
+ payroll_mask = _match_state(summary["payroll_total"], rule.payroll_state)
+ return summary.index[age_mask & positive_weight_mask & ss_mask & payroll_mask]
+
+
+def select_households_for_composite_rule(
+ summary: pd.DataFrame,
+ *,
+ min_max_age: int,
+ max_max_age: int,
+ ss_state: ConstraintState,
+ payroll_state: ConstraintState,
+) -> pd.Index:
+ age_mask = _age_range_mask(
+ summary["max_age"],
+ min_max_age=min_max_age,
+ max_max_age=max_max_age,
+ )
+ positive_weight_mask = summary["baseline_weight"] > 0
+ ss_mask = _match_state(summary["ss_total"], ss_state)
+ payroll_mask = _match_state(summary["payroll_total"], payroll_state)
+ return summary.index[age_mask & positive_weight_mask & ss_mask & payroll_mask]
+
+
+def _next_entity_id(values: pd.Series) -> int:
+ non_null = values.dropna()
+ if non_null.empty:
+ return 1
+ return int(non_null.max()) + 1
+
+
+def _cast_mapped_ids(series: pd.Series, mapped: pd.Series) -> pd.Series:
+ dtype = series.dtype
+ if pd.api.types.is_integer_dtype(dtype):
+ return mapped.astype(dtype)
+ if pd.api.types.is_float_dtype(dtype):
+ return mapped.astype(dtype)
+ return mapped
+
+
+def clone_households_with_age_shift(
+ input_df: pd.DataFrame,
+ *,
+ base_year: int,
+ household_ids: pd.Index,
+ age_shift: int,
+ clone_weight_scale: float,
+ id_counters: dict[str, int] | None = None,
+) -> tuple[pd.DataFrame, dict[str, int]]:
+ if household_ids.empty:
+ return input_df.iloc[0:0].copy(), (
+ id_counters.copy() if id_counters is not None else {}
+ )
+
+ household_id_col = _period_column("household_id", base_year)
+ person_id_col = _period_column(PERSON_ID_COLUMN, base_year)
+ age_col = _period_column("age", base_year)
+ household_weight_col = _period_column("household_weight", base_year)
+ person_weight_col = _period_column("person_weight", base_year)
+
+ donors = input_df[input_df[household_id_col].isin(household_ids)].copy()
+
+ next_ids = (
+ id_counters.copy()
+ if id_counters is not None
+ else {
+ entity_name: _next_entity_id(
+ input_df[_period_column(columns[0], base_year)]
+ )
+ for entity_name, columns in ENTITY_ID_COLUMNS.items()
+ }
+ )
+ if "person" not in next_ids:
+ next_ids["person"] = _next_entity_id(input_df[person_id_col])
+
+ household_map = {
+ original_id: next_ids["household"] + offset
+ for offset, original_id in enumerate(
+ pd.unique(donors[household_id_col].dropna())
+ )
+ }
+ next_ids["household"] += len(household_map)
+
+ for entity_name, columns in ENTITY_ID_COLUMNS.items():
+ column = _period_column(columns[0], base_year)
+ if entity_name == "household" or column not in donors.columns:
+ continue
+ unique_ids = pd.unique(donors[column].dropna())
+ mapping = {
+ original_id: next_ids[entity_name] + offset
+ for offset, original_id in enumerate(unique_ids)
+ }
+ next_ids[entity_name] += len(mapping)
+ for raw_column in columns:
+ mapped_column = _period_column(raw_column, base_year)
+ if mapped_column not in donors.columns:
+ continue
+ mapped = donors[mapped_column].map(mapping)
+ donors[mapped_column] = _cast_mapped_ids(donors[mapped_column], mapped)
+
+ person_map = {
+ original_id: next_ids["person"] + offset
+ for offset, original_id in enumerate(pd.unique(donors[person_id_col].dropna()))
+ }
+ next_ids["person"] += len(person_map)
+ donors[person_id_col] = _cast_mapped_ids(
+ donors[person_id_col], donors[person_id_col].map(person_map)
+ )
+
+ for raw_column in ENTITY_ID_COLUMNS["household"]:
+ mapped_column = _period_column(raw_column, base_year)
+ donors[mapped_column] = _cast_mapped_ids(
+ donors[mapped_column], donors[mapped_column].map(household_map)
+ )
+
+ donors[age_col] = np.minimum(donors[age_col].astype(float) + age_shift, 85)
+
+ if household_weight_col in donors.columns:
+ donors[household_weight_col] = (
+ donors[household_weight_col].astype(float) * clone_weight_scale
+ )
+ if person_weight_col in donors.columns:
+ donors[person_weight_col] = (
+ donors[person_weight_col].astype(float) * clone_weight_scale
+ )
+
+ return donors, next_ids
+
+
+def _household_component_totals(
+ input_df: pd.DataFrame,
+ *,
+ base_year: int,
+ components: tuple[str, ...],
+) -> pd.DataFrame:
+ household_id_col = _period_column("household_id", base_year)
+ available = [
+ _period_column(component, base_year)
+ for component in components
+ if _period_column(component, base_year) in input_df.columns
+ ]
+ if not available:
+ return pd.DataFrame(index=pd.Index([], dtype=int))
+ return input_df.groupby(household_id_col, sort=False)[available].sum()
+
+
+def _quantile_pair_households(
+ recipient_ids: pd.Index,
+ donor_ids: pd.Index,
+ summary: pd.DataFrame,
+) -> list[tuple[int, int]]:
+ if recipient_ids.empty or donor_ids.empty:
+ return []
+ recipient_order = (
+ summary.loc[recipient_ids]
+ .sort_values(["ss_total", "baseline_weight", "max_age"])
+ .index.to_list()
+ )
+ donor_order = (
+ summary.loc[donor_ids]
+ .sort_values(["payroll_total", "baseline_weight", "max_age"])
+ .index.to_list()
+ )
+ if len(donor_order) == 1:
+ donor_positions = np.zeros(len(recipient_order), dtype=int)
+ else:
+ donor_positions = (
+ np.linspace(
+ 0,
+ len(donor_order) - 1,
+ num=len(recipient_order),
+ )
+ .round()
+ .astype(int)
+ )
+ return [
+ (int(recipient_household_id), int(donor_order[position]))
+ for recipient_household_id, position in zip(recipient_order, donor_positions)
+ ]
+
+
+def _select_payroll_target_row(
+ household_rows: pd.DataFrame,
+ *,
+ base_year: int,
+) -> Any:
+ age_col = _period_column("age", base_year)
+ employment_col = _period_column("employment_income_before_lsr", base_year)
+ self_employment_col = _period_column("self_employment_income_before_lsr", base_year)
+ adults = household_rows[household_rows[age_col] >= 18]
+ if adults.empty:
+ adults = household_rows
+ existing_payroll = adults.get(employment_col, 0).astype(float) + adults.get(
+ self_employment_col, 0
+ ).astype(float)
+ if existing_payroll.gt(0).any():
+ return existing_payroll.idxmax()
+ return adults[age_col].astype(float).idxmax()
+
+
+def _clone_single_donor_person_row(
+ donor_row: pd.Series,
+ *,
+ base_year: int,
+ shared_household_id: int,
+ household_weight: float,
+ person_weight: float,
+ donor_age_shift: int,
+ id_counters: dict[str, int],
+) -> tuple[pd.Series, dict[str, int]]:
+ cloned = donor_row.copy()
+ person_id_col = _period_column(PERSON_ID_COLUMN, base_year)
+ household_weight_col = _period_column("household_weight", base_year)
+ person_weight_col = _period_column("person_weight", base_year)
+ age_col = _period_column("age", base_year)
+
+ cloned[person_id_col] = id_counters["person"]
+ id_counters["person"] += 1
+
+ for raw_column in ENTITY_ID_COLUMNS["household"]:
+ column = _period_column(raw_column, base_year)
+ cloned[column] = shared_household_id
+
+ for entity_name, columns in ENTITY_ID_COLUMNS.items():
+ if entity_name == "household":
+ continue
+ entity_id = id_counters[entity_name]
+ id_counters[entity_name] += 1
+ for raw_column in columns:
+ column = _period_column(raw_column, base_year)
+ if column in cloned.index:
+ cloned[column] = entity_id
+
+ cloned[age_col] = min(float(cloned[age_col]) + donor_age_shift, 85)
+ if household_weight_col in cloned.index:
+ cloned[household_weight_col] = household_weight
+ if person_weight_col in cloned.index:
+ cloned[person_weight_col] = person_weight
+ return cloned, id_counters
+
+
+def synthesize_composite_households(
+ input_df: pd.DataFrame,
+ *,
+ base_year: int,
+ summary: pd.DataFrame,
+ rule: CompositePayrollRule,
+ id_counters: dict[str, int] | None = None,
+) -> tuple[pd.DataFrame, dict[str, int], dict[str, Any]]:
+ recipient_ids = select_households_for_composite_rule(
+ summary,
+ min_max_age=rule.recipient_min_max_age,
+ max_max_age=rule.recipient_max_max_age,
+ ss_state=rule.recipient_ss_state,
+ payroll_state=rule.recipient_payroll_state,
+ )
+ donor_ids = select_households_for_composite_rule(
+ summary,
+ min_max_age=rule.donor_min_max_age,
+ max_max_age=rule.donor_max_max_age,
+ ss_state=rule.donor_ss_state,
+ payroll_state=rule.donor_payroll_state,
+ )
+ recipient_pairs = _quantile_pair_households(recipient_ids, donor_ids, summary)
+ if not recipient_pairs:
+ return (
+ input_df.iloc[0:0].copy(),
+ id_counters.copy() if id_counters is not None else {},
+ {
+ "rule": rule.name,
+ "recipient_household_count": 0,
+ "donor_household_count": int(len(donor_ids)),
+ "composite_household_count": 0,
+ "composite_person_count": 0,
+ "payroll_transfer_scale": rule.payroll_transfer_scale,
+ },
+ )
+
+ clone_df, next_ids = clone_households_with_age_shift(
+ input_df,
+ base_year=base_year,
+ household_ids=pd.Index([recipient for recipient, _ in recipient_pairs]),
+ age_shift=0,
+ clone_weight_scale=rule.clone_weight_scale,
+ id_counters=id_counters,
+ )
+ household_id_col = _period_column("household_id", base_year)
+ original_household_col = _period_column("person_household_id", base_year)
+ payroll_totals = _household_component_totals(
+ input_df,
+ base_year=base_year,
+ components=PAYROLL_TRANSFER_COMPONENTS,
+ )
+
+ original_recipients = pd.unique(
+ input_df[
+ input_df[household_id_col].isin(
+ [recipient for recipient, _ in recipient_pairs]
+ )
+ ][household_id_col]
+ )
+ cloned_household_ids = pd.unique(clone_df[household_id_col])
+ cloned_mapping = {
+ int(original): int(cloned)
+ for original, cloned in zip(original_recipients, cloned_household_ids)
+ }
+
+ employment_col = _period_column("employment_income_before_lsr", base_year)
+ self_employment_col = _period_column("self_employment_income_before_lsr", base_year)
+ qbi_col = _period_column("w2_wages_from_qualified_business", base_year)
+
+ for recipient_household_id, donor_household_id in recipient_pairs:
+ cloned_household_id = cloned_mapping[int(recipient_household_id)]
+ mask = clone_df[household_id_col] == cloned_household_id
+ target_row = _select_payroll_target_row(
+ clone_df.loc[mask],
+ base_year=base_year,
+ )
+ donor_row = payroll_totals.loc[int(donor_household_id)]
+ clone_df.loc[target_row, employment_col] = (
+ clone_df.loc[target_row, employment_col]
+ + float(donor_row.get(employment_col, 0.0)) * rule.payroll_transfer_scale
+ )
+ clone_df.loc[target_row, self_employment_col] = (
+ clone_df.loc[target_row, self_employment_col]
+ + float(donor_row.get(self_employment_col, 0.0))
+ * rule.payroll_transfer_scale
+ )
+ if qbi_col in clone_df.columns and qbi_col in donor_row.index:
+ clone_df.loc[target_row, qbi_col] = (
+ clone_df.loc[target_row, qbi_col]
+ + float(donor_row.get(qbi_col, 0.0)) * rule.payroll_transfer_scale
+ )
+
+ return (
+ clone_df,
+ next_ids,
+ {
+ "rule": rule.name,
+ "recipient_household_count": int(len(recipient_ids)),
+ "donor_household_count": int(len(donor_ids)),
+ "composite_household_count": int(clone_df[household_id_col].nunique()),
+ "composite_person_count": int(len(clone_df)),
+ "payroll_transfer_scale": rule.payroll_transfer_scale,
+ },
+ )
+
+
+def synthesize_single_person_grid_households(
+ input_df: pd.DataFrame,
+ *,
+ base_year: int,
+ summary: pd.DataFrame,
+ rule: SinglePersonSyntheticGridRule,
+ id_counters: dict[str, int] | None = None,
+) -> tuple[pd.DataFrame, dict[str, int], dict[str, Any]]:
+ template_ids = select_households_for_composite_rule(
+ summary[summary["household_size"] == 1],
+ min_max_age=rule.template_min_max_age,
+ max_max_age=rule.template_max_max_age,
+ ss_state=rule.template_ss_state,
+ payroll_state=rule.template_payroll_state,
+ )
+ donor_ids = select_households_for_composite_rule(
+ summary[summary["household_size"] == 1],
+ min_max_age=rule.payroll_donor_min_max_age,
+ max_max_age=rule.payroll_donor_max_max_age,
+ ss_state=rule.payroll_donor_ss_state,
+ payroll_state=rule.payroll_donor_payroll_state,
+ )
+ if template_ids.empty or donor_ids.empty:
+ return (
+ input_df.iloc[0:0].copy(),
+ id_counters.copy() if id_counters is not None else {},
+ {
+ "rule": rule.name,
+ "template_household_count": int(len(template_ids)),
+ "payroll_donor_household_count": int(len(donor_ids)),
+ "synthetic_household_count": 0,
+ "synthetic_person_count": 0,
+ },
+ )
+
+ household_id_col = _period_column("household_id", base_year)
+ age_col = _period_column("age", base_year)
+ person_id_col = _period_column(PERSON_ID_COLUMN, base_year)
+ household_weight_col = _period_column("household_weight", base_year)
+ person_weight_col = _period_column("person_weight", base_year)
+ employment_col = _period_column("employment_income_before_lsr", base_year)
+ self_employment_col = _period_column("self_employment_income_before_lsr", base_year)
+ qbi_col = _period_column("w2_wages_from_qualified_business", base_year)
+
+ template_rows = (
+ input_df[input_df[household_id_col].isin(template_ids)]
+ .sort_values([age_col, household_id_col])
+ .copy()
+ )
+ ss_values = summary.loc[template_ids, "ss_total"].astype(float)
+ payroll_values = summary.loc[donor_ids, "payroll_total"].astype(float)
+ ss_targets = [float(ss_values.quantile(q)) for q in rule.ss_quantiles]
+ payroll_targets = [
+ float(payroll_values.quantile(q) * scale_factor)
+ for q in rule.payroll_quantiles
+ for scale_factor in rule.payroll_scale_factors
+ ]
+
+ next_ids = (
+ id_counters.copy()
+ if id_counters is not None
+ else {
+ entity_name: _next_entity_id(
+ input_df[_period_column(columns[0], base_year)]
+ )
+ for entity_name, columns in ENTITY_ID_COLUMNS.items()
+ }
+ )
+ if "person" not in next_ids:
+ next_ids["person"] = _next_entity_id(input_df[person_id_col])
+
+ synthetic_rows = []
+ template_records = template_rows.to_dict("records")
+ template_index = 0
+ for target_age in rule.target_ages:
+ for ss_target in ss_targets:
+ for payroll_target in payroll_targets:
+ base_row = template_records[
+ template_index % len(template_records)
+ ].copy()
+ template_index += 1
+ household_id = next_ids["household"]
+ next_ids["household"] += 1
+ person_id = next_ids["person"]
+ next_ids["person"] += 1
+
+ for entity_name, columns in ENTITY_ID_COLUMNS.items():
+ entity_id = next_ids[entity_name]
+ next_ids[entity_name] += 1
+ for raw_column in columns:
+ column = _period_column(raw_column, base_year)
+ base_row[column] = entity_id
+
+ base_row[household_id_col] = household_id
+ base_row[_period_column("person_household_id", base_year)] = (
+ household_id
+ )
+ base_row[person_id_col] = person_id
+ base_row[age_col] = float(target_age)
+ base_row[_period_column("social_security_retirement", base_year)] = (
+ float(ss_target)
+ )
+ base_row[_period_column("social_security_disability", base_year)] = 0.0
+ base_row[_period_column("social_security_survivors", base_year)] = 0.0
+ base_row[_period_column("social_security_dependents", base_year)] = 0.0
+ base_row[employment_col] = float(payroll_target)
+ base_row[self_employment_col] = 0.0
+ if qbi_col in base_row:
+ base_row[qbi_col] = 0.0
+ if household_weight_col in base_row:
+ base_row[household_weight_col] = (
+ float(base_row[household_weight_col]) * rule.clone_weight_scale
+ )
+ if person_weight_col in base_row:
+ base_row[person_weight_col] = (
+ float(base_row[person_weight_col]) * rule.clone_weight_scale
+ )
+ synthetic_rows.append(base_row)
+
+ synthetic_df = pd.DataFrame(synthetic_rows, columns=input_df.columns)
+ return (
+ synthetic_df,
+ next_ids,
+ {
+ "rule": rule.name,
+ "template_household_count": int(len(template_ids)),
+ "payroll_donor_household_count": int(len(donor_ids)),
+ "synthetic_household_count": int(synthetic_df[household_id_col].nunique()),
+ "synthetic_person_count": int(len(synthetic_df)),
+ "target_age_count": int(len(rule.target_ages)),
+ "ss_grid_size": int(len(rule.ss_quantiles)),
+ "payroll_grid_size": int(len(rule.payroll_quantiles)),
+ "payroll_scale_factor_count": int(len(rule.payroll_scale_factors)),
+ },
+ )
+
+
+def synthesize_mixed_age_households(
+ input_df: pd.DataFrame,
+ *,
+ base_year: int,
+ summary: pd.DataFrame,
+ rule: MixedAgeAppendRule,
+ id_counters: dict[str, int] | None = None,
+) -> tuple[pd.DataFrame, dict[str, int], dict[str, Any]]:
+ recipient_ids = select_households_for_composite_rule(
+ summary,
+ min_max_age=rule.recipient_min_max_age,
+ max_max_age=rule.recipient_max_max_age,
+ ss_state=rule.recipient_ss_state,
+ payroll_state=rule.recipient_payroll_state,
+ )
+ donor_ids = select_households_for_composite_rule(
+ summary,
+ min_max_age=rule.donor_min_max_age,
+ max_max_age=rule.donor_max_max_age,
+ ss_state=rule.donor_ss_state,
+ payroll_state=rule.donor_payroll_state,
+ )
+ recipient_pairs = _quantile_pair_households(recipient_ids, donor_ids, summary)
+ if not recipient_pairs:
+ return (
+ input_df.iloc[0:0].copy(),
+ id_counters.copy() if id_counters is not None else {},
+ {
+ "rule": rule.name,
+ "recipient_household_count": int(len(recipient_ids)),
+ "donor_household_count": int(len(donor_ids)),
+ "synthetic_household_count": 0,
+ "synthetic_person_count": 0,
+ },
+ )
+
+ recipient_clone_df, next_ids = clone_households_with_age_shift(
+ input_df,
+ base_year=base_year,
+ household_ids=pd.Index([recipient for recipient, _ in recipient_pairs]),
+ age_shift=0,
+ clone_weight_scale=rule.clone_weight_scale,
+ id_counters=id_counters,
+ )
+
+ household_id_col = _period_column("household_id", base_year)
+ person_weight_col = _period_column("person_weight", base_year)
+
+ original_recipients = pd.unique(
+ input_df[
+ input_df[household_id_col].isin(
+ [recipient for recipient, _ in recipient_pairs]
+ )
+ ][household_id_col]
+ )
+ cloned_household_ids = pd.unique(recipient_clone_df[household_id_col])
+ cloned_mapping = {
+ int(original): int(cloned)
+ for original, cloned in zip(original_recipients, cloned_household_ids)
+ }
+
+ donor_person_rows: list[pd.Series] = []
+ for recipient_household_id, donor_household_id in recipient_pairs:
+ cloned_household_id = cloned_mapping[int(recipient_household_id)]
+ cloned_household_rows = recipient_clone_df[
+ recipient_clone_df[household_id_col] == cloned_household_id
+ ]
+ donor_rows = input_df[input_df[household_id_col] == int(donor_household_id)]
+ donor_row_idx = _select_payroll_target_row(
+ donor_rows,
+ base_year=base_year,
+ )
+ household_weight = float(
+ cloned_household_rows.iloc[0][_period_column("household_weight", base_year)]
+ )
+ person_weight = (
+ float(cloned_household_rows.iloc[0][person_weight_col])
+ if person_weight_col in cloned_household_rows.columns
+ else household_weight
+ )
+ donor_row, next_ids = _clone_single_donor_person_row(
+ donor_rows.loc[donor_row_idx],
+ base_year=base_year,
+ shared_household_id=cloned_household_id,
+ household_weight=household_weight,
+ person_weight=person_weight,
+ donor_age_shift=rule.donor_age_shift,
+ id_counters=next_ids,
+ )
+ donor_person_rows.append(donor_row)
+
+ donor_df = pd.DataFrame(donor_person_rows, columns=input_df.columns)
+ synthetic_df = pd.concat([recipient_clone_df, donor_df], ignore_index=True)
+ return (
+ synthetic_df,
+ next_ids,
+ {
+ "rule": rule.name,
+ "recipient_household_count": int(len(recipient_ids)),
+ "donor_household_count": int(len(donor_ids)),
+ "synthetic_household_count": int(synthetic_df[household_id_col].nunique()),
+ "synthetic_person_count": int(len(synthetic_df)),
+ "donor_age_shift": rule.donor_age_shift,
+ },
+ )
+
+
+def augment_input_dataframe(
+ input_df: pd.DataFrame,
+ *,
+ base_year: int,
+ profile: str | SupportAugmentationProfile,
+) -> tuple[pd.DataFrame, dict[str, Any]]:
+ profile_obj = (
+ get_support_augmentation_profile(profile)
+ if isinstance(profile, str)
+ else profile
+ )
+ summary = household_support_summary(input_df, base_year=base_year)
+
+ clone_frames = []
+ rule_reports: list[dict[str, Any]] = []
+ id_counters = {
+ entity_name: _next_entity_id(input_df[_period_column(columns[0], base_year)])
+ for entity_name, columns in ENTITY_ID_COLUMNS.items()
+ }
+ id_counters["person"] = _next_entity_id(
+ input_df[_period_column(PERSON_ID_COLUMN, base_year)]
+ )
+ for rule in profile_obj.rules:
+ if isinstance(rule, AgeShiftCloneRule):
+ donor_households = select_donor_households(summary, rule)
+ clone_df, id_counters = clone_households_with_age_shift(
+ input_df,
+ base_year=base_year,
+ household_ids=donor_households,
+ age_shift=rule.age_shift,
+ clone_weight_scale=rule.clone_weight_scale,
+ id_counters=id_counters,
+ )
+ clone_frames.append(clone_df)
+ rule_reports.append(
+ {
+ "rule": rule.name,
+ "donor_household_count": int(len(donor_households)),
+ "clone_household_count": int(
+ clone_df[_period_column("household_id", base_year)].nunique()
+ )
+ if not clone_df.empty
+ else 0,
+ "clone_person_count": int(len(clone_df)),
+ "age_shift": rule.age_shift,
+ "clone_weight_scale": rule.clone_weight_scale,
+ }
+ )
+ continue
+
+ if isinstance(rule, CompositePayrollRule):
+ composite_df, id_counters, composite_report = (
+ synthesize_composite_households(
+ input_df,
+ base_year=base_year,
+ summary=summary,
+ rule=rule,
+ id_counters=id_counters,
+ )
+ )
+ clone_frames.append(composite_df)
+ rule_reports.append(composite_report)
+ continue
+
+ if isinstance(rule, MixedAgeAppendRule):
+ mixed_df, id_counters, mixed_report = synthesize_mixed_age_households(
+ input_df,
+ base_year=base_year,
+ summary=summary,
+ rule=rule,
+ id_counters=id_counters,
+ )
+ clone_frames.append(mixed_df)
+ rule_reports.append(mixed_report)
+ continue
+
+ synthetic_df, id_counters, synthetic_report = (
+ synthesize_single_person_grid_households(
+ input_df,
+ base_year=base_year,
+ summary=summary,
+ rule=rule,
+ id_counters=id_counters,
+ )
+ )
+ clone_frames.append(synthetic_df)
+ rule_reports.append(synthetic_report)
+
+ if clone_frames:
+ augmented_df = pd.concat([input_df, *clone_frames], ignore_index=True)
+ else:
+ augmented_df = input_df.copy()
+
+ report = {
+ "profile": profile_obj.name,
+ "description": profile_obj.description,
+ "base_household_count": int(
+ input_df[_period_column("household_id", base_year)].nunique()
+ ),
+ "base_person_count": int(len(input_df)),
+ "augmented_household_count": int(
+ augmented_df[_period_column("household_id", base_year)].nunique()
+ ),
+ "augmented_person_count": int(len(augmented_df)),
+ "rules": rule_reports,
+ }
+ return augmented_df, report
+
+
+def build_augmented_dataset(
+ *,
+ base_dataset: str,
+ base_year: int,
+ profile: str | SupportAugmentationProfile,
+) -> tuple[Dataset, dict[str, Any]]:
+ sim = Microsimulation(dataset=base_dataset)
+ input_df = sim.to_input_dataframe()
+ augmented_df, report = augment_input_dataframe(
+ input_df,
+ base_year=base_year,
+ profile=profile,
+ )
+ report["base_dataset"] = base_dataset
+ return Dataset.from_dataframe(augmented_df, base_year), report
diff --git a/policyengine_us_data/datasets/cps/long_term/tax_assumptions.py b/policyengine_us_data/datasets/cps/long_term/tax_assumptions.py
new file mode 100644
index 000000000..4e29c825f
--- /dev/null
+++ b/policyengine_us_data/datasets/cps/long_term/tax_assumptions.py
@@ -0,0 +1,177 @@
+from __future__ import annotations
+
+import math
+from typing import Any
+
+
+TRUSTEES_CORE_THRESHOLD_ASSUMPTION = {
+ "name": "trustees-core-thresholds-v1",
+ "description": (
+ "Best-public Trustees tax-side approximation: keep Social Security "
+ "benefit-tax thresholds fixed, but wage-index core ordinary federal "
+ "tax thresholds after 2034."
+ ),
+ "source": "SSA 2025 Trustees Report V.C.7",
+ "start_year": 2035,
+ "parameter_groups": [
+ "ordinary_income_brackets",
+ "standard_deduction",
+ "aged_blind_standard_deduction",
+ "capital_gains_thresholds",
+ "amt_thresholds",
+ ],
+}
+
+
+def round_amount(amount: float, rounding: dict | None) -> float:
+ if not rounding:
+ return amount
+
+ interval = float(rounding["interval"])
+ rounding_type = rounding["type"]
+
+ if rounding_type == "downwards":
+ return math.floor(amount / interval) * interval
+ if rounding_type == "nearest":
+ return math.floor(amount / interval + 0.5) * interval
+
+ raise ValueError(f"Unsupported rounding type: {rounding_type}")
+
+
+def _uprating_parameter_name(parameter) -> str | None:
+ metadata = getattr(parameter, "metadata", {})
+ uprating = metadata.get("uprating")
+ if isinstance(uprating, dict):
+ return uprating.get("parameter")
+ return uprating
+
+
+def iter_updatable_parameters(
+ root,
+ *,
+ uprating_parameter: str | None = None,
+) -> list:
+ candidates = [root]
+ if hasattr(root, "get_descendants"):
+ candidates.extend(root.get_descendants())
+
+ result = []
+ for candidate in candidates:
+ if candidate.__class__.__name__ != "Parameter":
+ continue
+ uprating_name = _uprating_parameter_name(candidate)
+ if uprating_name is None:
+ continue
+ if uprating_parameter is not None and uprating_name != uprating_parameter:
+ continue
+ result.append(candidate)
+ return result
+
+
+def apply_wage_growth_to_parameter(
+ parameter,
+ *,
+ nawi,
+ start_year: int,
+ end_year: int,
+) -> None:
+ metadata = getattr(parameter, "metadata", {})
+ uprating = metadata.get("uprating")
+ rounding = uprating.get("rounding") if isinstance(uprating, dict) else None
+
+ for year in range(start_year, end_year + 1):
+ previous_value = float(parameter(f"{year - 1}-01-01"))
+ wage_growth = float(nawi(f"{year - 1}-01-01")) / float(
+ nawi(f"{year - 2}-01-01")
+ )
+ updated_value = round_amount(previous_value * wage_growth, rounding)
+ parameter.update(
+ period=f"year:{year}-01-01:1",
+ value=updated_value,
+ )
+
+
+def create_wage_indexed_core_thresholds_reform(
+ *,
+ start_year: int = 2035,
+ end_year: int = 2100,
+):
+ from policyengine_us.model_api import Reform
+
+ def modify_parameters(parameters):
+ nawi = parameters.gov.ssa.nawi
+ roots = [
+ parameters.gov.irs.income.bracket.thresholds,
+ parameters.gov.irs.deductions.standard.amount,
+ parameters.gov.irs.deductions.standard.aged_or_blind.amount,
+ parameters.gov.irs.capital_gains.thresholds,
+ parameters.gov.irs.income.amt.brackets,
+ parameters.gov.irs.income.amt.exemption.amount,
+ parameters.gov.irs.income.amt.exemption.phase_out.start,
+ parameters.gov.irs.income.amt.exemption.separate_limit,
+ ]
+
+ seen = set()
+ for root in roots:
+ for parameter in iter_updatable_parameters(root):
+ if parameter.name in seen:
+ continue
+ seen.add(parameter.name)
+ apply_wage_growth_to_parameter(
+ parameter,
+ nawi=nawi,
+ start_year=start_year,
+ end_year=end_year,
+ )
+ return parameters
+
+ class reform(Reform):
+ def apply(self):
+ self.modify_parameters(modify_parameters)
+
+ return reform
+
+
+def create_wage_indexed_full_irs_uprating_reform(
+ *,
+ start_year: int = 2035,
+ end_year: int = 2100,
+):
+ from policyengine_us.model_api import Reform
+
+ def modify_parameters(parameters):
+ nawi = parameters.gov.ssa.nawi
+ seen = set()
+ for parameter in iter_updatable_parameters(
+ parameters.gov.irs,
+ uprating_parameter="gov.irs.uprating",
+ ):
+ if parameter.name in seen:
+ continue
+ seen.add(parameter.name)
+ apply_wage_growth_to_parameter(
+ parameter,
+ nawi=nawi,
+ start_year=start_year,
+ end_year=end_year,
+ )
+ return parameters
+
+ class reform(Reform):
+ def apply(self):
+ self.modify_parameters(modify_parameters)
+
+ return reform
+
+
+def get_long_run_tax_assumption_metadata(
+ name: str,
+ *,
+ end_year: int,
+) -> dict[str, Any]:
+ if name != TRUSTEES_CORE_THRESHOLD_ASSUMPTION["name"]:
+ raise ValueError(f"Unknown long-run tax assumption: {name}")
+
+ metadata = dict(TRUSTEES_CORE_THRESHOLD_ASSUMPTION)
+ metadata["end_year"] = int(end_year)
+ return metadata
diff --git a/policyengine_us_data/storage/README.md b/policyengine_us_data/storage/README.md
index 80d4c1cdc..ac4478902 100644
--- a/policyengine_us_data/storage/README.md
+++ b/policyengine_us_data/storage/README.md
@@ -30,6 +30,13 @@
- `https://www.ssa.gov/oact/solvency/provisions/tables/table_run133.html`
• Notes: Contains OASDI cost projections and taxable payroll data (2025-2100)
+- **long_term_target_sources/**
+ • Source packages for long-term CPS calibration targets
+ • Files:
+ - `trustees_2025_current_law.csv`: explicit frozen copy of the legacy Trustees/current-law target path
+ - `sources.json`: provenance and source metadata for each named package
+ • Notes: `run_household_projection.py --target-source ...` selects from these packages instead of relying on branch-specific data files
+
- **national_and_district_rents_2023.csv**
• Source: Census ACS 5-year estimates (2023), median 2BR rent by congressional district
• Created by: `fetch_cd_rents.py` (requires `CENSUS_API_KEY` environment variable)
diff --git a/policyengine_us_data/storage/long_term_target_sources/oact_2025_08_05_provisional.csv b/policyengine_us_data/storage/long_term_target_sources/oact_2025_08_05_provisional.csv
new file mode 100644
index 000000000..c73592790
--- /dev/null
+++ b/policyengine_us_data/storage/long_term_target_sources/oact_2025_08_05_provisional.csv
@@ -0,0 +1,77 @@
+year,oasdi_cost_in_billion_2025_usd,cpi_w_intermediate,oasdi_cost_in_billion_nominal_usd,taxable_payroll_in_billion_nominal_usd,h6_income_rate_change,oasdi_tob_pct_of_taxable_payroll,oasdi_tob_billions_nominal_usd,hi_tob_billions_nominal_usd
+2025,1609,100.0,1609.0,10621,0.0,0.5370464174748141,57.0397,38.304600179716786
+2026,1660,102.49,1701.334,11129,0.0,0.5444343606793063,60.59009999999999,41.186853903042184
+2027,1715,104.95,1799.8925,11627,0.0,0.5637885955104498,65.5517,48.095931607707655
+2028,1763,107.47,1894.6961,12159,0.0,0.5852512542149847,71.16069999999999,52.62050146630209
+2029,1810,110.05,1991.905,12696,0.0,0.6373660995589162,80.92,60.16481033396345
+2030,1856,112.69,2091.5264,13239,0.0,0.6689644232948108,88.5642,65.69059107415735
+2031,1903,115.4,2196.062,13798,0.0,0.6983896216843021,96.3638,71.59442052614531
+2032,1947,118.17,2300.7699,14380,0.0,0.7315159944367178,105.19200000000001,78.27395310947746
+2033,1991,121.0,2409.11,14987,0.0,0.7539080536464937,112.9882,85.07501730492007
+2034,2032,123.91,2517.8512,15594,0.0,0.7854726176734642,122.4866,92.24537468746983
+2035,2073,126.88,2630.2224,16205,0.0,0.7981795742054921,129.345,99.04787709554665
+2036,2114,129.93,2746.7202,16825,0.0,0.8113372956909362,136.5075,105.50951161765427
+2037,2155,133.04,2867.012,17465,0.0,0.8149527626681936,142.3315,111.90107428672101
+2038,2194,136.24,2989.1056,18132,0.0,0.8278976395323187,150.11440000000002,118.48385077067687
+2039,2233,139.51,3115.2583,18819,0.0,0.8412598969126945,158.3167,125.23374749266783
+2040,2270,142.86,3242.922,19532,0.0,0.8550112635674789,167.0008,132.13926202500207
+2041,2306,146.28,3373.2168,20269,0.0,0.8581015343628201,173.92860000000002,138.9165967996406
+2042,2342,149.79,3508.0818,21035,0.0,0.861559305918707,181.229,145.92339950639513
+2043,2378,153.39,3647.6142,21828,0.0,0.8748671431189299,190.966,153.25093791653404
+2044,2415,157.07,3793.2405,22653,0.0,0.8780448505716681,198.90349999999998,160.68590468556505
+2045,2452,160.84,3943.7968,23507,-0.07,0.891084357850853,209.46720000000002,168.5513909261071
+2046,2488,164.7,4097.736,24391,-0.12,0.8939920462465664,218.05360000000002,176.52152325652904
+2047,2527,168.65,4261.7855,25313,-0.18,0.896791372022281,227.0048,184.82403098539618
+2048,2567,172.7,4433.209,26270,-0.23,0.9094746859535591,238.91899999999998,193.69560564165155
+2049,2609,176.85,4614.0165,27263,-0.27,0.9120459964053844,248.65109999999999,202.86065543346197
+2050,2652,181.09,4802.5068,28300,-0.32,0.9241696113074204,261.53999999999996,216.92146837816398
+2051,2696,185.44,4999.4624,29376,-0.36,0.9265550108932461,272.1848,227.17710670657155
+2052,2743,189.89,5208.6827,30494,-0.4,0.9385144618613499,286.1906,238.0205394497212
+2053,2792,194.44,5428.7648,31661,-0.43,0.9407280250150026,297.84389999999996,249.44686394215998
+2054,2842,199.11,5658.7062,32869,-1.0,0.9525388664090784,313.09,261.4423901244333
+2055,2895,203.89,5902.6155,34124,-1.01,0.9645774235142422,329.1524,274.24655199383494
+2056,2950,208.78,6159.01,35432,-1.01,0.966254233461278,342.3632,287.5390828182389
+2057,3007,213.79,6428.6653,36790,-1.02,0.9778689861375374,359.758,301.67833439393695
+2058,3066,218.93,6712.3938,38201,-1.03,0.9896869191905971,378.07030000000003,316.6402407536223
+2059,3125,224.18,7005.625,39670,-1.04,1.0011797327955634,397.168,332.06147451086855
+2060,3184,229.56,7309.1904,41196,-1.04,1.0026177298766872,413.0384,347.92088498883385
+2061,3243,235.07,7623.3201,42782,-1.05,1.0140035528960778,433.81100000000004,364.3585166503047
+2062,3303,240.71,7950.6513,44429,-1.06,1.025563033154021,455.6474,381.58588180463465
+2063,3362,246.49,8286.9938,46136,-1.06,1.0268371770417895,473.7416,399.16745314590827
+2064,3422,252.4,8637.128,47902,-1.07,1.038268548286084,497.35139999999996,417.5120418767757
+2065,3483,258.46,9002.1618,49733,-1.07,1.0394367924717993,516.9431,436.4536805413806
+2066,3544,264.66,9379.5504,51631,-1.08,1.050754004377215,542.5147999999999,456.34830162327114
+2067,3607,271.02,9775.6914,53598,-1.09,1.0618273069890667,569.1182,477.000102597283
+2068,3670,277.52,10184.984,55637,-1.09,1.0630395240577315,591.4433,498.52457562710316
+2069,3735,284.18,10614.123,57746,-1.1,1.0741973470023898,620.306,521.0794614282611
+2070,3801,291.0,11060.91,59930,-1.1,1.0751376606040381,644.33,544.3715195000228
+2071,3867,297.99,11523.2733,62196,-1.11,1.0862042575085213,675.5756,568.6915353613309
+2072,3934,305.14,12004.2076,64543,-1.12,1.0970695505322035,708.0816000000001,593.8535429669257
+2073,4002,312.46,12504.6492,66975,-1.12,1.0980515117581187,735.42,620.2432200447928
+2074,4071,319.96,13025.5716,69501,-1.13,1.1089931080128343,770.7613,647.5960347191796
+2075,4139,327.64,13561.0196,72131,-1.13,1.1097590495071465,800.4802999999999,675.8383507115801
+2076,4206,335.5,14111.13,74862,-1.14,1.1206310277577407,838.9268,704.8661033532109
+2077,4273,343.55,14679.8915,77698,-1.14,1.1214667044196762,871.3572,734.6364395845724
+2078,4339,351.8,15264.602,80650,-1.14,1.1222690638561688,905.11,765.1167787494154
+2079,4403,360.24,15861.3672,83727,-1.15,1.1330401184802992,948.6605,796.6188317534055
+2080,4467,368.89,16478.3163,86933,-1.15,1.1337806126557235,985.6295,828.9385545905168
+2081,4530,377.74,17111.622,90268,-1.15,1.1344906279080074,1024.082,862.0944955793473
+2082,4593,386.81,17766.1833,93749,-1.15,1.1351731751805352,1064.2134999999998,896.2517963396247
+2083,4655,396.09,18437.9895,97381,-1.15,1.1358288577853997,1106.0815,931.4300349246773
+2084,4716,405.6,19128.096,101163,-1.15,1.1365563496535294,1149.7745,967.5207711639717
+2085,4775,415.33,19832.0075,105104,-1.15,1.1371555792358043,1195.196,1003.870068209045
+2086,4833,425.3,20554.749,109217,-1.14,1.127730847761795,1231.6737999999998,1041.3446863480701
+2087,4891,435.51,21300.7941,113504,-1.14,1.1283704539047081,1280.7456,1079.6614728458444
+2088,4948,445.96,22066.1008,117973,-1.14,1.1288957642850483,1331.7922,1119.009014012573
+2089,5006,456.66,22860.3996,122629,-1.13,1.1194804654690163,1372.8076999999998,1159.6737769420636
+2090,5064,467.62,23680.2768,127477,-1.13,1.1199589729912063,1427.6901,1201.7744537535523
+2091,5125,478.84,24540.55,132518,-1.13,1.1204918577098961,1484.8534000000002,1245.7033522589752
+2092,5188,490.34,25438.8392,137764,-1.12,1.1109265120060392,1530.4568,1291.5522519728356
+2093,5254,502.1,26380.334,143215,-1.12,1.1113416890688823,1591.608,1339.5725734983864
+2094,5323,514.16,27368.7368,148876,-1.12,1.1117380907600958,1655.1112,1390.0478250557512
+2095,5396,526.49,28409.4004,154754,-1.12,1.1121811390981817,1721.1448,1443.5177747752655
+2096,5472,539.13,29501.1936,160855,-1.12,1.1125398650958938,1789.576,1499.387856294711
+2097,5551,552.07,30645.4057,167185,-1.11,1.102882136555313,1843.8535,1557.8239986469107
+2098,5633,565.32,31844.4756,173750,-1.11,1.1032086330935251,1916.825,1619.4655487069804
+2099,5719,578.89,33106.7191,180557,-1.12,1.113464667667274,2010.4384,1684.301218679657
+2100,5809,592.78,34434.5902,187614,-1.12,1.113710490688328,2089.4768,1751.6199945298017
diff --git a/policyengine_us_data/storage/long_term_target_sources/oasdi_oact_20250805_nominal_delta.csv b/policyengine_us_data/storage/long_term_target_sources/oasdi_oact_20250805_nominal_delta.csv
new file mode 100644
index 000000000..c5aabfd19
--- /dev/null
+++ b/policyengine_us_data/storage/long_term_target_sources/oasdi_oact_20250805_nominal_delta.csv
@@ -0,0 +1,76 @@
+year,oasdi_nominal_delta_billions
+2025,-3.5
+2026,-16.2
+2027,-17.0
+2028,-17.6
+2029,-14.3
+2030,-14.7
+2031,-15.4
+2032,-15.6
+2033,-15.9
+2034,-16.3
+2035,-16.5
+2036,-16.6
+2037,-16.6
+2038,-16.7
+2039,-16.7
+2040,-16.6
+2041,-16.6
+2042,-16.5
+2043,-16.4
+2044,-16.3
+2045,-16.2
+2046,-16.1
+2047,-16.0
+2048,-15.9
+2049,-15.8
+2050,-15.8
+2051,-15.7
+2052,-15.7
+2053,-15.6
+2054,-15.6
+2055,-15.5
+2056,-15.5
+2057,-15.5
+2058,-15.4
+2059,-15.4
+2060,-15.4
+2061,-15.4
+2062,-15.3
+2063,-15.3
+2064,-15.2
+2065,-15.2
+2066,-15.1
+2067,-15.1
+2068,-15.0
+2069,-14.9
+2070,-14.9
+2071,-14.8
+2072,-14.8
+2073,-14.7
+2074,-14.6
+2075,-14.6
+2076,-14.5
+2077,-14.4
+2078,-14.3
+2079,-14.2
+2080,-14.1
+2081,-14.0
+2082,-13.9
+2083,-13.8
+2084,-13.6
+2085,-13.5
+2086,-13.4
+2087,-13.2
+2088,-13.1
+2089,-12.9
+2090,-12.8
+2091,-12.6
+2092,-12.5
+2093,-12.4
+2094,-12.3
+2095,-12.1
+2096,-12.0
+2097,-11.9
+2098,-11.8
+2099,-11.8
diff --git a/policyengine_us_data/storage/long_term_target_sources/sources.json b/policyengine_us_data/storage/long_term_target_sources/sources.json
new file mode 100644
index 000000000..62dd15e1a
--- /dev/null
+++ b/policyengine_us_data/storage/long_term_target_sources/sources.json
@@ -0,0 +1,35 @@
+{
+ "default_source": "trustees_2025_current_law",
+ "sources": {
+ "oact_2025_08_05_provisional": {
+ "derived_from": "trustees_2025_current_law",
+ "description": "Post-OBBBA SSA OACT baseline overlay with provisional HI bridge for long-term calibration experiments.",
+ "file": "oact_2025_08_05_provisional.csv",
+ "hi_method": "match_oasdi_pct_change",
+ "name": "oact_2025_08_05_provisional",
+ "notes": [
+ "OASDI TOB nominal deltas are taken from the August 5, 2025 OACT letter.",
+ "2100 OASDI delta is carried forward from 2099 because the published delta table ends at 2099.",
+ "HI TOB series is provisional: it applies the same percentage change as OASDI TOB to preserve the OASDI/HI share split until a published annual HI replacement series is available."
+ ],
+ "source_urls": [
+ "https://www.ssa.gov/OACT/solvency/RWyden_20250805.pdf",
+ "https://www.ssa.gov/oact/tr/2025/lrIndex.html"
+ ],
+ "type": "oact_override"
+ },
+ "trustees_2025_current_law": {
+ "description": "2025 Trustees current-law baseline used by the legacy long-term calibration stack.",
+ "file": "trustees_2025_current_law.csv",
+ "name": "trustees_2025_current_law",
+ "notes": [
+ "Generated from social_security_aux.csv for explicit source selection."
+ ],
+ "source_urls": [
+ "https://www.ssa.gov/oact/tr/2025/lrIndex.html",
+ "https://www.ssa.gov/oact/solvency/provisions/tables/table_run133.html"
+ ],
+ "type": "trustees_current_law"
+ }
+ }
+}
diff --git a/policyengine_us_data/storage/long_term_target_sources/trustees_2025_current_law.csv b/policyengine_us_data/storage/long_term_target_sources/trustees_2025_current_law.csv
new file mode 100644
index 000000000..cc5d66108
--- /dev/null
+++ b/policyengine_us_data/storage/long_term_target_sources/trustees_2025_current_law.csv
@@ -0,0 +1,77 @@
+year,oasdi_cost_in_billion_2025_usd,cpi_w_intermediate,oasdi_cost_in_billion_nominal_usd,taxable_payroll_in_billion_nominal_usd,h6_income_rate_change,oasdi_tob_pct_of_taxable_payroll,oasdi_tob_billions_nominal_usd,hi_tob_billions_nominal_usd
+2025,1609,100.0,1609.0,10621,0.0,0.57,60.5397,40.655
+2026,1660,102.49,1701.334,11129,0.0,0.69,76.7901,52.199
+2027,1715,104.95,1799.8925,11627,0.0,0.71,82.5517,60.569
+2028,1763,107.47,1894.6961,12159,0.0,0.73,88.7607,65.635
+2029,1810,110.05,1991.905,12696,0.0,0.75,95.22,70.797
+2030,1856,112.69,2091.5264,13239,0.0,0.78,103.2642,76.594
+2031,1903,115.4,2196.062,13798,0.0,0.81,111.7638,83.036
+2032,1947,118.17,2300.7699,14380,0.0,0.84,120.792,89.882
+2033,1991,121.0,2409.11,14987,0.0,0.86,128.8882,97.047
+2034,2032,123.91,2517.8512,15594,0.0,0.89,138.7866,104.521
+2035,2073,126.88,2630.2224,16205,0.0,0.9,145.845,111.683
+2036,2114,129.93,2746.7202,16825,0.0,0.91,153.1075,118.34
+2037,2155,133.04,2867.012,17465,0.0,0.91,158.9315,124.952
+2038,2194,136.24,2989.1056,18132,0.0,0.92,166.8144,131.665
+2039,2233,139.51,3115.2583,18819,0.0,0.93,175.0167,138.444
+2040,2270,142.86,3242.922,19532,0.0,0.94,183.6008,145.274
+2041,2306,146.28,3373.2168,20269,0.0,0.94,190.5286,152.175
+2042,2342,149.79,3508.0818,21035,0.0,0.94,197.729,159.209
+2043,2378,153.39,3647.6142,21828,0.0,0.95,207.366,166.412
+2044,2415,157.07,3793.2405,22653,0.0,0.95,215.2035,173.854
+2045,2452,160.84,3943.7968,23507,-0.07,0.96,225.6672,181.587
+2046,2488,164.7,4097.736,24391,-0.12,0.96,234.1536,189.555
+2047,2527,168.65,4261.7855,25313,-0.18,0.96,243.0048,197.851
+2048,2567,172.7,4433.209,26270,-0.23,0.97,254.819,206.586
+2049,2609,176.85,4614.0165,27263,-0.27,0.97,264.4511,215.751
+2050,2652,181.09,4802.5068,28300,-0.32,0.98,277.34,230.026
+2051,2696,185.44,4999.4624,29376,-0.36,0.98,287.8848,240.281
+2052,2743,189.89,5208.6827,30494,-0.4,0.99,301.8906,251.078
+2053,2792,194.44,5428.7648,31661,-0.43,0.99,313.4439,262.512
+2054,2842,199.11,5658.7062,32869,-1.0,1.0,328.69,274.469
+2055,2895,203.89,5902.6155,34124,-1.01,1.01,344.6524,287.161
+2056,2950,208.78,6159.01,35432,-1.01,1.01,357.8632,300.557
+2057,3007,213.79,6428.6653,36790,-1.02,1.02,375.258,314.676
+2058,3066,218.93,6712.3938,38201,-1.03,1.03,393.4703,329.538
+2059,3125,224.18,7005.625,39670,-1.04,1.04,412.568,344.937
+2060,3184,229.56,7309.1904,41196,-1.04,1.04,428.4384,360.893
+2061,3243,235.07,7623.3201,42782,-1.05,1.05,449.211,377.293
+2062,3303,240.71,7950.6513,44429,-1.06,1.06,470.9474,394.399
+2063,3362,246.49,8286.9938,46136,-1.06,1.06,489.0416,412.059
+2064,3422,252.4,8637.128,47902,-1.07,1.07,512.5514,430.272
+2065,3483,258.46,9002.1618,49733,-1.07,1.07,532.1431,449.287
+2066,3544,264.66,9379.5504,51631,-1.08,1.08,557.6148,469.05
+2067,3607,271.02,9775.6914,53598,-1.09,1.09,584.2182,489.656
+2068,3670,277.52,10184.984,55637,-1.09,1.09,606.4433,511.168
+2069,3735,284.18,10614.123,57746,-1.1,1.1,635.206,533.596
+2070,3801,291.0,11060.91,59930,-1.1,1.1,659.23,556.96
+2071,3867,297.99,11523.2733,62196,-1.11,1.11,690.3756,581.15
+2072,3934,305.14,12004.2076,64543,-1.12,1.12,722.8816,606.266
+2073,4002,312.46,12504.6492,66975,-1.12,1.12,750.12,632.641
+2074,4071,319.96,13025.5716,69501,-1.13,1.13,785.3613,659.863
+2075,4139,327.64,13561.0196,72131,-1.13,1.13,815.0803,688.165
+2076,4206,335.5,14111.13,74862,-1.14,1.14,853.4268,717.049
+2077,4273,343.55,14679.8915,77698,-1.14,1.14,885.7572,746.777
+2078,4339,351.8,15264.602,80650,-1.14,1.14,919.41,777.205
+2079,4403,360.24,15861.3672,83727,-1.15,1.15,962.8605,808.543
+2080,4467,368.89,16478.3163,86933,-1.15,1.15,999.7295,840.797
+2081,4530,377.74,17111.622,90268,-1.15,1.15,1038.082,873.88
+2082,4593,386.81,17766.1833,93749,-1.15,1.15,1078.1135,907.958
+2083,4655,396.09,18437.9895,97381,-1.15,1.15,1119.8815,943.051
+2084,4716,405.6,19128.096,101163,-1.15,1.15,1163.3745,978.965
+2085,4775,415.33,19832.0075,105104,-1.15,1.15,1208.696,1015.209
+2086,4833,425.3,20554.749,109217,-1.14,1.14,1245.0738,1052.674
+2087,4891,435.51,21300.7941,113504,-1.14,1.14,1293.9456,1090.789
+2088,4948,445.96,22066.1008,117973,-1.14,1.14,1344.8922,1130.016
+2089,5006,456.66,22860.3996,122629,-1.13,1.13,1385.7077,1170.571
+2090,5064,467.62,23680.2768,127477,-1.13,1.13,1440.4901,1212.549
+2091,5125,478.84,24540.55,132518,-1.13,1.13,1497.4534,1256.274
+2092,5188,490.34,25438.8392,137764,-1.12,1.12,1542.9568,1302.101
+2093,5254,502.1,26380.334,143215,-1.12,1.12,1604.008,1350.009
+2094,5323,514.16,27368.7368,148876,-1.12,1.12,1667.4112,1400.378
+2095,5396,526.49,28409.4004,154754,-1.12,1.12,1733.2448,1453.666
+2096,5472,539.13,29501.1936,160855,-1.12,1.12,1801.576,1509.442
+2097,5551,552.07,30645.4057,167185,-1.11,1.11,1855.7535,1567.878
+2098,5633,565.32,31844.4756,173750,-1.11,1.11,1928.625,1629.435
+2099,5719,578.89,33106.7191,180557,-1.12,1.12,2022.2384,1694.187
+2100,5809,592.78,34434.5902,187614,-1.12,1.12,2101.2768,1761.512
diff --git a/policyengine_us_data/utils/takeup.py b/policyengine_us_data/utils/takeup.py
index b8db8c90a..f2cf5d2a8 100644
--- a/policyengine_us_data/utils/takeup.py
+++ b/policyengine_us_data/utils/takeup.py
@@ -84,6 +84,12 @@
if spec.get("target") is not None
}
+# CMS 2025 Marketplace OEP State-Level Public Use File, Total / All row.
+# This is the number of consumers receiving APTC in plan year 2025.
+ACA_POST_CALIBRATION_PERSON_TARGETS = {
+ 2025: 22_380_137,
+}
+
# FIPS -> 2-letter state code for Medicaid rate lookup
_FIPS_TO_STATE_CODE = {
1: "AL",
@@ -154,45 +160,38 @@ def _resolve_rate(
return float(rate_or_dict)
-def compute_block_takeup_for_entities(
+def compute_block_takeup_draws_for_entities(
var_name: str,
- rate_or_dict,
entity_blocks: np.ndarray,
entity_hh_ids: np.ndarray,
- entity_clone_indices: np.ndarray,
+ entity_clone_indices: np.ndarray | None = None,
) -> np.ndarray:
- """Compute boolean takeup via clone-seeded draws.
+ """Compute deterministic uniform draws for entity-level takeup.
- Each unique (hh_id, clone_idx) pair gets its own seeded RNG,
- producing reproducible draws tied to the donor household and
- independent across clones. The rate varies by state (derived
- from the block GEOID).
+ Each unique (household id, clone index) pair gets its own seeded RNG,
+ producing reproducible draws that are stable for a given donor household
+ and independent across clones. Rates are applied separately by the caller
+ after resolving state FIPS from the block GEOID.
Args:
var_name: Takeup variable name.
- rate_or_dict: Scalar rate or {state_code: rate} dict.
- entity_blocks: Block GEOID per entity (str array),
- used only for state FIPS rate resolution.
+ entity_blocks: Block GEOID per entity (str array).
entity_hh_ids: Original household ID per entity.
- entity_clone_indices: Clone index per entity. For the
- matrix builder (single clone), a scalar broadcast
- via np.full. For the H5 builder (all clones),
- a per-entity array.
+ entity_clone_indices: Clone index per entity.
Returns:
- Boolean array of shape (n_entities,).
+ Float array of shape (n_entities,) in [0, 1).
"""
n = len(entity_blocks)
draws = np.zeros(n, dtype=np.float64)
- rates = np.ones(n, dtype=np.float64)
+ if entity_clone_indices is None:
+ entity_clone_indices = np.zeros(n, dtype=np.int64)
- # Resolve rates from block state FIPS
+ # Iterate block groups first so draws stay stable within geography slices.
for block in np.unique(entity_blocks):
if block == "":
continue
blk_mask = entity_blocks == block
- sf = int(str(block)[:2])
- rates[blk_mask] = _resolve_rate(rate_or_dict, sf)
# Draw per (hh_id, clone_idx) pair
for hh_id in np.unique(entity_hh_ids):
@@ -203,9 +202,70 @@ def compute_block_takeup_for_entities(
rng = seeded_rng(var_name, salt=f"{int(hh_id)}:{int(ci)}")
draws[ci_mask] = rng.random(n_ent)
+ return draws
+
+
+def compute_block_takeup_for_entities(
+ var_name: str,
+ rate_or_dict,
+ entity_blocks: np.ndarray,
+ entity_hh_ids: np.ndarray = None,
+ entity_clone_ids: np.ndarray = None,
+) -> np.ndarray:
+ """Compute boolean takeup via block-level seeded draws."""
+ draws = compute_block_takeup_draws_for_entities(
+ var_name,
+ entity_blocks,
+ entity_hh_ids,
+ entity_clone_ids,
+ )
+ rates = np.ones(len(entity_blocks), dtype=np.float64)
+
+ for block in np.unique(entity_blocks):
+ if block == "":
+ continue
+ blk_mask = entity_blocks == block
+ rates[blk_mask] = _resolve_rate(rate_or_dict, int(str(block)[:2]))
+
return draws < rates
+def extend_aca_takeup_to_match_target(
+ base_takeup: np.ndarray,
+ entity_draws: np.ndarray,
+ enrolled_person_weights: np.ndarray,
+ target_people: float,
+) -> np.ndarray:
+ """Turn on extra tax units until weighted ACA enrollment hits target.
+
+ ``enrolled_person_weights`` should be the weighted number of
+ people in each tax unit who would receive ACA PTC if that tax
+ unit takes up coverage in the target year.
+ """
+ result = base_takeup.copy()
+ current_people = enrolled_person_weights[result].sum()
+ if current_people >= target_people:
+ return result
+
+ available_mask = (~result) & (enrolled_person_weights > 0)
+ if not available_mask.any():
+ return result
+
+ available_idx = np.flatnonzero(available_mask)
+ ordered_idx = available_idx[np.argsort(entity_draws[available_idx], kind="stable")]
+ cumulative_people = current_people + np.cumsum(enrolled_person_weights[ordered_idx])
+ n_to_add = (
+ np.searchsorted(
+ cumulative_people,
+ target_people,
+ side="left",
+ )
+ + 1
+ )
+ result[ordered_idx[:n_to_add]] = True
+ return result
+
+
def apply_block_takeup_to_arrays(
hh_blocks: np.ndarray,
hh_state_fips: np.ndarray,
diff --git a/tests/integration/test_enhanced_cps.py b/tests/integration/test_enhanced_cps.py
index 8be42ab89..b82b66b01 100644
--- a/tests/integration/test_enhanced_cps.py
+++ b/tests/integration/test_enhanced_cps.py
@@ -1,5 +1,6 @@
"""Integration tests for Enhanced CPS dataset (requires enhanced_cps_2024.h5)."""
+import numpy as np
import pytest
@@ -259,6 +260,25 @@ def test_aca_calibration():
assert not failed, f"One or more states exceeded tolerance of {TOLERANCE:.0%}."
+def test_aca_2025_takeup_override_helper():
+ from policyengine_us_data.datasets.cps.enhanced_cps import (
+ create_aca_2025_takeup_override,
+ )
+
+ result = create_aca_2025_takeup_override(
+ base_takeup=np.array([True, False, False], dtype=bool),
+ person_enrolled_if_takeup=np.array([True, True, True, True], dtype=bool),
+ person_weights=np.array([2.0, 1.0, 3.0, 4.0], dtype=np.float64),
+ person_tax_unit_ids=np.array([10, 10, 11, 12], dtype=np.int64),
+ tax_unit_ids=np.array([10, 11, 12], dtype=np.int64),
+ target_people=6.0,
+ )
+
+ assert np.all(np.array([True, False, False]) <= result)
+ assert result.dtype == bool
+ assert result.sum() == 2
+
+
def test_immigration_status_diversity():
"""Test that immigration statuses show appropriate diversity (not all citizens)."""
from policyengine_us_data.datasets.cps import EnhancedCPS_2024
diff --git a/tests/unit/calibration/test_unified_calibration.py b/tests/unit/calibration/test_unified_calibration.py
index 3617c9081..1a012d9f9 100644
--- a/tests/unit/calibration/test_unified_calibration.py
+++ b/tests/unit/calibration/test_unified_calibration.py
@@ -14,8 +14,10 @@
from policyengine_us_data.utils.takeup import (
SIMPLE_TAKEUP_VARS,
TAKEUP_AFFECTED_TARGETS,
- compute_block_takeup_for_entities,
apply_block_takeup_to_arrays,
+ compute_block_takeup_draws_for_entities,
+ compute_block_takeup_for_entities,
+ extend_aca_takeup_to_match_target,
_resolve_rate,
)
from policyengine_us_data.calibration.clone_and_assign import (
@@ -199,6 +201,43 @@ def test_different_blocks_different_result(self):
assert differs
+class TestAcaTakeupTargeting:
+ """Verify ACA post-calibration targeting helpers."""
+
+ def test_draw_helper_matches_boolean_helper(self):
+ blocks = np.array(["370010001001001"] * 25)
+ hh_ids = np.arange(25, dtype=np.int64)
+ draws = compute_block_takeup_draws_for_entities(
+ "takes_up_aca_if_eligible",
+ blocks,
+ hh_ids,
+ )
+ result = compute_block_takeup_for_entities(
+ "takes_up_aca_if_eligible",
+ 0.7,
+ blocks,
+ hh_ids,
+ )
+ np.testing.assert_array_equal(result, draws < 0.7)
+
+ def test_extend_only_adds_true_values_until_target(self):
+ base_takeup = np.array([True, False, False, False], dtype=bool)
+ entity_draws = np.array([0.10, 0.40, 0.20, 0.30], dtype=np.float64)
+ enrolled_person_weights = np.array([2.0, 1.0, 3.0, 4.0], dtype=np.float64)
+
+ result = extend_aca_takeup_to_match_target(
+ base_takeup,
+ entity_draws,
+ enrolled_person_weights,
+ target_people=6.0,
+ )
+
+ np.testing.assert_array_equal(
+ result,
+ np.array([True, False, True, True], dtype=bool),
+ )
+
+
class TestResolveRate:
"""Verify _resolve_rate handles scalar and dict rates."""
@@ -627,7 +666,6 @@ def test_matrix_and_stacked_identical_draws(self):
def test_aggregation_entity_to_household(self):
"""np.add.at aggregation matches manual per-HH sum."""
n_hh = 3
- n_ent = 6
ent_hh = np.array([0, 0, 1, 1, 1, 2])
eligible = np.array(
[100.0, 200.0, 50.0, 150.0, 100.0, 300.0],
diff --git a/tests/unit/test_long_term_calibration_contract.py b/tests/unit/test_long_term_calibration_contract.py
new file mode 100644
index 000000000..73c24b4d9
--- /dev/null
+++ b/tests/unit/test_long_term_calibration_contract.py
@@ -0,0 +1,1320 @@
+from __future__ import annotations
+
+import json
+import numpy as np
+import pytest
+from policyengine_core.data.dataset import Dataset
+
+from policyengine_us_data.datasets.cps.long_term import (
+ calibration as calibration_module,
+)
+from policyengine_us_data.datasets.cps.long_term.calibration import (
+ assess_nonnegative_feasibility,
+ build_calibration_audit,
+ calibrate_entropy,
+ calibrate_entropy_bounded,
+ calibrate_weights,
+)
+from policyengine_us_data.datasets.cps.long_term.calibration_artifacts import (
+ normalize_metadata,
+ rebuild_dataset_manifest,
+ update_dataset_manifest,
+ write_support_augmentation_report,
+ write_year_metadata,
+)
+from policyengine_us_data.datasets.cps.long_term.calibration_profiles import (
+ approximate_window_for_year,
+ build_profile_from_flags,
+ classify_calibration_quality,
+ get_profile,
+ validate_calibration_audit,
+)
+from policyengine_us_data.datasets.cps.long_term.projection_utils import (
+ aggregate_age_targets,
+ aggregate_household_age_matrix,
+ build_age_bins,
+ validate_projected_social_security_cap,
+)
+from policyengine_us_data.datasets.cps.long_term.ssa_data import (
+ available_long_term_target_sources,
+ describe_long_term_target_source,
+ load_oasdi_tob_projections,
+ load_taxable_payroll_projections,
+)
+from policyengine_us_data.datasets.cps.long_term.support_augmentation import (
+ AgeShiftCloneRule,
+ CompositePayrollRule,
+ MixedAgeAppendRule,
+ SinglePersonSyntheticGridRule,
+ SupportAugmentationProfile,
+ augment_input_dataframe,
+ household_support_summary,
+ select_donor_households,
+)
+from policyengine_us_data.datasets.cps.long_term.prototype_synthetic_2100_support import (
+ SyntheticCandidate,
+ _compose_role_donor_rows_to_target,
+ build_role_composite_calibration_blueprint,
+ build_role_donor_composites,
+ summarize_realized_clone_translation,
+)
+
+
+class ExplodingCalibrator:
+ def calibrate(self, **kwargs):
+ raise RuntimeError("boom")
+
+
+def _toy_support_dataframe():
+ return json.loads(
+ json.dumps(
+ {
+ "person_id__2024": [101, 102, 201, 202, 301],
+ "household_id__2024": [1, 1, 2, 2, 3],
+ "person_household_id__2024": [1, 1, 2, 2, 3],
+ "family_id__2024": [11.0, 11.0, 21.0, 21.0, 31.0],
+ "person_family_id__2024": [11, 11, 21, 21, 31],
+ "tax_unit_id__2024": [101, 101, 201, 202, 301],
+ "person_tax_unit_id__2024": [101, 101, 201, 202, 301],
+ "spm_unit_id__2024": [1001, 1001, 2001, 2001, 3001],
+ "person_spm_unit_id__2024": [1001, 1001, 2001, 2001, 3001],
+ "marital_unit_id__2024": [501, 501, 601, 602, 701],
+ "person_marital_unit_id__2024": [501, 501, 601, 602, 701],
+ "age__2024": [70.0, 68.0, 80.0, 77.0, 60.0],
+ "household_weight__2024": [10.0, 10.0, 8.0, 8.0, 5.0],
+ "person_weight__2024": [10.0, 10.0, 8.0, 8.0, 5.0],
+ "social_security_retirement__2024": [
+ 20_000.0,
+ 0.0,
+ 30_000.0,
+ 0.0,
+ 0.0,
+ ],
+ "social_security_disability__2024": [0.0, 0.0, 0.0, 0.0, 0.0],
+ "social_security_survivors__2024": [0.0, 0.0, 0.0, 0.0, 0.0],
+ "social_security_dependents__2024": [0.0, 0.0, 0.0, 0.0, 0.0],
+ "employment_income_before_lsr__2024": [
+ 5_000.0,
+ 0.0,
+ 12_000.0,
+ 0.0,
+ 50_000.0,
+ ],
+ "self_employment_income_before_lsr__2024": [
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ 0.0,
+ ],
+ }
+ )
+ )
+
+
+def test_named_profile_lookup():
+ profile = get_profile("ss-payroll-tob")
+ assert profile.calibration_method == "entropy"
+ assert profile.use_greg is False
+ assert profile.use_ss is True
+ assert profile.use_payroll is True
+ assert profile.use_tob is True
+ assert profile.benchmark_tob is False
+ assert profile.use_h6_reform is False
+ assert profile.max_negative_weight_pct == 0.0
+ assert profile.approximate_windows[0].age_bucket_size == 5
+ assert profile.min_positive_household_count == 1000
+ assert profile.min_effective_sample_size == 75.0
+ assert profile.max_top_10_weight_share_pct == 25.0
+ assert profile.max_top_100_weight_share_pct == 95.0
+
+
+def test_support_augmentation_selects_expected_donors():
+ import pandas as pd
+
+ df = pd.DataFrame(_toy_support_dataframe())
+ summary = household_support_summary(df, base_year=2024)
+ rule = AgeShiftCloneRule(
+ name="older_ss_pay",
+ min_max_age=65,
+ max_max_age=74,
+ age_shift=10,
+ ss_state="positive",
+ payroll_state="positive",
+ )
+ donors = select_donor_households(summary, rule)
+ assert list(donors) == [1]
+
+
+def test_support_augmentation_clones_households_with_new_ids():
+ import pandas as pd
+
+ df = pd.DataFrame(_toy_support_dataframe())
+ profile = SupportAugmentationProfile(
+ name="test-profile",
+ description="Toy support augmentation profile.",
+ rules=(
+ AgeShiftCloneRule(
+ name="older_ss_pay",
+ min_max_age=65,
+ max_max_age=74,
+ age_shift=10,
+ ss_state="positive",
+ payroll_state="positive",
+ clone_weight_scale=0.5,
+ ),
+ ),
+ )
+ augmented_df, report = augment_input_dataframe(
+ df,
+ base_year=2024,
+ profile=profile,
+ )
+ assert report["base_household_count"] == 3
+ assert report["augmented_household_count"] == 4
+ cloned_household_ids = set(augmented_df["household_id__2024"].unique()) - {
+ 1,
+ 2,
+ 3,
+ }
+ assert len(cloned_household_ids) == 1
+ cloned_rows = augmented_df[
+ augmented_df["household_id__2024"].isin(cloned_household_ids)
+ ]
+ assert cloned_rows["age__2024"].max() == pytest.approx(80.0)
+ assert cloned_rows["household_weight__2024"].iloc[0] == pytest.approx(5.0)
+ assert cloned_rows["person_id__2024"].min() > df["person_id__2024"].max()
+
+
+def test_support_augmentation_synthesizes_composite_payroll_household():
+ import pandas as pd
+
+ df = pd.DataFrame(_toy_support_dataframe())
+ profile = SupportAugmentationProfile(
+ name="composite-profile",
+ description="Toy composite support augmentation profile.",
+ rules=(
+ CompositePayrollRule(
+ name="older_ss_only_plus_payroll",
+ recipient_min_max_age=75,
+ recipient_max_max_age=84,
+ donor_min_max_age=55,
+ donor_max_max_age=64,
+ recipient_ss_state="positive",
+ recipient_payroll_state="positive",
+ donor_ss_state="nonpositive",
+ donor_payroll_state="positive",
+ payroll_transfer_scale=0.5,
+ clone_weight_scale=0.25,
+ ),
+ ),
+ )
+ augmented_df, report = augment_input_dataframe(
+ df,
+ base_year=2024,
+ profile=profile,
+ )
+ assert report["base_household_count"] == 3
+ assert report["augmented_household_count"] == 4
+ cloned_household_ids = set(augmented_df["household_id__2024"].unique()) - {
+ 1,
+ 2,
+ 3,
+ }
+ assert len(cloned_household_ids) == 1
+ cloned_rows = augmented_df[
+ augmented_df["household_id__2024"].isin(cloned_household_ids)
+ ]
+ assert cloned_rows["age__2024"].max() == pytest.approx(80.0)
+ assert cloned_rows["social_security_retirement__2024"].sum() == pytest.approx(
+ 30_000.0
+ )
+ assert cloned_rows["employment_income_before_lsr__2024"].sum() == pytest.approx(
+ 37_000.0
+ )
+
+
+def test_support_augmentation_appends_single_person_synthetic_grid_households():
+ import pandas as pd
+
+ df = pd.DataFrame(
+ {
+ "person_id__2024": [101, 201, 301],
+ "household_id__2024": [1, 2, 3],
+ "person_household_id__2024": [1, 2, 3],
+ "family_id__2024": [11.0, 21.0, 31.0],
+ "person_family_id__2024": [11, 21, 31],
+ "tax_unit_id__2024": [101, 201, 301],
+ "person_tax_unit_id__2024": [101, 201, 301],
+ "spm_unit_id__2024": [1001, 2001, 3001],
+ "person_spm_unit_id__2024": [1001, 2001, 3001],
+ "marital_unit_id__2024": [501, 601, 701],
+ "person_marital_unit_id__2024": [501, 601, 701],
+ "age__2024": [78.0, 86.0, 60.0],
+ "household_weight__2024": [10.0, 8.0, 5.0],
+ "person_weight__2024": [10.0, 8.0, 5.0],
+ "social_security_retirement__2024": [20_000.0, 24_000.0, 0.0],
+ "social_security_disability__2024": [0.0, 0.0, 0.0],
+ "social_security_survivors__2024": [0.0, 0.0, 0.0],
+ "social_security_dependents__2024": [0.0, 0.0, 0.0],
+ "employment_income_before_lsr__2024": [0.0, 0.0, 50_000.0],
+ "self_employment_income_before_lsr__2024": [0.0, 0.0, 0.0],
+ "w2_wages_from_qualified_business__2024": [0.0, 0.0, 0.0],
+ }
+ )
+ profile = SupportAugmentationProfile(
+ name="grid-profile",
+ description="Toy single-person synthetic grid.",
+ rules=(
+ SinglePersonSyntheticGridRule(
+ name="older_grid",
+ template_min_max_age=75,
+ template_max_max_age=86,
+ target_ages=(77, 85),
+ ss_quantiles=(0.5,),
+ payroll_quantiles=(0.5,),
+ template_ss_state="positive",
+ template_payroll_state="any",
+ payroll_donor_min_max_age=55,
+ payroll_donor_max_max_age=64,
+ clone_weight_scale=0.2,
+ ),
+ ),
+ )
+ augmented_df, report = augment_input_dataframe(
+ df,
+ base_year=2024,
+ profile=profile,
+ )
+ assert report["base_household_count"] == 3
+ assert report["augmented_household_count"] == 5
+ synthetic_household_ids = set(augmented_df["household_id__2024"].unique()) - {
+ 1,
+ 2,
+ 3,
+ }
+ assert len(synthetic_household_ids) == 2
+ synthetic_rows = augmented_df[
+ augmented_df["household_id__2024"].isin(synthetic_household_ids)
+ ]
+ assert set(synthetic_rows["age__2024"].tolist()) == {77.0, 85.0}
+ assert set(synthetic_rows["social_security_retirement__2024"].tolist()) == {
+ 22_000.0
+ }
+ assert set(synthetic_rows["employment_income_before_lsr__2024"].tolist()) == {
+ 50_000.0
+ }
+
+
+def test_support_augmentation_appends_mixed_age_household():
+ import pandas as pd
+
+ df = pd.DataFrame(_toy_support_dataframe())
+ profile = SupportAugmentationProfile(
+ name="mixed-age-profile",
+ description="Toy mixed-age household support augmentation profile.",
+ rules=(
+ MixedAgeAppendRule(
+ name="older_plus_younger_earner",
+ recipient_min_max_age=75,
+ recipient_max_max_age=84,
+ donor_min_max_age=55,
+ donor_max_max_age=64,
+ recipient_ss_state="positive",
+ recipient_payroll_state="any",
+ donor_ss_state="nonpositive",
+ donor_payroll_state="positive",
+ clone_weight_scale=0.2,
+ ),
+ ),
+ )
+ augmented_df, report = augment_input_dataframe(
+ df,
+ base_year=2024,
+ profile=profile,
+ )
+ assert report["base_household_count"] == 3
+ assert report["augmented_household_count"] == 4
+ synthetic_household_ids = set(augmented_df["household_id__2024"].unique()) - {
+ 1,
+ 2,
+ 3,
+ }
+ assert len(synthetic_household_ids) == 1
+ synthetic_rows = augmented_df[
+ augmented_df["household_id__2024"].isin(synthetic_household_ids)
+ ]
+ assert sorted(synthetic_rows["age__2024"].tolist()) == [60.0, 77.0, 80.0]
+ assert synthetic_rows["social_security_retirement__2024"].sum() == pytest.approx(
+ 30_000.0
+ )
+ assert synthetic_rows["employment_income_before_lsr__2024"].sum() == pytest.approx(
+ 62_000.0
+ )
+ assert synthetic_rows["tax_unit_id__2024"].nunique() == 3
+
+
+def test_role_donor_composites_build_structural_candidate_from_role_donors():
+ import pandas as pd
+
+ candidates = [
+ SyntheticCandidate(
+ archetype="older_plus_prime_worker_family",
+ head_age=67,
+ spouse_age=42,
+ dependent_ages=(10,),
+ head_wages=0.0,
+ spouse_wages=100_000.0,
+ head_ss=40_000.0,
+ spouse_ss=0.0,
+ pension_income=0.0,
+ dividend_income=0.0,
+ )
+ ]
+ actual_summary = pd.DataFrame(
+ [
+ {
+ "tax_unit_id": 1,
+ "head_age": 70.0,
+ "spouse_age": None,
+ "adult_count": 1,
+ "dependent_count": 0,
+ "dependent_ages": (),
+ "head_payroll": 0.0,
+ "spouse_payroll": 0.0,
+ "head_ss": 40_000.0,
+ "spouse_ss": 0.0,
+ "payroll_total": 0.0,
+ "ss_total": 40_000.0,
+ "dividend_income": 2_000.0,
+ "pension_income": 8_000.0,
+ "support_count_weight": 1.0,
+ "person_weight_proxy": 1.0,
+ "archetype": "older_beneficiary_single",
+ },
+ {
+ "tax_unit_id": 2,
+ "head_age": 41.0,
+ "spouse_age": 39.0,
+ "adult_count": 2,
+ "dependent_count": 1,
+ "dependent_ages": (10,),
+ "head_payroll": 60_000.0,
+ "spouse_payroll": 40_000.0,
+ "head_ss": 0.0,
+ "spouse_ss": 0.0,
+ "payroll_total": 100_000.0,
+ "ss_total": 0.0,
+ "dividend_income": 0.0,
+ "pension_income": 0.0,
+ "support_count_weight": 1.0,
+ "person_weight_proxy": 1.0,
+ "archetype": "prime_worker_family",
+ },
+ ]
+ )
+
+ composite_candidates, prior_weights, report = build_role_donor_composites(
+ candidates,
+ np.array([1.0]),
+ actual_summary,
+ ss_scale=1.0,
+ earnings_scale=1.0,
+ top_n_targets=1,
+ older_donors_per_target=1,
+ worker_donors_per_target=1,
+ )
+
+ assert len(composite_candidates) == 1
+ assert composite_candidates[0].archetype.endswith("_role_donor")
+ assert composite_candidates[0].spouse_wages == pytest.approx(100_000.0)
+ assert composite_candidates[0].head_ss == pytest.approx(40_000.0)
+ assert prior_weights.tolist() == pytest.approx([1.0])
+ assert report["skipped_targets"] == []
+
+
+def test_age_bin_helpers_preserve_population_totals():
+ bins = build_age_bins(n_ages=86, bucket_size=5)
+ assert bins[0] == (0, 5)
+ assert bins[-1] == (85, 86)
+
+ X = np.eye(86)
+ y = np.arange(86, dtype=float)
+ X_coarse = aggregate_household_age_matrix(X, bins)
+ y_coarse = aggregate_age_targets(y, bins)
+
+ assert X_coarse.shape == (86, 18)
+ assert y_coarse.shape == (18,)
+ assert X_coarse.sum() == pytest.approx(X.sum())
+ assert y_coarse.sum() == pytest.approx(y.sum())
+
+ target_matrix = np.column_stack([y, y * 2])
+ aggregated_target_matrix = aggregate_age_targets(target_matrix, bins)
+ assert aggregated_target_matrix.shape == (18, 2)
+ assert aggregated_target_matrix[:, 0].sum() == pytest.approx(y.sum())
+ assert aggregated_target_matrix[:, 1].sum() == pytest.approx((y * 2).sum())
+
+
+def test_validate_projected_social_security_cap_rejects_flat_tail():
+ from types import SimpleNamespace
+
+ def accessor(year: int):
+ cap = 254_400.0 if year >= 2035 else 186_000.0
+ return SimpleNamespace(
+ gov=SimpleNamespace(
+ irs=SimpleNamespace(
+ payroll=SimpleNamespace(social_security=SimpleNamespace(cap=cap))
+ )
+ )
+ )
+
+ with pytest.raises(RuntimeError, match="flat after 2035"):
+ validate_projected_social_security_cap(accessor, 2100)
+
+
+def test_role_composite_calibration_blueprint_reweights_clone_priors():
+ report = {
+ "target_year": 2100,
+ "clone_household_reports": [
+ {
+ "clone_household_id": 1001,
+ "target_head_age": 70,
+ "target_spouse_age": 68,
+ "target_dependent_ages": [12],
+ "target_ss_total": 20_000.0,
+ "target_payroll_total": 50_000.0,
+ "per_clone_weight_share_pct": 60.0,
+ },
+ {
+ "clone_household_id": 1002,
+ "target_head_age": 80,
+ "target_spouse_age": None,
+ "target_dependent_ages": [],
+ "target_ss_total": 30_000.0,
+ "target_payroll_total": 10_000.0,
+ "per_clone_weight_share_pct": 40.0,
+ },
+ ],
+ }
+ baseline_weights = np.array([10.0, 20.0, 30.0], dtype=float)
+ blueprint = build_role_composite_calibration_blueprint(
+ report,
+ year=2100,
+ age_bins=build_age_bins(n_ages=86, bucket_size=5),
+ hh_id_to_idx={999: 0, 1001: 1, 1002: 2},
+ baseline_weights=baseline_weights,
+ base_weight_scale=0.5,
+ )
+
+ assert blueprint is not None
+ assert blueprint["baseline_weights"].tolist() == pytest.approx([5.0, 36.0, 24.0])
+ assert blueprint["ss_overrides"] == {1: 20_000.0, 2: 30_000.0}
+ assert blueprint["payroll_overrides"] == {1: 50_000.0, 2: 10_000.0}
+ assert blueprint["age_overrides"][1].sum() == pytest.approx(3.0)
+ assert blueprint["age_overrides"][2].sum() == pytest.approx(1.0)
+ assert blueprint["summary"]["clone_household_count"] == 2
+ assert blueprint["summary"]["base_weight_scale"] == pytest.approx(0.5)
+
+
+def test_legacy_flags_map_to_named_profile():
+ profile = build_profile_from_flags(
+ use_greg=False,
+ use_ss=True,
+ use_payroll=True,
+ use_h6_reform=False,
+ use_tob=True,
+ )
+ assert profile.name == "custom-greg-ss-payroll-tob"
+ assert profile.calibration_method == "greg"
+ assert profile.use_greg is True
+
+
+def test_approximate_window_none_selects_open_ended_tail():
+ profile = get_profile("ss-payroll-tob")
+ window = approximate_window_for_year(profile, None)
+ assert window is not None
+ assert window.start_year == 2096
+ assert window.end_year is None
+
+
+def test_strict_greg_failure_raises():
+ X = np.array([[1.0, 0.0], [0.0, 1.0]])
+ y_target = np.array([1.0, 1.0])
+ baseline_weights = np.array([1.0, 1.0])
+
+ with pytest.raises(RuntimeError, match="fallback was disabled"):
+ calibrate_weights(
+ X=X,
+ y_target=y_target,
+ baseline_weights=baseline_weights,
+ method="greg",
+ calibrator=ExplodingCalibrator(),
+ allow_fallback_to_ipf=False,
+ )
+
+
+def test_build_calibration_audit_reports_constraint_error():
+ X = np.array([[1.0, 0.0], [0.0, 1.0]])
+ y_target = np.array([1.0, 1.0])
+ baseline_weights = np.array([1.0, 1.0])
+ weights = np.array([1.0, 1.0])
+ audit = build_calibration_audit(
+ X=X,
+ y_target=y_target,
+ weights=weights,
+ baseline_weights=baseline_weights,
+ calibration_event={
+ "method_requested": "greg",
+ "method_used": "greg",
+ "greg_attempted": True,
+ "greg_error": None,
+ "fell_back_to_ipf": False,
+ },
+ payroll_values=np.array([10.0, 0.0]),
+ payroll_target=20.0,
+ )
+
+ assert audit["constraints"]["payroll_total"]["achieved"] == 10.0
+ assert audit["constraints"]["payroll_total"]["pct_error"] == -50.0
+ assert audit["positive_weight_count"] == 2
+ assert audit["positive_weight_pct"] == 100.0
+ assert audit["negative_weight_household_pct"] == 0.0
+ assert audit["effective_sample_size"] == pytest.approx(2.0)
+ assert audit["top_10_weight_share_pct"] == pytest.approx(100.0)
+ assert audit["top_100_weight_share_pct"] == pytest.approx(100.0)
+
+
+def test_profile_validation_rejects_fallback_and_large_error():
+ profile = build_profile_from_flags(
+ use_greg=True,
+ use_ss=True,
+ use_payroll=True,
+ use_h6_reform=False,
+ use_tob=False,
+ )
+ audit = {
+ "fell_back_to_ipf": True,
+ "age_max_pct_error": 0.0,
+ "negative_weight_pct": 0.0,
+ "constraints": {
+ "payroll_total": {"pct_error": 0.2},
+ },
+ }
+
+ issues = validate_calibration_audit(audit, profile)
+ assert "GREG calibration fell back to IPF" in issues
+ assert any("payroll_total error" in issue for issue in issues)
+
+
+def test_classify_calibration_quality_marks_invalid_audit_approximate():
+ profile = get_profile("ss-payroll-tob")
+ quality = classify_calibration_quality(
+ {
+ "fell_back_to_ipf": False,
+ "age_max_pct_error": 0.0,
+ "negative_weight_pct": 0.0,
+ "constraints": {
+ "ss_total": {"pct_error": 0.0},
+ "payroll_total": {"pct_error": 0.5},
+ },
+ },
+ profile,
+ year=2078,
+ )
+ assert quality == "approximate"
+
+
+def test_entropy_profile_rejects_negative_weights():
+ profile = get_profile("ss-payroll-tob")
+ issues = validate_calibration_audit(
+ {
+ "fell_back_to_ipf": False,
+ "age_max_pct_error": 0.0,
+ "negative_weight_pct": 0.01,
+ "constraints": {
+ "ss_total": {"pct_error": 0.0},
+ "payroll_total": {"pct_error": 0.0},
+ },
+ },
+ profile,
+ )
+ assert any("Negative weight share" in issue for issue in issues)
+
+
+def test_support_thresholds_reject_concentrated_weights():
+ profile = get_profile("ss-payroll-tob")
+ issues = validate_calibration_audit(
+ {
+ "fell_back_to_ipf": False,
+ "age_max_pct_error": 0.0,
+ "negative_weight_pct": 0.0,
+ "positive_weight_count": 90,
+ "effective_sample_size": 57.6,
+ "top_10_weight_share_pct": 26.6,
+ "top_100_weight_share_pct": 100.0,
+ "constraints": {
+ "ss_total": {"pct_error": 0.0},
+ "payroll_total": {"pct_error": 0.0},
+ },
+ },
+ profile,
+ year=2075,
+ quality="exact",
+ )
+ assert any("Positive household count" in issue for issue in issues)
+ assert any("Top-10 weight share" in issue for issue in issues)
+ assert any("Top-100 weight share" in issue for issue in issues)
+
+
+def test_classify_calibration_quality_marks_support_collapse_aggregate():
+ profile = get_profile("ss-payroll-tob")
+ quality = classify_calibration_quality(
+ {
+ "fell_back_to_ipf": False,
+ "age_max_pct_error": 0.0,
+ "negative_weight_pct": 0.0,
+ "positive_weight_count": 6840,
+ "effective_sample_size": 24.98,
+ "top_10_weight_share_pct": 54.8,
+ "top_100_weight_share_pct": 97.4,
+ "constraints": {
+ "ss_total": {"pct_error": 0.0},
+ "payroll_total": {"pct_error": 0.0},
+ },
+ },
+ profile,
+ year=2075,
+ )
+ assert quality == "aggregate"
+
+
+def test_approximate_window_is_year_bounded():
+ profile = get_profile("ss-payroll-tob")
+ quality = classify_calibration_quality(
+ {
+ "fell_back_to_ipf": False,
+ "age_max_pct_error": 3.0,
+ "negative_weight_pct": 0.0,
+ "constraints": {
+ "ss_total": {"pct_error": 0.0},
+ "payroll_total": {"pct_error": 3.0},
+ },
+ },
+ profile,
+ year=2080,
+ )
+ assert quality == "approximate"
+
+ quality = classify_calibration_quality(
+ {
+ "fell_back_to_ipf": False,
+ "age_max_pct_error": 3.0,
+ "negative_weight_pct": 0.0,
+ "constraints": {
+ "ss_total": {"pct_error": 0.0},
+ "payroll_total": {"pct_error": 3.0},
+ },
+ },
+ profile,
+ year=2035,
+ )
+ assert quality == "aggregate"
+
+
+def test_normalize_metadata_harmonizes_lp_fallback_labels():
+ profile = get_profile("ss-payroll-tob")
+ metadata = normalize_metadata(
+ {
+ "year": 2075,
+ "profile": profile.to_dict(),
+ "calibration_audit": {
+ "lp_fallback_used": True,
+ "approximation_method": "lp_minimax_exact",
+ "approximate_solution_error_pct": 0.0,
+ "max_constraint_pct_error": 0.368,
+ "age_max_pct_error": 0.0,
+ "negative_weight_pct": 0.0,
+ "constraints": {
+ "ss_total": {"pct_error": 0.368},
+ "payroll_total": {"pct_error": 0.0},
+ },
+ },
+ }
+ )
+
+ audit = metadata["calibration_audit"]
+ assert audit["calibration_quality"] == "approximate"
+ assert audit["approximation_method"] == "lp_minimax"
+ assert audit["approximate_solution_used"] is True
+ assert audit["approximate_solution_error_pct"] == pytest.approx(0.368)
+
+
+def test_manifest_updates_and_rejects_profile_mismatch(tmp_path):
+ profile = get_profile("ss-payroll-tob")
+ audit = {
+ "method_used": "greg",
+ "fell_back_to_ipf": False,
+ "negative_weight_pct": 1.5,
+ }
+
+ year_2026 = tmp_path / "2026.h5"
+ year_2026.write_text("", encoding="utf-8")
+ metadata_2026 = write_year_metadata(
+ year_2026,
+ year=2026,
+ base_dataset_path="hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5",
+ profile=profile.to_dict(),
+ calibration_audit=audit,
+ )
+ manifest_path = update_dataset_manifest(
+ tmp_path,
+ year=2026,
+ h5_path=year_2026,
+ metadata_path=metadata_2026,
+ base_dataset_path="hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5",
+ profile=profile.to_dict(),
+ calibration_audit=audit,
+ )
+
+ year_2027 = tmp_path / "2027.h5"
+ year_2027.write_text("", encoding="utf-8")
+ metadata_2027 = write_year_metadata(
+ year_2027,
+ year=2027,
+ base_dataset_path="hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5",
+ profile=profile.to_dict(),
+ calibration_audit=audit,
+ )
+ update_dataset_manifest(
+ tmp_path,
+ year=2027,
+ h5_path=year_2027,
+ metadata_path=metadata_2027,
+ base_dataset_path="hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5",
+ profile=profile.to_dict(),
+ calibration_audit=audit,
+ )
+
+ manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
+ assert manifest["profile"]["name"] == "ss-payroll-tob"
+ assert manifest["years"] == [2026, 2027]
+ assert manifest["datasets"]["2026"]["metadata"] == "2026.h5.metadata.json"
+
+ with pytest.raises(ValueError, match="different calibration profile"):
+ update_dataset_manifest(
+ tmp_path,
+ year=2028,
+ h5_path=tmp_path / "2028.h5",
+ metadata_path=tmp_path / "2028.h5.metadata.json",
+ base_dataset_path="hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5",
+ profile=get_profile("ss").to_dict(),
+ calibration_audit=audit,
+ )
+
+ manifest_path.unlink()
+ rebuilt_path = rebuild_dataset_manifest(tmp_path)
+ rebuilt = json.loads(rebuilt_path.read_text(encoding="utf-8"))
+ assert rebuilt["years"] == [2026, 2027]
+
+
+def test_hard_target_tob_affects_quality_classification():
+ profile = get_profile("ss-payroll-tob")
+ quality = classify_calibration_quality(
+ {
+ "fell_back_to_ipf": False,
+ "age_max_pct_error": 0.0,
+ "negative_weight_pct": 0.0,
+ "constraints": {
+ "ss_total": {"pct_error": 0.0},
+ "payroll_total": {"pct_error": 0.0},
+ "oasdi_tob": {"pct_error": 12.0},
+ "hi_tob": {"pct_error": -9.0},
+ },
+ },
+ profile,
+ year=2035,
+ )
+ assert quality == "aggregate"
+
+
+def test_entropy_calibration_produces_nonnegative_weights_and_hits_targets():
+ X = np.array(
+ [
+ [1.0, 0.0],
+ [0.0, 1.0],
+ [1.0, 1.0],
+ ]
+ )
+ y_target = np.array([2.0, 3.0])
+ baseline_weights = np.array([1.0, 1.0, 1.0])
+ payroll_values = np.array([1.0, 0.0, 2.0])
+ payroll_target = 3.5
+
+ weights, _ = calibrate_entropy(
+ X=X,
+ y_target=y_target,
+ baseline_weights=baseline_weights,
+ payroll_values=payroll_values,
+ payroll_target=payroll_target,
+ n_ages=2,
+ )
+
+ assert np.all(weights > 0)
+ np.testing.assert_allclose(X.T @ weights, y_target, rtol=1e-8, atol=1e-8)
+ np.testing.assert_allclose(
+ np.dot(payroll_values, weights), payroll_target, rtol=1e-8, atol=1e-8
+ )
+
+
+def test_bounded_entropy_calibration_returns_positive_approximate_weights():
+ X = np.array(
+ [
+ [1.0, 0.0, 1.0],
+ [0.0, 1.0, 1.0],
+ ]
+ )
+ y_target = np.array([1.0, 1.0, 3.0])
+ baseline_weights = np.array([1.0, 1.0])
+
+ weights, _, info = calibrate_entropy_bounded(
+ X=X,
+ y_target=y_target,
+ baseline_weights=baseline_weights,
+ n_ages=3,
+ max_constraint_error_pct=40.0,
+ )
+
+ assert info["best_case_max_pct_error"] <= 40.0
+ assert np.all(weights > 0)
+
+
+def test_entropy_calibration_prefers_bounded_entropy_over_lp_approximate_solution():
+ X = np.array(
+ [
+ [1.0, 0.0, 1.0],
+ [0.0, 1.0, 1.0],
+ ]
+ )
+ y_target = np.array([1.0, 1.0, 3.0])
+ baseline_weights = np.array([1.0, 1.0])
+
+ weights, _, audit = calibrate_weights(
+ X=X,
+ y_target=y_target,
+ baseline_weights=baseline_weights,
+ method="entropy",
+ n_ages=3,
+ allow_approximate_entropy=True,
+ approximate_max_error_pct=40.0,
+ )
+
+ assert audit["approximate_solution_used"] is True
+ assert audit["approximation_method"] == "bounded_entropy"
+ assert audit["approximate_solution_error_pct"] > 10.0
+ assert np.all(weights > 0)
+
+
+def test_entropy_calibration_uses_lp_exact_fallback_even_before_approximate_window(
+ monkeypatch,
+):
+ monkeypatch.setattr(
+ calibration_module,
+ "calibrate_entropy",
+ lambda *args, **kwargs: (_ for _ in ()).throw(RuntimeError("entropy stalled")),
+ )
+ monkeypatch.setattr(
+ calibration_module,
+ "calibrate_lp_minimax",
+ lambda *args, **kwargs: (
+ np.array([1.0, 2.0]),
+ 1,
+ {"best_case_max_pct_error": 0.0},
+ ),
+ )
+
+ weights, _, audit = calibrate_weights(
+ X=np.array([[1.0], [0.0]]),
+ y_target=np.array([1.0]),
+ baseline_weights=np.array([1.0, 1.0]),
+ method="entropy",
+ n_ages=1,
+ allow_approximate_entropy=False,
+ )
+
+ np.testing.assert_allclose(weights, np.array([1.0, 2.0]))
+ assert audit["lp_fallback_used"] is True
+ assert audit["approximate_solution_used"] is False
+ assert audit["approximation_method"] == "lp_minimax_exact"
+
+
+def test_nonnegative_feasibility_diagnostic_distinguishes_feasible_and_infeasible():
+ feasible_A = np.array(
+ [
+ [1.0, 0.0],
+ [0.0, 1.0],
+ [1.0, 1.0],
+ ]
+ )
+ feasible_targets = np.array([1.0, 2.0, 3.0])
+ feasible = assess_nonnegative_feasibility(feasible_A, feasible_targets)
+ assert feasible["success"] is True
+ assert feasible["best_case_max_pct_error"] < 1e-6
+
+ infeasible_A = np.array(
+ [
+ [1.0, 0.0],
+ [0.0, 1.0],
+ [1.0, 1.0],
+ ]
+ )
+ infeasible_targets = np.array([1.0, 1.0, 3.0])
+ infeasible = assess_nonnegative_feasibility(infeasible_A, infeasible_targets)
+ assert infeasible["success"] is True
+ assert infeasible["best_case_max_pct_error"] > 10.0
+
+
+def test_long_term_target_sources_are_available_and_distinct():
+ sources = available_long_term_target_sources()
+ assert "trustees_2025_current_law" in sources
+ assert "oact_2025_08_05_provisional" in sources
+
+ trustees = describe_long_term_target_source("trustees_2025_current_law")
+ assert trustees["file"] == "trustees_2025_current_law.csv"
+
+ payroll_2026 = load_taxable_payroll_projections(
+ 2026,
+ source_name="trustees_2025_current_law",
+ )
+ assert payroll_2026 == pytest.approx(11_129_000_000_000.0)
+
+ trustees_oasdi_2026 = load_oasdi_tob_projections(
+ 2026,
+ source_name="trustees_2025_current_law",
+ )
+ oact_oasdi_2026 = load_oasdi_tob_projections(
+ 2026,
+ source_name="oact_2025_08_05_provisional",
+ )
+ assert oact_oasdi_2026 < trustees_oasdi_2026
+
+
+def test_normalize_metadata_backfills_validation_passed():
+ metadata = normalize_metadata(
+ {
+ "year": 2091,
+ "profile": {"name": "ss-payroll-tob"},
+ "calibration_audit": {
+ "lp_fallback_used": True,
+ "approximation_method": "lp_blend",
+ "approximate_solution_error_pct": 16.0,
+ "max_constraint_pct_error": 16.0,
+ "age_max_pct_error": 14.5,
+ "negative_weight_pct": 0.0,
+ "positive_weight_count": 6840,
+ "effective_sample_size": 12.0,
+ "top_10_weight_share_pct": 80.0,
+ "top_100_weight_share_pct": 99.0,
+ "constraints": {
+ "ss_total": {"pct_error": 14.5},
+ "payroll_total": {"pct_error": 16.0},
+ },
+ },
+ }
+ )
+
+ audit = metadata["calibration_audit"]
+ assert audit["validation_passed"] is False
+ assert isinstance(audit["validation_issues"], list)
+ assert len(audit["validation_issues"]) > 0
+
+
+def test_manifest_contains_invalid_artifacts_flag(tmp_path):
+ profile = get_profile("ss-payroll-tob")
+
+ valid_audit = {
+ "method_used": "entropy",
+ "fell_back_to_ipf": False,
+ "age_max_pct_error": 0.0,
+ "negative_weight_pct": 0.0,
+ "positive_weight_count": 70000,
+ "effective_sample_size": 5000.0,
+ "top_10_weight_share_pct": 1.5,
+ "top_100_weight_share_pct": 10.0,
+ "max_constraint_pct_error": 0.0,
+ "constraints": {},
+ "validation_passed": True,
+ "validation_issues": [],
+ }
+
+ invalid_audit = {
+ "method_used": "entropy",
+ "fell_back_to_ipf": False,
+ "age_max_pct_error": 14.0,
+ "negative_weight_pct": 0.0,
+ "positive_weight_count": 6840,
+ "effective_sample_size": 12.0,
+ "top_10_weight_share_pct": 80.0,
+ "top_100_weight_share_pct": 99.0,
+ "max_constraint_pct_error": 16.0,
+ "constraints": {"payroll_total": {"pct_error": 16.0}},
+ "validation_passed": False,
+ "validation_issues": ["ESS too low"],
+ }
+
+ # First year: valid
+ year_2030 = tmp_path / "2030.h5"
+ year_2030.write_text("", encoding="utf-8")
+ metadata_2030 = write_year_metadata(
+ year_2030,
+ year=2030,
+ base_dataset_path="test.h5",
+ profile=profile.to_dict(),
+ calibration_audit=valid_audit,
+ )
+ manifest_path = update_dataset_manifest(
+ tmp_path,
+ year=2030,
+ h5_path=year_2030,
+ metadata_path=metadata_2030,
+ base_dataset_path="test.h5",
+ profile=profile.to_dict(),
+ calibration_audit=valid_audit,
+ )
+
+ manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
+ assert manifest["contains_invalid_artifacts"] is False
+
+ # Second year: invalid
+ year_2091 = tmp_path / "2091.h5"
+ year_2091.write_text("", encoding="utf-8")
+ metadata_2091 = write_year_metadata(
+ year_2091,
+ year=2091,
+ base_dataset_path="test.h5",
+ profile=profile.to_dict(),
+ calibration_audit=invalid_audit,
+ )
+ update_dataset_manifest(
+ tmp_path,
+ year=2091,
+ h5_path=year_2091,
+ metadata_path=metadata_2091,
+ base_dataset_path="test.h5",
+ profile=profile.to_dict(),
+ calibration_audit=invalid_audit,
+ )
+
+ manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
+ assert manifest["contains_invalid_artifacts"] is True
+ assert manifest["datasets"]["2030"]["validation_passed"] is True
+ assert manifest["datasets"]["2091"]["validation_passed"] is False
+
+
+def test_manifest_persists_support_augmentation_metadata(tmp_path):
+ profile = get_profile("ss-payroll-tob")
+ audit = {
+ "method_used": "entropy",
+ "fell_back_to_ipf": False,
+ "age_max_pct_error": 0.0,
+ "negative_weight_pct": 0.0,
+ "positive_weight_count": 70000,
+ "effective_sample_size": 5000.0,
+ "top_10_weight_share_pct": 1.5,
+ "top_100_weight_share_pct": 10.0,
+ "max_constraint_pct_error": 0.0,
+ "constraints": {},
+ "validation_passed": True,
+ "validation_issues": [],
+ }
+ support_augmentation = {
+ "name": "donor-backed-synthetic-v1",
+ "activation_start_year": 2075,
+ "target_year": 2100,
+ "report_file": "support_augmentation_report.json",
+ "report_summary": {
+ "base_household_count": 41314,
+ "augmented_household_count": 41326,
+ },
+ }
+
+ year_2100 = tmp_path / "2100.h5"
+ year_2100.write_text("", encoding="utf-8")
+ metadata_path = write_year_metadata(
+ year_2100,
+ year=2100,
+ base_dataset_path="test.h5",
+ profile=profile.to_dict(),
+ calibration_audit=audit,
+ support_augmentation=support_augmentation,
+ )
+ manifest_path = update_dataset_manifest(
+ tmp_path,
+ year=2100,
+ h5_path=year_2100,
+ metadata_path=metadata_path,
+ base_dataset_path="test.h5",
+ profile=profile.to_dict(),
+ calibration_audit=audit,
+ support_augmentation=support_augmentation,
+ )
+
+ metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
+ manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
+ assert metadata["support_augmentation"]["name"] == "donor-backed-synthetic-v1"
+ assert (
+ metadata["support_augmentation"]["report_file"]
+ == "support_augmentation_report.json"
+ )
+ assert (
+ manifest["support_augmentation"]["report_summary"]["augmented_household_count"]
+ == 41326
+ )
+
+
+def test_manifest_persists_tax_assumption_metadata(tmp_path):
+ profile = get_profile("ss-payroll-tob")
+ audit = {
+ "method_used": "entropy",
+ "fell_back_to_ipf": False,
+ "age_max_pct_error": 0.0,
+ "negative_weight_pct": 0.0,
+ "positive_weight_count": 70000,
+ "effective_sample_size": 5000.0,
+ "top_10_weight_share_pct": 1.5,
+ "top_100_weight_share_pct": 10.0,
+ "max_constraint_pct_error": 0.0,
+ "constraints": {},
+ "validation_passed": True,
+ "validation_issues": [],
+ }
+ tax_assumption = {
+ "name": "trustees-core-thresholds-v1",
+ "start_year": 2035,
+ "end_year": 2100,
+ }
+
+ year_2100 = tmp_path / "2100.h5"
+ year_2100.write_text("", encoding="utf-8")
+ metadata_path = write_year_metadata(
+ year_2100,
+ year=2100,
+ base_dataset_path="test.h5",
+ profile=profile.to_dict(),
+ calibration_audit=audit,
+ tax_assumption=tax_assumption,
+ )
+ manifest_path = update_dataset_manifest(
+ tmp_path,
+ year=2100,
+ h5_path=year_2100,
+ metadata_path=metadata_path,
+ base_dataset_path="test.h5",
+ profile=profile.to_dict(),
+ calibration_audit=audit,
+ tax_assumption=tax_assumption,
+ )
+
+ metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
+ manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
+ assert metadata["tax_assumption"]["name"] == "trustees-core-thresholds-v1"
+ assert manifest["tax_assumption"]["end_year"] == 2100
+
+
+def test_write_support_augmentation_report(tmp_path):
+ report = {
+ "name": "donor-backed-composite-v1",
+ "clone_household_count": 2,
+ "clone_household_reports": [{"clone_household_id": 1001}],
+ }
+ report_path = write_support_augmentation_report(tmp_path, report)
+ assert report_path == tmp_path / "support_augmentation_report.json"
+ loaded = json.loads(report_path.read_text(encoding="utf-8"))
+ assert loaded["clone_household_count"] == 2
+ assert loaded["clone_household_reports"][0]["clone_household_id"] == 1001
+
+
+def test_write_support_augmentation_report_custom_filename(tmp_path):
+ report = {"name": "dynamic-augmentation", "target_year": 2090}
+ report_path = write_support_augmentation_report(
+ tmp_path,
+ report,
+ filename="support_augmentation_report_2090.json",
+ )
+ assert report_path == tmp_path / "support_augmentation_report_2090.json"
+ loaded = json.loads(report_path.read_text(encoding="utf-8"))
+ assert loaded["target_year"] == 2090
+
+
+def test_summarize_realized_clone_translation_matches_toy_clone():
+ import pandas as pd
+
+ dataset = Dataset.from_dataframe(pd.DataFrame(_toy_support_dataframe()), 2024)
+ augmentation_report = {
+ "clone_household_reports": [
+ {
+ "candidate_idx": 0,
+ "archetype": "older_worker_couple",
+ "clone_household_id": 1,
+ "clone_tax_unit_id": 101,
+ "target_head_age": 70,
+ "target_spouse_age": 68,
+ "target_dependent_ages": [],
+ "target_payroll_total": 5_000.0,
+ "target_ss_total": 20_000.0,
+ }
+ ]
+ }
+ summary = summarize_realized_clone_translation(
+ dataset,
+ period=2024,
+ augmentation_report=augmentation_report,
+ age_bucket_size=5,
+ )
+ assert summary["matched_clone_household_count"] == 1
+ assert summary["aggregate_ss_pct_error"] == pytest.approx(0.0)
+ assert summary["aggregate_payroll_pct_error"] == pytest.approx(0.0)
+ assert summary["per_clone"][0]["realized_ages"] == [70, 68]
+
+
+def test_compose_role_donor_rows_falls_back_for_missing_dependents():
+ import pandas as pd
+
+ df = pd.DataFrame(_toy_support_dataframe())
+ enriched = df.copy()
+ enriched["__pe_payroll_uprating_factor"] = 2.0
+ enriched["__pe_ss_uprating_factor"] = 3.0
+
+ older_rows = enriched[enriched["person_tax_unit_id__2024"] == 201].copy()
+ worker_rows = enriched[enriched["person_tax_unit_id__2024"] == 301].copy()
+ candidate = SyntheticCandidate(
+ archetype="older_plus_prime_worker_family_role_donor",
+ head_age=80,
+ spouse_age=60,
+ dependent_ages=(12,),
+ head_wages=0.0,
+ spouse_wages=100_000.0,
+ head_ss=60_000.0,
+ spouse_ss=0.0,
+ pension_income=0.0,
+ dividend_income=0.0,
+ )
+ clone_df, _ = _compose_role_donor_rows_to_target(
+ older_rows,
+ worker_rows,
+ base_year=2024,
+ target_candidate=candidate,
+ ss_scale=3.0,
+ earnings_scale=2.0,
+ id_counters={
+ "household": 100,
+ "family": 200,
+ "tax_unit": 300,
+ "spm_unit": 400,
+ "marital_unit": 500,
+ "person": 600,
+ },
+ clone_weight_scale=0.1,
+ clone_weight_divisor=1,
+ )
+ assert clone_df is not None
+ assert sorted(clone_df["age__2024"].astype(int).tolist()) == [12, 60, 80]