From beed75aa7b23ec77577da05844aeac6fccbb8c92 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 5 Oct 2025 15:11:32 -0400 Subject: [PATCH 1/8] Move all randomness to data package for deterministic country package MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change moves ALL random number generation from policyengine-us into the dataset generation in policyengine-us-data. The country package is now a purely deterministic rules engine. ## Key Changes ### policyengine-us-data: - Add take-up rate YAML parameter files in `parameters/take_up/` - Generate all stochastic boolean take-up decisions in CPS dataset - Use seeded RNG (seed=100) for full reproducibility ### Stochastic variables generated: **Take-up decisions (boolean):** - takes_up_snap_if_eligible - takes_up_aca_if_eligible - takes_up_medicaid_if_eligible - takes_up_eitc (already boolean) - takes_up_dc_ptc (already boolean) All random generation now uses np.random.default_rng(seed=100) for full reproducibility across dataset builds. ## Trade-offs **IMPORTANT**: Take-up rates can no longer be adjusted dynamically via policy reforms or in the web app. They are fixed in the microdata. This is an acceptable trade-off for the cleaner architecture of keeping the country package purely deterministic. To adjust take-up rates, the microdata must be regenerated. Related: policyengine-us PR (must be merged after this) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- household_counts.txt | 12 +++++ policyengine_us_data/datasets/cps/cps.py | 40 ++++++++++---- policyengine_us_data/parameters/__init__.py | 53 +++++++++++++++++++ .../parameters/take_up/aca.yaml | 10 ++++ .../parameters/take_up/dc_ptc.yaml | 11 ++++ .../parameters/take_up/eitc.yaml | 12 +++++ .../parameters/take_up/medicaid.yaml | 10 ++++ .../parameters/take_up/snap.yaml | 9 ++++ 8 files changed, 148 insertions(+), 9 deletions(-) create mode 100644 household_counts.txt create mode 100644 policyengine_us_data/parameters/__init__.py create mode 100644 policyengine_us_data/parameters/take_up/aca.yaml create mode 100644 policyengine_us_data/parameters/take_up/dc_ptc.yaml create mode 100644 policyengine_us_data/parameters/take_up/eitc.yaml create mode 100644 policyengine_us_data/parameters/take_up/medicaid.yaml create mode 100644 policyengine_us_data/parameters/take_up/snap.yaml diff --git a/household_counts.txt b/household_counts.txt new file mode 100644 index 000000000..b336dd0ac --- /dev/null +++ b/household_counts.txt @@ -0,0 +1,12 @@ +L0 Value | Household Count +---------|---------------- +4.9999e-07 | 7,977 +4.999e-07 | 8,005 +5.0000078125e-07 | 7,784 +5.000015625e-07 | 7,617 +5.00003125e-07 | 8,069 +5.0000625e-07 | 7,908 +5.000125e-07 | 7,937 +5.00025e-07 | 7,889 +5.0005e-07 | 7,935 +5.002e-07 | 7,897 diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 84f01a8bb..4442398f6 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -14,6 +14,7 @@ ) from microimpute.models.qrf import QRF import logging +from policyengine_us_data.parameters import load_take_up_rate class CPS(Dataset): @@ -194,25 +195,46 @@ def add_takeup(self): from policyengine_us import system, Microsimulation baseline = Microsimulation(dataset=self) - parameters = baseline.tax_benefit_system.parameters(self.time_period) + # Generate all stochastic take-up decisions using take-up rates from parameter files + # This keeps the country package purely deterministic generator = np.random.default_rng(seed=100) - eitc_takeup_rates = parameters.gov.irs.credits.eitc.takeup + # Load take-up rates from parameter files + eitc_rates_by_children = load_take_up_rate("eitc", self.time_period) + dc_ptc_rate = load_take_up_rate("dc_ptc", self.time_period) + snap_rate = load_take_up_rate("snap", self.time_period) + aca_rate = load_take_up_rate("aca", self.time_period) + medicaid_rate = load_take_up_rate("medicaid", self.time_period) + + # EITC: varies by number of children eitc_child_count = baseline.calculate("eitc_child_count").values - eitc_takeup_rate = eitc_takeup_rates.calc(eitc_child_count) + eitc_takeup_rate = np.array( + [eitc_rates_by_children.get(min(int(c), 3), 0.85) for c in eitc_child_count] + ) data["takes_up_eitc"] = ( generator.random(len(data["tax_unit_id"])) < eitc_takeup_rate ) - dc_ptc_takeup_rate = parameters.gov.states.dc.tax.income.credits.ptc.takeup + + # DC Property Tax Credit data["takes_up_dc_ptc"] = ( - generator.random(len(data["tax_unit_id"])) < dc_ptc_takeup_rate + generator.random(len(data["tax_unit_id"])) < dc_ptc_rate ) - generator = np.random.default_rng(seed=100) - data["snap_take_up_seed"] = generator.random(len(data["spm_unit_id"])) - data["aca_take_up_seed"] = generator.random(len(data["tax_unit_id"])) - data["medicaid_take_up_seed"] = generator.random(len(data["person_id"])) + # SNAP + data["takes_up_snap_if_eligible"] = ( + generator.random(len(data["spm_unit_id"])) < snap_rate + ) + + # ACA + data["takes_up_aca_if_eligible"] = ( + generator.random(len(data["tax_unit_id"])) < aca_rate + ) + + # Medicaid + data["takes_up_medicaid_if_eligible"] = ( + generator.random(len(data["person_id"])) < medicaid_rate + ) self.save_dataset(data) diff --git a/policyengine_us_data/parameters/__init__.py b/policyengine_us_data/parameters/__init__.py new file mode 100644 index 000000000..a8acad195 --- /dev/null +++ b/policyengine_us_data/parameters/__init__.py @@ -0,0 +1,53 @@ +""" +Take-up rate parameters for stochastic simulation. + +These parameters are stored in the data package to keep the country package +as a purely deterministic rules engine. +""" +import yaml +from pathlib import Path + +PARAMETERS_DIR = Path(__file__).parent + + +def load_take_up_rate(variable_name: str, year: int = 2018) -> float: + """Load take-up rate from YAML parameter files. + + Args: + variable_name: Name of the take-up parameter file (without .yaml) + year: Year for which to get the rate + + Returns: + Take-up rate as a float between 0 and 1 + """ + yaml_path = PARAMETERS_DIR / "take_up" / f"{variable_name}.yaml" + + with open(yaml_path) as f: + data = yaml.safe_load(f) + + # Handle EITC special case (has rates_by_children instead of values) + if "rates_by_children" in data: + return data["rates_by_children"] # Return the dict + + # Find the applicable value for the year + values = data["values"] + applicable_value = None + + for date_key, value in sorted(values.items()): + # Handle both string and datetime.date objects from YAML + if hasattr(date_key, "year"): + # It's a datetime.date object + date_year = date_key.year + else: + # It's a string + date_year = int(date_key.split("-")[0]) + + if date_year <= year: + applicable_value = value + else: + break + + if applicable_value is None: + raise ValueError(f"No take-up rate found for {variable_name} in {year}") + + return applicable_value diff --git a/policyengine_us_data/parameters/take_up/aca.yaml b/policyengine_us_data/parameters/take_up/aca.yaml new file mode 100644 index 000000000..98f920142 --- /dev/null +++ b/policyengine_us_data/parameters/take_up/aca.yaml @@ -0,0 +1,10 @@ +description: Percentage of eligible people who do enroll in Affordable Care Act coverage, if eligible. +metadata: + label: ACA takeup rate + unit: /1 + period: year + reference: + - title: KFF "A Closer Look at the Remaining Uninsured Population Eligible for Medicaid and CHIP" + href: https://www.kff.org/uninsured/issue-brief/a-closer-look-at-the-remaining-uninsured-population-eligible-for-medicaid-and-chip/#:~:text=the%20uninsured%20rate%20dropped%20to,States%20began%20the +values: + 2018-01-01: 0.672 diff --git a/policyengine_us_data/parameters/take_up/dc_ptc.yaml b/policyengine_us_data/parameters/take_up/dc_ptc.yaml new file mode 100644 index 000000000..6195ecf33 --- /dev/null +++ b/policyengine_us_data/parameters/take_up/dc_ptc.yaml @@ -0,0 +1,11 @@ +description: The share of eligible individuals who claim the DC property tax credit. +metadata: + unit: /1 + label: DC property tax credit takeup rate + period: year + reference: + - title: District of Columbia Tax Expenditure Report, 2024 + href: https://ora-cfo.dc.gov/sites/default/files/dc/sites/ora-cfo/publication/attachments/2024%20Tax%20Expenditure%20Report.pdf#page=234 +values: + # 37,133 (from 2024 Tax Expenditure Report) / 131,791,388 (PolicyEngine DC PTC value estimate) + 2021-01-01: 0.32 diff --git a/policyengine_us_data/parameters/take_up/eitc.yaml b/policyengine_us_data/parameters/take_up/eitc.yaml new file mode 100644 index 000000000..17aa9daae --- /dev/null +++ b/policyengine_us_data/parameters/take_up/eitc.yaml @@ -0,0 +1,12 @@ +description: The share of eligible individuals who claim the EITC (by number of children). +metadata: + label: EITC take-up rate by number of children + reference: + - title: National Taxpayer Advocate Special Report to Congress 2020 | IRS + href: https://www.taxpayeradvocate.irs.gov/wp-content/uploads/2020/08/JRC20_Volume3.pdf#page=62 +# Maps number of children to take-up rate +rates_by_children: + 0: 0.65 + 1: 0.86 + 2: 0.85 + 3: 0.85 # Assume same as 2 diff --git a/policyengine_us_data/parameters/take_up/medicaid.yaml b/policyengine_us_data/parameters/take_up/medicaid.yaml new file mode 100644 index 000000000..cfdf919a5 --- /dev/null +++ b/policyengine_us_data/parameters/take_up/medicaid.yaml @@ -0,0 +1,10 @@ +description: Percentage of people who do enroll in Medicaid, if eligible. +metadata: + label: Medicaid takeup rate + unit: /1 + period: year + reference: + - title: KFF "A Closer Look at the Remaining Uninsured Population Eligible for Medicaid and CHIP" + href: https://www.kff.org/uninsured/issue-brief/a-closer-look-at-the-remaining-uninsured-population-eligible-for-medicaid-and-chip/#:~:text=the%20uninsured%20rate%20dropped%20to,States%20began%20the +values: + 2018-01-01: 0.93 diff --git a/policyengine_us_data/parameters/take_up/snap.yaml b/policyengine_us_data/parameters/take_up/snap.yaml new file mode 100644 index 000000000..12b6012ec --- /dev/null +++ b/policyengine_us_data/parameters/take_up/snap.yaml @@ -0,0 +1,9 @@ +description: Percentage of eligible SNAP recipients who claim SNAP. +metadata: + label: SNAP takeup rate + unit: /1 + reference: + - title: USDA + href: https://www.fns.usda.gov/usamap +values: + 2018-01-01: 0.82 From 97252d6aa3ade2097e68db91ce03b969fe5ba71b Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 10 Nov 2025 05:54:03 -0600 Subject: [PATCH 2/8] Add Head Start and Early Head Start takeup generation - Create takeup parameter files with rates from NIEER report - Head Start: 40% (pre-pandemic), 30% (pandemic 2020-2021) - Early Head Start: 9% - Generate stochastic takeup in CPS dataset using same pattern as SNAP/Medicaid - Coordinates with policyengine-us PR adding takeup variables --- policyengine_us_data/datasets/cps/cps.py | 12 ++++++++++++ .../parameters/take_up/early_head_start.yaml | 9 +++++++++ .../parameters/take_up/head_start.yaml | 10 ++++++++++ 3 files changed, 31 insertions(+) create mode 100644 policyengine_us_data/parameters/take_up/early_head_start.yaml create mode 100644 policyengine_us_data/parameters/take_up/head_start.yaml diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 4442398f6..bf080db45 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -206,6 +206,8 @@ def add_takeup(self): snap_rate = load_take_up_rate("snap", self.time_period) aca_rate = load_take_up_rate("aca", self.time_period) medicaid_rate = load_take_up_rate("medicaid", self.time_period) + head_start_rate = load_take_up_rate("head_start", self.time_period) + early_head_start_rate = load_take_up_rate("early_head_start", self.time_period) # EITC: varies by number of children eitc_child_count = baseline.calculate("eitc_child_count").values @@ -236,6 +238,16 @@ def add_takeup(self): generator.random(len(data["person_id"])) < medicaid_rate ) + # Head Start + data["takes_up_head_start_if_eligible"] = ( + generator.random(len(data["person_id"])) < head_start_rate + ) + + # Early Head Start + data["takes_up_early_head_start_if_eligible"] = ( + generator.random(len(data["person_id"])) < early_head_start_rate + ) + self.save_dataset(data) diff --git a/policyengine_us_data/parameters/take_up/early_head_start.yaml b/policyengine_us_data/parameters/take_up/early_head_start.yaml new file mode 100644 index 000000000..3802d9880 --- /dev/null +++ b/policyengine_us_data/parameters/take_up/early_head_start.yaml @@ -0,0 +1,9 @@ +description: Percentage of eligible infants and toddlers who enroll in Early Head Start. +metadata: + label: Early Head Start take-up rate + unit: /1 + reference: + - title: NIEER State(s) of Head Start and Early Head Start Report + href: https://nieer.org/research-library/states-head-start-early-head-start +values: + 2020-09-01: 0.09 diff --git a/policyengine_us_data/parameters/take_up/head_start.yaml b/policyengine_us_data/parameters/take_up/head_start.yaml new file mode 100644 index 000000000..9495f44bd --- /dev/null +++ b/policyengine_us_data/parameters/take_up/head_start.yaml @@ -0,0 +1,10 @@ +description: Percentage of eligible children who enroll in Head Start. +metadata: + label: Head Start take-up rate + unit: /1 + reference: + - title: NIEER State(s) of Head Start and Early Head Start Report + href: https://nieer.org/research-library/states-head-start-early-head-start +values: + 2020-09-01: 0.40 + 2021-09-01: 0.30 From 470b48fe47d0a35d65ff74dbf1339caf4c632af7 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 3 Dec 2025 17:20:56 -0500 Subject: [PATCH 3/8] Add changelog entry and remove debug file --- changelog_entry.yaml | 4 ++++ household_counts.txt | 12 ------------ policyengine_us_data/datasets/cps/cps.py | 9 +++++++-- policyengine_us_data/parameters/__init__.py | 5 ++++- 4 files changed, 15 insertions(+), 15 deletions(-) delete mode 100644 household_counts.txt diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29bb..1a8cc410b 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + added: + - Move all randomness to data package for deterministic country package. Take-up decisions for SNAP, Medicaid, ACA, EITC, DC PTC, Head Start, and Early Head Start are now generated stochastically during dataset creation using take-up rates from YAML parameter files. diff --git a/household_counts.txt b/household_counts.txt deleted file mode 100644 index b336dd0ac..000000000 --- a/household_counts.txt +++ /dev/null @@ -1,12 +0,0 @@ -L0 Value | Household Count ----------|---------------- -4.9999e-07 | 7,977 -4.999e-07 | 8,005 -5.0000078125e-07 | 7,784 -5.000015625e-07 | 7,617 -5.00003125e-07 | 8,069 -5.0000625e-07 | 7,908 -5.000125e-07 | 7,937 -5.00025e-07 | 7,889 -5.0005e-07 | 7,935 -5.002e-07 | 7,897 diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index bf080db45..695372858 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -207,12 +207,17 @@ def add_takeup(self): aca_rate = load_take_up_rate("aca", self.time_period) medicaid_rate = load_take_up_rate("medicaid", self.time_period) head_start_rate = load_take_up_rate("head_start", self.time_period) - early_head_start_rate = load_take_up_rate("early_head_start", self.time_period) + early_head_start_rate = load_take_up_rate( + "early_head_start", self.time_period + ) # EITC: varies by number of children eitc_child_count = baseline.calculate("eitc_child_count").values eitc_takeup_rate = np.array( - [eitc_rates_by_children.get(min(int(c), 3), 0.85) for c in eitc_child_count] + [ + eitc_rates_by_children.get(min(int(c), 3), 0.85) + for c in eitc_child_count + ] ) data["takes_up_eitc"] = ( generator.random(len(data["tax_unit_id"])) < eitc_takeup_rate diff --git a/policyengine_us_data/parameters/__init__.py b/policyengine_us_data/parameters/__init__.py index a8acad195..0e5856a61 100644 --- a/policyengine_us_data/parameters/__init__.py +++ b/policyengine_us_data/parameters/__init__.py @@ -4,6 +4,7 @@ These parameters are stored in the data package to keep the country package as a purely deterministic rules engine. """ + import yaml from pathlib import Path @@ -48,6 +49,8 @@ def load_take_up_rate(variable_name: str, year: int = 2018) -> float: break if applicable_value is None: - raise ValueError(f"No take-up rate found for {variable_name} in {year}") + raise ValueError( + f"No take-up rate found for {variable_name} in {year}" + ) return applicable_value From 7df52459e4196f26b12e1c37cde35052020a2524 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 3 Dec 2025 18:02:39 -0500 Subject: [PATCH 4/8] Add tests for stochastic variable generation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tests verify: - Take-up rate parameters load correctly (EITC, SNAP, Medicaid, etc.) - Seeded RNG produces deterministic results - Take-up proportions match expected rates 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../tests/test_stochastic_variables.py | 110 ++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 policyengine_us_data/tests/test_stochastic_variables.py diff --git a/policyengine_us_data/tests/test_stochastic_variables.py b/policyengine_us_data/tests/test_stochastic_variables.py new file mode 100644 index 000000000..e4d896e44 --- /dev/null +++ b/policyengine_us_data/tests/test_stochastic_variables.py @@ -0,0 +1,110 @@ +"""Tests for stochastic variable generation in the data package. + +These tests verify that: +1. Take-up rate parameters load correctly +2. Seeded RNG produces deterministic results +3. Take-up rates produce plausible proportions +""" + +import pytest +import numpy as np +from policyengine_us_data.parameters import load_take_up_rate + + +class TestTakeUpRateParameters: + """Test that take-up rate parameters load correctly.""" + + def test_eitc_rate_loads(self): + """EITC take-up rates should load and be plausible.""" + rates = load_take_up_rate("eitc", 2022) + # EITC rates are by number of children: 0, 1, 2, 3+ + assert isinstance(rates, dict) or isinstance(rates, float) + if isinstance(rates, dict): + for key, rate in rates.items(): + assert 0 < rate <= 1 + + def test_snap_rate_loads(self): + """SNAP take-up rate should load and be plausible.""" + rate = load_take_up_rate("snap", 2022) + assert 0 < rate <= 1 + + def test_medicaid_rate_loads(self): + """Medicaid take-up rate should load and be plausible.""" + rate = load_take_up_rate("medicaid", 2022) + assert 0 < rate <= 1 + + def test_aca_rate_loads(self): + """ACA take-up rate should load and be plausible.""" + rate = load_take_up_rate("aca", 2022) + assert 0 < rate <= 1 + + def test_head_start_rate_loads(self): + """Head Start take-up rate should load and be plausible.""" + rate = load_take_up_rate("head_start", 2022) + assert 0 < rate <= 1 + + def test_early_head_start_rate_loads(self): + """Early Head Start take-up rate should load and be plausible.""" + rate = load_take_up_rate("early_head_start", 2022) + assert 0 < rate <= 1 + + def test_dc_ptc_rate_loads(self): + """DC PTC take-up rate should load and be plausible.""" + rate = load_take_up_rate("dc_ptc", 2022) + assert 0 < rate <= 1 + + +class TestSeededRandomness: + """Test that stochastic generation is deterministic.""" + + def test_same_seed_produces_same_results(self): + """Using the same seed should produce identical results.""" + seed = 0 + n = 1_000 + + generator1 = np.random.default_rng(seed=seed) + result1 = generator1.random(n) + + generator2 = np.random.default_rng(seed=seed) + result2 = generator2.random(n) + + np.testing.assert_array_equal(result1, result2) + + def test_different_seeds_produce_different_results(self): + """Different seeds should produce different results.""" + n = 1_000 + + generator1 = np.random.default_rng(seed=0) + result1 = generator1.random(n) + + generator2 = np.random.default_rng(seed=1) + result2 = generator2.random(n) + + assert not np.array_equal(result1, result2) + + +class TestTakeUpProportions: + """Test that take-up rates produce plausible proportions.""" + + def test_take_up_produces_expected_proportion(self): + """Simulated take-up should match the rate approximately.""" + rate = 0.7 + n = 10_000 + generator = np.random.default_rng(seed=42) + + take_up = generator.random(n) < rate + actual_proportion = take_up.mean() + + # Should be within 5 percentage points of the rate + assert abs(actual_proportion - rate) < 0.05 + + def test_boolean_generation(self): + """Take-up decisions should be boolean.""" + rate = 0.5 + n = 100 + generator = np.random.default_rng(seed=42) + + take_up = generator.random(n) < rate + + assert take_up.dtype == bool + assert set(take_up).issubset({True, False}) From 7f67999982bf21f319ec118a4f27a6f920bab24f Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Thu, 5 Feb 2026 17:21:52 -0500 Subject: [PATCH 5/8] Add name-based seeding, state-specific Medicaid, SSI and WIC variables Replace shared RNG (seed=100) with per-variable name-based seeding using _stable_string_hash for order-independent reproducibility. Add state-specific Medicaid takeup rates (53%-99%), SSI resource test pass rate, and WIC takeup/nutritional risk draw variables. Co-Authored-By: Claude Opus 4.6 --- changelog_entry.yaml | 9 +- policyengine_us_data/datasets/cps/cps.py | 68 ++++++--- policyengine_us_data/parameters/__init__.py | 18 ++- .../parameters/take_up/medicaid.yaml | 60 +++++++- .../parameters/take_up/ssi_pass_rate.yaml | 10 ++ .../tests/test_stochastic_variables.py | 139 +++++++++++------- policyengine_us_data/utils/randomness.py | 29 ++++ 7 files changed, 242 insertions(+), 91 deletions(-) create mode 100644 policyengine_us_data/parameters/take_up/ssi_pass_rate.yaml create mode 100644 policyengine_us_data/utils/randomness.py diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 1a8cc410b..0afd182ae 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,4 +1,11 @@ - bump: minor changes: added: - - Move all randomness to data package for deterministic country package. Take-up decisions for SNAP, Medicaid, ACA, EITC, DC PTC, Head Start, and Early Head Start are now generated stochastically during dataset creation using take-up rates from YAML parameter files. + - Name-based seeding (seeded_rng) for order-independent reproducibility + - State-specific Medicaid takeup rates (53%-99% range, 51 jurisdictions) + - SSI resource test pass rate parameter (0.4) + - WIC takeup and nutritional risk draw variables (float) + - meets_ssi_resource_test boolean generation + changed: + - Replaced shared RNG (seed=100) with per-variable name-based seeding + - Medicaid takeup now uses state-specific rates instead of uniform 93% diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 695372858..d4c899cdc 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -15,6 +15,7 @@ from microimpute.models.qrf import QRF import logging from policyengine_us_data.parameters import load_take_up_rate +from policyengine_us_data.utils.randomness import seeded_rng class CPS(Dataset): @@ -192,24 +193,25 @@ def add_rent(self, cps: h5py.File, person: DataFrame, household: DataFrame): def add_takeup(self): data = self.load_dataset() - from policyengine_us import system, Microsimulation + from policyengine_us import Microsimulation baseline = Microsimulation(dataset=self) - # Generate all stochastic take-up decisions using take-up rates from parameter files - # This keeps the country package purely deterministic - generator = np.random.default_rng(seed=100) + n_persons = len(data["person_id"]) + n_tax_units = len(data["tax_unit_id"]) + n_spm_units = len(data["spm_unit_id"]) - # Load take-up rates from parameter files + # Load take-up rates eitc_rates_by_children = load_take_up_rate("eitc", self.time_period) dc_ptc_rate = load_take_up_rate("dc_ptc", self.time_period) snap_rate = load_take_up_rate("snap", self.time_period) aca_rate = load_take_up_rate("aca", self.time_period) - medicaid_rate = load_take_up_rate("medicaid", self.time_period) + medicaid_rates_by_state = load_take_up_rate("medicaid", self.time_period) head_start_rate = load_take_up_rate("head_start", self.time_period) early_head_start_rate = load_take_up_rate( "early_head_start", self.time_period ) + ssi_pass_rate = load_take_up_rate("ssi_pass_rate", self.time_period) # EITC: varies by number of children eitc_child_count = baseline.calculate("eitc_child_count").values @@ -219,38 +221,60 @@ def add_takeup(self): for c in eitc_child_count ] ) - data["takes_up_eitc"] = ( - generator.random(len(data["tax_unit_id"])) < eitc_takeup_rate - ) + rng = seeded_rng("takes_up_eitc") + data["takes_up_eitc"] = rng.random(n_tax_units) < eitc_takeup_rate # DC Property Tax Credit - data["takes_up_dc_ptc"] = ( - generator.random(len(data["tax_unit_id"])) < dc_ptc_rate - ) + rng = seeded_rng("takes_up_dc_ptc") + data["takes_up_dc_ptc"] = rng.random(n_tax_units) < dc_ptc_rate # SNAP - data["takes_up_snap_if_eligible"] = ( - generator.random(len(data["spm_unit_id"])) < snap_rate - ) + rng = seeded_rng("takes_up_snap_if_eligible") + data["takes_up_snap_if_eligible"] = rng.random(n_spm_units) < snap_rate # ACA - data["takes_up_aca_if_eligible"] = ( - generator.random(len(data["tax_unit_id"])) < aca_rate - ) + rng = seeded_rng("takes_up_aca_if_eligible") + data["takes_up_aca_if_eligible"] = rng.random(n_tax_units) < aca_rate - # Medicaid + # Medicaid: state-specific rates + state_codes = baseline.calculate("state_code_str").values + hh_ids = data["household_id"] + person_hh_ids = data["person_household_id"] + hh_to_state = dict(zip(hh_ids, state_codes)) + person_states = np.array( + [hh_to_state.get(hh_id, "CA") for hh_id in person_hh_ids] + ) + medicaid_rate_by_person = np.array( + [medicaid_rates_by_state.get(s, 0.93) for s in person_states] + ) + rng = seeded_rng("takes_up_medicaid_if_eligible") data["takes_up_medicaid_if_eligible"] = ( - generator.random(len(data["person_id"])) < medicaid_rate + rng.random(n_persons) < medicaid_rate_by_person ) # Head Start + rng = seeded_rng("takes_up_head_start_if_eligible") data["takes_up_head_start_if_eligible"] = ( - generator.random(len(data["person_id"])) < head_start_rate + rng.random(n_persons) < head_start_rate ) # Early Head Start + rng = seeded_rng("takes_up_early_head_start_if_eligible") data["takes_up_early_head_start_if_eligible"] = ( - generator.random(len(data["person_id"])) < early_head_start_rate + rng.random(n_persons) < early_head_start_rate + ) + + # SSI resource test + rng = seeded_rng("meets_ssi_resource_test") + data["meets_ssi_resource_test"] = rng.random(n_persons) < ssi_pass_rate + + # WIC draws (country package compares against category-specific rates) + rng = seeded_rng("wic_takeup_draw") + data["wic_takeup_draw"] = rng.random(n_persons).astype(np.float32) + + rng = seeded_rng("wic_nutritional_risk_draw") + data["wic_nutritional_risk_draw"] = rng.random(n_persons).astype( + np.float32 ) self.save_dataset(data) diff --git a/policyengine_us_data/parameters/__init__.py b/policyengine_us_data/parameters/__init__.py index 0e5856a61..babd99037 100644 --- a/policyengine_us_data/parameters/__init__.py +++ b/policyengine_us_data/parameters/__init__.py @@ -11,7 +11,7 @@ PARAMETERS_DIR = Path(__file__).parent -def load_take_up_rate(variable_name: str, year: int = 2018) -> float: +def load_take_up_rate(variable_name: str, year: int = 2018): """Load take-up rate from YAML parameter files. Args: @@ -19,28 +19,30 @@ def load_take_up_rate(variable_name: str, year: int = 2018) -> float: year: Year for which to get the rate Returns: - Take-up rate as a float between 0 and 1 + float, dict (EITC rates_by_children), or dict (Medicaid + rates_by_state) """ yaml_path = PARAMETERS_DIR / "take_up" / f"{variable_name}.yaml" with open(yaml_path) as f: data = yaml.safe_load(f) - # Handle EITC special case (has rates_by_children instead of values) + # EITC: rates by number of children if "rates_by_children" in data: - return data["rates_by_children"] # Return the dict + return data["rates_by_children"] - # Find the applicable value for the year + # Medicaid: state-specific rates + if "rates_by_state" in data: + return data["rates_by_state"] + + # Standard time-series values values = data["values"] applicable_value = None for date_key, value in sorted(values.items()): - # Handle both string and datetime.date objects from YAML if hasattr(date_key, "year"): - # It's a datetime.date object date_year = date_key.year else: - # It's a string date_year = int(date_key.split("-")[0]) if date_year <= year: diff --git a/policyengine_us_data/parameters/take_up/medicaid.yaml b/policyengine_us_data/parameters/take_up/medicaid.yaml index cfdf919a5..3beb851b0 100644 --- a/policyengine_us_data/parameters/take_up/medicaid.yaml +++ b/policyengine_us_data/parameters/take_up/medicaid.yaml @@ -3,8 +3,62 @@ metadata: label: Medicaid takeup rate unit: /1 period: year + breakdown: + - state_code reference: - title: KFF "A Closer Look at the Remaining Uninsured Population Eligible for Medicaid and CHIP" - href: https://www.kff.org/uninsured/issue-brief/a-closer-look-at-the-remaining-uninsured-population-eligible-for-medicaid-and-chip/#:~:text=the%20uninsured%20rate%20dropped%20to,States%20began%20the -values: - 2018-01-01: 0.93 + href: https://www.kff.org/uninsured/issue-brief/a-closer-look-at-the-remaining-uninsured-population-eligible-for-medicaid-and-chip/ + - title: State-specific rates derived from MACPAC enrollment targets vs modeled eligibility + href: https://www.medicaid.gov/medicaid/program-information/medicaid-and-chip-enrollment-data/report-highlights/index.html +rates_by_state: + AK: 0.88 + AL: 0.92 + AR: 0.79 + AZ: 0.95 + CA: 0.78 + CO: 0.99 + CT: 0.89 + DC: 0.99 + DE: 0.86 + FL: 0.98 + GA: 0.73 + HI: 0.88 + IA: 0.84 + ID: 0.78 + IL: 0.85 + IN: 0.99 + KS: 0.92 + KY: 0.87 + LA: 0.79 + MA: 0.94 + MD: 0.95 + ME: 0.92 + MI: 0.91 + MN: 0.89 + MO: 0.89 + MS: 0.75 + MT: 0.83 + NC: 0.94 + ND: 0.91 + NE: 0.79 + NH: 0.84 + NJ: 0.74 + NM: 0.84 + NV: 0.93 + NY: 0.86 + OH: 0.82 + OK: 0.77 + OR: 0.92 + PA: 0.64 + RI: 0.94 + SC: 0.93 + SD: 0.88 + TN: 0.92 + TX: 0.76 + UT: 0.53 + VA: 0.82 + VT: 0.93 + WA: 0.98 + WI: 0.91 + WV: 0.83 + WY: 0.70 diff --git a/policyengine_us_data/parameters/take_up/ssi_pass_rate.yaml b/policyengine_us_data/parameters/take_up/ssi_pass_rate.yaml new file mode 100644 index 000000000..415b3c6c0 --- /dev/null +++ b/policyengine_us_data/parameters/take_up/ssi_pass_rate.yaml @@ -0,0 +1,10 @@ +description: Proportion of SSI-aged-blind-disabled recipients who meet the asset test. +metadata: + label: SSI resource test pass rate + unit: /1 + period: year + reference: + - title: SSI resource test pass rate from policyengine-us + href: https://github.com/PolicyEngine/policyengine-us +values: + 2018-01-01: 0.4 diff --git a/policyengine_us_data/tests/test_stochastic_variables.py b/policyengine_us_data/tests/test_stochastic_variables.py index e4d896e44..293d29a7e 100644 --- a/policyengine_us_data/tests/test_stochastic_variables.py +++ b/policyengine_us_data/tests/test_stochastic_variables.py @@ -1,110 +1,135 @@ -"""Tests for stochastic variable generation in the data package. - -These tests verify that: -1. Take-up rate parameters load correctly -2. Seeded RNG produces deterministic results -3. Take-up rates produce plausible proportions -""" +"""Tests for stochastic variable generation in the data package.""" import pytest import numpy as np from policyengine_us_data.parameters import load_take_up_rate +from policyengine_us_data.utils.randomness import ( + _stable_string_hash, + seeded_rng, +) class TestTakeUpRateParameters: - """Test that take-up rate parameters load correctly.""" def test_eitc_rate_loads(self): - """EITC take-up rates should load and be plausible.""" rates = load_take_up_rate("eitc", 2022) - # EITC rates are by number of children: 0, 1, 2, 3+ - assert isinstance(rates, dict) or isinstance(rates, float) - if isinstance(rates, dict): - for key, rate in rates.items(): - assert 0 < rate <= 1 + assert isinstance(rates, dict) + for key, rate in rates.items(): + assert 0 < rate <= 1 def test_snap_rate_loads(self): - """SNAP take-up rate should load and be plausible.""" rate = load_take_up_rate("snap", 2022) assert 0 < rate <= 1 - def test_medicaid_rate_loads(self): - """Medicaid take-up rate should load and be plausible.""" - rate = load_take_up_rate("medicaid", 2022) - assert 0 < rate <= 1 + def test_medicaid_rate_loads_state_specific(self): + rates = load_take_up_rate("medicaid", 2022) + assert isinstance(rates, dict) + assert len(rates) == 51 # 50 states + DC + for state, rate in rates.items(): + assert 0 < rate <= 1, f"{state}: {rate}" + assert rates["UT"] == 0.53 + assert rates["CO"] == 0.99 def test_aca_rate_loads(self): - """ACA take-up rate should load and be plausible.""" rate = load_take_up_rate("aca", 2022) assert 0 < rate <= 1 def test_head_start_rate_loads(self): - """Head Start take-up rate should load and be plausible.""" rate = load_take_up_rate("head_start", 2022) assert 0 < rate <= 1 def test_early_head_start_rate_loads(self): - """Early Head Start take-up rate should load and be plausible.""" rate = load_take_up_rate("early_head_start", 2022) assert 0 < rate <= 1 def test_dc_ptc_rate_loads(self): - """DC PTC take-up rate should load and be plausible.""" rate = load_take_up_rate("dc_ptc", 2022) assert 0 < rate <= 1 + def test_ssi_pass_rate_loads(self): + rate = load_take_up_rate("ssi_pass_rate", 2022) + assert rate == 0.4 -class TestSeededRandomness: - """Test that stochastic generation is deterministic.""" - def test_same_seed_produces_same_results(self): - """Using the same seed should produce identical results.""" - seed = 0 - n = 1_000 +class TestStableStringHash: - generator1 = np.random.default_rng(seed=seed) - result1 = generator1.random(n) + def test_deterministic(self): + h1 = _stable_string_hash("takes_up_snap_if_eligible") + h2 = _stable_string_hash("takes_up_snap_if_eligible") + assert h1 == h2 - generator2 = np.random.default_rng(seed=seed) - result2 = generator2.random(n) + def test_different_strings_differ(self): + h1 = _stable_string_hash("takes_up_snap_if_eligible") + h2 = _stable_string_hash("takes_up_aca_if_eligible") + assert h1 != h2 - np.testing.assert_array_equal(result1, result2) + def test_returns_uint64(self): + h = _stable_string_hash("test") + assert h.dtype == np.uint64 - def test_different_seeds_produce_different_results(self): - """Different seeds should produce different results.""" - n = 1_000 - generator1 = np.random.default_rng(seed=0) - result1 = generator1.random(n) +class TestSeededRng: - generator2 = np.random.default_rng(seed=1) - result2 = generator2.random(n) + def test_same_name_same_results(self): + rng1 = seeded_rng("takes_up_snap_if_eligible") + result1 = rng1.random(1000) + rng2 = seeded_rng("takes_up_snap_if_eligible") + result2 = rng2.random(1000) + np.testing.assert_array_equal(result1, result2) + def test_different_names_different_results(self): + rng1 = seeded_rng("takes_up_snap_if_eligible") + result1 = rng1.random(1000) + rng2 = seeded_rng("takes_up_aca_if_eligible") + result2 = rng2.random(1000) assert not np.array_equal(result1, result2) + def test_order_independence(self): + """Generating variables in different order produces same values.""" + # Order A: SNAP then ACA + rng_snap_a = seeded_rng("takes_up_snap_if_eligible") + snap_a = rng_snap_a.random(1000) + rng_aca_a = seeded_rng("takes_up_aca_if_eligible") + aca_a = rng_aca_a.random(1000) + + # Order B: ACA then SNAP + rng_aca_b = seeded_rng("takes_up_aca_if_eligible") + aca_b = rng_aca_b.random(1000) + rng_snap_b = seeded_rng("takes_up_snap_if_eligible") + snap_b = rng_snap_b.random(1000) + + np.testing.assert_array_equal(snap_a, snap_b) + np.testing.assert_array_equal(aca_a, aca_b) + class TestTakeUpProportions: - """Test that take-up rates produce plausible proportions.""" def test_take_up_produces_expected_proportion(self): - """Simulated take-up should match the rate approximately.""" rate = 0.7 n = 10_000 - generator = np.random.default_rng(seed=42) - - take_up = generator.random(n) < rate - actual_proportion = take_up.mean() - - # Should be within 5 percentage points of the rate - assert abs(actual_proportion - rate) < 0.05 + rng = seeded_rng("test_variable") + take_up = rng.random(n) < rate + assert abs(take_up.mean() - rate) < 0.05 def test_boolean_generation(self): - """Take-up decisions should be boolean.""" - rate = 0.5 - n = 100 - generator = np.random.default_rng(seed=42) - - take_up = generator.random(n) < rate - + rng = seeded_rng("test_bool") + take_up = rng.random(100) < 0.5 assert take_up.dtype == bool assert set(take_up).issubset({True, False}) + + def test_wic_draws_are_float(self): + rng = seeded_rng("wic_takeup_draw") + draws = rng.random(1000).astype(np.float32) + assert draws.dtype == np.float32 + assert np.all(draws >= 0) + assert np.all(draws < 1) + + def test_state_specific_medicaid_proportions(self): + rates = load_take_up_rate("medicaid", 2022) + rng = seeded_rng("takes_up_medicaid_if_eligible") + n = 50_000 + draws = rng.random(n) + # Test a few states + for state, expected_rate in [("UT", 0.53), ("CO", 0.99)]: + take_up = draws[:10_000] < expected_rate + assert abs(take_up.mean() - expected_rate) < 0.05 diff --git a/policyengine_us_data/utils/randomness.py b/policyengine_us_data/utils/randomness.py new file mode 100644 index 000000000..f4f2e51d7 --- /dev/null +++ b/policyengine_us_data/utils/randomness.py @@ -0,0 +1,29 @@ +import warnings +import numpy as np + + +def _stable_string_hash(s: str) -> np.uint64: + """Deterministic hash consistent across Python processes. + + Python's built-in hash() is not deterministic across processes + (since 3.3), so we use a polynomial rolling hash with mixing. + + Ported from policyengine_core.commons.formulas._stable_string_hash. + """ + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "overflow encountered", RuntimeWarning + ) + h = np.uint64(0) + for byte in s.encode("utf-8"): + h = h * np.uint64(31) + np.uint64(byte) + h = h ^ (h >> np.uint64(33)) + h = h * np.uint64(0xFF51AFD7ED558CCD) + h = h ^ (h >> np.uint64(33)) + return h + + +def seeded_rng(variable_name: str) -> np.random.Generator: + """Create a per-variable RNG seeded by variable name hash.""" + seed = int(_stable_string_hash(variable_name)) % (2**63) + return np.random.default_rng(seed=seed) From 373235c8128eade0f89e509be128293472c0b83a Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Fri, 6 Feb 2026 09:20:12 -0500 Subject: [PATCH 6/8] Add optional salt parameter to seeded_rng Supports per-subgroup RNG streams (e.g., congressional districts) while keeping the unsalted default distinct from any salted variant. Co-Authored-By: Claude Opus 4.6 --- policyengine_us_data/utils/randomness.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/policyengine_us_data/utils/randomness.py b/policyengine_us_data/utils/randomness.py index f4f2e51d7..eac015227 100644 --- a/policyengine_us_data/utils/randomness.py +++ b/policyengine_us_data/utils/randomness.py @@ -23,7 +23,8 @@ def _stable_string_hash(s: str) -> np.uint64: return h -def seeded_rng(variable_name: str) -> np.random.Generator: +def seeded_rng(variable_name: str, salt: str = None) -> np.random.Generator: """Create a per-variable RNG seeded by variable name hash.""" - seed = int(_stable_string_hash(variable_name)) % (2**63) + key = variable_name if salt is None else f"{variable_name}:{salt}" + seed = int(_stable_string_hash(key)) % (2**63) return np.random.default_rng(seed=seed) From d90f42bc0d75e3d4bcd7395ec7212984b4ece6bc Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Fri, 6 Feb 2026 14:02:45 -0500 Subject: [PATCH 7/8] Convert WIC float draws to bools for consistency with other takeup variables Resolves category-specific rate comparisons at data generation time so only bools (would_claim_wic, wic_nutritional_risk_imputed) are stored in the dataset, matching the pattern used by all other takeup variables. Co-Authored-By: Claude Opus 4.6 --- policyengine_us_data/datasets/cps/cps.py | 23 +++++++++---- policyengine_us_data/parameters/__init__.py | 14 ++++++++ .../take_up/wic_nutritional_risk.yaml | 13 ++++++++ .../parameters/take_up/wic_takeup.yaml | 33 +++++++++++++++++++ .../tests/test_stochastic_variables.py | 31 +++++++++++++---- 5 files changed, 102 insertions(+), 12 deletions(-) create mode 100644 policyengine_us_data/parameters/take_up/wic_nutritional_risk.yaml create mode 100644 policyengine_us_data/parameters/take_up/wic_takeup.yaml diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index d4c899cdc..c6224e1c3 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -268,13 +268,24 @@ def add_takeup(self): rng = seeded_rng("meets_ssi_resource_test") data["meets_ssi_resource_test"] = rng.random(n_persons) < ssi_pass_rate - # WIC draws (country package compares against category-specific rates) - rng = seeded_rng("wic_takeup_draw") - data["wic_takeup_draw"] = rng.random(n_persons).astype(np.float32) + # WIC: resolve draws to bools using category-specific rates + wic_categories = baseline.calculate("wic_category_str").values + wic_takeup_rates = load_take_up_rate("wic_takeup", self.time_period) + wic_takeup_rate_by_person = np.array( + [wic_takeup_rates.get(c, 0) for c in wic_categories] + ) + rng = seeded_rng("would_claim_wic") + data["would_claim_wic"] = rng.random(n_persons) < wic_takeup_rate_by_person - rng = seeded_rng("wic_nutritional_risk_draw") - data["wic_nutritional_risk_draw"] = rng.random(n_persons).astype( - np.float32 + wic_risk_rates = load_take_up_rate( + "wic_nutritional_risk", self.time_period + ) + wic_risk_rate_by_person = np.array( + [wic_risk_rates.get(c, 0) for c in wic_categories] + ) + rng = seeded_rng("wic_nutritional_risk_imputed") + data["wic_nutritional_risk_imputed"] = ( + rng.random(n_persons) < wic_risk_rate_by_person ) self.save_dataset(data) diff --git a/policyengine_us_data/parameters/__init__.py b/policyengine_us_data/parameters/__init__.py index babd99037..2fcddb5af 100644 --- a/policyengine_us_data/parameters/__init__.py +++ b/policyengine_us_data/parameters/__init__.py @@ -35,6 +35,20 @@ def load_take_up_rate(variable_name: str, year: int = 2018): if "rates_by_state" in data: return data["rates_by_state"] + # WIC-style: rates by category (each category has a time series) + if "rates_by_category" in data: + result = {} + for category, time_series in data["rates_by_category"].items(): + applicable_value = None + for y, value in sorted(time_series.items()): + if int(y) <= year: + applicable_value = value + else: + break + if applicable_value is not None: + result[category] = applicable_value + return result + # Standard time-series values values = data["values"] applicable_value = None diff --git a/policyengine_us_data/parameters/take_up/wic_nutritional_risk.yaml b/policyengine_us_data/parameters/take_up/wic_nutritional_risk.yaml new file mode 100644 index 000000000..c327b4285 --- /dev/null +++ b/policyengine_us_data/parameters/take_up/wic_nutritional_risk.yaml @@ -0,0 +1,13 @@ +rates_by_category: + PREGNANT: + 1980: 0.913 + POSTPARTUM: + 1980: 0.933 + BREASTFEEDING: + 1980: 0.889 + INFANT: + 1980: 0.95 + CHILD: + 1980: 0.752 + NONE: + 1980: 0 diff --git a/policyengine_us_data/parameters/take_up/wic_takeup.yaml b/policyengine_us_data/parameters/take_up/wic_takeup.yaml new file mode 100644 index 000000000..ec0f63554 --- /dev/null +++ b/policyengine_us_data/parameters/take_up/wic_takeup.yaml @@ -0,0 +1,33 @@ +rates_by_category: + PREGNANT: + 2018: 0.533 + 2019: 0.523 + 2020: 0.456 + 2021: 0.437 + 2022: 0.456 + POSTPARTUM: + 2018: 0.844 + 2019: 0.847 + 2020: 0.685 + 2021: 0.672 + 2022: 0.689 + BREASTFEEDING: + 2018: 0.687 + 2019: 0.684 + 2020: 0.604 + 2021: 0.608 + 2022: 0.663 + INFANT: + 2018: 0.978 + 2019: 0.984 + 2020: 0.817 + 2021: 0.78 + 2022: 0.784 + CHILD: + 2018: 0.442 + 2019: 0.448 + 2020: 0.406 + 2021: 0.432 + 2022: 0.46 + NONE: + 2018: 0 diff --git a/policyengine_us_data/tests/test_stochastic_variables.py b/policyengine_us_data/tests/test_stochastic_variables.py index 293d29a7e..1819241a2 100644 --- a/policyengine_us_data/tests/test_stochastic_variables.py +++ b/policyengine_us_data/tests/test_stochastic_variables.py @@ -117,12 +117,31 @@ def test_boolean_generation(self): assert take_up.dtype == bool assert set(take_up).issubset({True, False}) - def test_wic_draws_are_float(self): - rng = seeded_rng("wic_takeup_draw") - draws = rng.random(1000).astype(np.float32) - assert draws.dtype == np.float32 - assert np.all(draws >= 0) - assert np.all(draws < 1) + def test_wic_takeup_rates_load(self): + rates = load_take_up_rate("wic_takeup", 2022) + assert isinstance(rates, dict) + assert rates["PREGNANT"] == 0.456 + assert rates["INFANT"] == 0.784 + assert rates["NONE"] == 0 + + def test_wic_nutritional_risk_rates_load(self): + rates = load_take_up_rate("wic_nutritional_risk", 2022) + assert isinstance(rates, dict) + assert rates["INFANT"] == 0.95 + assert rates["CHILD"] == 0.752 + assert rates["NONE"] == 0 + + def test_wic_category_specific_proportions(self): + rates = load_take_up_rate("wic_takeup", 2022) + n = 10_000 + rng = seeded_rng("would_claim_wic") + draws = rng.random(n) + for category, expected_rate in [ + ("INFANT", 0.784), + ("CHILD", 0.46), + ]: + take_up = draws[:n] < expected_rate + assert abs(take_up.mean() - expected_rate) < 0.05 def test_state_specific_medicaid_proportions(self): rates = load_take_up_rate("medicaid", 2022) From eafd4380d8bfac32d597b03ce1c26bed9ebe43f1 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Fri, 6 Feb 2026 16:46:16 -0500 Subject: [PATCH 8/8] Resolve WIC nutritional risk fully in data, eliminate _imputed variable Co-Authored-By: Claude Opus 4.6 --- policyengine_us_data/datasets/cps/cps.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index c6224e1c3..90dc3f03d 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -277,16 +277,17 @@ def add_takeup(self): rng = seeded_rng("would_claim_wic") data["would_claim_wic"] = rng.random(n_persons) < wic_takeup_rate_by_person + # WIC nutritional risk — fully resolved wic_risk_rates = load_take_up_rate( "wic_nutritional_risk", self.time_period ) wic_risk_rate_by_person = np.array( [wic_risk_rates.get(c, 0) for c in wic_categories] ) - rng = seeded_rng("wic_nutritional_risk_imputed") - data["wic_nutritional_risk_imputed"] = ( - rng.random(n_persons) < wic_risk_rate_by_person - ) + receives_wic = baseline.calculate("receives_wic").values + rng = seeded_rng("is_wic_at_nutritional_risk") + imputed_risk = rng.random(n_persons) < wic_risk_rate_by_person + data["is_wic_at_nutritional_risk"] = receives_wic | imputed_risk self.save_dataset(data)