From 56c8b82eadab3ae2bc35ed8d3d3804860e9f9fba Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Thu, 21 May 2026 19:52:55 +0200 Subject: [PATCH] Fix subsampling with formula-backed IDs --- changelog.d/subsample-structural-ids.fixed.md | 1 + policyengine_core/simulations/simulation.py | 5 ++- .../core/test_subsample_invalidates_cache.py | 42 +++++++++++++++++++ 3 files changed, 46 insertions(+), 2 deletions(-) create mode 100644 changelog.d/subsample-structural-ids.fixed.md diff --git a/changelog.d/subsample-structural-ids.fixed.md b/changelog.d/subsample-structural-ids.fixed.md new file mode 100644 index 00000000..dc24b531 --- /dev/null +++ b/changelog.d/subsample-structural-ids.fixed.md @@ -0,0 +1 @@ +Preserve computed structural dataset variables when subsampling simulations. diff --git a/policyengine_core/simulations/simulation.py b/policyengine_core/simulations/simulation.py index 92e54eda..c655106c 100644 --- a/policyengine_core/simulations/simulation.py +++ b/policyengine_core/simulations/simulation.py @@ -1753,8 +1753,9 @@ def subsample( if time_period is None: time_period = self.default_calculation_period - # Convert simulation inputs to DataFrame - df = self.to_input_dataframe() + # Subsampling rebuilds the complete dataset, so preserve computed + # structural variables such as formula-backed IDs. + df = self.to_input_dataframe(include_computed_variables=True) # Extract time period from DataFrame columns df_time_period = ( diff --git a/tests/core/test_subsample_invalidates_cache.py b/tests/core/test_subsample_invalidates_cache.py index 07ae5dad..d2174c1b 100644 --- a/tests/core/test_subsample_invalidates_cache.py +++ b/tests/core/test_subsample_invalidates_cache.py @@ -18,10 +18,24 @@ import pandas as pd from policyengine_core.country_template import Microsimulation +from policyengine_core.country_template import Simulation as CountryTemplateSimulation +from policyengine_core.country_template.entities import Person from policyengine_core.data import Dataset +from policyengine_core.model_api import Variable +from policyengine_core.periods import YEAR from policyengine_core.periods import period as make_period +class person_id(Variable): + value_type = int + entity = Person + definition_period = YEAR + label = "Formula-backed person ID for subsample regression tests." + + def formula(person, period): + return np.arange(person.count) + + def _build_mini_dataset() -> Dataset: """Build a 5-household / 10-person in-memory dataset for subsample tests.""" df = pd.DataFrame( @@ -40,6 +54,34 @@ def _build_mini_dataset() -> Dataset: return Dataset.from_dataframe(df, "2022") +def _build_formula_backed_id_simulation( + isolated_tax_benefit_system, +) -> CountryTemplateSimulation: + isolated_tax_benefit_system.replace_variable(person_id) + return CountryTemplateSimulation( + tax_benefit_system=isolated_tax_benefit_system, + dataset=_build_mini_dataset(), + ) + + +def test_subsample_preserves_formula_backed_structural_ids( + isolated_tax_benefit_system, +) -> None: + """Subsampling needs IDs that safe public exports intentionally omit.""" + sim = _build_formula_backed_id_simulation(isolated_tax_benefit_system) + + safe_columns = sim.to_input_dataframe().columns + full_columns = sim.to_input_dataframe(include_computed_variables=True).columns + + assert "person_id__2022" not in safe_columns + assert "person_id__2022" in full_columns + + sim.subsample(n=1, seed="formula-backed-person-id") + + assert sim.populations["household"].count == 1 + assert sim.persons.count == 2 + + def test_subsample_clears_stale_fast_cache_entries() -> None: """A pre-subsample entry in ``_fast_cache`` must not survive subsample.