Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/subsample-structural-ids.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Preserve computed structural dataset variables when subsampling simulations.
5 changes: 3 additions & 2 deletions policyengine_core/simulations/simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1753,8 +1753,9 @@ def subsample(
if time_period is None:
time_period = self.default_calculation_period

# Convert simulation inputs to DataFrame
df = self.to_input_dataframe()
# Subsampling rebuilds the complete dataset, so preserve computed
# structural variables such as formula-backed IDs.
df = self.to_input_dataframe(include_computed_variables=True)

# Extract time period from DataFrame columns
df_time_period = (
Expand Down
42 changes: 42 additions & 0 deletions tests/core/test_subsample_invalidates_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,24 @@
import pandas as pd

from policyengine_core.country_template import Microsimulation
from policyengine_core.country_template import Simulation as CountryTemplateSimulation
from policyengine_core.country_template.entities import Person
from policyengine_core.data import Dataset
from policyengine_core.model_api import Variable
from policyengine_core.periods import YEAR
from policyengine_core.periods import period as make_period


class person_id(Variable):
value_type = int
entity = Person
definition_period = YEAR
label = "Formula-backed person ID for subsample regression tests."

def formula(person, period):
return np.arange(person.count)


def _build_mini_dataset() -> Dataset:
"""Build a 5-household / 10-person in-memory dataset for subsample tests."""
df = pd.DataFrame(
Expand All @@ -40,6 +54,34 @@ def _build_mini_dataset() -> Dataset:
return Dataset.from_dataframe(df, "2022")


def _build_formula_backed_id_simulation(
isolated_tax_benefit_system,
) -> CountryTemplateSimulation:
isolated_tax_benefit_system.replace_variable(person_id)
return CountryTemplateSimulation(
tax_benefit_system=isolated_tax_benefit_system,
dataset=_build_mini_dataset(),
)


def test_subsample_preserves_formula_backed_structural_ids(
isolated_tax_benefit_system,
) -> None:
"""Subsampling needs IDs that safe public exports intentionally omit."""
sim = _build_formula_backed_id_simulation(isolated_tax_benefit_system)

safe_columns = sim.to_input_dataframe().columns
full_columns = sim.to_input_dataframe(include_computed_variables=True).columns

assert "person_id__2022" not in safe_columns
assert "person_id__2022" in full_columns

sim.subsample(n=1, seed="formula-backed-person-id")

assert sim.populations["household"].count == 1
assert sim.persons.count == 2


def test_subsample_clears_stale_fast_cache_entries() -> None:
"""A pre-subsample entry in ``_fast_cache`` must not survive subsample.

Expand Down
Loading