2727import pandas as pd
2828import pytest
2929
30- from diff_diff import CallawaySantAnna , DifferenceInDifferences , SurveyDesign
30+ from diff_diff import DifferenceInDifferences , SurveyDesign
3131from diff_diff .twfe import TwoWayFixedEffects
3232
3333BENCHMARK_DIR = Path (__file__ ).parent .parent / "benchmarks" / "data" / "real"
3737NHANES_RESULTS_FILE = BENCHMARK_DIR / "nhanes_realdata_golden.json"
3838RECS_RESULTS_FILE = BENCHMARK_DIR / "recs_realdata_golden.json"
3939
40- # Tolerances — match existing synthetic-data cross-validation
41- ATT_RTOL = 1e-4
42- ATT_ATOL = 0.01
43- SE_RTOL = 0.01 # 1%
44- SE_ATOL = 0.05
45- CI_RTOL = 0.01
46- CI_ATOL = 0.05
47-
48- # Wider tolerances for replicate weight methods
49- REP_SE_RTOL = 0.05 # 5%
50- REP_SE_ATOL = 0.1
51- REP_CI_RTOL = 0.05
52- REP_CI_ATOL = 0.1
40+ # Tight tolerances — observed gaps are < 1e-10, so 1e-8 guards against
41+ # regressions without being so loose that a real bug could slip through.
42+ ATT_RTOL = 1e-8
43+ ATT_ATOL = 1e-8
44+ SE_RTOL = 1e-8
45+ SE_ATOL = 1e-8
46+ CI_RTOL = 1e-8
47+ CI_ATOL = 1e-8
48+
49+ # Replicate weight methods: still tight (observed gaps < 1e-10)
50+ REP_SE_RTOL = 1e-8
51+ REP_SE_ATOL = 1e-8
52+ REP_CI_RTOL = 1e-8
53+ REP_CI_ATOL = 1e-8
5354
5455# Mark for easy filtering: pytest -m realdata
5556pytestmark = pytest .mark .realdata
@@ -208,6 +209,9 @@ def test_a4_twfe(self, api_results):
208209 survey_design = sd ,
209210 )
210211
212+ # ATT should match R's pooled DiD. SE differs because TWFE absorbs
213+ # unit fixed effects (within-transform), giving a smaller residual
214+ # variance than R's svyglm(outcome ~ treated * post).
211215 _assert_close (result .att , r ["att" ], ATT_RTOL , ATT_ATOL , "A4 TWFE ATT" )
212216 assert np .isfinite (result .se ), "A4 TWFE SE should be finite"
213217 assert result .se > 0 , "A4 TWFE SE should be positive"
@@ -230,6 +234,8 @@ def test_a5_subpopulation_elementary(self, api_results):
230234
231235 _assert_close (result .att , r ["att" ], ATT_RTOL , ATT_ATOL , "A5 subpop ATT" )
232236 _assert_close (result .se , r ["se" ], SE_RTOL , SE_ATOL , "A5 subpop SE" )
237+ # df differs by design: subpopulation() preserves all 3 strata (df=397),
238+ # while R's subset() drops to 1 stratum (df=199). ATT and SE match.
233239
234240 def test_a6_covariates (self , api_results ):
235241 """A6: Covariate-adjusted (meals, ell) with strata + FPC."""
@@ -279,6 +285,9 @@ def test_a7_fay_brr_replicates(self, api_results):
279285
280286 _assert_close (result .att , r ["att" ], ATT_RTOL , ATT_ATOL , "A7 Fay ATT" )
281287 _assert_close (result .se , r ["se" ], REP_SE_RTOL , REP_SE_ATOL , "A7 Fay SE" )
288+ assert result .survey_metadata .df_survey == r ["df" ], (
289+ f"A7 df: Python={ result .survey_metadata .df_survey } , R={ r ['df' ]} "
290+ )
282291
283292 def test_fpc_reduces_se (self , api_results ):
284293 """FPC should reduce SE vs no-FPC (A1 SE < A2 SE)."""
@@ -360,6 +369,13 @@ def test_b2_covariates(self, nhanes_results):
360369
361370 _assert_close (result .att , r ["att" ], ATT_RTOL , ATT_ATOL , "B2 cov ATT" )
362371 _assert_close (result .se , r ["se" ], SE_RTOL , SE_ATOL , "B2 cov SE" )
372+ assert result .survey_metadata .df_survey == r ["df" ], (
373+ f"B2 df: Python={ result .survey_metadata .df_survey } , R={ r ['df' ]} "
374+ )
375+ _assert_close (result .conf_int [0 ], r ["ci_lower" ], CI_RTOL , CI_ATOL ,
376+ "B2 CI lower" )
377+ _assert_close (result .conf_int [1 ], r ["ci_upper" ], CI_RTOL , CI_ATOL ,
378+ "B2 CI upper" )
363379
364380 def test_b3_weights_only (self , nhanes_results ):
365381 """B3: Weighted OLS (no clustering), baseline comparison."""
@@ -372,6 +388,13 @@ def test_b3_weights_only(self, nhanes_results):
372388
373389 _assert_close (result .att , r ["att" ], ATT_RTOL , ATT_ATOL , "B3 ATT" )
374390 _assert_close (result .se , r ["se" ], SE_RTOL , SE_ATOL , "B3 SE" )
391+ assert result .survey_metadata .df_survey == r ["df" ], (
392+ f"B3 df: Python={ result .survey_metadata .df_survey } , R={ r ['df' ]} "
393+ )
394+ _assert_close (result .conf_int [0 ], r ["ci_lower" ], CI_RTOL , CI_ATOL ,
395+ "B3 CI lower" )
396+ _assert_close (result .conf_int [1 ], r ["ci_upper" ], CI_RTOL , CI_ATOL ,
397+ "B3 CI upper" )
375398
376399 def test_b4_subpop_female (self , nhanes_results ):
377400 """B4: Subpopulation — female respondents only."""
@@ -392,46 +415,19 @@ def test_b4_subpop_female(self, nhanes_results):
392415
393416 _assert_close (result .att , r ["att" ], ATT_RTOL , ATT_ATOL , "B4 subpop ATT" )
394417 _assert_close (result .se , r ["se" ], SE_RTOL , SE_ATOL , "B4 subpop SE" )
395-
396- def test_b5_callaway_santanna_rc (self , nhanes_results ):
397- """B5: CallawaySantAnna repeated cross-section with survey design.
398-
399- R's did::att_gt is survey-naive for SE, so only ATT is compared.
400- NOTE: This test is skipped if R's did package couldn't produce
401- golden values (e.g., due to gname type issues with 2-period data).
402- """
403- if "b5_cs_rc" not in nhanes_results :
404- pytest .skip (
405- "B5 golden values not available — R did::att_gt may not "
406- "support 2-period repeated cross-section with this gname setup"
407- )
408- r = nhanes_results ["b5_cs_rc" ]
409- data = self ._load_nhanes_data (nhanes_results )
410-
411- sd = SurveyDesign (
412- weights = "WTMEC2YR" , strata = "SDMVSTRA" , psu = "SDMVPSU" , nest = True ,
413- )
414- est = CallawaySantAnna (
415- estimation_method = "reg" ,
416- control_group = "never_treated" ,
417- base_period = "varying" ,
418- panel = False ,
419- n_bootstrap = 0 ,
420- )
421- result = est .fit (
422- data , "outcome" , "unit_id" , "period" , "first_treat" ,
423- aggregate = "simple" ,
424- survey_design = sd ,
418+ assert result .survey_metadata .df_survey == r ["df" ], (
419+ f"B4 df: Python={ result .survey_metadata .df_survey } , R={ r ['df' ]} "
425420 )
421+ _assert_close (result .conf_int [0 ], r ["ci_lower" ], CI_RTOL , CI_ATOL ,
422+ "B4 CI lower" )
423+ _assert_close (result .conf_int [1 ], r ["ci_upper" ], CI_RTOL , CI_ATOL ,
424+ "B4 CI upper" )
426425
427- _assert_close (
428- result .overall_att , r ["overall_att" ],
429- rtol = 1e-3 , atol = 0.05 ,
430- label = "B5 CS RC-DiD overall ATT" ,
431- )
432- # Design-based SE should be finite and positive
433- assert np .isfinite (result .overall_se ), "B5 SE should be finite"
434- assert result .overall_se > 0 , "B5 SE should be positive"
426+ # B5 (CallawaySantAnna RC-DiD) was removed: R's did::att_gt cannot produce
427+ # golden values for a 2-period repeated cross-section, and the time-scale
428+ # mismatch between the R script (period_cs=1/2) and the Python data
429+ # (period=0/1) made the dormant path untestable. CallawaySantAnna survey
430+ # variance is validated in the synthetic-data suite instead.
435431
436432
437433# ============================================================================
@@ -540,8 +536,13 @@ def test_c2_full_regression(self, recs_results):
540536 )
541537
542538 def test_c3_deff_diagnostics (self , recs_results ):
543- """C3: DEFF diagnostics from real JK1 replicate SEs."""
544- r = recs_results ["c1_simple" ]
539+ """C3: DEFF diagnostics from real JK1 replicate SEs.
540+
541+ DEFF = (survey_SE / naive_SE)^2. The naive SE baseline differs between
542+ R (weighted lm) and Python (compute_deff_diagnostics SRS baseline), so
543+ DEFF values aren't directly comparable. This test verifies DEFFs are
544+ finite, positive, and > 1 (expected for a complex survey design).
545+ """
545546 data , rep_cols = self ._load_recs_data (recs_results )
546547 data = data .dropna (subset = ["TOTALBTU" , "KOWNRENT" , "NWEIGHT" ]).reset_index (
547548 drop = True
@@ -577,3 +578,8 @@ def test_c3_deff_diagnostics(self, recs_results):
577578 assert all (deff .deff > 0 ), "DEFF values should be positive"
578579 assert all (np .isfinite (deff .effective_n )), "Effective n should be finite"
579580 assert all (deff .effective_n > 0 ), "Effective n should be positive"
581+
582+ # DEFF > 1 expected for a complex survey with unequal weights
583+ assert deff .deff [1 ] > 1.0 , (
584+ f"DEFF(KOWNRENT) = { deff .deff [1 ]:.4f} , expected > 1.0 for survey data"
585+ )
0 commit comments