11# Standard library
2- import re
32from datetime import datetime
43from datetime import timezone
54from pathlib import Path
1615 from tidy_conf import merge_conferences
1716 from tidy_conf .deduplicate import deduplicate
1817 from tidy_conf .schema import get_schema
18+ from tidy_conf .titles import normalize_conference_name
1919 from tidy_conf .utils import fill_missing_required
2020 from tidy_conf .yaml import load_title_mappings
2121 from tidy_conf .yaml import write_df_yaml
2525 from .tidy_conf import merge_conferences
2626 from .tidy_conf .deduplicate import deduplicate
2727 from .tidy_conf .schema import get_schema
28+ from .tidy_conf .titles import normalize_conference_name
2829 from .tidy_conf .utils import fill_missing_required
2930 from .tidy_conf .yaml import load_title_mappings
3031 from .tidy_conf .yaml import write_df_yaml
@@ -269,10 +270,11 @@ def main(year: int | None = None, base: str = "") -> None:
269270
270271 # Load and apply the title mappings
271272 _ , known_mappings = load_title_mappings (reverse = True )
272- df_csv_standardized ["conference" ] = (
273- df_csv_standardized ["conference" ]
274- .replace (re .compile (r"\b\s+(19|20)\d{2}\s*\b" ), "" , regex = True )
275- .replace (known_mappings )
273+
274+ # CRITICAL: Use normalize_conference_name for CONSISTENT normalization
275+ # This ensures the same normalization is used here AND in mapping_dict later
276+ df_csv_standardized ["conference" ] = df_csv_standardized ["conference" ].apply (
277+ lambda x : normalize_conference_name (x , known_mappings ),
276278 )
277279
278280 # Store the new csv dataframe to cache (with original names)
@@ -282,7 +284,9 @@ def main(year: int | None = None, base: str = "") -> None:
282284 # _ = pd.concat([df_csv_old, df_csv_raw]).drop_duplicates(keep=False)
283285
284286 # Deduplicate the new dataframe (with standardized names for merging)
285- df_csv_for_merge = deduplicate (df_csv_standardized , "conference" )
287+ # CRITICAL: Must group by both conference AND year to avoid losing multi-year entries
288+ # (e.g., "PyCon USA 2025" and "PyCon USA 2026" both normalize to "PyCon USA")
289+ df_csv_for_merge = deduplicate (df_csv_standardized , ["conference" , "year" ])
286290
287291 if df_csv_for_merge .empty :
288292 print ("No new conferences found in Python organiser source." )
@@ -345,23 +349,52 @@ def main(year: int | None = None, base: str = "") -> None:
345349 df_csv_output = df_csv_raw .copy ()
346350
347351 # Map from the standardized data back to original
352+ # CRITICAL: Use the SAME normalization function as tidy_df_names to avoid data loss
348353 mapping_dict = {}
349354 for idx , row in df_csv_raw .iterrows ():
350- standardized_conf = re .sub (r"\b\s+(19|20)\d{2}\s*\b" , "" , row ["conference" ])
351- if standardized_conf in known_mappings :
352- standardized_conf = known_mappings [standardized_conf ]
355+ # Use normalize_conference_name for consistent normalization
356+ standardized_conf = normalize_conference_name (row ["conference" ], known_mappings )
353357 mapping_key = (standardized_conf , row ["year" ])
354358 mapping_dict [mapping_key ] = idx
355359
360+ # Track entries that matched and those that didn't for debugging
361+ matched_keys = set ()
362+ unmatched_entries = []
363+
356364 # Update the CSV output with information from the merged data
357365 for _ , row in df_new .iterrows ():
358366 key = (row ["conference" ], row ["year" ])
359367 if key in mapping_dict :
360368 original_idx = mapping_dict [key ]
369+ matched_keys .add (key )
361370 # Update only fields that were potentially enriched during merge
362371 for col in ["start" , "end" , "cfp" , "link" , "cfp_link" , "sponsor" , "finaid" ]:
363372 if col in row and pd .notna (row [col ]):
364373 df_csv_output .at [original_idx , col ] = row [col ]
374+ else :
375+ # Track entries that didn't match for potential debugging
376+ unmatched_entries .append (
377+ {"conference" : row ["conference" ], "year" : row ["year" ]},
378+ )
379+
380+ # Log any unmatched entries for debugging (these may be legitimately new)
381+ if unmatched_entries :
382+ logger .debug (
383+ f"Found { len (unmatched_entries )} entries in df_new not in mapping_dict "
384+ "(may be new conferences from YAML):" ,
385+ )
386+ for entry in unmatched_entries [:5 ]: # Show first 5
387+ logger .debug (f" - { entry ['conference' ]} ({ entry ['year' ]} )" )
388+
389+ # Verify no silent data loss: all CSV entries should be accounted for
390+ csv_keys_in_mapping = set (mapping_dict .keys ())
391+ unmatched_csv = csv_keys_in_mapping - matched_keys
392+ if unmatched_csv :
393+ logger .warning (
394+ f"Potential data loss: { len (unmatched_csv )} CSV entries were not updated:" ,
395+ )
396+ for key in list (unmatched_csv )[:5 ]: # Show first 5
397+ logger .warning (f" - { key [0 ]} ({ key [1 ]} )" )
365398
366399 # Write the CSV with original names
367400 df_csv_output .loc [:, "Location" ] = df_csv_output .place
0 commit comments