Merge pull request #202 from JesperDramsch/claude/fix-csv-merge-loss-4WPng

JesperDramsch · web-flow · commit ad47c665196a · 2026-01-18T02:16:46.000+01:00
fix(merge): prevent silent data loss in CSV merge pipeline
diff --git a/tests/test_merge_no_data_loss.py b/tests/test_merge_no_data_loss.py
diff --git a/utils/import_python_official.py b/utils/import_python_official.py
@@ -270,7 +270,8 @@ def main(year=None, base="") -> bool:
         df_diff = pd.concat([df_ics_old, df_ics]).drop_duplicates(keep=False)
 
         # Deduplicate the new dataframe
-        df_ics = deduplicate(df_diff, "conference")
+        # CRITICAL: Must group by both conference AND year to avoid losing multi-year entries
+        df_ics = deduplicate(df_diff, ["conference", "year"])
 
         if df_ics.empty:
             logger.info("No new conferences found in official Python source.")
diff --git a/utils/import_python_organizers.py b/utils/import_python_organizers.py
@@ -1,5 +1,4 @@
 # Standard library
-import re
 from datetime import datetime
 from datetime import timezone
 from pathlib import Path
@@ -16,6 +15,7 @@
     from tidy_conf import merge_conferences
     from tidy_conf.deduplicate import deduplicate
     from tidy_conf.schema import get_schema
+    from tidy_conf.titles import normalize_conference_name
     from tidy_conf.utils import fill_missing_required
     from tidy_conf.yaml import load_title_mappings
     from tidy_conf.yaml import write_df_yaml
@@ -25,6 +25,7 @@
     from .tidy_conf import merge_conferences
     from .tidy_conf.deduplicate import deduplicate
     from .tidy_conf.schema import get_schema
+    from .tidy_conf.titles import normalize_conference_name
     from .tidy_conf.utils import fill_missing_required
     from .tidy_conf.yaml import load_title_mappings
     from .tidy_conf.yaml import write_df_yaml
@@ -269,10 +270,11 @@ def main(year: int | None = None, base: str = "") -> None:
 
     # Load and apply the title mappings
     _, known_mappings = load_title_mappings(reverse=True)
-    df_csv_standardized["conference"] = (
-        df_csv_standardized["conference"]
-        .replace(re.compile(r"\b\s+(19|20)\d{2}\s*\b"), "", regex=True)
-        .replace(known_mappings)
+
+    # CRITICAL: Use normalize_conference_name for CONSISTENT normalization
+    # This ensures the same normalization is used here AND in mapping_dict later
+    df_csv_standardized["conference"] = df_csv_standardized["conference"].apply(
+        lambda x: normalize_conference_name(x, known_mappings),
     )
 
     # Store the new csv dataframe to cache (with original names)
@@ -282,7 +284,9 @@ def main(year: int | None = None, base: str = "") -> None:
     # _ = pd.concat([df_csv_old, df_csv_raw]).drop_duplicates(keep=False)
 
     # Deduplicate the new dataframe (with standardized names for merging)
-    df_csv_for_merge = deduplicate(df_csv_standardized, "conference")
+    # CRITICAL: Must group by both conference AND year to avoid losing multi-year entries
+    # (e.g., "PyCon USA 2025" and "PyCon USA 2026" both normalize to "PyCon USA")
+    df_csv_for_merge = deduplicate(df_csv_standardized, ["conference", "year"])
 
     if df_csv_for_merge.empty:
         print("No new conferences found in Python organiser source.")
@@ -345,23 +349,52 @@ def main(year: int | None = None, base: str = "") -> None:
     df_csv_output = df_csv_raw.copy()
 
     # Map from the standardized data back to original
+    # CRITICAL: Use the SAME normalization function as tidy_df_names to avoid data loss
     mapping_dict = {}
     for idx, row in df_csv_raw.iterrows():
-        standardized_conf = re.sub(r"\b\s+(19|20)\d{2}\s*\b", "", row["conference"])
-        if standardized_conf in known_mappings:
-            standardized_conf = known_mappings[standardized_conf]
+        # Use normalize_conference_name for consistent normalization
+        standardized_conf = normalize_conference_name(row["conference"], known_mappings)
         mapping_key = (standardized_conf, row["year"])
         mapping_dict[mapping_key] = idx
 
+    # Track entries that matched and those that didn't for debugging
+    matched_keys = set()
+    unmatched_entries = []
+
     # Update the CSV output with information from the merged data
     for _, row in df_new.iterrows():
         key = (row["conference"], row["year"])
         if key in mapping_dict:
             original_idx = mapping_dict[key]
+            matched_keys.add(key)
             # Update only fields that were potentially enriched during merge
             for col in ["start", "end", "cfp", "link", "cfp_link", "sponsor", "finaid"]:
                 if col in row and pd.notna(row[col]):
                     df_csv_output.at[original_idx, col] = row[col]
+        else:
+            # Track entries that didn't match for potential debugging
+            unmatched_entries.append(
+                {"conference": row["conference"], "year": row["year"]},
+            )
+
+    # Log any unmatched entries for debugging (these may be legitimately new)
+    if unmatched_entries:
+        logger.debug(
+            f"Found {len(unmatched_entries)} entries in df_new not in mapping_dict "
+            "(may be new conferences from YAML):",
+        )
+        for entry in unmatched_entries[:5]:  # Show first 5
+            logger.debug(f"  - {entry['conference']} ({entry['year']})")
+
+    # Verify no silent data loss: all CSV entries should be accounted for
+    csv_keys_in_mapping = set(mapping_dict.keys())
+    unmatched_csv = csv_keys_in_mapping - matched_keys
+    if unmatched_csv:
+        logger.warning(
+            f"Potential data loss: {len(unmatched_csv)} CSV entries were not updated:",
+        )
+        for key in list(unmatched_csv)[:5]:  # Show first 5
+            logger.warning(f"  - {key[0]} ({key[1]})")
 
     # Write the CSV with original names
     df_csv_output.loc[:, "Location"] = df_csv_output.place
diff --git a/utils/tidy_conf/data/rejections.yml b/utils/tidy_conf/data/rejections.yml
@@ -10,4 +10,28 @@ alt_name:
   Python Austria:
     variations:
     - PyCon Australia
+  PyCon India:
+    variations:
+    - PyCon Indonesia
+  PyCon Indonesia:
+    variations:
+    - PyCon India
+  Swiss Python Summit:
+    variations:
+    - Python Sul
+  Python Sul:
+    variations:
+    - Swiss Python Summit
+  PyCon Latam:
+    variations:
+    - PyCon Latin America
+  PyCon Latin America:
+    variations:
+    - PyCon Latam
+  Plone Conference:
+    variations:
+    - AfroPython Conference
+  AfroPython Conference:
+    variations:
+    - Plone Conference
 spelling: []
diff --git a/utils/tidy_conf/data/titles.yml b/utils/tidy_conf/data/titles.yml
@@ -158,6 +158,9 @@ alt_name:
     variations:
     - PyLadies Conference
     - PyLadies Con
+  ExistingConf:
+    variations:
+    - New Variation
 spelling:
 - DjangoCon
 - EuroPython
diff --git a/utils/tidy_conf/interactive_merge.py b/utils/tidy_conf/interactive_merge.py
@@ -564,20 +564,25 @@ def merge_conferences(
                         elif " " not in rx and " " in ry:
                             cfp_time_x = " " + ry.split(" ")[1]
 
+                        # Check if the cfp_ext columns exist before accessing them
+                        # These columns may not exist if one dataframe doesn't have cfp_ext
+                        cfp_ext_x = row.get("cfp_ext_x") if "cfp_ext_x" in row.index else None
+                        cfp_ext_y = row.get("cfp_ext_y") if "cfp_ext_y" in row.index else None
+
                         # Check if the cfp_ext is the same and if so update the cfp
-                        if rx + cfp_time_x == row["cfp_ext_x"]:
+                        if cfp_ext_x is not None and rx + cfp_time_x == cfp_ext_x:
                             df_new.loc[i, "cfp"] = ry + cfp_time_y
                             df_new.loc[i, "cfp_ext"] = rx + cfp_time_x
                             continue
-                        if ry + cfp_time_y == row["cfp_ext_y"]:
+                        if cfp_ext_y is not None and ry + cfp_time_y == cfp_ext_y:
                             df_new.loc[i, "cfp"] = rx + cfp_time_x
                             df_new.loc[i, "cfp_ext"] = ry + cfp_time_y
                             continue
-                        if rx + cfp_time_x == row["cfp_ext_y"]:
+                        if cfp_ext_y is not None and rx + cfp_time_x == cfp_ext_y:
                             df_new.loc[i, "cfp"] = ry + cfp_time_y
                             df_new.loc[i, "cfp_ext"] = rx + cfp_time_x
                             continue
-                        if ry + cfp_time_y == row["cfp_ext_x"]:
+                        if cfp_ext_x is not None and ry + cfp_time_y == cfp_ext_x:
                             df_new.loc[i, "cfp"] = rx + cfp_time_x
                             df_new.loc[i, "cfp_ext"] = ry + cfp_time_y
                             continue
diff --git a/utils/tidy_conf/titles.py b/utils/tidy_conf/titles.py
@@ -119,55 +119,88 @@ def expand_country_codes(name):
     return name
 
 
-def tidy_df_names(df):
-    """Tidy up the conference names in a consistent way.
-
-    Normalizes conference names by:
-    1. Removing years from names
-    2. Expanding country codes to full names (e.g., "PyCon PL" -> "PyCon Poland")
-    3. Normalizing spacing and punctuation
-    4. Applying known mappings from titles.yml
+def normalize_conference_name(name: str, known_mappings: dict | None = None) -> str:
+    """Normalize a single conference name to a canonical form.
+
+    This is the single source of truth for conference name normalization.
+    Both DataFrame-level normalization (tidy_df_names) and individual lookups
+    should use this function to ensure consistency.
+
+    Parameters
+    ----------
+    name : str
+        Conference name to normalize
+    known_mappings : dict, optional
+        Mapping of known variations to canonical names. If None, will be loaded.
+
+    Returns
+    -------
+    str
+        Normalized conference name
     """
-    # Load known title mappings
-    _, known_mappings = load_title_mappings(reverse=True)
+    if not name or not isinstance(name, str):
+        return name if isinstance(name, str) else ""
 
-    # Define regex patterns for matching years and conference names
-    # Match years with or without leading space
+    # Load known mappings if not provided
+    if known_mappings is None:
+        _, known_mappings = load_title_mappings(reverse=True)
+
+    # Define regex patterns (same as tidy_df_names)
     regex_year = re.compile(r"\b\s*(19|20)\d{2}\s*\b")
     regex_py = re.compile(r"\b(Python|PyCon)\b")
 
-    # Harmonize conference titles using known mappings and regex
-    series = df["conference"].copy()
+    result = name
 
-    # Remove years from conference names
-    series = series.str.replace(regex_year, " ", regex=True)
+    # Remove years from conference names (replace with space to avoid concatenation)
+    result = regex_year.sub(" ", result)
 
     # Add a space after Python or PyCon
-    series = series.str.replace(regex_py, r" \1 ", regex=True)
+    result = regex_py.sub(r" \1 ", result)
 
-    # Replace non-word characters
-    series = series.str.replace(r"[\+]", " ", regex=True)
+    # Replace non-word characters like +
+    result = re.sub(r"[\+]", " ", result)
 
     # Replace the word Conference
-    series = series.str.replace(r"\bConf\b", "Conference", regex=True)
+    result = re.sub(r"\bConf\b", "Conference", result)
 
     # Remove extra spaces
-    series = series.str.replace(r"\s+", " ", regex=True)
+    result = re.sub(r"\s+", " ", result)
 
     # Remove leading and trailing whitespace
-    series = series.str.strip()
+    result = result.strip()
 
-    # Expand country codes to full names BEFORE applying known mappings
-    # This ensures "PyCon PL" becomes "PyCon Poland" which can then match
-    series = series.apply(expand_country_codes)
+    # Apply known mappings FIRST (mappings may contain unexpanded country codes)
+    if result in known_mappings:
+        result = known_mappings[result]
 
-    # Replace known mappings (from titles.yml)
-    series = series.replace(known_mappings)
+    # Expand country codes to full names AFTER mappings
+    # This ensures idempotency: normalize(normalize(x)) == normalize(x)
+    result = expand_country_codes(result)
 
     # Final cleanup
-    series = series.str.strip()
+    result = result.strip()
+
+    return result
+
+
+def tidy_df_names(df):
+    """Tidy up the conference names in a consistent way.
+
+    Normalizes conference names by:
+    1. Removing years from names
+    2. Expanding country codes to full names (e.g., "PyCon PL" -> "PyCon Poland")
+    3. Normalizing spacing and punctuation
+    4. Applying known mappings from titles.yml
+
+    Uses normalize_conference_name() internally for consistency.
+    """
+    # Load known title mappings once for efficiency
+    _, known_mappings = load_title_mappings(reverse=True)
 
+    # Apply normalization using the shared function
     df = df.copy()
-    df.loc[:, "conference"] = series
+    df.loc[:, "conference"] = df["conference"].apply(
+        lambda x: normalize_conference_name(x, known_mappings),
+    )
 
     return df
diff --git a/utils/tidy_conf/yaml.py b/utils/tidy_conf/yaml.py
@@ -192,6 +192,9 @@ def update_title_mappings(data, path="utils/tidy_conf/data/titles.yml"):
                 continue
             if key not in title_data["alt_name"]:
                 title_data["alt_name"][key] = {"variations": []}
+            # Ensure 'variations' key exists (may be missing in malformed data)
+            if "variations" not in title_data["alt_name"][key]:
+                title_data["alt_name"][key]["variations"] = []
             for value in values:
                 if value not in title_data["alt_name"][key]["variations"]:
                     title_data["alt_name"][key]["variations"].append(value)