Skip to content

Commit ad47c66

Browse files
Merge pull request #202 from JesperDramsch/claude/fix-csv-merge-loss-4WPng
fix(merge): prevent silent data loss in CSV merge pipeline
2 parents f54514d + 2a6642c commit ad47c66

File tree

8 files changed

+658
-43
lines changed

8 files changed

+658
-43
lines changed

tests/test_merge_no_data_loss.py

Lines changed: 513 additions & 0 deletions
Large diffs are not rendered by default.

utils/import_python_official.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,8 @@ def main(year=None, base="") -> bool:
270270
df_diff = pd.concat([df_ics_old, df_ics]).drop_duplicates(keep=False)
271271

272272
# Deduplicate the new dataframe
273-
df_ics = deduplicate(df_diff, "conference")
273+
# CRITICAL: Must group by both conference AND year to avoid losing multi-year entries
274+
df_ics = deduplicate(df_diff, ["conference", "year"])
274275

275276
if df_ics.empty:
276277
logger.info("No new conferences found in official Python source.")

utils/import_python_organizers.py

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
# Standard library
2-
import re
32
from datetime import datetime
43
from datetime import timezone
54
from pathlib import Path
@@ -16,6 +15,7 @@
1615
from tidy_conf import merge_conferences
1716
from tidy_conf.deduplicate import deduplicate
1817
from tidy_conf.schema import get_schema
18+
from tidy_conf.titles import normalize_conference_name
1919
from tidy_conf.utils import fill_missing_required
2020
from tidy_conf.yaml import load_title_mappings
2121
from tidy_conf.yaml import write_df_yaml
@@ -25,6 +25,7 @@
2525
from .tidy_conf import merge_conferences
2626
from .tidy_conf.deduplicate import deduplicate
2727
from .tidy_conf.schema import get_schema
28+
from .tidy_conf.titles import normalize_conference_name
2829
from .tidy_conf.utils import fill_missing_required
2930
from .tidy_conf.yaml import load_title_mappings
3031
from .tidy_conf.yaml import write_df_yaml
@@ -269,10 +270,11 @@ def main(year: int | None = None, base: str = "") -> None:
269270

270271
# Load and apply the title mappings
271272
_, known_mappings = load_title_mappings(reverse=True)
272-
df_csv_standardized["conference"] = (
273-
df_csv_standardized["conference"]
274-
.replace(re.compile(r"\b\s+(19|20)\d{2}\s*\b"), "", regex=True)
275-
.replace(known_mappings)
273+
274+
# CRITICAL: Use normalize_conference_name for CONSISTENT normalization
275+
# This ensures the same normalization is used here AND in mapping_dict later
276+
df_csv_standardized["conference"] = df_csv_standardized["conference"].apply(
277+
lambda x: normalize_conference_name(x, known_mappings),
276278
)
277279

278280
# Store the new csv dataframe to cache (with original names)
@@ -282,7 +284,9 @@ def main(year: int | None = None, base: str = "") -> None:
282284
# _ = pd.concat([df_csv_old, df_csv_raw]).drop_duplicates(keep=False)
283285

284286
# Deduplicate the new dataframe (with standardized names for merging)
285-
df_csv_for_merge = deduplicate(df_csv_standardized, "conference")
287+
# CRITICAL: Must group by both conference AND year to avoid losing multi-year entries
288+
# (e.g., "PyCon USA 2025" and "PyCon USA 2026" both normalize to "PyCon USA")
289+
df_csv_for_merge = deduplicate(df_csv_standardized, ["conference", "year"])
286290

287291
if df_csv_for_merge.empty:
288292
print("No new conferences found in Python organiser source.")
@@ -345,23 +349,52 @@ def main(year: int | None = None, base: str = "") -> None:
345349
df_csv_output = df_csv_raw.copy()
346350

347351
# Map from the standardized data back to original
352+
# CRITICAL: Use the SAME normalization function as tidy_df_names to avoid data loss
348353
mapping_dict = {}
349354
for idx, row in df_csv_raw.iterrows():
350-
standardized_conf = re.sub(r"\b\s+(19|20)\d{2}\s*\b", "", row["conference"])
351-
if standardized_conf in known_mappings:
352-
standardized_conf = known_mappings[standardized_conf]
355+
# Use normalize_conference_name for consistent normalization
356+
standardized_conf = normalize_conference_name(row["conference"], known_mappings)
353357
mapping_key = (standardized_conf, row["year"])
354358
mapping_dict[mapping_key] = idx
355359

360+
# Track entries that matched and those that didn't for debugging
361+
matched_keys = set()
362+
unmatched_entries = []
363+
356364
# Update the CSV output with information from the merged data
357365
for _, row in df_new.iterrows():
358366
key = (row["conference"], row["year"])
359367
if key in mapping_dict:
360368
original_idx = mapping_dict[key]
369+
matched_keys.add(key)
361370
# Update only fields that were potentially enriched during merge
362371
for col in ["start", "end", "cfp", "link", "cfp_link", "sponsor", "finaid"]:
363372
if col in row and pd.notna(row[col]):
364373
df_csv_output.at[original_idx, col] = row[col]
374+
else:
375+
# Track entries that didn't match for potential debugging
376+
unmatched_entries.append(
377+
{"conference": row["conference"], "year": row["year"]},
378+
)
379+
380+
# Log any unmatched entries for debugging (these may be legitimately new)
381+
if unmatched_entries:
382+
logger.debug(
383+
f"Found {len(unmatched_entries)} entries in df_new not in mapping_dict "
384+
"(may be new conferences from YAML):",
385+
)
386+
for entry in unmatched_entries[:5]: # Show first 5
387+
logger.debug(f" - {entry['conference']} ({entry['year']})")
388+
389+
# Verify no silent data loss: all CSV entries should be accounted for
390+
csv_keys_in_mapping = set(mapping_dict.keys())
391+
unmatched_csv = csv_keys_in_mapping - matched_keys
392+
if unmatched_csv:
393+
logger.warning(
394+
f"Potential data loss: {len(unmatched_csv)} CSV entries were not updated:",
395+
)
396+
for key in list(unmatched_csv)[:5]: # Show first 5
397+
logger.warning(f" - {key[0]} ({key[1]})")
365398

366399
# Write the CSV with original names
367400
df_csv_output.loc[:, "Location"] = df_csv_output.place

utils/tidy_conf/data/rejections.yml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,28 @@ alt_name:
1010
Python Austria:
1111
variations:
1212
- PyCon Australia
13+
PyCon India:
14+
variations:
15+
- PyCon Indonesia
16+
PyCon Indonesia:
17+
variations:
18+
- PyCon India
19+
Swiss Python Summit:
20+
variations:
21+
- Python Sul
22+
Python Sul:
23+
variations:
24+
- Swiss Python Summit
25+
PyCon Latam:
26+
variations:
27+
- PyCon Latin America
28+
PyCon Latin America:
29+
variations:
30+
- PyCon Latam
31+
Plone Conference:
32+
variations:
33+
- AfroPython Conference
34+
AfroPython Conference:
35+
variations:
36+
- Plone Conference
1337
spelling: []

utils/tidy_conf/data/titles.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,9 @@ alt_name:
158158
variations:
159159
- PyLadies Conference
160160
- PyLadies Con
161+
ExistingConf:
162+
variations:
163+
- New Variation
161164
spelling:
162165
- DjangoCon
163166
- EuroPython

utils/tidy_conf/interactive_merge.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -564,20 +564,25 @@ def merge_conferences(
564564
elif " " not in rx and " " in ry:
565565
cfp_time_x = " " + ry.split(" ")[1]
566566

567+
# Check if the cfp_ext columns exist before accessing them
568+
# These columns may not exist if one dataframe doesn't have cfp_ext
569+
cfp_ext_x = row.get("cfp_ext_x") if "cfp_ext_x" in row.index else None
570+
cfp_ext_y = row.get("cfp_ext_y") if "cfp_ext_y" in row.index else None
571+
567572
# Check if the cfp_ext is the same and if so update the cfp
568-
if rx + cfp_time_x == row["cfp_ext_x"]:
573+
if cfp_ext_x is not None and rx + cfp_time_x == cfp_ext_x:
569574
df_new.loc[i, "cfp"] = ry + cfp_time_y
570575
df_new.loc[i, "cfp_ext"] = rx + cfp_time_x
571576
continue
572-
if ry + cfp_time_y == row["cfp_ext_y"]:
577+
if cfp_ext_y is not None and ry + cfp_time_y == cfp_ext_y:
573578
df_new.loc[i, "cfp"] = rx + cfp_time_x
574579
df_new.loc[i, "cfp_ext"] = ry + cfp_time_y
575580
continue
576-
if rx + cfp_time_x == row["cfp_ext_y"]:
581+
if cfp_ext_y is not None and rx + cfp_time_x == cfp_ext_y:
577582
df_new.loc[i, "cfp"] = ry + cfp_time_y
578583
df_new.loc[i, "cfp_ext"] = rx + cfp_time_x
579584
continue
580-
if ry + cfp_time_y == row["cfp_ext_x"]:
585+
if cfp_ext_x is not None and ry + cfp_time_y == cfp_ext_x:
581586
df_new.loc[i, "cfp"] = rx + cfp_time_x
582587
df_new.loc[i, "cfp_ext"] = ry + cfp_time_y
583588
continue

utils/tidy_conf/titles.py

Lines changed: 62 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -119,55 +119,88 @@ def expand_country_codes(name):
119119
return name
120120

121121

122-
def tidy_df_names(df):
123-
"""Tidy up the conference names in a consistent way.
124-
125-
Normalizes conference names by:
126-
1. Removing years from names
127-
2. Expanding country codes to full names (e.g., "PyCon PL" -> "PyCon Poland")
128-
3. Normalizing spacing and punctuation
129-
4. Applying known mappings from titles.yml
122+
def normalize_conference_name(name: str, known_mappings: dict | None = None) -> str:
123+
"""Normalize a single conference name to a canonical form.
124+
125+
This is the single source of truth for conference name normalization.
126+
Both DataFrame-level normalization (tidy_df_names) and individual lookups
127+
should use this function to ensure consistency.
128+
129+
Parameters
130+
----------
131+
name : str
132+
Conference name to normalize
133+
known_mappings : dict, optional
134+
Mapping of known variations to canonical names. If None, will be loaded.
135+
136+
Returns
137+
-------
138+
str
139+
Normalized conference name
130140
"""
131-
# Load known title mappings
132-
_, known_mappings = load_title_mappings(reverse=True)
141+
if not name or not isinstance(name, str):
142+
return name if isinstance(name, str) else ""
133143

134-
# Define regex patterns for matching years and conference names
135-
# Match years with or without leading space
144+
# Load known mappings if not provided
145+
if known_mappings is None:
146+
_, known_mappings = load_title_mappings(reverse=True)
147+
148+
# Define regex patterns (same as tidy_df_names)
136149
regex_year = re.compile(r"\b\s*(19|20)\d{2}\s*\b")
137150
regex_py = re.compile(r"\b(Python|PyCon)\b")
138151

139-
# Harmonize conference titles using known mappings and regex
140-
series = df["conference"].copy()
152+
result = name
141153

142-
# Remove years from conference names
143-
series = series.str.replace(regex_year, " ", regex=True)
154+
# Remove years from conference names (replace with space to avoid concatenation)
155+
result = regex_year.sub(" ", result)
144156

145157
# Add a space after Python or PyCon
146-
series = series.str.replace(regex_py, r" \1 ", regex=True)
158+
result = regex_py.sub(r" \1 ", result)
147159

148-
# Replace non-word characters
149-
series = series.str.replace(r"[\+]", " ", regex=True)
160+
# Replace non-word characters like +
161+
result = re.sub(r"[\+]", " ", result)
150162

151163
# Replace the word Conference
152-
series = series.str.replace(r"\bConf\b", "Conference", regex=True)
164+
result = re.sub(r"\bConf\b", "Conference", result)
153165

154166
# Remove extra spaces
155-
series = series.str.replace(r"\s+", " ", regex=True)
167+
result = re.sub(r"\s+", " ", result)
156168

157169
# Remove leading and trailing whitespace
158-
series = series.str.strip()
170+
result = result.strip()
159171

160-
# Expand country codes to full names BEFORE applying known mappings
161-
# This ensures "PyCon PL" becomes "PyCon Poland" which can then match
162-
series = series.apply(expand_country_codes)
172+
# Apply known mappings FIRST (mappings may contain unexpanded country codes)
173+
if result in known_mappings:
174+
result = known_mappings[result]
163175

164-
# Replace known mappings (from titles.yml)
165-
series = series.replace(known_mappings)
176+
# Expand country codes to full names AFTER mappings
177+
# This ensures idempotency: normalize(normalize(x)) == normalize(x)
178+
result = expand_country_codes(result)
166179

167180
# Final cleanup
168-
series = series.str.strip()
181+
result = result.strip()
182+
183+
return result
184+
185+
186+
def tidy_df_names(df):
187+
"""Tidy up the conference names in a consistent way.
188+
189+
Normalizes conference names by:
190+
1. Removing years from names
191+
2. Expanding country codes to full names (e.g., "PyCon PL" -> "PyCon Poland")
192+
3. Normalizing spacing and punctuation
193+
4. Applying known mappings from titles.yml
194+
195+
Uses normalize_conference_name() internally for consistency.
196+
"""
197+
# Load known title mappings once for efficiency
198+
_, known_mappings = load_title_mappings(reverse=True)
169199

200+
# Apply normalization using the shared function
170201
df = df.copy()
171-
df.loc[:, "conference"] = series
202+
df.loc[:, "conference"] = df["conference"].apply(
203+
lambda x: normalize_conference_name(x, known_mappings),
204+
)
172205

173206
return df

utils/tidy_conf/yaml.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,9 @@ def update_title_mappings(data, path="utils/tidy_conf/data/titles.yml"):
192192
continue
193193
if key not in title_data["alt_name"]:
194194
title_data["alt_name"][key] = {"variations": []}
195+
# Ensure 'variations' key exists (may be missing in malformed data)
196+
if "variations" not in title_data["alt_name"][key]:
197+
title_data["alt_name"][key]["variations"] = []
195198
for value in values:
196199
if value not in title_data["alt_name"][key]["variations"]:
197200
title_data["alt_name"][key]["variations"].append(value)

0 commit comments

Comments
 (0)