From 43b97f1236bac9872805892c9f5a23e19e4af179 Mon Sep 17 00:00:00 2001
From: ftavakoli <t.fateme@gmail.com>
Date: Wed, 4 Feb 2026 11:22:42 -0500
Subject: [PATCH 1/7] error fix: seperated extraction and drop id

---
 .../configs/experiment_config_20k.yaml        | 119 ++++++++++++++++++
 .../attacks/ensemble/blending.py              |   2 +-
 2 files changed, 120 insertions(+), 1 deletion(-)
 create mode 100644 examples/ensemble_attack/configs/experiment_config_20k.yaml

diff --git a/examples/ensemble_attack/configs/experiment_config_20k.yaml b/examples/ensemble_attack/configs/experiment_config_20k.yaml
new file mode 100644
index 00000000..cc274644
--- /dev/null
+++ b/examples/ensemble_attack/configs/experiment_config_20k.yaml
@@ -0,0 +1,119 @@
+# Ensemble experiment configuration
+# This config can be used to run both the Ensemble attack training (``run_attack.py``) and testing phases (``test_attack_model.py``).
+base_experiment_dir: /projects/midst-experiments/ensemble_attack/ablation/no_domias # Processed data, and experiment artifacts will be stored under this directory.
+base_data_config_dir: examples/ensemble_attack/data_configs # Training and data type configs are saved under this directory.
+
+# Training Pipeline Control
+pipeline:
+  run_data_processing: true # Set this to false if you have already saved the processed data
+  run_shadow_model_training: true # Set this to false if shadow models are already trained and saved
+  run_metaclassifier_training: true
+
+target_model: # This is only used for testing the attack on a real target model.
+  target_model_directory: /projects/midst-experiments/ensemble_attack/midst_data_all_attacks/tabddpm_black_box/final
+  target_model_id: 61  # Will be overridden per SLURM array task
+  target_model_name: tabddpm_${target_model.target_model_id}
+  target_synthetic_data_path: ${target_model.target_model_directory}/${target_model.target_model_name}/trans_synthetic.csv 
+  challenge_data_path: ${target_model.target_model_directory}/${target_model.target_model_name}/challenge_with_id.csv
+  challenge_label_path: ${target_model.target_model_directory}/${target_model.target_model_name}/challenge_label.csv
+
+  target_shadow_models_output_path: ${base_experiment_dir}/test_all_targets # Sub-directory to store test shadows and results
+  attack_probabilities_result_path: ${target_model.target_shadow_models_output_path}/test_probabilities/attack_model_${target_model.target_model_id}_proba
+  attack_rmia_shadow_training_data_choice: "combined" # Options: "combined", "only_challenge", "only_train". This determines which data to use for training RMIA attack model in testing phase.
+  # See select_challenge_data_for_training()'s docstring for more details.
+
+
+# Data paths
+data_paths:
+  processed_base_data_dir: ${base_experiment_dir} # To save new processed data for training, or read from previously collected and processed data (testing phase).
+  population_path: ${data_paths.processed_base_data_dir}/population_data  # Path where the collected population data will be stored (output/input)
+  processed_attack_data_path: ${data_paths.processed_base_data_dir}/attack_data # Path where the processed attack real train and evaluation data is stored (output/input)
+  attack_evaluation_result_path: ${base_experiment_dir}/evaluation_results # Path where the attack (train phase) evaluation results will be stored (output)
+
+
+model_paths:
+  metaclassifier_model_path: ${base_experiment_dir}/trained_models # Path where the trained metaclassifier model will be saved
+
+
+# Dataset specific information used for processing in this example
+data_processing_config:
+  midst_data_path: /projects/midst-experiments/ensemble_attack/midst_data_all_attacks/ # Used to collect the data (input)
+  # The following directories should exists and contain `tabddpm_{i} for i in folder_ranges`: 
+  # population data: midst_data_path/population_attack_data_types_to_collect/population_splits
+  # challenge data: midst_data_path/challenge_attack_data_types_to_collect/challenge_splits
+  population_attack_data_types_to_collect:
+        [
+          "tabddpm_black_box",
+        ]
+  challenge_attack_data_types_to_collect:
+        [
+          "tabddpm_black_box",
+        ]
+  population_splits: ["train"]  # Data splits to be collected for population data
+  challenge_splits: ["train" , "final"]  # Data splits to be collected for challenge points
+  original_population_data_path: /projects/midst-experiments/ensemble_attack/competition/population_data/ # This is where the original attack's population data (800k) will be read from, mainly to be used by DOMIAS
+  # You can download this data from https://github.com/CRCHUM-CITADEL/ensemble-mia/blob/main/input/population/population_all_with_challenge.csv
+
+  # The column name in the data to be used for stratified splitting.
+  column_to_stratify: "trans_type"  # Attention: This value is not documented in the original codebase.
+  folder_ranges: # Specify folder ranges for any of the mentioned splits.
+    train: [[1, 21]] # Folders to be used for train data collection in the experiments
+    final: [[61, 71], [101, 111]]
+  # File names in MIDST data directories.
+  single_table_train_data_file_name: "train_with_id.csv"
+  multi_table_train_data_file_name: "trans.csv"
+  challenge_data_file_name: "challenge_with_id.csv"
+  population_sample_size: 40000 # Population size is the total data that your attack has access to.
+  # In experiments, this is sampled out of all the collected training data in case the available data
+  # is more than this number. Note that, half of this data is actually used for training, the other half
+  # is used for evaluation. For example, with 40k population size, only 20k is used for training the attack model.
+  # TODO: make sure to consider this in experiments.
+
+# Training and data settings for shadow models (temporary, numbers subject to change)
+shadow_training:
+  # Data Config files path used for training a TabDDPM model
+  training_json_config_paths: # Config json files used for tabddpm training on the trans table
+    table_domain_file_path: ${base_data_config_dir}/trans_domain.json
+    dataset_meta_file_path: ${base_data_config_dir}/dataset_meta.json
+    tabddpm_training_config_path: ${base_data_config_dir}/trans.json
+  # Model training artifacts are saved under shadow_models_data_path/workspace_name/exp_name
+  # Also, training configs for each shadow model are created under shadow_models_data_path.
+  shadow_models_output_path: ${base_experiment_dir}/shadow_models_and_data
+  target_model_output_path: ${base_experiment_dir}/shadow_target_model_and_data
+  # Paths to final shadow models used for metaclassifier training (relative to shadow_models_output_path)
+  # These paths are a result of running the shadow model training pipeline, specifically the
+  # train_three_sets_of_shadow_models in shadow_model_training.py
+  # Each .pkl file contains the training data, trained model and training results for all shadow models in a list.
+  final_shadow_models_path: [
+          "${shadow_training.shadow_models_output_path}/initial_model_rmia_1/shadow_workspace/pre_trained_model/rmia_shadows.pkl",
+          "${shadow_training.shadow_models_output_path}/initial_model_rmia_2/shadow_workspace/pre_trained_model/rmia_shadows.pkl",
+          "${shadow_training.shadow_models_output_path}/shadow_model_rmia_third_set/shadow_workspace/trained_model/rmia_shadows_third_set.pkl",
+      ]
+  target_synthetic_data_path: ${shadow_training.target_model_output_path}/target_synthetic_data.csv
+  # Path to final shadow target's synthetic data (relative to target_model_output_path)
+  fine_tuning_config:
+    fine_tune_diffusion_iterations: 200000 # Original code: 200000
+    fine_tune_classifier_iterations: 20000 # Original code: 20000
+    pre_train_data_size: 60000 # Original code: 60000
+  number_of_points_to_synthesize: 20000 # Number of synthetic data samples to be generated by shadow models.
+  # Original code: 20000
+
+
+# Metaclassifier settings
+metaclassifier:
+  # Data types json file is used for xgboost model training.
+  data_types_file_path: ${base_data_config_dir}/data_types.json
+  model_type: "xgb"
+  # Model training parameters
+  num_optuna_trials: 100 # Original code: 100
+  num_kfolds: 5
+  use_gpu: false
+  # Temporary. Might remove having an epoch parameter.
+  epochs: 1
+  meta_classifier_model_name: ${metaclassifier.model_type}_metaclassifier_model
+
+attack_success_computation:
+  target_ids_to_test: [61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110] # List of target model IDs to compute the attack success for.
+
+# General settings
+random_seed: 42 # Set to null for no seed, or an integer for a fixed seed
diff --git a/src/midst_toolkit/attacks/ensemble/blending.py b/src/midst_toolkit/attacks/ensemble/blending.py
index 2b7c194d..d6c221b7 100644
--- a/src/midst_toolkit/attacks/ensemble/blending.py
+++ b/src/midst_toolkit/attacks/ensemble/blending.py
@@ -251,6 +251,6 @@ def predict(
         score = None
 
         if y_test is not None:
-            score = TprAtFpr.get_tpr_at_fpr(true_membership=y_test, predictions=probabilities, max_fpr=0.1)
+            score = TprAtFpr.get_tpr_at_fpr(true_membership=y_test, predicted_membership=probabilities, fpr_threshold=0.1)
 
         return probabilities, score

From a67a0cf7c3b44cba333c02252a3e6acde4b96922 Mon Sep 17 00:00:00 2001
From: ftavakoli <t.fateme@gmail.com>
Date: Wed, 4 Feb 2026 12:19:44 -0500
Subject: [PATCH 2/7] minor fixes

---
 examples/ensemble_attack/test_attack_model.py | 31 +++++++++++--------
 .../attacks/ensemble/blending.py              |  4 ++-
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/examples/ensemble_attack/test_attack_model.py b/examples/ensemble_attack/test_attack_model.py
index a4bd18cc..2d131346 100644
--- a/examples/ensemble_attack/test_attack_model.py
+++ b/examples/ensemble_attack/test_attack_model.py
@@ -18,6 +18,7 @@
 from midst_toolkit.attacks.ensemble.data_utils import load_dataframe
 from midst_toolkit.common.logger import log
 from midst_toolkit.common.random import set_all_random_seeds
+from midst_toolkit.models.clavaddpm.train import get_df_without_id
 
 
 class RmiaTrainingDataChoice(Enum):
@@ -51,13 +52,15 @@ def save_results(
             f.write(f"TPR at FPR=0.1: {pred_score:.4f}\n")
 
 
-def extract_and_drop_id_column(
+def extract_the_main_id_column(
     data_frame: pd.DataFrame,
     data_types_file_path: Path,
-) -> tuple[pd.DataFrame, pd.Series]:
+) -> pd.Series:
     """
-    Extracts IDs from the dataframe and drops the ID column. ID column is identified based on
+    Extracts and returns the main IDs from the dataframe. The main ID column is identified based on
     the data types JSON file with "id_column_name" key.
+    Main IDs are not repeated in the dataset.
+    For example, in the Berka dataset, "trans_id" is the main ID column, and "account_id" is not the main ID column.
 
     Args:
         data_frame: Input dataframe.
@@ -74,14 +77,11 @@ def extract_and_drop_id_column(
 
     assert "id_column_name" in column_types, f"{data_types_file_path} must contain 'id_column_name' key."
     id_column_name = column_types["id_column_name"]
-
+    # Make sure we have one main id column
+    assert isinstance(id_column_name, str), "Only one main id column should be identified."
     assert id_column_name in data_frame.columns, f"Dataframe must have {id_column_name} column"
-    data_trans_ids = data_frame[id_column_name]
-
-    # Drop ID column from data
-    data_frame = data_frame.drop(columns=id_column_name)
 
-    return data_frame, data_trans_ids
+    return data_frame[id_column_name]
 
 
 def run_rmia_shadow_training(config: DictConfig, df_challenge: pd.DataFrame) -> list[dict[str, list[Any]]]:
@@ -183,7 +183,7 @@ def collect_challenge_and_train_data(
         midst_data_input_dir=targets_data_path,
         attack_types=challenge_attack_types,
         # For ensemble experiments, change to ``test`` for 10k, and change to ``final`` for 20k
-        split_folders=["test"],
+        split_folders=["final"],
         dataset="challenge",
         data_processing_config=data_processing_config,
     )
@@ -266,7 +266,7 @@ def train_rmia_shadows_for_test_phase(config: DictConfig) -> list[dict[str, list
     df_challenge_experiment, df_master_train = collect_challenge_and_train_data(
         config.data_processing_config,
         processed_attack_data_path=Path(config.data_paths.processed_attack_data_path),
-        targets_data_path=Path(config.data_paths.midst_data_path),
+        targets_data_path=Path(config.data_processing_config.midst_data_path),
     )
     # Load the challenge dataframe for training RMIA shadow models.
     rmia_training_choice = RmiaTrainingDataChoice(config.target_model.attack_rmia_shadow_training_data_choice)
@@ -357,8 +357,13 @@ def run_metaclassifier_testing(
     else:
         log(INFO, "All shadow models for testing phase found. Using existing RMIA shadow models...")
 
-    # Extract and drop id columns from the test data
-    test_data, test_trans_ids = extract_and_drop_id_column(test_data, Path(config.metaclassifier.data_types_file_path))
+    # Extract the main ID column's values from the test data
+    test_trans_ids = extract_the_main_id_column(
+        data_frame=test_data,
+        data_types_file_path=Path(config.metaclassifier.data_types_file_path),
+    )
+    # Drop id columns from the test data. Berka has two id columns: "trans_id" and "account_id".
+    test_data = get_df_without_id(test_data)
 
     # 4) Initialize the attacker object, and assign the loaded metaclassifier to it.
     blending_attacker = BlendingPlusPlus(
diff --git a/src/midst_toolkit/attacks/ensemble/blending.py b/src/midst_toolkit/attacks/ensemble/blending.py
index d6c221b7..24104cc1 100644
--- a/src/midst_toolkit/attacks/ensemble/blending.py
+++ b/src/midst_toolkit/attacks/ensemble/blending.py
@@ -251,6 +251,8 @@ def predict(
         score = None
 
         if y_test is not None:
-            score = TprAtFpr.get_tpr_at_fpr(true_membership=y_test, predicted_membership=probabilities, fpr_threshold=0.1)
+            score = TprAtFpr.get_tpr_at_fpr(
+                true_membership=y_test, predicted_membership=probabilities, fpr_threshold=0.1
+            )
 
         return probabilities, score

From 0ed74e934d58fb5d042cec2d01a37fc55a52aa50 Mon Sep 17 00:00:00 2001
From: ftavakoli <t.fateme@gmail.com>
Date: Wed, 4 Feb 2026 12:21:41 -0500
Subject: [PATCH 3/7] deleted extra config

---
 .../configs/experiment_config_20k.yaml        | 119 ------------------
 1 file changed, 119 deletions(-)
 delete mode 100644 examples/ensemble_attack/configs/experiment_config_20k.yaml

diff --git a/examples/ensemble_attack/configs/experiment_config_20k.yaml b/examples/ensemble_attack/configs/experiment_config_20k.yaml
deleted file mode 100644
index cc274644..00000000
--- a/examples/ensemble_attack/configs/experiment_config_20k.yaml
+++ /dev/null
@@ -1,119 +0,0 @@
-# Ensemble experiment configuration
-# This config can be used to run both the Ensemble attack training (``run_attack.py``) and testing phases (``test_attack_model.py``).
-base_experiment_dir: /projects/midst-experiments/ensemble_attack/ablation/no_domias # Processed data, and experiment artifacts will be stored under this directory.
-base_data_config_dir: examples/ensemble_attack/data_configs # Training and data type configs are saved under this directory.
-
-# Training Pipeline Control
-pipeline:
-  run_data_processing: true # Set this to false if you have already saved the processed data
-  run_shadow_model_training: true # Set this to false if shadow models are already trained and saved
-  run_metaclassifier_training: true
-
-target_model: # This is only used for testing the attack on a real target model.
-  target_model_directory: /projects/midst-experiments/ensemble_attack/midst_data_all_attacks/tabddpm_black_box/final
-  target_model_id: 61  # Will be overridden per SLURM array task
-  target_model_name: tabddpm_${target_model.target_model_id}
-  target_synthetic_data_path: ${target_model.target_model_directory}/${target_model.target_model_name}/trans_synthetic.csv 
-  challenge_data_path: ${target_model.target_model_directory}/${target_model.target_model_name}/challenge_with_id.csv
-  challenge_label_path: ${target_model.target_model_directory}/${target_model.target_model_name}/challenge_label.csv
-
-  target_shadow_models_output_path: ${base_experiment_dir}/test_all_targets # Sub-directory to store test shadows and results
-  attack_probabilities_result_path: ${target_model.target_shadow_models_output_path}/test_probabilities/attack_model_${target_model.target_model_id}_proba
-  attack_rmia_shadow_training_data_choice: "combined" # Options: "combined", "only_challenge", "only_train". This determines which data to use for training RMIA attack model in testing phase.
-  # See select_challenge_data_for_training()'s docstring for more details.
-
-
-# Data paths
-data_paths:
-  processed_base_data_dir: ${base_experiment_dir} # To save new processed data for training, or read from previously collected and processed data (testing phase).
-  population_path: ${data_paths.processed_base_data_dir}/population_data  # Path where the collected population data will be stored (output/input)
-  processed_attack_data_path: ${data_paths.processed_base_data_dir}/attack_data # Path where the processed attack real train and evaluation data is stored (output/input)
-  attack_evaluation_result_path: ${base_experiment_dir}/evaluation_results # Path where the attack (train phase) evaluation results will be stored (output)
-
-
-model_paths:
-  metaclassifier_model_path: ${base_experiment_dir}/trained_models # Path where the trained metaclassifier model will be saved
-
-
-# Dataset specific information used for processing in this example
-data_processing_config:
-  midst_data_path: /projects/midst-experiments/ensemble_attack/midst_data_all_attacks/ # Used to collect the data (input)
-  # The following directories should exists and contain `tabddpm_{i} for i in folder_ranges`: 
-  # population data: midst_data_path/population_attack_data_types_to_collect/population_splits
-  # challenge data: midst_data_path/challenge_attack_data_types_to_collect/challenge_splits
-  population_attack_data_types_to_collect:
-        [
-          "tabddpm_black_box",
-        ]
-  challenge_attack_data_types_to_collect:
-        [
-          "tabddpm_black_box",
-        ]
-  population_splits: ["train"]  # Data splits to be collected for population data
-  challenge_splits: ["train" , "final"]  # Data splits to be collected for challenge points
-  original_population_data_path: /projects/midst-experiments/ensemble_attack/competition/population_data/ # This is where the original attack's population data (800k) will be read from, mainly to be used by DOMIAS
-  # You can download this data from https://github.com/CRCHUM-CITADEL/ensemble-mia/blob/main/input/population/population_all_with_challenge.csv
-
-  # The column name in the data to be used for stratified splitting.
-  column_to_stratify: "trans_type"  # Attention: This value is not documented in the original codebase.
-  folder_ranges: # Specify folder ranges for any of the mentioned splits.
-    train: [[1, 21]] # Folders to be used for train data collection in the experiments
-    final: [[61, 71], [101, 111]]
-  # File names in MIDST data directories.
-  single_table_train_data_file_name: "train_with_id.csv"
-  multi_table_train_data_file_name: "trans.csv"
-  challenge_data_file_name: "challenge_with_id.csv"
-  population_sample_size: 40000 # Population size is the total data that your attack has access to.
-  # In experiments, this is sampled out of all the collected training data in case the available data
-  # is more than this number. Note that, half of this data is actually used for training, the other half
-  # is used for evaluation. For example, with 40k population size, only 20k is used for training the attack model.
-  # TODO: make sure to consider this in experiments.
-
-# Training and data settings for shadow models (temporary, numbers subject to change)
-shadow_training:
-  # Data Config files path used for training a TabDDPM model
-  training_json_config_paths: # Config json files used for tabddpm training on the trans table
-    table_domain_file_path: ${base_data_config_dir}/trans_domain.json
-    dataset_meta_file_path: ${base_data_config_dir}/dataset_meta.json
-    tabddpm_training_config_path: ${base_data_config_dir}/trans.json
-  # Model training artifacts are saved under shadow_models_data_path/workspace_name/exp_name
-  # Also, training configs for each shadow model are created under shadow_models_data_path.
-  shadow_models_output_path: ${base_experiment_dir}/shadow_models_and_data
-  target_model_output_path: ${base_experiment_dir}/shadow_target_model_and_data
-  # Paths to final shadow models used for metaclassifier training (relative to shadow_models_output_path)
-  # These paths are a result of running the shadow model training pipeline, specifically the
-  # train_three_sets_of_shadow_models in shadow_model_training.py
-  # Each .pkl file contains the training data, trained model and training results for all shadow models in a list.
-  final_shadow_models_path: [
-          "${shadow_training.shadow_models_output_path}/initial_model_rmia_1/shadow_workspace/pre_trained_model/rmia_shadows.pkl",
-          "${shadow_training.shadow_models_output_path}/initial_model_rmia_2/shadow_workspace/pre_trained_model/rmia_shadows.pkl",
-          "${shadow_training.shadow_models_output_path}/shadow_model_rmia_third_set/shadow_workspace/trained_model/rmia_shadows_third_set.pkl",
-      ]
-  target_synthetic_data_path: ${shadow_training.target_model_output_path}/target_synthetic_data.csv
-  # Path to final shadow target's synthetic data (relative to target_model_output_path)
-  fine_tuning_config:
-    fine_tune_diffusion_iterations: 200000 # Original code: 200000
-    fine_tune_classifier_iterations: 20000 # Original code: 20000
-    pre_train_data_size: 60000 # Original code: 60000
-  number_of_points_to_synthesize: 20000 # Number of synthetic data samples to be generated by shadow models.
-  # Original code: 20000
-
-
-# Metaclassifier settings
-metaclassifier:
-  # Data types json file is used for xgboost model training.
-  data_types_file_path: ${base_data_config_dir}/data_types.json
-  model_type: "xgb"
-  # Model training parameters
-  num_optuna_trials: 100 # Original code: 100
-  num_kfolds: 5
-  use_gpu: false
-  # Temporary. Might remove having an epoch parameter.
-  epochs: 1
-  meta_classifier_model_name: ${metaclassifier.model_type}_metaclassifier_model
-
-attack_success_computation:
-  target_ids_to_test: [61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110] # List of target model IDs to compute the attack success for.
-
-# General settings
-random_seed: 42 # Set to null for no seed, or an integer for a fixed seed

From 007e33bdca100bf40fb611ecec0c3df3c3410206 Mon Sep 17 00:00:00 2001
From: ftavakoli <t.fateme@gmail.com>
Date: Wed, 4 Feb 2026 12:25:31 -0500
Subject: [PATCH 4/7] midst_data_path goes under data_processing_config

---
 examples/ensemble_attack/configs/experiment_config.yaml | 6 +++++-
 examples/ensemble_attack/run_attack.py                  | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/ensemble_attack/configs/experiment_config.yaml b/examples/ensemble_attack/configs/experiment_config.yaml
index fc50bec6..4216715c 100644
--- a/examples/ensemble_attack/configs/experiment_config.yaml
+++ b/examples/ensemble_attack/configs/experiment_config.yaml
@@ -25,7 +25,7 @@ target_model: # This is only used for testing the attack on a real target model.
 
 # Data paths
 data_paths:
-  midst_data_path: /projects/midst-experiments/all_tabddpms/ # Used to collect the data (input) as defined in data_processing_config
+
   processed_base_data_dir: ${base_experiment_dir} # To save new processed data for training, or read from previously collected and processed data (testing phase).
   population_path: ${data_paths.processed_base_data_dir}/population_data  # Path where the collected population data will be stored (output/input)
   processed_attack_data_path: ${data_paths.processed_base_data_dir}/attack_data # Path where the processed attack real train and evaluation data is stored (output/input)
@@ -38,6 +38,10 @@ model_paths:
 
 # Dataset specific information used for processing in this example
 data_processing_config:
+  midst_data_path: /projects/midst-experiments/all_tabddpms/ # Used to collect the data (input)
+  # The following directories should exist and contain `tabddpm_{i} for i in folder_ranges`:
+  # population data: midst_data_path/population_attack_data_types_to_collect/population_splits
+  # challenge data: midst_data_path/challenge_attack_data_types_to_collect/challenge_splits
   population_attack_data_types_to_collect:
         [
           "tabddpm_trained_with_10k",
diff --git a/examples/ensemble_attack/run_attack.py b/examples/ensemble_attack/run_attack.py
index c53a86dc..4e67fa50 100644
--- a/examples/ensemble_attack/run_attack.py
+++ b/examples/ensemble_attack/run_attack.py
@@ -38,7 +38,7 @@ def run_data_processing(config: DictConfig) -> None:
     log(INFO, "Running data processing pipeline...")
     # Collect the real data from the MIDST challenge resources.
     population_data = collect_population_data_ensemble(
-        midst_data_input_dir=Path(config.data_paths.midst_data_path),
+        midst_data_input_dir=Path(config.data_processing_config.midst_data_path),
         data_processing_config=config.data_processing_config,
         save_dir=Path(config.data_paths.population_path),
         base_population=original_population_data,

From 927eb4739a060f71f5f3a23017af9e93c1df8778 Mon Sep 17 00:00:00 2001
From: ftavakoli <t.fateme@gmail.com>
Date: Wed, 4 Feb 2026 12:53:33 -0500
Subject: [PATCH 5/7] fixed a unit test

---
 tests/unit/attacks/ensemble/test_meta_classifier.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/attacks/ensemble/test_meta_classifier.py b/tests/unit/attacks/ensemble/test_meta_classifier.py
index c4dc0fe4..5e8154ec 100644
--- a/tests/unit/attacks/ensemble/test_meta_classifier.py
+++ b/tests/unit/attacks/ensemble/test_meta_classifier.py
@@ -359,7 +359,7 @@ def test_predict_flow(
         call_args = mock_get_tpr.call_args
 
         np.testing.assert_array_equal(call_args.kwargs["true_membership"], sample_dataframes["y_test"])
-        np.testing.assert_array_almost_equal(call_args.kwargs["predictions"], expected_probabilities)
-        np.testing.assert_equal(call_args.kwargs["max_fpr"], 0.1)
+        np.testing.assert_array_almost_equal(call_args.kwargs["predicted_membership"], expected_probabilities)
+        np.testing.assert_equal(call_args.kwargs["fpr_threshold"], 0.1)
 
         assert score == 0.99

From 697a592636bff7b41922119be78c1b158be1f18a Mon Sep 17 00:00:00 2001
From: ftavakoli <t.fateme@gmail.com>
Date: Wed, 4 Feb 2026 13:19:47 -0500
Subject: [PATCH 6/7] codrabbitai comment: fix return type in docstring

---
 examples/ensemble_attack/test_attack_model.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/ensemble_attack/test_attack_model.py b/examples/ensemble_attack/test_attack_model.py
index 2d131346..8b90df66 100644
--- a/examples/ensemble_attack/test_attack_model.py
+++ b/examples/ensemble_attack/test_attack_model.py
@@ -67,9 +67,7 @@ def extract_the_main_id_column(
         data_types_file_path: Path to the data types JSON file.
 
     Returns:
-        A tuple containing:
-            - The modified dataframe with ID columns dropped.
-            - A Series containing the extracted data of ID columns.
+        A Series containing the extracted data of the main ID column.
     """
     # Extract ID column from the dataframe
     with open(data_types_file_path, "r") as f:

From d2351cda84dfa34c6ed12ce75810dbf9f5e46bd0 Mon Sep 17 00:00:00 2001
From: ftavakoli <t.fateme@gmail.com>
Date: Wed, 4 Feb 2026 14:41:59 -0500
Subject: [PATCH 7/7] A better name for the function

---
 examples/ensemble_attack/test_attack_model.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/ensemble_attack/test_attack_model.py b/examples/ensemble_attack/test_attack_model.py
index 8b90df66..910189bc 100644
--- a/examples/ensemble_attack/test_attack_model.py
+++ b/examples/ensemble_attack/test_attack_model.py
@@ -52,15 +52,15 @@ def save_results(
             f.write(f"TPR at FPR=0.1: {pred_score:.4f}\n")
 
 
-def extract_the_main_id_column(
+def extract_primary_id_column(
     data_frame: pd.DataFrame,
     data_types_file_path: Path,
 ) -> pd.Series:
     """
-    Extracts and returns the main IDs from the dataframe. The main ID column is identified based on
+    Extracts and returns the primary IDs from the dataframe. The primary ID column is identified based on
     the data types JSON file with "id_column_name" key.
-    Main IDs are not repeated in the dataset.
-    For example, in the Berka dataset, "trans_id" is the main ID column, and "account_id" is not the main ID column.
+    primary IDs are unique keys in the dataset.
+    For example, in the Berka dataset, "trans_id" is the primary ID column, while "account_id" is not.
 
     Args:
         data_frame: Input dataframe.
@@ -356,7 +356,7 @@ def run_metaclassifier_testing(
         log(INFO, "All shadow models for testing phase found. Using existing RMIA shadow models...")
 
     # Extract the main ID column's values from the test data
-    test_trans_ids = extract_the_main_id_column(
+    test_trans_ids = extract_primary_id_column(
         data_frame=test_data,
         data_types_file_path=Path(config.metaclassifier.data_types_file_path),
     )