From 43b97f1236bac9872805892c9f5a23e19e4af179 Mon Sep 17 00:00:00 2001 From: ftavakoli Date: Wed, 4 Feb 2026 11:22:42 -0500 Subject: [PATCH 1/7] error fix: seperated extraction and drop id --- .../configs/experiment_config_20k.yaml | 119 ++++++++++++++++++ .../attacks/ensemble/blending.py | 2 +- 2 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 examples/ensemble_attack/configs/experiment_config_20k.yaml diff --git a/examples/ensemble_attack/configs/experiment_config_20k.yaml b/examples/ensemble_attack/configs/experiment_config_20k.yaml new file mode 100644 index 00000000..cc274644 --- /dev/null +++ b/examples/ensemble_attack/configs/experiment_config_20k.yaml @@ -0,0 +1,119 @@ +# Ensemble experiment configuration +# This config can be used to run both the Ensemble attack training (``run_attack.py``) and testing phases (``test_attack_model.py``). +base_experiment_dir: /projects/midst-experiments/ensemble_attack/ablation/no_domias # Processed data, and experiment artifacts will be stored under this directory. +base_data_config_dir: examples/ensemble_attack/data_configs # Training and data type configs are saved under this directory. + +# Training Pipeline Control +pipeline: + run_data_processing: true # Set this to false if you have already saved the processed data + run_shadow_model_training: true # Set this to false if shadow models are already trained and saved + run_metaclassifier_training: true + +target_model: # This is only used for testing the attack on a real target model. + target_model_directory: /projects/midst-experiments/ensemble_attack/midst_data_all_attacks/tabddpm_black_box/final + target_model_id: 61 # Will be overridden per SLURM array task + target_model_name: tabddpm_${target_model.target_model_id} + target_synthetic_data_path: ${target_model.target_model_directory}/${target_model.target_model_name}/trans_synthetic.csv + challenge_data_path: ${target_model.target_model_directory}/${target_model.target_model_name}/challenge_with_id.csv + challenge_label_path: ${target_model.target_model_directory}/${target_model.target_model_name}/challenge_label.csv + + target_shadow_models_output_path: ${base_experiment_dir}/test_all_targets # Sub-directory to store test shadows and results + attack_probabilities_result_path: ${target_model.target_shadow_models_output_path}/test_probabilities/attack_model_${target_model.target_model_id}_proba + attack_rmia_shadow_training_data_choice: "combined" # Options: "combined", "only_challenge", "only_train". This determines which data to use for training RMIA attack model in testing phase. + # See select_challenge_data_for_training()'s docstring for more details. + + +# Data paths +data_paths: + processed_base_data_dir: ${base_experiment_dir} # To save new processed data for training, or read from previously collected and processed data (testing phase). + population_path: ${data_paths.processed_base_data_dir}/population_data # Path where the collected population data will be stored (output/input) + processed_attack_data_path: ${data_paths.processed_base_data_dir}/attack_data # Path where the processed attack real train and evaluation data is stored (output/input) + attack_evaluation_result_path: ${base_experiment_dir}/evaluation_results # Path where the attack (train phase) evaluation results will be stored (output) + + +model_paths: + metaclassifier_model_path: ${base_experiment_dir}/trained_models # Path where the trained metaclassifier model will be saved + + +# Dataset specific information used for processing in this example +data_processing_config: + midst_data_path: /projects/midst-experiments/ensemble_attack/midst_data_all_attacks/ # Used to collect the data (input) + # The following directories should exists and contain `tabddpm_{i} for i in folder_ranges`: + # population data: midst_data_path/population_attack_data_types_to_collect/population_splits + # challenge data: midst_data_path/challenge_attack_data_types_to_collect/challenge_splits + population_attack_data_types_to_collect: + [ + "tabddpm_black_box", + ] + challenge_attack_data_types_to_collect: + [ + "tabddpm_black_box", + ] + population_splits: ["train"] # Data splits to be collected for population data + challenge_splits: ["train" , "final"] # Data splits to be collected for challenge points + original_population_data_path: /projects/midst-experiments/ensemble_attack/competition/population_data/ # This is where the original attack's population data (800k) will be read from, mainly to be used by DOMIAS + # You can download this data from https://github.com/CRCHUM-CITADEL/ensemble-mia/blob/main/input/population/population_all_with_challenge.csv + + # The column name in the data to be used for stratified splitting. + column_to_stratify: "trans_type" # Attention: This value is not documented in the original codebase. + folder_ranges: # Specify folder ranges for any of the mentioned splits. + train: [[1, 21]] # Folders to be used for train data collection in the experiments + final: [[61, 71], [101, 111]] + # File names in MIDST data directories. + single_table_train_data_file_name: "train_with_id.csv" + multi_table_train_data_file_name: "trans.csv" + challenge_data_file_name: "challenge_with_id.csv" + population_sample_size: 40000 # Population size is the total data that your attack has access to. + # In experiments, this is sampled out of all the collected training data in case the available data + # is more than this number. Note that, half of this data is actually used for training, the other half + # is used for evaluation. For example, with 40k population size, only 20k is used for training the attack model. + # TODO: make sure to consider this in experiments. + +# Training and data settings for shadow models (temporary, numbers subject to change) +shadow_training: + # Data Config files path used for training a TabDDPM model + training_json_config_paths: # Config json files used for tabddpm training on the trans table + table_domain_file_path: ${base_data_config_dir}/trans_domain.json + dataset_meta_file_path: ${base_data_config_dir}/dataset_meta.json + tabddpm_training_config_path: ${base_data_config_dir}/trans.json + # Model training artifacts are saved under shadow_models_data_path/workspace_name/exp_name + # Also, training configs for each shadow model are created under shadow_models_data_path. + shadow_models_output_path: ${base_experiment_dir}/shadow_models_and_data + target_model_output_path: ${base_experiment_dir}/shadow_target_model_and_data + # Paths to final shadow models used for metaclassifier training (relative to shadow_models_output_path) + # These paths are a result of running the shadow model training pipeline, specifically the + # train_three_sets_of_shadow_models in shadow_model_training.py + # Each .pkl file contains the training data, trained model and training results for all shadow models in a list. + final_shadow_models_path: [ + "${shadow_training.shadow_models_output_path}/initial_model_rmia_1/shadow_workspace/pre_trained_model/rmia_shadows.pkl", + "${shadow_training.shadow_models_output_path}/initial_model_rmia_2/shadow_workspace/pre_trained_model/rmia_shadows.pkl", + "${shadow_training.shadow_models_output_path}/shadow_model_rmia_third_set/shadow_workspace/trained_model/rmia_shadows_third_set.pkl", + ] + target_synthetic_data_path: ${shadow_training.target_model_output_path}/target_synthetic_data.csv + # Path to final shadow target's synthetic data (relative to target_model_output_path) + fine_tuning_config: + fine_tune_diffusion_iterations: 200000 # Original code: 200000 + fine_tune_classifier_iterations: 20000 # Original code: 20000 + pre_train_data_size: 60000 # Original code: 60000 + number_of_points_to_synthesize: 20000 # Number of synthetic data samples to be generated by shadow models. + # Original code: 20000 + + +# Metaclassifier settings +metaclassifier: + # Data types json file is used for xgboost model training. + data_types_file_path: ${base_data_config_dir}/data_types.json + model_type: "xgb" + # Model training parameters + num_optuna_trials: 100 # Original code: 100 + num_kfolds: 5 + use_gpu: false + # Temporary. Might remove having an epoch parameter. + epochs: 1 + meta_classifier_model_name: ${metaclassifier.model_type}_metaclassifier_model + +attack_success_computation: + target_ids_to_test: [61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110] # List of target model IDs to compute the attack success for. + +# General settings +random_seed: 42 # Set to null for no seed, or an integer for a fixed seed diff --git a/src/midst_toolkit/attacks/ensemble/blending.py b/src/midst_toolkit/attacks/ensemble/blending.py index 2b7c194d..d6c221b7 100644 --- a/src/midst_toolkit/attacks/ensemble/blending.py +++ b/src/midst_toolkit/attacks/ensemble/blending.py @@ -251,6 +251,6 @@ def predict( score = None if y_test is not None: - score = TprAtFpr.get_tpr_at_fpr(true_membership=y_test, predictions=probabilities, max_fpr=0.1) + score = TprAtFpr.get_tpr_at_fpr(true_membership=y_test, predicted_membership=probabilities, fpr_threshold=0.1) return probabilities, score From a67a0cf7c3b44cba333c02252a3e6acde4b96922 Mon Sep 17 00:00:00 2001 From: ftavakoli Date: Wed, 4 Feb 2026 12:19:44 -0500 Subject: [PATCH 2/7] minor fixes --- examples/ensemble_attack/test_attack_model.py | 31 +++++++++++-------- .../attacks/ensemble/blending.py | 4 ++- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/examples/ensemble_attack/test_attack_model.py b/examples/ensemble_attack/test_attack_model.py index a4bd18cc..2d131346 100644 --- a/examples/ensemble_attack/test_attack_model.py +++ b/examples/ensemble_attack/test_attack_model.py @@ -18,6 +18,7 @@ from midst_toolkit.attacks.ensemble.data_utils import load_dataframe from midst_toolkit.common.logger import log from midst_toolkit.common.random import set_all_random_seeds +from midst_toolkit.models.clavaddpm.train import get_df_without_id class RmiaTrainingDataChoice(Enum): @@ -51,13 +52,15 @@ def save_results( f.write(f"TPR at FPR=0.1: {pred_score:.4f}\n") -def extract_and_drop_id_column( +def extract_the_main_id_column( data_frame: pd.DataFrame, data_types_file_path: Path, -) -> tuple[pd.DataFrame, pd.Series]: +) -> pd.Series: """ - Extracts IDs from the dataframe and drops the ID column. ID column is identified based on + Extracts and returns the main IDs from the dataframe. The main ID column is identified based on the data types JSON file with "id_column_name" key. + Main IDs are not repeated in the dataset. + For example, in the Berka dataset, "trans_id" is the main ID column, and "account_id" is not the main ID column. Args: data_frame: Input dataframe. @@ -74,14 +77,11 @@ def extract_and_drop_id_column( assert "id_column_name" in column_types, f"{data_types_file_path} must contain 'id_column_name' key." id_column_name = column_types["id_column_name"] - + # Make sure we have one main id column + assert isinstance(id_column_name, str), "Only one main id column should be identified." assert id_column_name in data_frame.columns, f"Dataframe must have {id_column_name} column" - data_trans_ids = data_frame[id_column_name] - - # Drop ID column from data - data_frame = data_frame.drop(columns=id_column_name) - return data_frame, data_trans_ids + return data_frame[id_column_name] def run_rmia_shadow_training(config: DictConfig, df_challenge: pd.DataFrame) -> list[dict[str, list[Any]]]: @@ -183,7 +183,7 @@ def collect_challenge_and_train_data( midst_data_input_dir=targets_data_path, attack_types=challenge_attack_types, # For ensemble experiments, change to ``test`` for 10k, and change to ``final`` for 20k - split_folders=["test"], + split_folders=["final"], dataset="challenge", data_processing_config=data_processing_config, ) @@ -266,7 +266,7 @@ def train_rmia_shadows_for_test_phase(config: DictConfig) -> list[dict[str, list df_challenge_experiment, df_master_train = collect_challenge_and_train_data( config.data_processing_config, processed_attack_data_path=Path(config.data_paths.processed_attack_data_path), - targets_data_path=Path(config.data_paths.midst_data_path), + targets_data_path=Path(config.data_processing_config.midst_data_path), ) # Load the challenge dataframe for training RMIA shadow models. rmia_training_choice = RmiaTrainingDataChoice(config.target_model.attack_rmia_shadow_training_data_choice) @@ -357,8 +357,13 @@ def run_metaclassifier_testing( else: log(INFO, "All shadow models for testing phase found. Using existing RMIA shadow models...") - # Extract and drop id columns from the test data - test_data, test_trans_ids = extract_and_drop_id_column(test_data, Path(config.metaclassifier.data_types_file_path)) + # Extract the main ID column's values from the test data + test_trans_ids = extract_the_main_id_column( + data_frame=test_data, + data_types_file_path=Path(config.metaclassifier.data_types_file_path), + ) + # Drop id columns from the test data. Berka has two id columns: "trans_id" and "account_id". + test_data = get_df_without_id(test_data) # 4) Initialize the attacker object, and assign the loaded metaclassifier to it. blending_attacker = BlendingPlusPlus( diff --git a/src/midst_toolkit/attacks/ensemble/blending.py b/src/midst_toolkit/attacks/ensemble/blending.py index d6c221b7..24104cc1 100644 --- a/src/midst_toolkit/attacks/ensemble/blending.py +++ b/src/midst_toolkit/attacks/ensemble/blending.py @@ -251,6 +251,8 @@ def predict( score = None if y_test is not None: - score = TprAtFpr.get_tpr_at_fpr(true_membership=y_test, predicted_membership=probabilities, fpr_threshold=0.1) + score = TprAtFpr.get_tpr_at_fpr( + true_membership=y_test, predicted_membership=probabilities, fpr_threshold=0.1 + ) return probabilities, score From 0ed74e934d58fb5d042cec2d01a37fc55a52aa50 Mon Sep 17 00:00:00 2001 From: ftavakoli Date: Wed, 4 Feb 2026 12:21:41 -0500 Subject: [PATCH 3/7] deleted extra config --- .../configs/experiment_config_20k.yaml | 119 ------------------ 1 file changed, 119 deletions(-) delete mode 100644 examples/ensemble_attack/configs/experiment_config_20k.yaml diff --git a/examples/ensemble_attack/configs/experiment_config_20k.yaml b/examples/ensemble_attack/configs/experiment_config_20k.yaml deleted file mode 100644 index cc274644..00000000 --- a/examples/ensemble_attack/configs/experiment_config_20k.yaml +++ /dev/null @@ -1,119 +0,0 @@ -# Ensemble experiment configuration -# This config can be used to run both the Ensemble attack training (``run_attack.py``) and testing phases (``test_attack_model.py``). -base_experiment_dir: /projects/midst-experiments/ensemble_attack/ablation/no_domias # Processed data, and experiment artifacts will be stored under this directory. -base_data_config_dir: examples/ensemble_attack/data_configs # Training and data type configs are saved under this directory. - -# Training Pipeline Control -pipeline: - run_data_processing: true # Set this to false if you have already saved the processed data - run_shadow_model_training: true # Set this to false if shadow models are already trained and saved - run_metaclassifier_training: true - -target_model: # This is only used for testing the attack on a real target model. - target_model_directory: /projects/midst-experiments/ensemble_attack/midst_data_all_attacks/tabddpm_black_box/final - target_model_id: 61 # Will be overridden per SLURM array task - target_model_name: tabddpm_${target_model.target_model_id} - target_synthetic_data_path: ${target_model.target_model_directory}/${target_model.target_model_name}/trans_synthetic.csv - challenge_data_path: ${target_model.target_model_directory}/${target_model.target_model_name}/challenge_with_id.csv - challenge_label_path: ${target_model.target_model_directory}/${target_model.target_model_name}/challenge_label.csv - - target_shadow_models_output_path: ${base_experiment_dir}/test_all_targets # Sub-directory to store test shadows and results - attack_probabilities_result_path: ${target_model.target_shadow_models_output_path}/test_probabilities/attack_model_${target_model.target_model_id}_proba - attack_rmia_shadow_training_data_choice: "combined" # Options: "combined", "only_challenge", "only_train". This determines which data to use for training RMIA attack model in testing phase. - # See select_challenge_data_for_training()'s docstring for more details. - - -# Data paths -data_paths: - processed_base_data_dir: ${base_experiment_dir} # To save new processed data for training, or read from previously collected and processed data (testing phase). - population_path: ${data_paths.processed_base_data_dir}/population_data # Path where the collected population data will be stored (output/input) - processed_attack_data_path: ${data_paths.processed_base_data_dir}/attack_data # Path where the processed attack real train and evaluation data is stored (output/input) - attack_evaluation_result_path: ${base_experiment_dir}/evaluation_results # Path where the attack (train phase) evaluation results will be stored (output) - - -model_paths: - metaclassifier_model_path: ${base_experiment_dir}/trained_models # Path where the trained metaclassifier model will be saved - - -# Dataset specific information used for processing in this example -data_processing_config: - midst_data_path: /projects/midst-experiments/ensemble_attack/midst_data_all_attacks/ # Used to collect the data (input) - # The following directories should exists and contain `tabddpm_{i} for i in folder_ranges`: - # population data: midst_data_path/population_attack_data_types_to_collect/population_splits - # challenge data: midst_data_path/challenge_attack_data_types_to_collect/challenge_splits - population_attack_data_types_to_collect: - [ - "tabddpm_black_box", - ] - challenge_attack_data_types_to_collect: - [ - "tabddpm_black_box", - ] - population_splits: ["train"] # Data splits to be collected for population data - challenge_splits: ["train" , "final"] # Data splits to be collected for challenge points - original_population_data_path: /projects/midst-experiments/ensemble_attack/competition/population_data/ # This is where the original attack's population data (800k) will be read from, mainly to be used by DOMIAS - # You can download this data from https://github.com/CRCHUM-CITADEL/ensemble-mia/blob/main/input/population/population_all_with_challenge.csv - - # The column name in the data to be used for stratified splitting. - column_to_stratify: "trans_type" # Attention: This value is not documented in the original codebase. - folder_ranges: # Specify folder ranges for any of the mentioned splits. - train: [[1, 21]] # Folders to be used for train data collection in the experiments - final: [[61, 71], [101, 111]] - # File names in MIDST data directories. - single_table_train_data_file_name: "train_with_id.csv" - multi_table_train_data_file_name: "trans.csv" - challenge_data_file_name: "challenge_with_id.csv" - population_sample_size: 40000 # Population size is the total data that your attack has access to. - # In experiments, this is sampled out of all the collected training data in case the available data - # is more than this number. Note that, half of this data is actually used for training, the other half - # is used for evaluation. For example, with 40k population size, only 20k is used for training the attack model. - # TODO: make sure to consider this in experiments. - -# Training and data settings for shadow models (temporary, numbers subject to change) -shadow_training: - # Data Config files path used for training a TabDDPM model - training_json_config_paths: # Config json files used for tabddpm training on the trans table - table_domain_file_path: ${base_data_config_dir}/trans_domain.json - dataset_meta_file_path: ${base_data_config_dir}/dataset_meta.json - tabddpm_training_config_path: ${base_data_config_dir}/trans.json - # Model training artifacts are saved under shadow_models_data_path/workspace_name/exp_name - # Also, training configs for each shadow model are created under shadow_models_data_path. - shadow_models_output_path: ${base_experiment_dir}/shadow_models_and_data - target_model_output_path: ${base_experiment_dir}/shadow_target_model_and_data - # Paths to final shadow models used for metaclassifier training (relative to shadow_models_output_path) - # These paths are a result of running the shadow model training pipeline, specifically the - # train_three_sets_of_shadow_models in shadow_model_training.py - # Each .pkl file contains the training data, trained model and training results for all shadow models in a list. - final_shadow_models_path: [ - "${shadow_training.shadow_models_output_path}/initial_model_rmia_1/shadow_workspace/pre_trained_model/rmia_shadows.pkl", - "${shadow_training.shadow_models_output_path}/initial_model_rmia_2/shadow_workspace/pre_trained_model/rmia_shadows.pkl", - "${shadow_training.shadow_models_output_path}/shadow_model_rmia_third_set/shadow_workspace/trained_model/rmia_shadows_third_set.pkl", - ] - target_synthetic_data_path: ${shadow_training.target_model_output_path}/target_synthetic_data.csv - # Path to final shadow target's synthetic data (relative to target_model_output_path) - fine_tuning_config: - fine_tune_diffusion_iterations: 200000 # Original code: 200000 - fine_tune_classifier_iterations: 20000 # Original code: 20000 - pre_train_data_size: 60000 # Original code: 60000 - number_of_points_to_synthesize: 20000 # Number of synthetic data samples to be generated by shadow models. - # Original code: 20000 - - -# Metaclassifier settings -metaclassifier: - # Data types json file is used for xgboost model training. - data_types_file_path: ${base_data_config_dir}/data_types.json - model_type: "xgb" - # Model training parameters - num_optuna_trials: 100 # Original code: 100 - num_kfolds: 5 - use_gpu: false - # Temporary. Might remove having an epoch parameter. - epochs: 1 - meta_classifier_model_name: ${metaclassifier.model_type}_metaclassifier_model - -attack_success_computation: - target_ids_to_test: [61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110] # List of target model IDs to compute the attack success for. - -# General settings -random_seed: 42 # Set to null for no seed, or an integer for a fixed seed From 007e33bdca100bf40fb611ecec0c3df3c3410206 Mon Sep 17 00:00:00 2001 From: ftavakoli Date: Wed, 4 Feb 2026 12:25:31 -0500 Subject: [PATCH 4/7] midst_data_path goes under data_processing_config --- examples/ensemble_attack/configs/experiment_config.yaml | 6 +++++- examples/ensemble_attack/run_attack.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/ensemble_attack/configs/experiment_config.yaml b/examples/ensemble_attack/configs/experiment_config.yaml index fc50bec6..4216715c 100644 --- a/examples/ensemble_attack/configs/experiment_config.yaml +++ b/examples/ensemble_attack/configs/experiment_config.yaml @@ -25,7 +25,7 @@ target_model: # This is only used for testing the attack on a real target model. # Data paths data_paths: - midst_data_path: /projects/midst-experiments/all_tabddpms/ # Used to collect the data (input) as defined in data_processing_config + processed_base_data_dir: ${base_experiment_dir} # To save new processed data for training, or read from previously collected and processed data (testing phase). population_path: ${data_paths.processed_base_data_dir}/population_data # Path where the collected population data will be stored (output/input) processed_attack_data_path: ${data_paths.processed_base_data_dir}/attack_data # Path where the processed attack real train and evaluation data is stored (output/input) @@ -38,6 +38,10 @@ model_paths: # Dataset specific information used for processing in this example data_processing_config: + midst_data_path: /projects/midst-experiments/all_tabddpms/ # Used to collect the data (input) + # The following directories should exist and contain `tabddpm_{i} for i in folder_ranges`: + # population data: midst_data_path/population_attack_data_types_to_collect/population_splits + # challenge data: midst_data_path/challenge_attack_data_types_to_collect/challenge_splits population_attack_data_types_to_collect: [ "tabddpm_trained_with_10k", diff --git a/examples/ensemble_attack/run_attack.py b/examples/ensemble_attack/run_attack.py index c53a86dc..4e67fa50 100644 --- a/examples/ensemble_attack/run_attack.py +++ b/examples/ensemble_attack/run_attack.py @@ -38,7 +38,7 @@ def run_data_processing(config: DictConfig) -> None: log(INFO, "Running data processing pipeline...") # Collect the real data from the MIDST challenge resources. population_data = collect_population_data_ensemble( - midst_data_input_dir=Path(config.data_paths.midst_data_path), + midst_data_input_dir=Path(config.data_processing_config.midst_data_path), data_processing_config=config.data_processing_config, save_dir=Path(config.data_paths.population_path), base_population=original_population_data, From 927eb4739a060f71f5f3a23017af9e93c1df8778 Mon Sep 17 00:00:00 2001 From: ftavakoli Date: Wed, 4 Feb 2026 12:53:33 -0500 Subject: [PATCH 5/7] fixed a unit test --- tests/unit/attacks/ensemble/test_meta_classifier.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/attacks/ensemble/test_meta_classifier.py b/tests/unit/attacks/ensemble/test_meta_classifier.py index c4dc0fe4..5e8154ec 100644 --- a/tests/unit/attacks/ensemble/test_meta_classifier.py +++ b/tests/unit/attacks/ensemble/test_meta_classifier.py @@ -359,7 +359,7 @@ def test_predict_flow( call_args = mock_get_tpr.call_args np.testing.assert_array_equal(call_args.kwargs["true_membership"], sample_dataframes["y_test"]) - np.testing.assert_array_almost_equal(call_args.kwargs["predictions"], expected_probabilities) - np.testing.assert_equal(call_args.kwargs["max_fpr"], 0.1) + np.testing.assert_array_almost_equal(call_args.kwargs["predicted_membership"], expected_probabilities) + np.testing.assert_equal(call_args.kwargs["fpr_threshold"], 0.1) assert score == 0.99 From 697a592636bff7b41922119be78c1b158be1f18a Mon Sep 17 00:00:00 2001 From: ftavakoli Date: Wed, 4 Feb 2026 13:19:47 -0500 Subject: [PATCH 6/7] codrabbitai comment: fix return type in docstring --- examples/ensemble_attack/test_attack_model.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/ensemble_attack/test_attack_model.py b/examples/ensemble_attack/test_attack_model.py index 2d131346..8b90df66 100644 --- a/examples/ensemble_attack/test_attack_model.py +++ b/examples/ensemble_attack/test_attack_model.py @@ -67,9 +67,7 @@ def extract_the_main_id_column( data_types_file_path: Path to the data types JSON file. Returns: - A tuple containing: - - The modified dataframe with ID columns dropped. - - A Series containing the extracted data of ID columns. + A Series containing the extracted data of the main ID column. """ # Extract ID column from the dataframe with open(data_types_file_path, "r") as f: From d2351cda84dfa34c6ed12ce75810dbf9f5e46bd0 Mon Sep 17 00:00:00 2001 From: ftavakoli Date: Wed, 4 Feb 2026 14:41:59 -0500 Subject: [PATCH 7/7] A better name for the function --- examples/ensemble_attack/test_attack_model.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/ensemble_attack/test_attack_model.py b/examples/ensemble_attack/test_attack_model.py index 8b90df66..910189bc 100644 --- a/examples/ensemble_attack/test_attack_model.py +++ b/examples/ensemble_attack/test_attack_model.py @@ -52,15 +52,15 @@ def save_results( f.write(f"TPR at FPR=0.1: {pred_score:.4f}\n") -def extract_the_main_id_column( +def extract_primary_id_column( data_frame: pd.DataFrame, data_types_file_path: Path, ) -> pd.Series: """ - Extracts and returns the main IDs from the dataframe. The main ID column is identified based on + Extracts and returns the primary IDs from the dataframe. The primary ID column is identified based on the data types JSON file with "id_column_name" key. - Main IDs are not repeated in the dataset. - For example, in the Berka dataset, "trans_id" is the main ID column, and "account_id" is not the main ID column. + primary IDs are unique keys in the dataset. + For example, in the Berka dataset, "trans_id" is the primary ID column, while "account_id" is not. Args: data_frame: Input dataframe. @@ -356,7 +356,7 @@ def run_metaclassifier_testing( log(INFO, "All shadow models for testing phase found. Using existing RMIA shadow models...") # Extract the main ID column's values from the test data - test_trans_ids = extract_the_main_id_column( + test_trans_ids = extract_primary_id_column( data_frame=test_data, data_types_file_path=Path(config.metaclassifier.data_types_file_path), )