From 026e190f5593dd4f882c5d518add7e3b9e1b4862 Mon Sep 17 00:00:00 2001 From: Alireza Nameni <64960208+Alirezak2n@users.noreply.github.com> Date: Fri, 3 Apr 2026 13:59:05 +0200 Subject: [PATCH 1/4] New major version for adding mods by user --- ideeplc/__init__.py | 2 +- ideeplc/__main__.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/ideeplc/__init__.py b/ideeplc/__init__.py index ae57bfe..9f2f44f 100644 --- a/ideeplc/__init__.py +++ b/ideeplc/__init__.py @@ -1,3 +1,3 @@ """iDeepLC: A deep Learning-based retention time predictor for unseen modified peptides with a novel encoding system""" -__version__ = "1.3.1" +__version__ = "1.4.0" diff --git a/ideeplc/__main__.py b/ideeplc/__main__.py index ca8db40..f9edb87 100644 --- a/ideeplc/__main__.py +++ b/ideeplc/__main__.py @@ -82,6 +82,15 @@ def _argument_parser() -> argparse.ArgumentParser: action="store_true", help="Flag to enable calibration of the model predictions.", ) + parser.add_argument( + "--mod-features", + type=str, + required=False, + help=( + "Optional CSV file with either raw modification rows (name, aa, smiles) " + "or standardized features (name, MolLogP_rdkit)." + ), + ) return parser From a76f0705141a568a558eef2c630f3d723cafe1cb Mon Sep 17 00:00:00 2001 From: Alireza Nameni <64960208+Alirezak2n@users.noreply.github.com> Date: Fri, 3 Apr 2026 13:59:42 +0200 Subject: [PATCH 2/4] Create mod_features.py Build standardized modifications --- ideeplc/mod_features.py | 46 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 ideeplc/mod_features.py diff --git a/ideeplc/mod_features.py b/ideeplc/mod_features.py new file mode 100644 index 0000000..d700b61 --- /dev/null +++ b/ideeplc/mod_features.py @@ -0,0 +1,46 @@ +"""Build standardized modification features from a user CSV.""" + +import argparse +import logging + +from ideeplc.utilities import build_user_mod_feature_table + + +LOGGER = logging.getLogger(__name__) + + +def _argument_parser() -> argparse.ArgumentParser: + """Create the argument parser for the feature builder.""" + parser = argparse.ArgumentParser( + description=( + "Convert a CSV with columns name, aa, smiles into standardized modification features." + ) + ) + parser.add_argument( + "-i", + "--input", + type=str, + required=True, + help="Path to the CSV file containing user modifications.", + ) + parser.add_argument( + "-o", + "--output", + type=str, + default="user_mod_features_standardized.csv", + help="Output path for the standardized feature table.", + ) + return parser + + +def main(argv=None): + """Build standardized modification features from a raw user CSV.""" + parser = _argument_parser() + args = parser.parse_args(argv) + + feature_table = build_user_mod_feature_table(args.input, args.output) + LOGGER.info("Wrote %d modification feature rows to %s", len(feature_table), args.output) + + +if __name__ == "__main__": + main() \ No newline at end of file From 977bcf8f4b21125ad5877aed53698e2b4ff4bf04 Mon Sep 17 00:00:00 2001 From: Alireza Nameni <64960208+Alirezak2n@users.noreply.github.com> Date: Fri, 3 Apr 2026 13:59:54 +0200 Subject: [PATCH 3/4] New test and readme --- README.md | 8 ++++++ pyproject.toml | 1 + tests/test_mod_features.py | 51 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+) create mode 100644 tests/test_mod_features.py diff --git a/README.md b/README.md index 57de61d..9f1944a 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,14 @@ ideeplc --input --save --calibrate ideeplc --input ./data/example_input/Hela_deeprt --save --finetune --calibrate ``` +#### Custom modification features +If you have new modification entries with columns `name`, `aa`, and `smiles`, you can generate a standardized feature table and then use it during prediction: + +```sh +ideeplc-mod-features --input user_mods.csv --output user_mod_features_standardized.csv +ideeplc --input peptide_file.csv --mod-features user_mod_features_standardized.csv +``` + For more detailed CLI usage, you can run: ```sh ideeplc --help diff --git a/pyproject.toml b/pyproject.toml index 409d94c..87cd0c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,6 +66,7 @@ version = {attr = "ideeplc.__version__"} [project.scripts] ideeplc = "ideeplc.__main__:main" +ideeplc-mod-features = "ideeplc.mod_features:main" [project.urls] GitHub = "https://github.com/CompOmics/iDeepLC" diff --git a/tests/test_mod_features.py b/tests/test_mod_features.py new file mode 100644 index 0000000..a6f9417 --- /dev/null +++ b/tests/test_mod_features.py @@ -0,0 +1,51 @@ +# Python +import pandas as pd + +from ideeplc.utilities import ( + MEAN_MOLLOGP, + STD_MOLLOGP, + build_user_mod_feature_table, + mod_chemical_features, +) + + +def test_build_user_mod_feature_table(tmp_path): + """Test building standardized modification features from a raw CSV.""" + input_csv = tmp_path / "user_mods.csv" + output_csv = tmp_path / "user_mod_features_standardized.csv" + + pd.DataFrame( + { + "name": ["CustomMod"], + "aa": ["K"], + "smiles": ["CCO"], + } + ).to_csv(input_csv, index=False) + + feature_table = build_user_mod_feature_table( + str(input_csv), + str(output_csv), + compute_mollogp_fn=lambda smiles: 1.0, + ) + + assert output_csv.exists() + assert list(feature_table["name"]) == ["CustomMod#K"] + expected = (1.0 - MEAN_MOLLOGP) / STD_MOLLOGP + assert feature_table.iloc[0]["MolLogP_rdkit"] == expected + + +def test_mod_chemical_features_merges_user_table(tmp_path): + """Test that a user feature table is merged into the built-in dictionary.""" + user_feature_csv = tmp_path / "custom_features.csv" + pd.DataFrame( + { + "name": ["CustomMod#K"], + "MolLogP_rdkit": [1.23], + } + ).to_csv(user_feature_csv, index=False) + + mod_dict = mod_chemical_features(user_mods_csv=str(user_feature_csv)) + + assert "CustomMod" in mod_dict + assert "K" in mod_dict["CustomMod"] + assert mod_dict["CustomMod"]["K"]["MolLogP_rdkit"] == 1.23 From 4b153822d27db489acad54ccd03f1a9f8b3a00bc Mon Sep 17 00:00:00 2001 From: Alireza Nameni <64960208+Alirezak2n@users.noreply.github.com> Date: Fri, 3 Apr 2026 14:00:19 +0200 Subject: [PATCH 4/4] Modified files to add the new feature of added mods by user --- ideeplc/data_initialize.py | 96 ++++++++++++++++++++++++- ideeplc/ideeplc_core.py | 18 ++++- ideeplc/predict.py | 121 ++++++++++++++++++++++++++++++-- ideeplc/utilities.py | 140 ++++++++++++++++++++++++++++++++++--- 4 files changed, 357 insertions(+), 18 deletions(-) diff --git a/ideeplc/data_initialize.py b/ideeplc/data_initialize.py index 02d578a..11cbb9e 100644 --- a/ideeplc/data_initialize.py +++ b/ideeplc/data_initialize.py @@ -64,8 +64,11 @@ def data_initialize( f"Loaded and reformed {len(reformed_peptides)} peptides sequences from the file." ) try: - # Convert sequences to matrix format - sequences, tr, errors = df_to_matrix(reformed_peptides, df) + sequences, tr, errors = df_to_matrix( + reformed_peptides, + df, + mod_features_csv=kwargs.get("mod_features_csv"), + ) except Exception as e: LOGGER.error(f"Error converting sequences to matrix format: {e}") raise @@ -82,3 +85,92 @@ def data_initialize( break LOGGER.info(f"Dataset initialized with data shape {x_shape}.") return prediction_dataset, x_shape + + +def data_initialize_chunked( + csv_path: str, chunk_size: int = 10000, **kwargs +) -> Iterator[Tuple[pd.DataFrame, MyDataset, np.ndarray]]: + """ + Initialize peptide matrices from a CSV file in chunks. + + :param csv_path: Path to the CSV file containing raw peptide sequences. + :param chunk_size: Number of rows to load per chunk. + :return: Iterator yielding dataframe chunk, dataset chunk, and x_shape. + """ + LOGGER.info(f"Loading peptides from {csv_path} in chunks of {chunk_size}") + + try: + chunk_iter = pd.read_csv(csv_path, chunksize=chunk_size) + except FileNotFoundError: + LOGGER.error(f"File {csv_path} not found.") + raise + except pd.errors.EmptyDataError: + LOGGER.error(f"File {csv_path} is empty.") + raise + except Exception as e: + LOGGER.error(f"Error reading {csv_path}: {e}") + raise + + for chunk_idx, df in enumerate(chunk_iter, start=1): + if "seq" not in df.columns: + LOGGER.error("CSV file must contain a 'seq' column with peptide sequences.") + raise ValueError("Missing 'seq' column in the CSV file.") + if "modifications" not in df.columns: + LOGGER.error( + "CSV file must contain a 'modifications' column with peptide modifications." + ) + raise ValueError("Missing 'modifications' column in the CSV file.") + if "tr" not in df.columns: + LOGGER.error("CSV file must contain a 'tr' column with retention times.") + raise ValueError("Missing 'tr' column in the CSV file.") + + reformed_peptides = [ + reform_seq(seq, mod) for seq, mod in zip(df["seq"], df["modifications"]) + ] + LOGGER.info( + f"Chunk {chunk_idx}: loaded and reformed {len(reformed_peptides)} peptides sequences." + ) + + try: + sequences, tr, errors = df_to_matrix( + reformed_peptides, + df, + mod_features_csv=kwargs.get("mod_features_csv"), + ) + except Exception as e: + LOGGER.error( + f"Error converting sequences to matrix format in chunk {chunk_idx}: {e}" + ) + raise + + if errors: + LOGGER.warning(f"Errors encountered during conversion in chunk {chunk_idx}: {errors}") + + prediction_dataset = MyDataset(sequences, tr) + + if len(prediction_dataset) == 0: + LOGGER.warning(f"Chunk {chunk_idx} contains no valid peptide entries.") + continue + + # Keep historical x_shape contract expected by model/tests: (batch, channels, length) + x_shape = (1,) + prediction_dataset[0][0].shape + LOGGER.info(f"Chunk {chunk_idx} initialized with data shape {x_shape}.") + yield df, prediction_dataset, x_shape + + +def get_input_shape_from_first_chunk(csv_path: str, chunk_size: int = 10000, **kwargs): + """ + Get the input shape from the first valid chunk of a CSV file. + + :param csv_path: Path to the CSV file containing raw peptide sequences. + :param chunk_size: Number of rows to load per chunk. + :return: x_shape for model initialization. + """ + for _, dataset_chunk, x_shape in data_initialize_chunked( + csv_path=csv_path, chunk_size=chunk_size, **kwargs + ): + LOGGER.info(f"Detected input shape from first valid chunk: {x_shape}") + return x_shape + + LOGGER.error("No valid chunks found in the input file.") + raise ValueError("No valid chunks found in the input file.") \ No newline at end of file diff --git a/ideeplc/ideeplc_core.py b/ideeplc/ideeplc_core.py index 958f2b2..62fd45d 100644 --- a/ideeplc/ideeplc_core.py +++ b/ideeplc/ideeplc_core.py @@ -74,7 +74,14 @@ def main(args): # Initialize data LOGGER.info(f"Loading data from {args.input}") - matrix_input, x_shape = data_initialize(csv_path=args.input) + + # For model initialization, only inspect the first valid chunk + x_shape = get_input_shape_from_first_chunk( + csv_path=args.input, + chunk_size=chunk_size, + mod_features_csv=getattr(args, "mod_features", None), + ) + # Initialize model LOGGER.info("Initializing model") model = MyNet(x_shape=x_shape, config=config).to(device) @@ -96,6 +103,12 @@ def main(args): if args.finetune: LOGGER.info("Fine-tuning the model") + + matrix_input, _ = data_initialize( + csv_path=args.input, + mod_features_csv=getattr(args, "mod_features", None), + ) + fine_tuner = iDeepLCFineTuner( model=model, train_data=matrix_input, @@ -123,6 +136,9 @@ def main(args): calibrate=args.calibrate, input_file=args.input, save_results=args.save, + batch_size=batch_size, + chunk_size=chunk_size, + mod_features_csv=getattr(args, "mod_features", None), ) LOGGER.info("Prediction completed.") # Generate Figures diff --git a/ideeplc/predict.py b/ideeplc/predict.py index bc90455..b5bc788 100644 --- a/ideeplc/predict.py +++ b/ideeplc/predict.py @@ -58,6 +58,10 @@ def predict( input_file: str, calibrate: bool, save_results: bool, + batch_size: int = None, + chunk_size: int = 10000, + dataloader_input: DataLoader = None, + mod_features_csv: str = None, ): """ Load a trained model and evaluate it on test datasets. @@ -71,13 +75,120 @@ def predict( :param save_results: If True, saves the evaluation results. :return: Loss, correlation, predictions, and ground truth values. """ - LOGGER.info("Starting prediction process.") + LOGGER.info( + f"Starting prediction process with batch size {batch_size} and chunk size {chunk_size}." + ) + + all_predictions = [] + all_ground_truth = [] + total_loss = 0.0 + total_samples = 0 + + calibrated_preds = None + + timestamp = datetime.datetime.now().strftime("%Y%m%d") + input_file_name = os.path.splitext(os.path.basename(input_file))[0] + output_path = ( + Path("ideeplc_output") / f"{input_file_name}_predictions_{timestamp}.csv" + ) try: - # Validate on the primary test set - loss, correlation, predictions, ground_truth = validate( - model, dataloader_input, loss_fn, device - ) + if dataloader_input is not None: + LOGGER.info("Using provided dataloader_input for prediction.") + loss, correlation, all_predictions, all_ground_truth = validate( + model=model, + dataloader=dataloader_input, + loss_fn=loss_fn, + device=device, + ) + + if calibrate: + LOGGER.info("Fitting calibration model.") + calibration_model = SplineTransformerCalibration() + calibration_model.fit(all_ground_truth, all_predictions) + calibrated_preds = calibration_model.transform(all_predictions) + + if len(calibrated_preds) > 1 and len(all_ground_truth) > 1: + correlation = np.corrcoef(calibrated_preds, all_ground_truth)[0, 1] + else: + correlation = np.nan + + loss_calibrated = loss_fn( + torch.tensor(calibrated_preds).float().view(-1, 1), + torch.tensor(all_ground_truth).float().view(-1, 1), + ) + loss = loss_calibrated.item() + return loss, correlation, calibrated_preds, all_ground_truth + + return loss, correlation, all_predictions, all_ground_truth + + if batch_size is None: + raise ValueError("batch_size must be provided when dataloader_input is not used.") + + if save_results: + output_path.parent.mkdir(parents=True, exist_ok=True) + if output_path.exists(): + output_path.unlink() + + for chunk_idx, (df_chunk, dataset_chunk, x_shape) in enumerate( + data_initialize_chunked( + csv_path=input_file, + chunk_size=chunk_size, + mod_features_csv=mod_features_csv, + ), + start=1, + ): + LOGGER.info( + f"Processing chunk {chunk_idx} with {len(dataset_chunk)} entries and shape {x_shape}." + ) + + dataloader_input = DataLoader( + dataset_chunk, + batch_size=batch_size, + shuffle=False, + ) + + chunk_loss, _, chunk_predictions, chunk_ground_truth = validate( + model=model, + dataloader=dataloader_input, + loss_fn=loss_fn, + device=device, + ) + + n_chunk = len(dataset_chunk) + total_loss += chunk_loss * n_chunk + total_samples += n_chunk + + all_predictions.extend(chunk_predictions) + all_ground_truth.extend(chunk_ground_truth) + + if save_results: + result_data = { + "sequences": df_chunk.get("seq", None), + "modifications": df_chunk.get("modifications", None), + "ground_truth": chunk_ground_truth, + "predictions": chunk_predictions, + } + + result_df = pd.DataFrame(result_data) + result_df.to_csv( + output_path, + mode="a", + index=False, + header=not output_path.exists(), + ) + LOGGER.info(f"Chunk {chunk_idx} results appended to {output_path}") + + if total_samples == 0: + LOGGER.error("No valid samples were processed during prediction.") + raise ValueError("No valid samples were processed during prediction.") + + loss = total_loss / total_samples + + if len(all_predictions) > 1 and len(all_ground_truth) > 1: + correlation = np.corrcoef(all_predictions, all_ground_truth)[0, 1] + else: + correlation = np.nan if calibrate: LOGGER.info("Fitting calibration model.") diff --git a/ideeplc/utilities.py b/ideeplc/utilities.py index aecc7ea..6c9307a 100644 --- a/ideeplc/utilities.py +++ b/ideeplc/utilities.py @@ -1,5 +1,7 @@ import io -from typing import List, Tuple, Dict, Union, Optional, Any +import logging +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np import pandas as pd @@ -9,6 +11,12 @@ import ideeplc.structure_feature +LOGGER = logging.getLogger(__name__) + +MEAN_MOLLOGP = -0.5262240476190472 +STD_MOLLOGP = 0.7546071397979358 + + class Config: """ Configuration class for the encoding of peptides. @@ -76,22 +84,132 @@ def aa_chemical_feature() -> Dict[str, np.ndarray]: return features_arrays -def mod_chemical_features() -> Dict[str, Dict[str, Dict[str, float]]]: - """Get modification features.""" - content = files(ideeplc.structure_feature).joinpath("ptm_stan.csv").read_bytes() - df = pd.read_csv(io.BytesIO(content)) - # Convert the dataframe to a dictionary and transpose it +def standardize(value: float, mean: float, std: float) -> float: + """Apply standardization.""" + return (value - mean) / std + + +def compute_mollogp(smiles: str) -> Optional[float]: + """Compute RDKit MolLogP from a SMILES string.""" + try: + from rdkit import Chem + from rdkit.Chem import Crippen + except ImportError as exc: + raise ImportError( + "rdkit is required to build modification features from raw SMILES input." + ) from exc + + mol = Chem.MolFromSmiles(smiles) + if mol is None: + return None + return Crippen.MolLogP(mol) + + +def build_user_mod_feature_table( + input_csv: str, + output_csv: Optional[str] = None, + compute_mollogp_fn: Optional[Callable[[str], Optional[float]]] = None, +) -> pd.DataFrame: + """Build a standardized modification feature table from a raw user CSV.""" + df = pd.read_csv(input_csv) + required_cols = {"name", "aa", "smiles"} + if not required_cols.issubset(df.columns): + raise ValueError(f"Input file must contain columns: {required_cols}") + + compute_fn = compute_mollogp_fn or compute_mollogp + results = [] + + for _, row in df.iterrows(): + name = f"{row['name']}#{row['aa']}" + smiles = row["smiles"] + + mollogp = compute_fn(smiles) + if mollogp is None: + LOGGER.warning("Skipping invalid SMILES for %s", name) + continue + + results.append( + { + "name": name, + "MolLogP_rdkit": standardize(mollogp, MEAN_MOLLOGP, STD_MOLLOGP), + } + ) + + if not results: + raise ValueError("No valid modification rows were found in the input file.") + + df_out = pd.DataFrame(results) + if output_csv: + Path(output_csv).parent.mkdir(parents=True, exist_ok=True) + df_out.to_csv(output_csv, index=False) + + return df_out + + +def _load_mod_feature_table(csv_path: str) -> pd.DataFrame: + """Load either a standardized feature table or a raw user-mod table.""" + df = pd.read_csv(csv_path) + + if {"name", "MolLogP_rdkit"}.issubset(df.columns): + feature_df = df.loc[:, ["name", "MolLogP_rdkit"]].copy() + elif {"name", "aa", "smiles"}.issubset(df.columns): + feature_df = build_user_mod_feature_table(csv_path) + else: + raise ValueError( + "Modification CSV must contain either ['name', 'MolLogP_rdkit'] or ['name', 'aa', 'smiles'] columns." + ) + + feature_df = feature_df.dropna(subset=["name", "MolLogP_rdkit"]) + feature_df["name"] = feature_df["name"].astype(str) + feature_df["MolLogP_rdkit"] = feature_df["MolLogP_rdkit"].astype(float) + feature_df = feature_df.drop_duplicates(subset=["name"], keep="last") + return feature_df + + +def _merge_mod_feature_tables( + base_df: pd.DataFrame, extra_df: pd.DataFrame +) -> pd.DataFrame: + """Merge the built-in and user-provided modification feature tables.""" + combined = base_df.copy() + combined.update(extra_df) + + new_rows = extra_df.loc[~extra_df.index.isin(combined.index)] + if not new_rows.empty: + combined = pd.concat([combined, new_rows]) + + return combined + + +def _mod_feature_table_to_dict( + df: pd.DataFrame, +) -> Dict[str, Dict[str, Dict[str, float]]]: + """Convert a feature table to the nested modification dictionary format.""" df = df.set_index("name").T - # Convert the DataFrame to a dictionary of modifications with their chemical features modified = df.to_dict("list") dic = {} for key, values in modified.items(): main_key, sub_key = key.split("#") - # Create a nested dictionary with the modification name and the amino acid dic.setdefault(main_key, {})[sub_key] = dict(zip(df.index, values)) return dic +def mod_chemical_features( + user_mods_csv: Optional[str] = None, +) -> Dict[str, Dict[str, Dict[str, float]]]: + """Get modification features, optionally merged with user-provided modifications.""" + content = files(ideeplc.structure_feature).joinpath("ptm_stan.csv").read_bytes() + base_df = pd.read_csv(io.BytesIO(content)) + base_df = base_df.loc[:, ["name", "MolLogP_rdkit"]].copy() + + if user_mods_csv: + extra_df = _load_mod_feature_table(user_mods_csv) + base_df = _merge_mod_feature_tables( + base_df.set_index("name"), extra_df.set_index("name") + ).reset_index() + + return _mod_feature_table_to_dict(base_df) + + def peptide_parser(peptide: str) -> Tuple: """Parse the peptide sequence and modifications.""" modifications = [] @@ -270,7 +388,9 @@ def encode_sequence_one_hot(sequence: str) -> np.ndarray: def df_to_matrix( - seqs: Union[str, List[str]], df: Optional[pd.DataFrame] = None + seqs: Union[str, List[str]], + df: Optional[pd.DataFrame] = None, + mod_features_csv: Optional[str] = None, ) -> ( tuple[ndarray, list[Any], list[list[str | list[str] | int | Exception]]] | ndarray @@ -311,7 +431,7 @@ def df_to_matrix( seqs_encoded = [] tr = [] errors = [] - modifications_dict = mod_chemical_features() + modifications_dict = mod_chemical_features(user_mods_csv=mod_features_csv) aa_to_feature = aa_chemical_feature() amino_acids_atoms = aa_atomic_composition_array()