diff --git a/README.md b/README.md index 57de61d..9f1944a 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,14 @@ ideeplc --input --save --calibrate ideeplc --input ./data/example_input/Hela_deeprt --save --finetune --calibrate ``` +#### Custom modification features +If you have new modification entries with columns `name`, `aa`, and `smiles`, you can generate a standardized feature table and then use it during prediction: + +```sh +ideeplc-mod-features --input user_mods.csv --output user_mod_features_standardized.csv +ideeplc --input peptide_file.csv --mod-features user_mod_features_standardized.csv +``` + For more detailed CLI usage, you can run: ```sh ideeplc --help diff --git a/ideeplc/__init__.py b/ideeplc/__init__.py index 7f86cee..b34a390 100644 --- a/ideeplc/__init__.py +++ b/ideeplc/__init__.py @@ -1,3 +1,3 @@ """iDeepLC: A deep Learning-based retention time predictor for unseen modified peptides with a novel encoding system""" +__version__ = "1.4.0" -__version__ = "1.3.2" diff --git a/ideeplc/__main__.py b/ideeplc/__main__.py index ca8db40..f9edb87 100644 --- a/ideeplc/__main__.py +++ b/ideeplc/__main__.py @@ -82,6 +82,15 @@ def _argument_parser() -> argparse.ArgumentParser: action="store_true", help="Flag to enable calibration of the model predictions.", ) + parser.add_argument( + "--mod-features", + type=str, + required=False, + help=( + "Optional CSV file with either raw modification rows (name, aa, smiles) " + "or standardized features (name, MolLogP_rdkit)." + ), + ) return parser diff --git a/ideeplc/data_initialize.py b/ideeplc/data_initialize.py index 80b7713..5b29466 100644 --- a/ideeplc/data_initialize.py +++ b/ideeplc/data_initialize.py @@ -63,7 +63,12 @@ def data_initialize( ) try: - sequences, tr, errors = df_to_matrix(reformed_peptides, df) + sequences, tr, errors = df_to_matrix( + reformed_peptides, + df, + mod_features_csv=kwargs.get("mod_features_csv"), + ) + except Exception as e: LOGGER.error(f"Error converting sequences to matrix format: {e}") raise @@ -128,7 +133,12 @@ def data_initialize_chunked( ) try: - sequences, tr, errors = df_to_matrix(reformed_peptides, df) + sequences, tr, errors = df_to_matrix( + reformed_peptides, + df, + mod_features_csv=kwargs.get("mod_features_csv"), + ) + except Exception as e: LOGGER.error( f"Error converting sequences to matrix format in chunk {chunk_idx}: {e}" @@ -150,7 +160,9 @@ def data_initialize_chunked( yield df, prediction_dataset, x_shape -def get_input_shape_from_first_chunk(csv_path: str, chunk_size: int = 10000): + +def get_input_shape_from_first_chunk(csv_path: str, chunk_size: int = 10000, **kwargs): + """ Get the input shape from the first valid chunk of a CSV file. @@ -159,7 +171,9 @@ def get_input_shape_from_first_chunk(csv_path: str, chunk_size: int = 10000): :return: x_shape for model initialization. """ for _, dataset_chunk, x_shape in data_initialize_chunked( - csv_path=csv_path, chunk_size=chunk_size + + csv_path=csv_path, chunk_size=chunk_size, **kwargs + ): LOGGER.info(f"Detected input shape from first valid chunk: {x_shape}") return x_shape diff --git a/ideeplc/ideeplc_core.py b/ideeplc/ideeplc_core.py index 4639183..237354f 100644 --- a/ideeplc/ideeplc_core.py +++ b/ideeplc/ideeplc_core.py @@ -75,7 +75,11 @@ def main(args): # For model initialization, only inspect the first valid chunk x_shape = get_input_shape_from_first_chunk( - csv_path=args.input, chunk_size=chunk_size + + csv_path=args.input, + chunk_size=chunk_size, + mod_features_csv=getattr(args, "mod_features", None), + ) # Initialize model @@ -101,7 +105,12 @@ def main(args): if args.finetune: LOGGER.info("Fine-tuning the model") - matrix_input, _ = data_initialize(csv_path=args.input) + + matrix_input, _ = data_initialize( + csv_path=args.input, + mod_features_csv=getattr(args, "mod_features", None), + ) + fine_tuner = iDeepLCFineTuner( model=model, @@ -129,6 +138,8 @@ def main(args): save_results=args.save, batch_size=batch_size, chunk_size=chunk_size, + mod_features_csv=getattr(args, "mod_features", None), + ) LOGGER.info("Prediction completed.") diff --git a/ideeplc/mod_features.py b/ideeplc/mod_features.py new file mode 100644 index 0000000..d700b61 --- /dev/null +++ b/ideeplc/mod_features.py @@ -0,0 +1,46 @@ +"""Build standardized modification features from a user CSV.""" + +import argparse +import logging + +from ideeplc.utilities import build_user_mod_feature_table + + +LOGGER = logging.getLogger(__name__) + + +def _argument_parser() -> argparse.ArgumentParser: + """Create the argument parser for the feature builder.""" + parser = argparse.ArgumentParser( + description=( + "Convert a CSV with columns name, aa, smiles into standardized modification features." + ) + ) + parser.add_argument( + "-i", + "--input", + type=str, + required=True, + help="Path to the CSV file containing user modifications.", + ) + parser.add_argument( + "-o", + "--output", + type=str, + default="user_mod_features_standardized.csv", + help="Output path for the standardized feature table.", + ) + return parser + + +def main(argv=None): + """Build standardized modification features from a raw user CSV.""" + parser = _argument_parser() + args = parser.parse_args(argv) + + feature_table = build_user_mod_feature_table(args.input, args.output) + LOGGER.info("Wrote %d modification feature rows to %s", len(feature_table), args.output) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/ideeplc/predict.py b/ideeplc/predict.py index 6ca01bb..799225c 100644 --- a/ideeplc/predict.py +++ b/ideeplc/predict.py @@ -67,6 +67,8 @@ def predict( batch_size: int = None, chunk_size: int = 10000, dataloader_input: DataLoader = None, + mod_features_csv: str = None, + ): """ Load a trained model and evaluate it on test datasets in chunks. @@ -138,8 +140,12 @@ def predict( output_path.unlink() for chunk_idx, (df_chunk, dataset_chunk, x_shape) in enumerate( - data_initialize_chunked(csv_path=input_file, chunk_size=chunk_size), - start=1, + data_initialize_chunked( + csv_path=input_file, + chunk_size=chunk_size, + mod_features_csv=mod_features_csv, + ), + ): LOGGER.info( f"Processing chunk {chunk_idx} with {len(dataset_chunk)} entries and shape {x_shape}." diff --git a/ideeplc/utilities.py b/ideeplc/utilities.py index 3f7e08a..2166d3d 100644 --- a/ideeplc/utilities.py +++ b/ideeplc/utilities.py @@ -1,5 +1,7 @@ import io -from typing import List, Tuple, Dict, Union, Optional, Any +import logging +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np import pandas as pd @@ -9,6 +11,12 @@ import ideeplc.structure_feature +LOGGER = logging.getLogger(__name__) + +MEAN_MOLLOGP = -0.5262240476190472 +STD_MOLLOGP = 0.7546071397979358 + + class Config: """ Configuration class for the encoding of peptides. @@ -76,22 +84,132 @@ def aa_chemical_feature() -> Dict[str, np.ndarray]: return features_arrays -def mod_chemical_features() -> Dict[str, Dict[str, Dict[str, float]]]: - """Get modification features.""" - content = files(ideeplc.structure_feature).joinpath("ptm_stan.csv").read_bytes() - df = pd.read_csv(io.BytesIO(content)) - # Convert the dataframe to a dictionary and transpose it +def standardize(value: float, mean: float, std: float) -> float: + """Apply standardization.""" + return (value - mean) / std + + +def compute_mollogp(smiles: str) -> Optional[float]: + """Compute RDKit MolLogP from a SMILES string.""" + try: + from rdkit import Chem + from rdkit.Chem import Crippen + except ImportError as exc: + raise ImportError( + "rdkit is required to build modification features from raw SMILES input." + ) from exc + + mol = Chem.MolFromSmiles(smiles) + if mol is None: + return None + return Crippen.MolLogP(mol) + + +def build_user_mod_feature_table( + input_csv: str, + output_csv: Optional[str] = None, + compute_mollogp_fn: Optional[Callable[[str], Optional[float]]] = None, +) -> pd.DataFrame: + """Build a standardized modification feature table from a raw user CSV.""" + df = pd.read_csv(input_csv) + required_cols = {"name", "aa", "smiles"} + if not required_cols.issubset(df.columns): + raise ValueError(f"Input file must contain columns: {required_cols}") + + compute_fn = compute_mollogp_fn or compute_mollogp + results = [] + + for _, row in df.iterrows(): + name = f"{row['name']}#{row['aa']}" + smiles = row["smiles"] + + mollogp = compute_fn(smiles) + if mollogp is None: + LOGGER.warning("Skipping invalid SMILES for %s", name) + continue + + results.append( + { + "name": name, + "MolLogP_rdkit": standardize(mollogp, MEAN_MOLLOGP, STD_MOLLOGP), + } + ) + + if not results: + raise ValueError("No valid modification rows were found in the input file.") + + df_out = pd.DataFrame(results) + if output_csv: + Path(output_csv).parent.mkdir(parents=True, exist_ok=True) + df_out.to_csv(output_csv, index=False) + + return df_out + + +def _load_mod_feature_table(csv_path: str) -> pd.DataFrame: + """Load either a standardized feature table or a raw user-mod table.""" + df = pd.read_csv(csv_path) + + if {"name", "MolLogP_rdkit"}.issubset(df.columns): + feature_df = df.loc[:, ["name", "MolLogP_rdkit"]].copy() + elif {"name", "aa", "smiles"}.issubset(df.columns): + feature_df = build_user_mod_feature_table(csv_path) + else: + raise ValueError( + "Modification CSV must contain either ['name', 'MolLogP_rdkit'] or ['name', 'aa', 'smiles'] columns." + ) + + feature_df = feature_df.dropna(subset=["name", "MolLogP_rdkit"]) + feature_df["name"] = feature_df["name"].astype(str) + feature_df["MolLogP_rdkit"] = feature_df["MolLogP_rdkit"].astype(float) + feature_df = feature_df.drop_duplicates(subset=["name"], keep="last") + return feature_df + + +def _merge_mod_feature_tables( + base_df: pd.DataFrame, extra_df: pd.DataFrame +) -> pd.DataFrame: + """Merge the built-in and user-provided modification feature tables.""" + combined = base_df.copy() + combined.update(extra_df) + + new_rows = extra_df.loc[~extra_df.index.isin(combined.index)] + if not new_rows.empty: + combined = pd.concat([combined, new_rows]) + + return combined + + +def _mod_feature_table_to_dict( + df: pd.DataFrame, +) -> Dict[str, Dict[str, Dict[str, float]]]: + """Convert a feature table to the nested modification dictionary format.""" df = df.set_index("name").T - # Convert the DataFrame to a dictionary of modifications with their chemical features modified = df.to_dict("list") dic = {} for key, values in modified.items(): main_key, sub_key = key.split("#") - # Create a nested dictionary with the modification name and the amino acid dic.setdefault(main_key, {})[sub_key] = dict(zip(df.index, values)) return dic +def mod_chemical_features( + user_mods_csv: Optional[str] = None, +) -> Dict[str, Dict[str, Dict[str, float]]]: + """Get modification features, optionally merged with user-provided modifications.""" + content = files(ideeplc.structure_feature).joinpath("ptm_stan.csv").read_bytes() + base_df = pd.read_csv(io.BytesIO(content)) + base_df = base_df.loc[:, ["name", "MolLogP_rdkit"]].copy() + + if user_mods_csv: + extra_df = _load_mod_feature_table(user_mods_csv) + base_df = _merge_mod_feature_tables( + base_df.set_index("name"), extra_df.set_index("name") + ).reset_index() + + return _mod_feature_table_to_dict(base_df) + + def peptide_parser(peptide: str) -> Tuple: """Parse the peptide sequence and modifications.""" modifications = [] @@ -302,7 +420,9 @@ def encode_sequence_one_hot(sequence: str) -> np.ndarray: def df_to_matrix( - seqs: Union[str, List[str]], df: Optional[pd.DataFrame] = None + seqs: Union[str, List[str]], + df: Optional[pd.DataFrame] = None, + mod_features_csv: Optional[str] = None, ) -> ( tuple[ndarray, list[Any], list[list[str | list[str] | int | str | Exception]]] | ndarray @@ -327,7 +447,7 @@ def df_to_matrix( seqs_encoded = [] tr = [] errors = [] - modifications_dict = mod_chemical_features() + modifications_dict = mod_chemical_features(user_mods_csv=mod_features_csv) aa_to_feature = aa_chemical_feature() amino_acids_atoms = aa_atomic_composition_array() diff --git a/pyproject.toml b/pyproject.toml index 409d94c..87cd0c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,6 +66,7 @@ version = {attr = "ideeplc.__version__"} [project.scripts] ideeplc = "ideeplc.__main__:main" +ideeplc-mod-features = "ideeplc.mod_features:main" [project.urls] GitHub = "https://github.com/CompOmics/iDeepLC" diff --git a/tests/test_mod_features.py b/tests/test_mod_features.py new file mode 100644 index 0000000..a6f9417 --- /dev/null +++ b/tests/test_mod_features.py @@ -0,0 +1,51 @@ +# Python +import pandas as pd + +from ideeplc.utilities import ( + MEAN_MOLLOGP, + STD_MOLLOGP, + build_user_mod_feature_table, + mod_chemical_features, +) + + +def test_build_user_mod_feature_table(tmp_path): + """Test building standardized modification features from a raw CSV.""" + input_csv = tmp_path / "user_mods.csv" + output_csv = tmp_path / "user_mod_features_standardized.csv" + + pd.DataFrame( + { + "name": ["CustomMod"], + "aa": ["K"], + "smiles": ["CCO"], + } + ).to_csv(input_csv, index=False) + + feature_table = build_user_mod_feature_table( + str(input_csv), + str(output_csv), + compute_mollogp_fn=lambda smiles: 1.0, + ) + + assert output_csv.exists() + assert list(feature_table["name"]) == ["CustomMod#K"] + expected = (1.0 - MEAN_MOLLOGP) / STD_MOLLOGP + assert feature_table.iloc[0]["MolLogP_rdkit"] == expected + + +def test_mod_chemical_features_merges_user_table(tmp_path): + """Test that a user feature table is merged into the built-in dictionary.""" + user_feature_csv = tmp_path / "custom_features.csv" + pd.DataFrame( + { + "name": ["CustomMod#K"], + "MolLogP_rdkit": [1.23], + } + ).to_csv(user_feature_csv, index=False) + + mod_dict = mod_chemical_features(user_mods_csv=str(user_feature_csv)) + + assert "CustomMod" in mod_dict + assert "K" in mod_dict["CustomMod"] + assert mod_dict["CustomMod"]["K"]["MolLogP_rdkit"] == 1.23