CompOmics · Alirezak2n · Apr 3, 2026 · Apr 3, 2026 · Apr 3, 2026 · Apr 3, 2026
diff --git a/README.md b/README.md
@@ -66,6 +66,14 @@ ideeplc --input <path/to/peptide_file.csv> --save --calibrate
 ideeplc --input ./data/example_input/Hela_deeprt --save --finetune --calibrate
 ```
 
+#### Custom modification features
+If you have new modification entries with columns `name`, `aa`, and `smiles`, you can generate a standardized feature table and then use it during prediction:
+
+```sh
+ideeplc-mod-features --input user_mods.csv --output user_mod_features_standardized.csv
+ideeplc --input peptide_file.csv --mod-features user_mod_features_standardized.csv
+```
+
 For more detailed CLI usage, you can run:
 ```sh
 ideeplc --help

diff --git a/ideeplc/__init__.py b/ideeplc/__init__.py
@@ -1,3 +1,3 @@
 """iDeepLC: A deep Learning-based retention time predictor for unseen modified peptides with a novel encoding system"""
+__version__ = "1.4.0"
 
-__version__ = "1.3.2"
diff --git a/ideeplc/__main__.py b/ideeplc/__main__.py
@@ -82,6 +82,15 @@ def _argument_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Flag to enable calibration of the model predictions.",
     )
+    parser.add_argument(
+        "--mod-features",
+        type=str,
+        required=False,
+        help=(
+            "Optional CSV file with either raw modification rows (name, aa, smiles) "
+            "or standardized features (name, MolLogP_rdkit)."
+        ),
+    )
     return parser
 
 

diff --git a/ideeplc/data_initialize.py b/ideeplc/data_initialize.py
@@ -63,7 +63,12 @@ def data_initialize(
     )
 
     try:
-        sequences, tr, errors = df_to_matrix(reformed_peptides, df)
+        sequences, tr, errors = df_to_matrix(
+            reformed_peptides,
+            df,
+            mod_features_csv=kwargs.get("mod_features_csv"),
+        )
+
     except Exception as e:
         LOGGER.error(f"Error converting sequences to matrix format: {e}")
         raise
@@ -128,7 +133,12 @@ def data_initialize_chunked(
         )
 
         try:
-            sequences, tr, errors = df_to_matrix(reformed_peptides, df)
+            sequences, tr, errors = df_to_matrix(
+                reformed_peptides,
+                df,
+                mod_features_csv=kwargs.get("mod_features_csv"),
+            )
+
         except Exception as e:
             LOGGER.error(
                 f"Error converting sequences to matrix format in chunk {chunk_idx}: {e}"
@@ -150,7 +160,9 @@ def data_initialize_chunked(
         yield df, prediction_dataset, x_shape
 
 
-def get_input_shape_from_first_chunk(csv_path: str, chunk_size: int = 10000):
+
+def get_input_shape_from_first_chunk(csv_path: str, chunk_size: int = 10000, **kwargs):
+
     """
     Get the input shape from the first valid chunk of a CSV file.
 
@@ -159,7 +171,9 @@ def get_input_shape_from_first_chunk(csv_path: str, chunk_size: int = 10000):
     :return: x_shape for model initialization.
     """
     for _, dataset_chunk, x_shape in data_initialize_chunked(
-        csv_path=csv_path, chunk_size=chunk_size
+
+        csv_path=csv_path, chunk_size=chunk_size, **kwargs
+
     ):
         LOGGER.info(f"Detected input shape from first valid chunk: {x_shape}")
         return x_shape

diff --git a/ideeplc/ideeplc_core.py b/ideeplc/ideeplc_core.py
@@ -75,7 +75,11 @@ def main(args):
 
         # For model initialization, only inspect the first valid chunk
         x_shape = get_input_shape_from_first_chunk(
-            csv_path=args.input, chunk_size=chunk_size
+
+            csv_path=args.input,
+            chunk_size=chunk_size,
+            mod_features_csv=getattr(args, "mod_features", None),
+
         )
 
         # Initialize model
@@ -101,7 +105,12 @@ def main(args):
         if args.finetune:
             LOGGER.info("Fine-tuning the model")
 
-            matrix_input, _ = data_initialize(csv_path=args.input)
+
+            matrix_input, _ = data_initialize(
+                csv_path=args.input,
+                mod_features_csv=getattr(args, "mod_features", None),
+            )
+
 
             fine_tuner = iDeepLCFineTuner(
                 model=model,
@@ -129,6 +138,8 @@ def main(args):
             save_results=args.save,
             batch_size=batch_size,
             chunk_size=chunk_size,
+            mod_features_csv=getattr(args, "mod_features", None),
+
         )
         LOGGER.info("Prediction completed.")
 

diff --git a/ideeplc/mod_features.py b/ideeplc/mod_features.py
@@ -0,0 +1,46 @@
+"""Build standardized modification features from a user CSV."""
+
+import argparse
+import logging
+
+from ideeplc.utilities import build_user_mod_feature_table
+
+
+LOGGER = logging.getLogger(__name__)
+
+
+def _argument_parser() -> argparse.ArgumentParser:
+    """Create the argument parser for the feature builder."""
+    parser = argparse.ArgumentParser(
+        description=(
+            "Convert a CSV with columns name, aa, smiles into standardized modification features."
+        )
+    )
+    parser.add_argument(
+        "-i",
+        "--input",
+        type=str,
+        required=True,
+        help="Path to the CSV file containing user modifications.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=str,
+        default="user_mod_features_standardized.csv",
+        help="Output path for the standardized feature table.",
+    )
+    return parser
+
+
+def main(argv=None):
+    """Build standardized modification features from a raw user CSV."""
+    parser = _argument_parser()
+    args = parser.parse_args(argv)
+
+    feature_table = build_user_mod_feature_table(args.input, args.output)
+    LOGGER.info("Wrote %d modification feature rows to %s", len(feature_table), args.output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ideeplc/predict.py b/ideeplc/predict.py
@@ -67,6 +67,8 @@ def predict(
     batch_size: int = None,
     chunk_size: int = 10000,
     dataloader_input: DataLoader = None,
+    mod_features_csv: str = None,
+
 ):
     """
     Load a trained model and evaluate it on test datasets in chunks.
@@ -138,8 +140,12 @@ def predict(
                 output_path.unlink()
 
         for chunk_idx, (df_chunk, dataset_chunk, x_shape) in enumerate(
-            data_initialize_chunked(csv_path=input_file, chunk_size=chunk_size),
-            start=1,
+            data_initialize_chunked(
+                csv_path=input_file,
+                chunk_size=chunk_size,
+                mod_features_csv=mod_features_csv,
+            ),
+
         ):
             LOGGER.info(
                 f"Processing chunk {chunk_idx} with {len(dataset_chunk)} entries and shape {x_shape}."

diff --git a/ideeplc/utilities.py b/ideeplc/utilities.py
@@ -1,5 +1,7 @@
 import io
-from typing import List, Tuple, Dict, Union, Optional, Any
+import logging
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -9,6 +11,12 @@
 import ideeplc.structure_feature
 
 
+LOGGER = logging.getLogger(__name__)
+
+MEAN_MOLLOGP = -0.5262240476190472
+STD_MOLLOGP = 0.7546071397979358
+
+
 class Config:
     """
     Configuration class for the encoding of peptides.
@@ -76,22 +84,132 @@ def aa_chemical_feature() -> Dict[str, np.ndarray]:
     return features_arrays
 
 
-def mod_chemical_features() -> Dict[str, Dict[str, Dict[str, float]]]:
-    """Get modification features."""
-    content = files(ideeplc.structure_feature).joinpath("ptm_stan.csv").read_bytes()
-    df = pd.read_csv(io.BytesIO(content))
-    # Convert the dataframe to a dictionary and transpose it
+def standardize(value: float, mean: float, std: float) -> float:
+    """Apply standardization."""
+    return (value - mean) / std
+
+
+def compute_mollogp(smiles: str) -> Optional[float]:
+    """Compute RDKit MolLogP from a SMILES string."""
+    try:
+        from rdkit import Chem
+        from rdkit.Chem import Crippen
+    except ImportError as exc:
+        raise ImportError(
+            "rdkit is required to build modification features from raw SMILES input."
+        ) from exc
+
+    mol = Chem.MolFromSmiles(smiles)
+    if mol is None:
+        return None
+    return Crippen.MolLogP(mol)
+
+
+def build_user_mod_feature_table(
+    input_csv: str,
+    output_csv: Optional[str] = None,
+    compute_mollogp_fn: Optional[Callable[[str], Optional[float]]] = None,
+) -> pd.DataFrame:
+    """Build a standardized modification feature table from a raw user CSV."""
+    df = pd.read_csv(input_csv)
+    required_cols = {"name", "aa", "smiles"}
+    if not required_cols.issubset(df.columns):
+        raise ValueError(f"Input file must contain columns: {required_cols}")
+
+    compute_fn = compute_mollogp_fn or compute_mollogp
+    results = []
+
+    for _, row in df.iterrows():
+        name = f"{row['name']}#{row['aa']}"
+        smiles = row["smiles"]
+
+        mollogp = compute_fn(smiles)
+        if mollogp is None:
+            LOGGER.warning("Skipping invalid SMILES for %s", name)
+            continue
+
+        results.append(
+            {
+                "name": name,
+                "MolLogP_rdkit": standardize(mollogp, MEAN_MOLLOGP, STD_MOLLOGP),
+            }
+        )
+
+    if not results:
+        raise ValueError("No valid modification rows were found in the input file.")
+
+    df_out = pd.DataFrame(results)
+    if output_csv:
+        Path(output_csv).parent.mkdir(parents=True, exist_ok=True)
+        df_out.to_csv(output_csv, index=False)
+
+    return df_out
+
+
+def _load_mod_feature_table(csv_path: str) -> pd.DataFrame:
+    """Load either a standardized feature table or a raw user-mod table."""
+    df = pd.read_csv(csv_path)
+
+    if {"name", "MolLogP_rdkit"}.issubset(df.columns):
+        feature_df = df.loc[:, ["name", "MolLogP_rdkit"]].copy()
+    elif {"name", "aa", "smiles"}.issubset(df.columns):
+        feature_df = build_user_mod_feature_table(csv_path)
+    else:
+        raise ValueError(
+            "Modification CSV must contain either ['name', 'MolLogP_rdkit'] or ['name', 'aa', 'smiles'] columns."
+        )
+
+    feature_df = feature_df.dropna(subset=["name", "MolLogP_rdkit"])
+    feature_df["name"] = feature_df["name"].astype(str)
+    feature_df["MolLogP_rdkit"] = feature_df["MolLogP_rdkit"].astype(float)
+    feature_df = feature_df.drop_duplicates(subset=["name"], keep="last")
+    return feature_df
+
+
+def _merge_mod_feature_tables(
+    base_df: pd.DataFrame, extra_df: pd.DataFrame
+) -> pd.DataFrame:
+    """Merge the built-in and user-provided modification feature tables."""
+    combined = base_df.copy()
+    combined.update(extra_df)
+
+    new_rows = extra_df.loc[~extra_df.index.isin(combined.index)]
+    if not new_rows.empty:
+        combined = pd.concat([combined, new_rows])
+
+    return combined
+
+
+def _mod_feature_table_to_dict(
+    df: pd.DataFrame,
+) -> Dict[str, Dict[str, Dict[str, float]]]:
+    """Convert a feature table to the nested modification dictionary format."""
     df = df.set_index("name").T
-    # Convert the DataFrame to a dictionary of modifications with their chemical features
     modified = df.to_dict("list")
     dic = {}
     for key, values in modified.items():
         main_key, sub_key = key.split("#")
-        # Create a nested dictionary with the modification name and the amino acid
         dic.setdefault(main_key, {})[sub_key] = dict(zip(df.index, values))
     return dic
 
 
+def mod_chemical_features(
+    user_mods_csv: Optional[str] = None,
+) -> Dict[str, Dict[str, Dict[str, float]]]:
+    """Get modification features, optionally merged with user-provided modifications."""
+    content = files(ideeplc.structure_feature).joinpath("ptm_stan.csv").read_bytes()
+    base_df = pd.read_csv(io.BytesIO(content))
+    base_df = base_df.loc[:, ["name", "MolLogP_rdkit"]].copy()
+
+    if user_mods_csv:
+        extra_df = _load_mod_feature_table(user_mods_csv)
+        base_df = _merge_mod_feature_tables(
+            base_df.set_index("name"), extra_df.set_index("name")
+        ).reset_index()
+
+    return _mod_feature_table_to_dict(base_df)
+
+
 def peptide_parser(peptide: str) -> Tuple:
     """Parse the peptide sequence and modifications."""
     modifications = []
@@ -302,7 +420,9 @@ def encode_sequence_one_hot(sequence: str) -> np.ndarray:
 
 
 def df_to_matrix(
-    seqs: Union[str, List[str]], df: Optional[pd.DataFrame] = None
+    seqs: Union[str, List[str]],
+    df: Optional[pd.DataFrame] = None,
+    mod_features_csv: Optional[str] = None,
 ) -> (
     tuple[ndarray, list[Any], list[list[str | list[str] | int | str | Exception]]]
     | ndarray
@@ -327,7 +447,7 @@ def df_to_matrix(
     seqs_encoded = []
     tr = []
     errors = []
-    modifications_dict = mod_chemical_features()
+    modifications_dict = mod_chemical_features(user_mods_csv=mod_features_csv)
     aa_to_feature = aa_chemical_feature()
     amino_acids_atoms = aa_atomic_composition_array()
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -66,6 +66,7 @@ version = {attr = "ideeplc.__version__"}
 
 [project.scripts]
 ideeplc = "ideeplc.__main__:main"
+ideeplc-mod-features = "ideeplc.mod_features:main"
 
 [project.urls]
 GitHub = "https://github.com/CompOmics/iDeepLC"