Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,14 @@ ideeplc --input <path/to/peptide_file.csv> --save --calibrate
ideeplc --input ./data/example_input/Hela_deeprt --save --finetune --calibrate
```

#### Custom modification features
If you have new modification entries with columns `name`, `aa`, and `smiles`, you can generate a standardized feature table and then use it during prediction:

```sh
ideeplc-mod-features --input user_mods.csv --output user_mod_features_standardized.csv
ideeplc --input peptide_file.csv --mod-features user_mod_features_standardized.csv
```

For more detailed CLI usage, you can run:
```sh
ideeplc --help
Expand Down
2 changes: 1 addition & 1 deletion ideeplc/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""iDeepLC: A deep Learning-based retention time predictor for unseen modified peptides with a novel encoding system"""
__version__ = "1.4.0"

__version__ = "1.3.2"
9 changes: 9 additions & 0 deletions ideeplc/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,15 @@ def _argument_parser() -> argparse.ArgumentParser:
action="store_true",
help="Flag to enable calibration of the model predictions.",
)
parser.add_argument(
"--mod-features",
type=str,
required=False,
help=(
"Optional CSV file with either raw modification rows (name, aa, smiles) "
"or standardized features (name, MolLogP_rdkit)."
),
)
return parser


Expand Down
22 changes: 18 additions & 4 deletions ideeplc/data_initialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,12 @@ def data_initialize(
)

try:
sequences, tr, errors = df_to_matrix(reformed_peptides, df)
sequences, tr, errors = df_to_matrix(
reformed_peptides,
df,
mod_features_csv=kwargs.get("mod_features_csv"),
)

except Exception as e:
LOGGER.error(f"Error converting sequences to matrix format: {e}")
raise
Expand Down Expand Up @@ -128,7 +133,12 @@ def data_initialize_chunked(
)

try:
sequences, tr, errors = df_to_matrix(reformed_peptides, df)
sequences, tr, errors = df_to_matrix(
reformed_peptides,
df,
mod_features_csv=kwargs.get("mod_features_csv"),
)

except Exception as e:
LOGGER.error(
f"Error converting sequences to matrix format in chunk {chunk_idx}: {e}"
Expand All @@ -150,7 +160,9 @@ def data_initialize_chunked(
yield df, prediction_dataset, x_shape


def get_input_shape_from_first_chunk(csv_path: str, chunk_size: int = 10000):

def get_input_shape_from_first_chunk(csv_path: str, chunk_size: int = 10000, **kwargs):

"""
Get the input shape from the first valid chunk of a CSV file.

Expand All @@ -159,7 +171,9 @@ def get_input_shape_from_first_chunk(csv_path: str, chunk_size: int = 10000):
:return: x_shape for model initialization.
"""
for _, dataset_chunk, x_shape in data_initialize_chunked(
csv_path=csv_path, chunk_size=chunk_size

csv_path=csv_path, chunk_size=chunk_size, **kwargs

):
LOGGER.info(f"Detected input shape from first valid chunk: {x_shape}")
return x_shape
Expand Down
15 changes: 13 additions & 2 deletions ideeplc/ideeplc_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,11 @@ def main(args):

# For model initialization, only inspect the first valid chunk
x_shape = get_input_shape_from_first_chunk(
csv_path=args.input, chunk_size=chunk_size

csv_path=args.input,
chunk_size=chunk_size,
mod_features_csv=getattr(args, "mod_features", None),

)

# Initialize model
Expand All @@ -101,7 +105,12 @@ def main(args):
if args.finetune:
LOGGER.info("Fine-tuning the model")

matrix_input, _ = data_initialize(csv_path=args.input)

matrix_input, _ = data_initialize(
csv_path=args.input,
mod_features_csv=getattr(args, "mod_features", None),
)


fine_tuner = iDeepLCFineTuner(
model=model,
Expand Down Expand Up @@ -129,6 +138,8 @@ def main(args):
save_results=args.save,
batch_size=batch_size,
chunk_size=chunk_size,
mod_features_csv=getattr(args, "mod_features", None),

)
LOGGER.info("Prediction completed.")

Expand Down
46 changes: 46 additions & 0 deletions ideeplc/mod_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""Build standardized modification features from a user CSV."""

import argparse
import logging

from ideeplc.utilities import build_user_mod_feature_table


LOGGER = logging.getLogger(__name__)


def _argument_parser() -> argparse.ArgumentParser:
"""Create the argument parser for the feature builder."""
parser = argparse.ArgumentParser(
description=(
"Convert a CSV with columns name, aa, smiles into standardized modification features."
)
)
parser.add_argument(
"-i",
"--input",
type=str,
required=True,
help="Path to the CSV file containing user modifications.",
)
parser.add_argument(
"-o",
"--output",
type=str,
default="user_mod_features_standardized.csv",
help="Output path for the standardized feature table.",
)
return parser


def main(argv=None):
"""Build standardized modification features from a raw user CSV."""
parser = _argument_parser()
args = parser.parse_args(argv)

feature_table = build_user_mod_feature_table(args.input, args.output)
LOGGER.info("Wrote %d modification feature rows to %s", len(feature_table), args.output)


if __name__ == "__main__":
main()
10 changes: 8 additions & 2 deletions ideeplc/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ def predict(
batch_size: int = None,
chunk_size: int = 10000,
dataloader_input: DataLoader = None,
mod_features_csv: str = None,

):
"""
Load a trained model and evaluate it on test datasets in chunks.
Expand Down Expand Up @@ -138,8 +140,12 @@ def predict(
output_path.unlink()

for chunk_idx, (df_chunk, dataset_chunk, x_shape) in enumerate(
data_initialize_chunked(csv_path=input_file, chunk_size=chunk_size),
start=1,
data_initialize_chunked(
csv_path=input_file,
chunk_size=chunk_size,
mod_features_csv=mod_features_csv,
),

):
LOGGER.info(
f"Processing chunk {chunk_idx} with {len(dataset_chunk)} entries and shape {x_shape}."
Expand Down
140 changes: 130 additions & 10 deletions ideeplc/utilities.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import io
from typing import List, Tuple, Dict, Union, Optional, Any
import logging
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
Expand All @@ -9,6 +11,12 @@
import ideeplc.structure_feature


LOGGER = logging.getLogger(__name__)

MEAN_MOLLOGP = -0.5262240476190472
STD_MOLLOGP = 0.7546071397979358


class Config:
"""
Configuration class for the encoding of peptides.
Expand Down Expand Up @@ -76,22 +84,132 @@ def aa_chemical_feature() -> Dict[str, np.ndarray]:
return features_arrays


def mod_chemical_features() -> Dict[str, Dict[str, Dict[str, float]]]:
"""Get modification features."""
content = files(ideeplc.structure_feature).joinpath("ptm_stan.csv").read_bytes()
df = pd.read_csv(io.BytesIO(content))
# Convert the dataframe to a dictionary and transpose it
def standardize(value: float, mean: float, std: float) -> float:
"""Apply standardization."""
return (value - mean) / std


def compute_mollogp(smiles: str) -> Optional[float]:
"""Compute RDKit MolLogP from a SMILES string."""
try:
from rdkit import Chem
from rdkit.Chem import Crippen
except ImportError as exc:
raise ImportError(
"rdkit is required to build modification features from raw SMILES input."
) from exc

mol = Chem.MolFromSmiles(smiles)
if mol is None:
return None
return Crippen.MolLogP(mol)


def build_user_mod_feature_table(
input_csv: str,
output_csv: Optional[str] = None,
compute_mollogp_fn: Optional[Callable[[str], Optional[float]]] = None,
) -> pd.DataFrame:
"""Build a standardized modification feature table from a raw user CSV."""
df = pd.read_csv(input_csv)
required_cols = {"name", "aa", "smiles"}
if not required_cols.issubset(df.columns):
raise ValueError(f"Input file must contain columns: {required_cols}")

compute_fn = compute_mollogp_fn or compute_mollogp
results = []

for _, row in df.iterrows():
name = f"{row['name']}#{row['aa']}"
smiles = row["smiles"]

mollogp = compute_fn(smiles)
if mollogp is None:
LOGGER.warning("Skipping invalid SMILES for %s", name)
continue

results.append(
{
"name": name,
"MolLogP_rdkit": standardize(mollogp, MEAN_MOLLOGP, STD_MOLLOGP),
}
)

if not results:
raise ValueError("No valid modification rows were found in the input file.")

df_out = pd.DataFrame(results)
if output_csv:
Path(output_csv).parent.mkdir(parents=True, exist_ok=True)
df_out.to_csv(output_csv, index=False)

return df_out


def _load_mod_feature_table(csv_path: str) -> pd.DataFrame:
"""Load either a standardized feature table or a raw user-mod table."""
df = pd.read_csv(csv_path)

if {"name", "MolLogP_rdkit"}.issubset(df.columns):
feature_df = df.loc[:, ["name", "MolLogP_rdkit"]].copy()
elif {"name", "aa", "smiles"}.issubset(df.columns):
feature_df = build_user_mod_feature_table(csv_path)
else:
raise ValueError(
"Modification CSV must contain either ['name', 'MolLogP_rdkit'] or ['name', 'aa', 'smiles'] columns."
)

feature_df = feature_df.dropna(subset=["name", "MolLogP_rdkit"])
feature_df["name"] = feature_df["name"].astype(str)
feature_df["MolLogP_rdkit"] = feature_df["MolLogP_rdkit"].astype(float)
feature_df = feature_df.drop_duplicates(subset=["name"], keep="last")
return feature_df


def _merge_mod_feature_tables(
base_df: pd.DataFrame, extra_df: pd.DataFrame
) -> pd.DataFrame:
"""Merge the built-in and user-provided modification feature tables."""
combined = base_df.copy()
combined.update(extra_df)

new_rows = extra_df.loc[~extra_df.index.isin(combined.index)]
if not new_rows.empty:
combined = pd.concat([combined, new_rows])

return combined


def _mod_feature_table_to_dict(
df: pd.DataFrame,
) -> Dict[str, Dict[str, Dict[str, float]]]:
"""Convert a feature table to the nested modification dictionary format."""
df = df.set_index("name").T
# Convert the DataFrame to a dictionary of modifications with their chemical features
modified = df.to_dict("list")
dic = {}
for key, values in modified.items():
main_key, sub_key = key.split("#")
# Create a nested dictionary with the modification name and the amino acid
dic.setdefault(main_key, {})[sub_key] = dict(zip(df.index, values))
return dic


def mod_chemical_features(
user_mods_csv: Optional[str] = None,
) -> Dict[str, Dict[str, Dict[str, float]]]:
"""Get modification features, optionally merged with user-provided modifications."""
content = files(ideeplc.structure_feature).joinpath("ptm_stan.csv").read_bytes()
base_df = pd.read_csv(io.BytesIO(content))
base_df = base_df.loc[:, ["name", "MolLogP_rdkit"]].copy()

if user_mods_csv:
extra_df = _load_mod_feature_table(user_mods_csv)
base_df = _merge_mod_feature_tables(
base_df.set_index("name"), extra_df.set_index("name")
).reset_index()

return _mod_feature_table_to_dict(base_df)


def peptide_parser(peptide: str) -> Tuple:
"""Parse the peptide sequence and modifications."""
modifications = []
Expand Down Expand Up @@ -302,7 +420,9 @@ def encode_sequence_one_hot(sequence: str) -> np.ndarray:


def df_to_matrix(
seqs: Union[str, List[str]], df: Optional[pd.DataFrame] = None
seqs: Union[str, List[str]],
df: Optional[pd.DataFrame] = None,
mod_features_csv: Optional[str] = None,
) -> (
tuple[ndarray, list[Any], list[list[str | list[str] | int | str | Exception]]]
| ndarray
Expand All @@ -327,7 +447,7 @@ def df_to_matrix(
seqs_encoded = []
tr = []
errors = []
modifications_dict = mod_chemical_features()
modifications_dict = mod_chemical_features(user_mods_csv=mod_features_csv)
aa_to_feature = aa_chemical_feature()
amino_acids_atoms = aa_atomic_composition_array()

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ version = {attr = "ideeplc.__version__"}

[project.scripts]
ideeplc = "ideeplc.__main__:main"
ideeplc-mod-features = "ideeplc.mod_features:main"

[project.urls]
GitHub = "https://github.com/CompOmics/iDeepLC"
Expand Down
Loading
Loading