From 026e190f5593dd4f882c5d518add7e3b9e1b4862 Mon Sep 17 00:00:00 2001
From: Alireza Nameni <64960208+Alirezak2n@users.noreply.github.com>
Date: Fri, 3 Apr 2026 13:59:05 +0200
Subject: [PATCH 1/4] New major version for adding mods by user

---
 ideeplc/__init__.py | 2 +-
 ideeplc/__main__.py | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/ideeplc/__init__.py b/ideeplc/__init__.py
index ae57bfe..9f2f44f 100644
--- a/ideeplc/__init__.py
+++ b/ideeplc/__init__.py
@@ -1,3 +1,3 @@
 """iDeepLC: A deep Learning-based retention time predictor for unseen modified peptides with a novel encoding system"""
 
-__version__ = "1.3.1"
+__version__ = "1.4.0"
diff --git a/ideeplc/__main__.py b/ideeplc/__main__.py
index ca8db40..f9edb87 100644
--- a/ideeplc/__main__.py
+++ b/ideeplc/__main__.py
@@ -82,6 +82,15 @@ def _argument_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Flag to enable calibration of the model predictions.",
     )
+    parser.add_argument(
+        "--mod-features",
+        type=str,
+        required=False,
+        help=(
+            "Optional CSV file with either raw modification rows (name, aa, smiles) "
+            "or standardized features (name, MolLogP_rdkit)."
+        ),
+    )
     return parser
 
 

From a76f0705141a568a558eef2c630f3d723cafe1cb Mon Sep 17 00:00:00 2001
From: Alireza Nameni <64960208+Alirezak2n@users.noreply.github.com>
Date: Fri, 3 Apr 2026 13:59:42 +0200
Subject: [PATCH 2/4] Create mod_features.py

Build standardized modifications
---
 ideeplc/mod_features.py | 46 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 ideeplc/mod_features.py

diff --git a/ideeplc/mod_features.py b/ideeplc/mod_features.py
new file mode 100644
index 0000000..d700b61
--- /dev/null
+++ b/ideeplc/mod_features.py
@@ -0,0 +1,46 @@
+"""Build standardized modification features from a user CSV."""
+
+import argparse
+import logging
+
+from ideeplc.utilities import build_user_mod_feature_table
+
+
+LOGGER = logging.getLogger(__name__)
+
+
+def _argument_parser() -> argparse.ArgumentParser:
+    """Create the argument parser for the feature builder."""
+    parser = argparse.ArgumentParser(
+        description=(
+            "Convert a CSV with columns name, aa, smiles into standardized modification features."
+        )
+    )
+    parser.add_argument(
+        "-i",
+        "--input",
+        type=str,
+        required=True,
+        help="Path to the CSV file containing user modifications.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=str,
+        default="user_mod_features_standardized.csv",
+        help="Output path for the standardized feature table.",
+    )
+    return parser
+
+
+def main(argv=None):
+    """Build standardized modification features from a raw user CSV."""
+    parser = _argument_parser()
+    args = parser.parse_args(argv)
+
+    feature_table = build_user_mod_feature_table(args.input, args.output)
+    LOGGER.info("Wrote %d modification feature rows to %s", len(feature_table), args.output)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 977bcf8f4b21125ad5877aed53698e2b4ff4bf04 Mon Sep 17 00:00:00 2001
From: Alireza Nameni <64960208+Alirezak2n@users.noreply.github.com>
Date: Fri, 3 Apr 2026 13:59:54 +0200
Subject: [PATCH 3/4] New test and readme

---
 README.md                  |  8 ++++++
 pyproject.toml             |  1 +
 tests/test_mod_features.py | 51 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 60 insertions(+)
 create mode 100644 tests/test_mod_features.py

diff --git a/README.md b/README.md
index 57de61d..9f1944a 100644
--- a/README.md
+++ b/README.md
@@ -66,6 +66,14 @@ ideeplc --input <path/to/peptide_file.csv> --save --calibrate
 ideeplc --input ./data/example_input/Hela_deeprt --save --finetune --calibrate
 ```
 
+#### Custom modification features
+If you have new modification entries with columns `name`, `aa`, and `smiles`, you can generate a standardized feature table and then use it during prediction:
+
+```sh
+ideeplc-mod-features --input user_mods.csv --output user_mod_features_standardized.csv
+ideeplc --input peptide_file.csv --mod-features user_mod_features_standardized.csv
+```
+
 For more detailed CLI usage, you can run:
 ```sh
 ideeplc --help
diff --git a/pyproject.toml b/pyproject.toml
index 409d94c..87cd0c5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -66,6 +66,7 @@ version = {attr = "ideeplc.__version__"}
 
 [project.scripts]
 ideeplc = "ideeplc.__main__:main"
+ideeplc-mod-features = "ideeplc.mod_features:main"
 
 [project.urls]
 GitHub = "https://github.com/CompOmics/iDeepLC"
diff --git a/tests/test_mod_features.py b/tests/test_mod_features.py
new file mode 100644
index 0000000..a6f9417
--- /dev/null
+++ b/tests/test_mod_features.py
@@ -0,0 +1,51 @@
+# Python
+import pandas as pd
+
+from ideeplc.utilities import (
+    MEAN_MOLLOGP,
+    STD_MOLLOGP,
+    build_user_mod_feature_table,
+    mod_chemical_features,
+)
+
+
+def test_build_user_mod_feature_table(tmp_path):
+    """Test building standardized modification features from a raw CSV."""
+    input_csv = tmp_path / "user_mods.csv"
+    output_csv = tmp_path / "user_mod_features_standardized.csv"
+
+    pd.DataFrame(
+        {
+            "name": ["CustomMod"],
+            "aa": ["K"],
+            "smiles": ["CCO"],
+        }
+    ).to_csv(input_csv, index=False)
+
+    feature_table = build_user_mod_feature_table(
+        str(input_csv),
+        str(output_csv),
+        compute_mollogp_fn=lambda smiles: 1.0,
+    )
+
+    assert output_csv.exists()
+    assert list(feature_table["name"]) == ["CustomMod#K"]
+    expected = (1.0 - MEAN_MOLLOGP) / STD_MOLLOGP
+    assert feature_table.iloc[0]["MolLogP_rdkit"] == expected
+
+
+def test_mod_chemical_features_merges_user_table(tmp_path):
+    """Test that a user feature table is merged into the built-in dictionary."""
+    user_feature_csv = tmp_path / "custom_features.csv"
+    pd.DataFrame(
+        {
+            "name": ["CustomMod#K"],
+            "MolLogP_rdkit": [1.23],
+        }
+    ).to_csv(user_feature_csv, index=False)
+
+    mod_dict = mod_chemical_features(user_mods_csv=str(user_feature_csv))
+
+    assert "CustomMod" in mod_dict
+    assert "K" in mod_dict["CustomMod"]
+    assert mod_dict["CustomMod"]["K"]["MolLogP_rdkit"] == 1.23

From 4b153822d27db489acad54ccd03f1a9f8b3a00bc Mon Sep 17 00:00:00 2001
From: Alireza Nameni <64960208+Alirezak2n@users.noreply.github.com>
Date: Fri, 3 Apr 2026 14:00:19 +0200
Subject: [PATCH 4/4] Modified files to add the new feature of added mods by
 user

---
 ideeplc/data_initialize.py |  96 ++++++++++++++++++++++++-
 ideeplc/ideeplc_core.py    |  18 ++++-
 ideeplc/predict.py         | 121 ++++++++++++++++++++++++++++++--
 ideeplc/utilities.py       | 140 ++++++++++++++++++++++++++++++++++---
 4 files changed, 357 insertions(+), 18 deletions(-)

diff --git a/ideeplc/data_initialize.py b/ideeplc/data_initialize.py
index 02d578a..11cbb9e 100644
--- a/ideeplc/data_initialize.py
+++ b/ideeplc/data_initialize.py
@@ -64,8 +64,11 @@ def data_initialize(
         f"Loaded and reformed {len(reformed_peptides)} peptides sequences from the file."
     )
     try:
-        # Convert sequences to matrix format
-        sequences, tr, errors = df_to_matrix(reformed_peptides, df)
+        sequences, tr, errors = df_to_matrix(
+            reformed_peptides,
+            df,
+            mod_features_csv=kwargs.get("mod_features_csv"),
+        )
     except Exception as e:
         LOGGER.error(f"Error converting sequences to matrix format: {e}")
         raise
@@ -82,3 +85,92 @@ def data_initialize(
         break
     LOGGER.info(f"Dataset initialized with data shape {x_shape}.")
     return prediction_dataset, x_shape
+
+
+def data_initialize_chunked(
+    csv_path: str, chunk_size: int = 10000, **kwargs
+) -> Iterator[Tuple[pd.DataFrame, MyDataset, np.ndarray]]:
+    """
+    Initialize peptide matrices from a CSV file in chunks.
+
+    :param csv_path: Path to the CSV file containing raw peptide sequences.
+    :param chunk_size: Number of rows to load per chunk.
+    :return: Iterator yielding dataframe chunk, dataset chunk, and x_shape.
+    """
+    LOGGER.info(f"Loading peptides from {csv_path} in chunks of {chunk_size}")
+
+    try:
+        chunk_iter = pd.read_csv(csv_path, chunksize=chunk_size)
+    except FileNotFoundError:
+        LOGGER.error(f"File {csv_path} not found.")
+        raise
+    except pd.errors.EmptyDataError:
+        LOGGER.error(f"File {csv_path} is empty.")
+        raise
+    except Exception as e:
+        LOGGER.error(f"Error reading {csv_path}: {e}")
+        raise
+
+    for chunk_idx, df in enumerate(chunk_iter, start=1):
+        if "seq" not in df.columns:
+            LOGGER.error("CSV file must contain a 'seq' column with peptide sequences.")
+            raise ValueError("Missing 'seq' column in the CSV file.")
+        if "modifications" not in df.columns:
+            LOGGER.error(
+                "CSV file must contain a 'modifications' column with peptide modifications."
+            )
+            raise ValueError("Missing 'modifications' column in the CSV file.")
+        if "tr" not in df.columns:
+            LOGGER.error("CSV file must contain a 'tr' column with retention times.")
+            raise ValueError("Missing 'tr' column in the CSV file.")
+
+        reformed_peptides = [
+            reform_seq(seq, mod) for seq, mod in zip(df["seq"], df["modifications"])
+        ]
+        LOGGER.info(
+            f"Chunk {chunk_idx}: loaded and reformed {len(reformed_peptides)} peptides sequences."
+        )
+
+        try:
+            sequences, tr, errors = df_to_matrix(
+                reformed_peptides,
+                df,
+                mod_features_csv=kwargs.get("mod_features_csv"),
+            )
+        except Exception as e:
+            LOGGER.error(
+                f"Error converting sequences to matrix format in chunk {chunk_idx}: {e}"
+            )
+            raise
+
+        if errors:
+            LOGGER.warning(f"Errors encountered during conversion in chunk {chunk_idx}: {errors}")
+
+        prediction_dataset = MyDataset(sequences, tr)
+
+        if len(prediction_dataset) == 0:
+            LOGGER.warning(f"Chunk {chunk_idx} contains no valid peptide entries.")
+            continue
+
+        # Keep historical x_shape contract expected by model/tests: (batch, channels, length)
+        x_shape = (1,) + prediction_dataset[0][0].shape
+        LOGGER.info(f"Chunk {chunk_idx} initialized with data shape {x_shape}.")
+        yield df, prediction_dataset, x_shape
+
+
+def get_input_shape_from_first_chunk(csv_path: str, chunk_size: int = 10000, **kwargs):
+    """
+    Get the input shape from the first valid chunk of a CSV file.
+
+    :param csv_path: Path to the CSV file containing raw peptide sequences.
+    :param chunk_size: Number of rows to load per chunk.
+    :return: x_shape for model initialization.
+    """
+    for _, dataset_chunk, x_shape in data_initialize_chunked(
+        csv_path=csv_path, chunk_size=chunk_size, **kwargs
+    ):
+        LOGGER.info(f"Detected input shape from first valid chunk: {x_shape}")
+        return x_shape
+
+    LOGGER.error("No valid chunks found in the input file.")
+    raise ValueError("No valid chunks found in the input file.")
\ No newline at end of file
diff --git a/ideeplc/ideeplc_core.py b/ideeplc/ideeplc_core.py
index 958f2b2..62fd45d 100644
--- a/ideeplc/ideeplc_core.py
+++ b/ideeplc/ideeplc_core.py
@@ -74,7 +74,14 @@ def main(args):
 
         # Initialize data
         LOGGER.info(f"Loading data from {args.input}")
-        matrix_input, x_shape = data_initialize(csv_path=args.input)
+
+        # For model initialization, only inspect the first valid chunk
+        x_shape = get_input_shape_from_first_chunk(
+            csv_path=args.input,
+            chunk_size=chunk_size,
+            mod_features_csv=getattr(args, "mod_features", None),
+        )
+
         # Initialize model
         LOGGER.info("Initializing model")
         model = MyNet(x_shape=x_shape, config=config).to(device)
@@ -96,6 +103,12 @@ def main(args):
 
         if args.finetune:
             LOGGER.info("Fine-tuning the model")
+
+            matrix_input, _ = data_initialize(
+                csv_path=args.input,
+                mod_features_csv=getattr(args, "mod_features", None),
+            )
+
             fine_tuner = iDeepLCFineTuner(
                 model=model,
                 train_data=matrix_input,
@@ -123,6 +136,9 @@ def main(args):
             calibrate=args.calibrate,
             input_file=args.input,
             save_results=args.save,
+            batch_size=batch_size,
+            chunk_size=chunk_size,
+            mod_features_csv=getattr(args, "mod_features", None),
         )
         LOGGER.info("Prediction completed.")
         # Generate Figures
diff --git a/ideeplc/predict.py b/ideeplc/predict.py
index bc90455..b5bc788 100644
--- a/ideeplc/predict.py
+++ b/ideeplc/predict.py
@@ -58,6 +58,10 @@ def predict(
     input_file: str,
     calibrate: bool,
     save_results: bool,
+    batch_size: int = None,
+    chunk_size: int = 10000,
+    dataloader_input: DataLoader = None,
+    mod_features_csv: str = None,
 ):
     """
     Load a trained model and evaluate it on test datasets.
@@ -71,13 +75,120 @@ def predict(
     :param save_results: If True, saves the evaluation results.
     :return: Loss, correlation, predictions, and ground truth values.
     """
-    LOGGER.info("Starting prediction process.")
+    LOGGER.info(
+        f"Starting prediction process with batch size {batch_size} and chunk size {chunk_size}."
+    )
+
+    all_predictions = []
+    all_ground_truth = []
+    total_loss = 0.0
+    total_samples = 0
+
+    calibrated_preds = None
+
+    timestamp = datetime.datetime.now().strftime("%Y%m%d")
+    input_file_name = os.path.splitext(os.path.basename(input_file))[0]
+    output_path = (
+        Path("ideeplc_output") / f"{input_file_name}_predictions_{timestamp}.csv"
+    )
 
     try:
-        # Validate on the primary test set
-        loss, correlation, predictions, ground_truth = validate(
-            model, dataloader_input, loss_fn, device
-        )
+        if dataloader_input is not None:
+            LOGGER.info("Using provided dataloader_input for prediction.")
+            loss, correlation, all_predictions, all_ground_truth = validate(
+                model=model,
+                dataloader=dataloader_input,
+                loss_fn=loss_fn,
+                device=device,
+            )
+
+            if calibrate:
+                LOGGER.info("Fitting calibration model.")
+                calibration_model = SplineTransformerCalibration()
+                calibration_model.fit(all_ground_truth, all_predictions)
+                calibrated_preds = calibration_model.transform(all_predictions)
+
+                if len(calibrated_preds) > 1 and len(all_ground_truth) > 1:
+                    correlation = np.corrcoef(calibrated_preds, all_ground_truth)[0, 1]
+                else:
+                    correlation = np.nan
+
+                loss_calibrated = loss_fn(
+                    torch.tensor(calibrated_preds).float().view(-1, 1),
+                    torch.tensor(all_ground_truth).float().view(-1, 1),
+                )
+                loss = loss_calibrated.item()
+                return loss, correlation, calibrated_preds, all_ground_truth
+
+            return loss, correlation, all_predictions, all_ground_truth
+
+        if batch_size is None:
+            raise ValueError("batch_size must be provided when dataloader_input is not used.")
+
+        if save_results:
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            if output_path.exists():
+                output_path.unlink()
+
+        for chunk_idx, (df_chunk, dataset_chunk, x_shape) in enumerate(
+            data_initialize_chunked(
+                csv_path=input_file,
+                chunk_size=chunk_size,
+                mod_features_csv=mod_features_csv,
+            ),
+            start=1,
+        ):
+            LOGGER.info(
+                f"Processing chunk {chunk_idx} with {len(dataset_chunk)} entries and shape {x_shape}."
+            )
+
+            dataloader_input = DataLoader(
+                dataset_chunk,
+                batch_size=batch_size,
+                shuffle=False,
+            )
+
+            chunk_loss, _, chunk_predictions, chunk_ground_truth = validate(
+                model=model,
+                dataloader=dataloader_input,
+                loss_fn=loss_fn,
+                device=device,
+            )
+
+            n_chunk = len(dataset_chunk)
+            total_loss += chunk_loss * n_chunk
+            total_samples += n_chunk
+
+            all_predictions.extend(chunk_predictions)
+            all_ground_truth.extend(chunk_ground_truth)
+
+            if save_results:
+                result_data = {
+                    "sequences": df_chunk.get("seq", None),
+                    "modifications": df_chunk.get("modifications", None),
+                    "ground_truth": chunk_ground_truth,
+                    "predictions": chunk_predictions,
+                }
+
+                result_df = pd.DataFrame(result_data)
+                result_df.to_csv(
+                    output_path,
+                    mode="a",
+                    index=False,
+                    header=not output_path.exists(),
+                )
+                LOGGER.info(f"Chunk {chunk_idx} results appended to {output_path}")
+
+        if total_samples == 0:
+            LOGGER.error("No valid samples were processed during prediction.")
+            raise ValueError("No valid samples were processed during prediction.")
+
+        loss = total_loss / total_samples
+
+        if len(all_predictions) > 1 and len(all_ground_truth) > 1:
+            correlation = np.corrcoef(all_predictions, all_ground_truth)[0, 1]
+        else:
+            correlation = np.nan
 
         if calibrate:
             LOGGER.info("Fitting calibration model.")
diff --git a/ideeplc/utilities.py b/ideeplc/utilities.py
index aecc7ea..6c9307a 100644
--- a/ideeplc/utilities.py
+++ b/ideeplc/utilities.py
@@ -1,5 +1,7 @@
 import io
-from typing import List, Tuple, Dict, Union, Optional, Any
+import logging
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -9,6 +11,12 @@
 import ideeplc.structure_feature
 
 
+LOGGER = logging.getLogger(__name__)
+
+MEAN_MOLLOGP = -0.5262240476190472
+STD_MOLLOGP = 0.7546071397979358
+
+
 class Config:
     """
     Configuration class for the encoding of peptides.
@@ -76,22 +84,132 @@ def aa_chemical_feature() -> Dict[str, np.ndarray]:
     return features_arrays
 
 
-def mod_chemical_features() -> Dict[str, Dict[str, Dict[str, float]]]:
-    """Get modification features."""
-    content = files(ideeplc.structure_feature).joinpath("ptm_stan.csv").read_bytes()
-    df = pd.read_csv(io.BytesIO(content))
-    # Convert the dataframe to a dictionary and transpose it
+def standardize(value: float, mean: float, std: float) -> float:
+    """Apply standardization."""
+    return (value - mean) / std
+
+
+def compute_mollogp(smiles: str) -> Optional[float]:
+    """Compute RDKit MolLogP from a SMILES string."""
+    try:
+        from rdkit import Chem
+        from rdkit.Chem import Crippen
+    except ImportError as exc:
+        raise ImportError(
+            "rdkit is required to build modification features from raw SMILES input."
+        ) from exc
+
+    mol = Chem.MolFromSmiles(smiles)
+    if mol is None:
+        return None
+    return Crippen.MolLogP(mol)
+
+
+def build_user_mod_feature_table(
+    input_csv: str,
+    output_csv: Optional[str] = None,
+    compute_mollogp_fn: Optional[Callable[[str], Optional[float]]] = None,
+) -> pd.DataFrame:
+    """Build a standardized modification feature table from a raw user CSV."""
+    df = pd.read_csv(input_csv)
+    required_cols = {"name", "aa", "smiles"}
+    if not required_cols.issubset(df.columns):
+        raise ValueError(f"Input file must contain columns: {required_cols}")
+
+    compute_fn = compute_mollogp_fn or compute_mollogp
+    results = []
+
+    for _, row in df.iterrows():
+        name = f"{row['name']}#{row['aa']}"
+        smiles = row["smiles"]
+
+        mollogp = compute_fn(smiles)
+        if mollogp is None:
+            LOGGER.warning("Skipping invalid SMILES for %s", name)
+            continue
+
+        results.append(
+            {
+                "name": name,
+                "MolLogP_rdkit": standardize(mollogp, MEAN_MOLLOGP, STD_MOLLOGP),
+            }
+        )
+
+    if not results:
+        raise ValueError("No valid modification rows were found in the input file.")
+
+    df_out = pd.DataFrame(results)
+    if output_csv:
+        Path(output_csv).parent.mkdir(parents=True, exist_ok=True)
+        df_out.to_csv(output_csv, index=False)
+
+    return df_out
+
+
+def _load_mod_feature_table(csv_path: str) -> pd.DataFrame:
+    """Load either a standardized feature table or a raw user-mod table."""
+    df = pd.read_csv(csv_path)
+
+    if {"name", "MolLogP_rdkit"}.issubset(df.columns):
+        feature_df = df.loc[:, ["name", "MolLogP_rdkit"]].copy()
+    elif {"name", "aa", "smiles"}.issubset(df.columns):
+        feature_df = build_user_mod_feature_table(csv_path)
+    else:
+        raise ValueError(
+            "Modification CSV must contain either ['name', 'MolLogP_rdkit'] or ['name', 'aa', 'smiles'] columns."
+        )
+
+    feature_df = feature_df.dropna(subset=["name", "MolLogP_rdkit"])
+    feature_df["name"] = feature_df["name"].astype(str)
+    feature_df["MolLogP_rdkit"] = feature_df["MolLogP_rdkit"].astype(float)
+    feature_df = feature_df.drop_duplicates(subset=["name"], keep="last")
+    return feature_df
+
+
+def _merge_mod_feature_tables(
+    base_df: pd.DataFrame, extra_df: pd.DataFrame
+) -> pd.DataFrame:
+    """Merge the built-in and user-provided modification feature tables."""
+    combined = base_df.copy()
+    combined.update(extra_df)
+
+    new_rows = extra_df.loc[~extra_df.index.isin(combined.index)]
+    if not new_rows.empty:
+        combined = pd.concat([combined, new_rows])
+
+    return combined
+
+
+def _mod_feature_table_to_dict(
+    df: pd.DataFrame,
+) -> Dict[str, Dict[str, Dict[str, float]]]:
+    """Convert a feature table to the nested modification dictionary format."""
     df = df.set_index("name").T
-    # Convert the DataFrame to a dictionary of modifications with their chemical features
     modified = df.to_dict("list")
     dic = {}
     for key, values in modified.items():
         main_key, sub_key = key.split("#")
-        # Create a nested dictionary with the modification name and the amino acid
         dic.setdefault(main_key, {})[sub_key] = dict(zip(df.index, values))
     return dic
 
 
+def mod_chemical_features(
+    user_mods_csv: Optional[str] = None,
+) -> Dict[str, Dict[str, Dict[str, float]]]:
+    """Get modification features, optionally merged with user-provided modifications."""
+    content = files(ideeplc.structure_feature).joinpath("ptm_stan.csv").read_bytes()
+    base_df = pd.read_csv(io.BytesIO(content))
+    base_df = base_df.loc[:, ["name", "MolLogP_rdkit"]].copy()
+
+    if user_mods_csv:
+        extra_df = _load_mod_feature_table(user_mods_csv)
+        base_df = _merge_mod_feature_tables(
+            base_df.set_index("name"), extra_df.set_index("name")
+        ).reset_index()
+
+    return _mod_feature_table_to_dict(base_df)
+
+
 def peptide_parser(peptide: str) -> Tuple:
     """Parse the peptide sequence and modifications."""
     modifications = []
@@ -270,7 +388,9 @@ def encode_sequence_one_hot(sequence: str) -> np.ndarray:
 
 
 def df_to_matrix(
-    seqs: Union[str, List[str]], df: Optional[pd.DataFrame] = None
+    seqs: Union[str, List[str]],
+    df: Optional[pd.DataFrame] = None,
+    mod_features_csv: Optional[str] = None,
 ) -> (
     tuple[ndarray, list[Any], list[list[str | list[str] | int | Exception]]]
     | ndarray
@@ -311,7 +431,7 @@ def df_to_matrix(
     seqs_encoded = []
     tr = []
     errors = []
-    modifications_dict = mod_chemical_features()
+    modifications_dict = mod_chemical_features(user_mods_csv=mod_features_csv)
     aa_to_feature = aa_chemical_feature()
     amino_acids_atoms = aa_atomic_composition_array()