diff --git a/ideeplc/__init__.py b/ideeplc/__init__.py index ae57bfe..7f86cee 100644 --- a/ideeplc/__init__.py +++ b/ideeplc/__init__.py @@ -1,3 +1,3 @@ """iDeepLC: A deep Learning-based retention time predictor for unseen modified peptides with a novel encoding system""" -__version__ = "1.3.1" +__version__ = "1.3.2" diff --git a/ideeplc/data_initialize.py b/ideeplc/data_initialize.py index 02d578a..80b7713 100644 --- a/ideeplc/data_initialize.py +++ b/ideeplc/data_initialize.py @@ -1,14 +1,13 @@ import logging -from typing import Tuple, Union +from typing import Tuple, Union, Iterator import pandas as pd import numpy as np -from torch.utils.data import Dataset, DataLoader +from torch.utils.data import Dataset from ideeplc.utilities import df_to_matrix, reform_seq LOGGER = logging.getLogger(__name__) -# Making the pytorch dataset class MyDataset(Dataset): def __init__(self, sequences: np.ndarray, retention: np.ndarray) -> None: self.sequences = sequences @@ -25,15 +24,14 @@ def data_initialize( csv_path: str, **kwargs ) -> Union[Tuple[MyDataset, np.ndarray], Tuple[MyDataset, np.ndarray]]: """ - Initialize peptides matrices based on a CSV file containing raw peptide sequences. + Initialize peptide matrices based on a CSV file containing raw peptide sequences. :param csv_path: Path to the CSV file containing raw peptide sequences. - :return: DataLoader for prediction. + :return: Dataset for prediction or fine-tuning and x_shape. """ - LOGGER.info(f"Loading peptides from {csv_path}") + try: - # Load peptides from CSV file df = pd.read_csv(csv_path) except FileNotFoundError: LOGGER.error(f"File {csv_path} not found.") @@ -63,22 +61,108 @@ def data_initialize( LOGGER.info( f"Loaded and reformed {len(reformed_peptides)} peptides sequences from the file." ) + try: - # Convert sequences to matrix format sequences, tr, errors = df_to_matrix(reformed_peptides, df) except Exception as e: LOGGER.error(f"Error converting sequences to matrix format: {e}") raise + if errors: LOGGER.warning(f"Errors encountered during conversion: {errors}") prediction_dataset = MyDataset(sequences, tr) - # Create DataLoader objects - dataloader_pred = DataLoader(prediction_dataset) - # passing the training X shape - for batch in dataloader_pred: - x_shape = batch[0].shape - break + if len(prediction_dataset) == 0: + LOGGER.error("No valid peptide entries were found in the input file.") + raise ValueError("No valid peptide entries were found in the input file.") + + # Keep historical x_shape contract expected by model/tests: (batch, channels, length) + x_shape = (1,) + prediction_dataset[0][0].shape LOGGER.info(f"Dataset initialized with data shape {x_shape}.") return prediction_dataset, x_shape + + +def data_initialize_chunked( + csv_path: str, chunk_size: int = 10000, **kwargs +) -> Iterator[Tuple[pd.DataFrame, MyDataset, np.ndarray]]: + """ + Initialize peptide matrices from a CSV file in chunks. + + :param csv_path: Path to the CSV file containing raw peptide sequences. + :param chunk_size: Number of rows to load per chunk. + :return: Iterator yielding dataframe chunk, dataset chunk, and x_shape. + """ + LOGGER.info(f"Loading peptides from {csv_path} in chunks of {chunk_size}") + + try: + chunk_iter = pd.read_csv(csv_path, chunksize=chunk_size) + except FileNotFoundError: + LOGGER.error(f"File {csv_path} not found.") + raise + except pd.errors.EmptyDataError: + LOGGER.error(f"File {csv_path} is empty.") + raise + except Exception as e: + LOGGER.error(f"Error reading {csv_path}: {e}") + raise + + for chunk_idx, df in enumerate(chunk_iter, start=1): + if "seq" not in df.columns: + LOGGER.error("CSV file must contain a 'seq' column with peptide sequences.") + raise ValueError("Missing 'seq' column in the CSV file.") + if "modifications" not in df.columns: + LOGGER.error( + "CSV file must contain a 'modifications' column with peptide modifications." + ) + raise ValueError("Missing 'modifications' column in the CSV file.") + if "tr" not in df.columns: + LOGGER.error("CSV file must contain a 'tr' column with retention times.") + raise ValueError("Missing 'tr' column in the CSV file.") + + reformed_peptides = [ + reform_seq(seq, mod) for seq, mod in zip(df["seq"], df["modifications"]) + ] + LOGGER.info( + f"Chunk {chunk_idx}: loaded and reformed {len(reformed_peptides)} peptides sequences." + ) + + try: + sequences, tr, errors = df_to_matrix(reformed_peptides, df) + except Exception as e: + LOGGER.error( + f"Error converting sequences to matrix format in chunk {chunk_idx}: {e}" + ) + raise + + if errors: + LOGGER.warning(f"Errors encountered during conversion in chunk {chunk_idx}: {errors}") + + prediction_dataset = MyDataset(sequences, tr) + + if len(prediction_dataset) == 0: + LOGGER.warning(f"Chunk {chunk_idx} contains no valid peptide entries.") + continue + + # Keep historical x_shape contract expected by model/tests: (batch, channels, length) + x_shape = (1,) + prediction_dataset[0][0].shape + LOGGER.info(f"Chunk {chunk_idx} initialized with data shape {x_shape}.") + yield df, prediction_dataset, x_shape + + +def get_input_shape_from_first_chunk(csv_path: str, chunk_size: int = 10000): + """ + Get the input shape from the first valid chunk of a CSV file. + + :param csv_path: Path to the CSV file containing raw peptide sequences. + :param chunk_size: Number of rows to load per chunk. + :return: x_shape for model initialization. + """ + for _, dataset_chunk, x_shape in data_initialize_chunked( + csv_path=csv_path, chunk_size=chunk_size + ): + LOGGER.info(f"Detected input shape from first valid chunk: {x_shape}") + return x_shape + + LOGGER.error("No valid chunks found in the input file.") + raise ValueError("No valid chunks found in the input file.") \ No newline at end of file diff --git a/ideeplc/fine_tuning.py b/ideeplc/fine_tuning.py index 70f0588..dcda916 100644 --- a/ideeplc/fine_tuning.py +++ b/ideeplc/fine_tuning.py @@ -24,6 +24,8 @@ def __init__( validation_data=None, validation_split=0.1, patience=5, + num_workers=0, + pin_memory=False, ): """ Initialize the fine-tuner with the model and data loaders. @@ -38,6 +40,8 @@ def __init__( :param validation_data: Optional validation dataset. :param validation_split: Fraction of training data to use for validation. :param patience: Number of epochs with no improvement after which training will be stopped. + :param num_workers: Number of workers for the DataLoader. + :param pin_memory: Whether to pin memory in the DataLoader. """ self.model = model.to(device) self.train_data = train_data @@ -49,6 +53,8 @@ def __init__( self.validation_data = validation_data self.validation_split = validation_split self.patience = patience + self.num_workers = num_workers + self.pin_memory = pin_memory def _freeze_layers(self, layers_to_freeze): """ @@ -71,34 +77,52 @@ def prepare_data(self, data, shuffle=True): :param shuffle: Whether to shuffle the data. :return: DataLoader for the dataset. """ - return DataLoader(data, batch_size=self.batch_size, shuffle=shuffle) + return DataLoader( + data, + batch_size=self.batch_size, + shuffle=shuffle, + num_workers=self.num_workers, + pin_memory=self.pin_memory, + ) def fine_tune(self, layers_to_freeze=None): """ Fine-tune the iDeepLC model on the training dataset. :param layers_to_freeze: List of layer names to freeze during fine-tuning. + :return: Best model based on validation loss. """ LOGGER.info("Starting fine-tuning...") + if layers_to_freeze: self._freeze_layers(layers_to_freeze) - optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate) + optimizer = torch.optim.Adam( + filter(lambda p: p.requires_grad, self.model.parameters()), + lr=self.learning_rate, + ) loss_fn = self.loss_function - # Prepare DataLoader - if self.validation_data: - dataloader_train = self.prepare_data(self.train_data) + + if self.validation_data is not None: + dataloader_train = self.prepare_data(self.train_data, shuffle=True) dataloader_val = self.prepare_data(self.validation_data, shuffle=False) else: - # Split the training data into training and validation sets train_size = int((1 - self.validation_split) * len(self.train_data)) val_size = len(self.train_data) - train_size + + if train_size == 0 or val_size == 0: + raise ValueError( + "Training dataset is too small for the requested validation split." + ) + train_dataset, val_dataset = torch.utils.data.random_split( self.train_data, [train_size, val_size] ) - dataloader_train = self.prepare_data(train_dataset) + dataloader_train = self.prepare_data(train_dataset, shuffle=True) dataloader_val = self.prepare_data(val_dataset, shuffle=False) + LOGGER.info(f"Training on {len(dataloader_train.dataset)} samples.") + LOGGER.info(f"Validating on {len(dataloader_val.dataset)} samples.") best_model = copy.deepcopy(self.model) best_loss = float("inf") @@ -107,15 +131,15 @@ def fine_tune(self, layers_to_freeze=None): for epoch in range(self.epochs): self.model.train() running_loss = 0.0 + for batch in dataloader_train: inputs, target = batch - inputs, target = inputs.to(self.device), target.to(self.device) + inputs = inputs.to(self.device, non_blocking=True) + target = target.to(self.device, non_blocking=True) - # Forward pass outputs = self.model(inputs.float()) loss = loss_fn(outputs, target.float().view(-1, 1)) - # Backward pass and optimization optimizer.zero_grad() loss.backward() optimizer.step() @@ -125,22 +149,24 @@ def fine_tune(self, layers_to_freeze=None): avg_loss = running_loss / len(dataloader_train.dataset) LOGGER.info(f"Epoch [{epoch + 1}/{self.epochs}], Loss: {avg_loss:.4f}") - # Validate the model after each epoch - if dataloader_val: - val_loss, _, _, _ = validate( - self.model, dataloader_val, loss_fn, self.device + val_loss, _, _, _ = validate( + self.model, dataloader_val, loss_fn, self.device + ) + + if val_loss < best_loss: + best_loss = val_loss + best_model = copy.deepcopy(self.model) + patience_counter = 0 + LOGGER.info(f"New best validation loss: {best_loss:.4f}") + else: + patience_counter += 1 + LOGGER.info( + f"No improvement in validation loss. Patience: {patience_counter}/{self.patience}" ) - if val_loss < best_loss: - best_loss = val_loss - best_model = copy.deepcopy(self.model) - patience_counter = 0 - LOGGER.info(f"New best validation loss: {best_loss:.4f}") - else: - patience_counter += 1 - - if patience_counter >= self.patience: - LOGGER.info("Early stopping triggered.") - break + + if patience_counter >= self.patience: + LOGGER.info("Early stopping triggered.") + break LOGGER.info("Fine-tuning complete.") - return best_model + return best_model \ No newline at end of file diff --git a/ideeplc/ideeplc_core.py b/ideeplc/ideeplc_core.py index 958f2b2..4639183 100644 --- a/ideeplc/ideeplc_core.py +++ b/ideeplc/ideeplc_core.py @@ -4,10 +4,9 @@ from pathlib import Path import sys from torch import nn -from torch.utils.data import DataLoader from ideeplc.model import MyNet from ideeplc.config import get_config -from ideeplc.data_initialize import data_initialize +from ideeplc.data_initialize import data_initialize, get_input_shape_from_first_chunk from ideeplc.predict import predict from ideeplc.figure import make_figures from ideeplc.fine_tuning import iDeepLCFineTuner @@ -39,9 +38,7 @@ def get_model_save_path(): Determines the correct directory and filename for saving the model. Appends a timestamp to the filename to prevent overwriting. - - Returns: - tuple: (model_save_path, model_dir) + :return: Tuple containing model_save_path and model_dir. """ timestamp = datetime.datetime.now().strftime("%m%d") model_dir = Path("ideeplc/models") / timestamp @@ -62,19 +59,25 @@ def main(args): """ Main function that executes training/evaluation for the iDeepLC package based on the provided arguments. - Args: - args (argparse.Namespace): Parsed arguments from the CLI. + :param args: Parsed arguments from the CLI. """ - LOGGER.info("Starting iDeepLC prediction...") + try: # Load configuration config = get_config() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + chunk_size = config.get("chunk_size", 10000) + batch_size = config["batch_size"] - # Initialize data + LOGGER.info(f"Using device: {device}") LOGGER.info(f"Loading data from {args.input}") - matrix_input, x_shape = data_initialize(csv_path=args.input) + + # For model initialization, only inspect the first valid chunk + x_shape = get_input_shape_from_first_chunk( + csv_path=args.input, chunk_size=chunk_size + ) + # Initialize model LOGGER.info("Initializing model") model = MyNet(x_shape=x_shape, config=config).to(device) @@ -84,11 +87,12 @@ def main(args): pretrained_model, model_dir = get_model_save_path() if args.model: try: - logging.info(f"Using user-specified model path: {args.model}") + LOGGER.info(f"Using user-specified model path: {args.model}") pretrained_model = Path(args.model) except Exception as e: LOGGER.error(f"Invalid model path provided: {e}") raise e + model.load_state_dict( torch.load(pretrained_model, map_location=device), strict=False ) @@ -96,6 +100,9 @@ def main(args): if args.finetune: LOGGER.info("Fine-tuning the model") + + matrix_input, _ = data_initialize(csv_path=args.input) + fine_tuner = iDeepLCFineTuner( model=model, train_data=matrix_input, @@ -103,28 +110,28 @@ def main(args): device=device, learning_rate=config["learning_rate"], epochs=config["epochs"], - batch_size=config["batch_size"], - validation_data=None, # No validation data provided for prediction + batch_size=batch_size, + validation_data=None, validation_split=0.1, - patience=5, + patience=20, ) model = fine_tuner.fine_tune(layers_to_freeze=config["layers_to_freeze"]) + torch.save(model.state_dict(), "finetuned_model_.pth") - dataloader_input = DataLoader( - matrix_input, batch_size=config["batch_size"], shuffle=False - ) # Prediction on provided data LOGGER.info("Starting prediction") pred_loss, pred_cor, pred_results, ground_truth = predict( model=model, - dataloader_input=dataloader_input, loss_fn=loss_function, device=device, calibrate=args.calibrate, input_file=args.input, save_results=args.save, + batch_size=batch_size, + chunk_size=chunk_size, ) LOGGER.info("Prediction completed.") + # Generate Figures make_figures( predictions=pred_results, @@ -137,4 +144,4 @@ def main(args): except Exception as e: LOGGER.error(f"An error occurred during execution: {e}") - raise e + raise e \ No newline at end of file diff --git a/ideeplc/model.py b/ideeplc/model.py index e86c6a5..f3f7d94 100644 --- a/ideeplc/model.py +++ b/ideeplc/model.py @@ -6,6 +6,7 @@ class MyNet(nn.Module): def __init__(self, x_shape, config): self.x_shape = x_shape + self.seq_len = self.x_shape[-1] super(MyNet, self).__init__() # first cnn layers for amino @@ -103,7 +104,7 @@ def __init__(self, x_shape, config): self.l4 = nn.ModuleList() self.l4.append( - nn.Linear(in_features=self.x_shape[2], out_features=config["fc_out"]) + nn.Linear(in_features=self.seq_len, out_features=config["fc_out"]) ) self.l4.append(nn.ReLU()) @@ -157,9 +158,9 @@ def __init__(self, x_shape, config): + config["cnn3_channels"] + config["cnn4_channels"] ) - * self.x_shape[2] + * self.seq_len + config["fc_out"] * 7 - + config["cnn2_channels"] * int(self.x_shape[2] / 2) + + config["cnn2_channels"] * int(self.seq_len / 2) ), out_features=int(config["fc2_out"]), ) @@ -196,7 +197,7 @@ def __init__(self, x_shape, config): def forward(self, x): x1 = x[:, :1, :] - x2 = x[:, 1:8, : int(self.x_shape[2] / 2)] + x2 = x[:, 1:8, : int(self.seq_len / 2)] x3 = x[:, 8:14, :] x4 = x[:, 14:21, :] x5 = x[:, 21:, :] diff --git a/ideeplc/predict.py b/ideeplc/predict.py index bc90455..6ca01bb 100644 --- a/ideeplc/predict.py +++ b/ideeplc/predict.py @@ -1,13 +1,14 @@ import os from pathlib import Path -from typing import Tuple import datetime +from typing import Tuple import numpy as np import pandas as pd import torch from torch import nn from torch.utils.data import DataLoader from ideeplc.calibrate import SplineTransformerCalibration +from ideeplc.data_initialize import data_initialize_chunked import logging LOGGER = logging.getLogger(__name__) @@ -18,6 +19,7 @@ def validate( ) -> Tuple[float, float, list, list]: """ Validate the model on a given dataset. + :param model: The trained model. :param dataloader: The DataLoader providing the validation/test data. :param loss_fn: The loss function to use. @@ -31,9 +33,9 @@ def validate( with torch.no_grad(): for inputs, labels in dataloader: - inputs, labels = inputs.to(device, non_blocking=True), labels.to( - device, non_blocking=True - ) + inputs = inputs.to(device, non_blocking=True) + labels = labels.to(device, non_blocking=True) + outputs_batch = model(inputs.float()) loss = loss_fn(outputs_batch, labels.float().view(-1, 1)) @@ -42,7 +44,12 @@ def validate( ground_truth.extend(labels.cpu().numpy().flatten()) avg_loss = total_loss / len(dataloader.dataset) - correlation = np.corrcoef(predictions, ground_truth)[0, 1] + + if len(predictions) > 1 and len(ground_truth) > 1: + correlation = np.corrcoef(predictions, ground_truth)[0, 1] + else: + correlation = np.nan + LOGGER.info( f"Validation complete. Loss: {avg_loss:.4f}, Correlation: {correlation:.4f}" ) @@ -52,84 +59,173 @@ def validate( def predict( model: nn.Module, - dataloader_input: DataLoader, loss_fn: nn.Module, device: torch.device, input_file: str, calibrate: bool, save_results: bool, + batch_size: int = None, + chunk_size: int = 10000, + dataloader_input: DataLoader = None, ): """ - Load a trained model and evaluate it on test datasets. + Load a trained model and evaluate it on test datasets in chunks. :param model: The trained model. - :param dataloader_input: Test dataset loader. :param loss_fn: Loss function. :param device: Computation device. :param input_file: Path to the input file containing peptide sequences. :param calibrate: If True, calibrates the results. :param save_results: If True, saves the evaluation results. + :param batch_size: Batch size used during prediction when reading from file. + :param chunk_size: Number of input rows to process per chunk. + :param dataloader_input: Optional prebuilt DataLoader for backward compatibility. :return: Loss, correlation, predictions, and ground truth values. """ - LOGGER.info("Starting prediction process.") + LOGGER.info( + f"Starting prediction process with batch size {batch_size} and chunk size {chunk_size}." + ) + + all_predictions = [] + all_ground_truth = [] + total_loss = 0.0 + total_samples = 0 + + calibrated_preds = None + + timestamp = datetime.datetime.now().strftime("%Y%m%d") + input_file_name = os.path.splitext(os.path.basename(input_file))[0] + output_path = ( + Path("ideeplc_output") / f"{input_file_name}_predictions_{timestamp}.csv" + ) try: - # Validate on the primary test set - loss, correlation, predictions, ground_truth = validate( - model, dataloader_input, loss_fn, device - ) + if dataloader_input is not None: + LOGGER.info("Using provided dataloader_input for prediction.") + loss, correlation, all_predictions, all_ground_truth = validate( + model=model, + dataloader=dataloader_input, + loss_fn=loss_fn, + device=device, + ) + + if calibrate: + LOGGER.info("Fitting calibration model.") + calibration_model = SplineTransformerCalibration() + calibration_model.fit(all_ground_truth, all_predictions) + calibrated_preds = calibration_model.transform(all_predictions) + + if len(calibrated_preds) > 1 and len(all_ground_truth) > 1: + correlation = np.corrcoef(calibrated_preds, all_ground_truth)[0, 1] + else: + correlation = np.nan + + loss_calibrated = loss_fn( + torch.tensor(calibrated_preds).float().view(-1, 1), + torch.tensor(all_ground_truth).float().view(-1, 1), + ) + loss = loss_calibrated.item() + return loss, correlation, calibrated_preds, all_ground_truth + + return loss, correlation, all_predictions, all_ground_truth + + if batch_size is None: + raise ValueError("batch_size must be provided when dataloader_input is not used.") + + if save_results: + output_path.parent.mkdir(parents=True, exist_ok=True) + if output_path.exists(): + output_path.unlink() + + for chunk_idx, (df_chunk, dataset_chunk, x_shape) in enumerate( + data_initialize_chunked(csv_path=input_file, chunk_size=chunk_size), + start=1, + ): + LOGGER.info( + f"Processing chunk {chunk_idx} with {len(dataset_chunk)} entries and shape {x_shape}." + ) + + dataloader_input = DataLoader( + dataset_chunk, + batch_size=batch_size, + shuffle=False, + ) + + chunk_loss, _, chunk_predictions, chunk_ground_truth = validate( + model=model, + dataloader=dataloader_input, + loss_fn=loss_fn, + device=device, + ) + + n_chunk = len(dataset_chunk) + total_loss += chunk_loss * n_chunk + total_samples += n_chunk + + all_predictions.extend(chunk_predictions) + all_ground_truth.extend(chunk_ground_truth) + + if save_results: + result_data = { + "sequences": df_chunk.get("seq", None), + "modifications": df_chunk.get("modifications", None), + "ground_truth": chunk_ground_truth, + "predictions": chunk_predictions, + } + + result_df = pd.DataFrame(result_data) + result_df.to_csv( + output_path, + mode="a", + index=False, + header=not output_path.exists(), + ) + LOGGER.info(f"Chunk {chunk_idx} results appended to {output_path}") + + if total_samples == 0: + LOGGER.error("No valid samples were processed during prediction.") + raise ValueError("No valid samples were processed during prediction.") + + loss = total_loss / total_samples + + if len(all_predictions) > 1 and len(all_ground_truth) > 1: + correlation = np.corrcoef(all_predictions, all_ground_truth)[0, 1] + else: + correlation = np.nan if calibrate: LOGGER.info("Fitting calibration model.") calibration_model = SplineTransformerCalibration() - calibration_model.fit(ground_truth, predictions) - calibrated_preds = calibration_model.transform(predictions) - correlation_preds = np.corrcoef(calibrated_preds, ground_truth)[0, 1] + calibration_model.fit(all_ground_truth, all_predictions) + calibrated_preds = calibration_model.transform(all_predictions) + + if len(calibrated_preds) > 1 and len(all_ground_truth) > 1: + correlation = np.corrcoef(calibrated_preds, all_ground_truth)[0, 1] + else: + correlation = np.nan loss_calibrated = loss_fn( torch.tensor(calibrated_preds).float().view(-1, 1), - torch.tensor(ground_truth).float().view(-1, 1), + torch.tensor(all_ground_truth).float().view(-1, 1), ) LOGGER.info(f"Calibration Loss: {loss_calibrated.item():.4f}") loss = loss_calibrated.item() - correlation = correlation_preds - # Save results - if save_results: - input_df = pd.read_csv(input_file) - # Extract sequences and modifications from the input file - sequences = input_df.get("seq", None) - modifications = input_df.get("modifications", None) - - timestamp = datetime.datetime.now().strftime("%Y%m%d") - input_file_name = os.path.splitext(os.path.basename(input_file))[0] - output_path = ( - Path("ideeplc_output") - / f"{input_file_name}_predictions_{timestamp}.csv" - ) - output_path.parent.mkdir( - parents=True, exist_ok=True - ) # Ensure the directory exists + if save_results: + saved_df = pd.read_csv(output_path) + saved_df["calibrated_predictions"] = calibrated_preds + saved_df.to_csv(output_path, index=False) + LOGGER.info(f"Calibrated results saved to {output_path}") - result_data = { - "sequences": sequences, - "modifications": modifications, - "ground_truth": ground_truth, - "predictions": predictions, - } - - if calibrate: - result_data["calibrated_predictions"] = calibrated_preds - - result_df = pd.DataFrame(result_data) - result_df.to_csv(output_path, index=False) - LOGGER.info(f"Results saved to {output_path}") + LOGGER.info( + f"Prediction complete. Loss: {loss:.4f}, Correlation: {correlation:.4f}" + ) if calibrate: - return loss, correlation, calibrated_preds, ground_truth + return loss, correlation, calibrated_preds, all_ground_truth else: - return loss, correlation, predictions, ground_truth + return loss, correlation, all_predictions, all_ground_truth except Exception as e: LOGGER.error(f"An error occurred during prediction: {e}") - raise e + raise e \ No newline at end of file diff --git a/ideeplc/utilities.py b/ideeplc/utilities.py index aecc7ea..3f7e08a 100644 --- a/ideeplc/utilities.py +++ b/ideeplc/utilities.py @@ -133,19 +133,51 @@ def encode_sequence_and_modification( if mods: for mod in mods: name = mod.name + + if name not in modifications_dict: + raise KeyError( + f"Modification '{name}' was not found in the modification feature dictionary." + ) + if aa not in modifications_dict[name]: + raise KeyError( + f"Modification '{name}' is not defined for amino acid '{aa}'." + ) + encoded[start_index : start_index + config.num_features, loc + 1] = ( list(modifications_dict[name][aa].values()) ) + if n_term: for mod in n_term: name = f"{mod.name}(N-T)" + + if name not in modifications_dict: + raise KeyError( + f"N-terminal modification '{name}' was not found in the modification feature dictionary." + ) + if sequence[0] not in modifications_dict[name]: + raise KeyError( + f"N-terminal modification '{name}' is not defined for amino acid '{sequence[0]}'." + ) + encoded[start_index : start_index + config.num_features, 1] = list( modifications_dict[name][sequence[0]].values() ) + if c_term: loc = len(parsed_sequence) + 1 for mod in c_term: name = mod.name + + if name not in modifications_dict: + raise KeyError( + f"C-terminal modification '{name}' was not found in the modification feature dictionary." + ) + if sequence[-1] not in modifications_dict[name]: + raise KeyError( + f"C-terminal modification '{name}' is not defined for amino acid '{sequence[-1]}'." + ) + encoded[start_index : start_index + config.num_features, loc] = list( modifications_dict[name][sequence[-1]].values() ) @@ -272,7 +304,7 @@ def encode_sequence_one_hot(sequence: str) -> np.ndarray: def df_to_matrix( seqs: Union[str, List[str]], df: Optional[pd.DataFrame] = None ) -> ( - tuple[ndarray, list[Any], list[list[str | list[str] | int | Exception]]] + tuple[ndarray, list[Any], list[list[str | list[str] | int | str | Exception]]] | ndarray | Any ): @@ -280,30 +312,14 @@ def df_to_matrix( Convert a peptide or a list of peptides to their matrix representation. If a DataFrame with 'tr' and 'predictions' columns is also provided, - the function returns a tuple: (matrix, tr_values, prediction_values, errors). + the function returns a tuple: (matrix, tr_values, errors). Otherwise, it returns only the encoded matrix representation. - Parameters - ---------- - seqs : str or List[str] - A single peptide sequence or a list of peptide sequences. - df : pd.DataFrame, optional - DataFrame with 'tr' and 'predictions' columns. Must align with the list of sequences. - - Returns - ------- - If `seqs` is a string: - np.ndarray : matrix representation of the peptide. - If `seqs` is a list and `df` is None: - np.ndarray : stacked matrix representation of all peptides. - If `seqs` is a list and `df` is given: - Tuple[np.ndarray, List[float], List] : - - stacked matrix - - list of tr values - - list of errors (with peptide, index, and exception) + :param seqs: A single peptide sequence or a list of peptide sequences. + :param df: DataFrame with 'tr' column. Must align with the list of sequences. + :return: Encoded matrix representation, and optionally tr values and errors. """ - # Normalize to list of sequences single_input = isinstance(seqs, str) if single_input: seqs = [seqs] @@ -317,10 +333,12 @@ def df_to_matrix( for idx, peptide in enumerate(seqs): try: + step = "peptide_parser" parsed_sequence, modifiers, sequence, modifications = peptide_parser( peptide ) + step = "encode_sequence_and_modification" encode_seq_mod = encode_sequence_and_modification( sequence, parsed_sequence, @@ -329,10 +347,13 @@ def df_to_matrix( modifiers["n_term"], modifiers["c_term"], ) + + step = "encode_diamino_sequence_and_modification" encode_di_seq_mod = encode_diamino_sequence_and_modification( encode_seq_mod=encode_seq_mod ) + step = "encode_sequence_and_modification_atomic" encode_seq_mod_atomic = encode_sequence_and_modification_atomic( sequence, parsed_sequence, @@ -340,13 +361,19 @@ def df_to_matrix( modifiers["n_term"], modifiers["c_term"], ) + + step = "encode_diamino_sequence_and_modification_atomic" encode_di_seq_mod_atomic = encode_diamino_sequence_and_modification_atomic( encode_seq_mod_atomic ) + step = "encode_sequence_metadata" encode_seq_meta = encode_sequence_metadata(sequence, encode_seq_mod_atomic) + + step = "encode_sequence_one_hot" seq_hot = encode_sequence_one_hot(sequence) + step = "combine_encoded_features" peptide_encoded = ( encode_seq_mod + encode_di_seq_mod @@ -362,10 +389,28 @@ def df_to_matrix( tr.append(df["tr"].iloc[idx]) except Exception as e: - errors.append([peptide, idx, e]) + errors.append([peptide, idx, step, e]) continue + if len(seqs_encoded) == 0: + if errors: + first_error = errors[0] + raise ValueError( + f"All peptides failed during encoding. First failing peptide: '{first_error[0]}' at index {first_error[1]}. Failing step: {first_error[2]}. Original error: {first_error[3]}" + ) + raise ValueError( + "All peptides failed during encoding. No encoded arrays were created." + ) + seqs_stack = np.stack(seqs_encoded) + + if errors: + print("Encoding errors encountered:") + for peptide, idx, step, err in errors[:10]: + print( + f" - index {idx}, step {step}, peptide '{peptide}': {err}" + ) + if single_input: return seqs_stack[0] elif df is not None: