Skip to content

Commit b65fb48

Browse files
Add Py_AASequence wrapper class with reverse and shuffle operations
Co-authored-by: timosachsenberg <5803621+timosachsenberg@users.noreply.github.com>
1 parent 1e899de commit b65fb48

File tree

3 files changed

+541
-0
lines changed

3 files changed

+541
-0
lines changed

openms_python/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from .py_featuremap import Py_FeatureMap
3030
from .py_consensusmap import Py_ConsensusMap
3131
from .py_experimentaldesign import Py_ExperimentalDesign
32+
from .py_aasequence import Py_AASequence
3233
from .py_identifications import (
3334
ProteinIdentifications,
3435
PeptideIdentifications,
@@ -107,6 +108,7 @@ def get_example(name: str, *, load: bool = False, target_dir: Union[str, Path, N
107108
"Py_FeatureMap",
108109
"Py_ConsensusMap",
109110
"Py_ExperimentalDesign",
111+
"Py_AASequence",
110112
"ProteinIdentifications",
111113
"PeptideIdentifications",
112114
"Identifications",

openms_python/py_aasequence.py

Lines changed: 279 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,279 @@
1+
"""Pythonic wrapper for pyOpenMS AASequence class."""
2+
3+
from __future__ import annotations
4+
5+
from typing import Optional
6+
import pyopenms as oms
7+
8+
9+
class Py_AASequence:
10+
"""
11+
A Pythonic wrapper around pyOpenMS AASequence.
12+
13+
This class provides intuitive properties and methods for working with
14+
amino acid sequences, including common operations like reversing and
15+
shuffling sequences with optional enzyme constraints.
16+
17+
Example:
18+
>>> seq = Py_AASequence.from_string("PEPTIDE")
19+
>>> print(seq.sequence)
20+
PEPTIDE
21+
>>> print(seq.mono_weight)
22+
799.36...
23+
>>> reversed_seq = seq.reverse()
24+
>>> print(reversed_seq.sequence)
25+
EDITPEP
26+
>>> shuffled_seq = seq.shuffle(enzyme="Trypsin")
27+
>>> print(shuffled_seq.sequence) # Shuffled while preserving cleavage sites
28+
"""
29+
30+
def __init__(self, native_sequence: Optional[oms.AASequence] = None):
31+
"""
32+
Initialize Py_AASequence wrapper.
33+
34+
Args:
35+
native_sequence: pyOpenMS AASequence object. If None, creates empty sequence.
36+
"""
37+
self._sequence = native_sequence if native_sequence is not None else oms.AASequence()
38+
self._decoy_generator = None
39+
40+
@classmethod
41+
def from_string(cls, sequence_str: str) -> Py_AASequence:
42+
"""
43+
Create AASequence from string representation.
44+
45+
Args:
46+
sequence_str: String representation of the amino acid sequence.
47+
Can include modifications in OpenMS format.
48+
49+
Returns:
50+
Py_AASequence: New wrapped sequence.
51+
52+
Example:
53+
>>> seq = Py_AASequence.from_string("PEPTIDE")
54+
>>> seq = Py_AASequence.from_string("PEPTIDEM(Oxidation)")
55+
"""
56+
return cls(oms.AASequence.fromString(sequence_str))
57+
58+
# ==================== Pythonic Properties ====================
59+
60+
@property
61+
def native(self) -> oms.AASequence:
62+
"""Return the underlying pyOpenMS AASequence."""
63+
return self._sequence
64+
65+
@property
66+
def sequence(self) -> str:
67+
"""Get the sequence as a string."""
68+
return self._sequence.toString()
69+
70+
@property
71+
def unmodified_sequence(self) -> str:
72+
"""Get the sequence without modifications."""
73+
return self._sequence.toUnmodifiedString()
74+
75+
@property
76+
def mono_weight(self) -> float:
77+
"""Get monoisotopic weight."""
78+
return self._sequence.getMonoWeight()
79+
80+
@property
81+
def average_weight(self) -> float:
82+
"""Get average weight."""
83+
return self._sequence.getAverageWeight()
84+
85+
@property
86+
def formula(self) -> str:
87+
"""Get molecular formula."""
88+
return self._sequence.getFormula().toString()
89+
90+
@property
91+
def is_modified(self) -> bool:
92+
"""Check if sequence has any modifications."""
93+
return self._sequence.isModified()
94+
95+
@property
96+
def has_n_terminal_modification(self) -> bool:
97+
"""Check if sequence has N-terminal modification."""
98+
return self._sequence.hasNTerminalModification()
99+
100+
@property
101+
def has_c_terminal_modification(self) -> bool:
102+
"""Check if sequence has C-terminal modification."""
103+
return self._sequence.hasCTerminalModification()
104+
105+
# ==================== Decoy Generation ====================
106+
107+
def _get_decoy_generator(self) -> oms.DecoyGenerator:
108+
"""Get or create DecoyGenerator instance (lazy initialization)."""
109+
if self._decoy_generator is None:
110+
self._decoy_generator = oms.DecoyGenerator()
111+
return self._decoy_generator
112+
113+
def reverse(self) -> Py_AASequence:
114+
"""
115+
Reverse the entire amino acid sequence.
116+
117+
Returns:
118+
Py_AASequence: New sequence with reversed amino acids.
119+
120+
Example:
121+
>>> seq = Py_AASequence.from_string("PEPTIDE")
122+
>>> reversed_seq = seq.reverse()
123+
>>> print(reversed_seq.sequence)
124+
EDITPEP
125+
"""
126+
dg = self._get_decoy_generator()
127+
reversed_native = dg.reverseProtein(self._sequence)
128+
return Py_AASequence(reversed_native)
129+
130+
def reverse_with_enzyme(self, enzyme: str = "Trypsin") -> Py_AASequence:
131+
"""
132+
Reverse peptide sequences between enzymatic cleavage sites.
133+
134+
This is useful for creating decoy sequences that maintain the
135+
same enzymatic cleavage pattern as the target.
136+
137+
Args:
138+
enzyme: Name of the enzyme (e.g., "Trypsin", "Lys-C", "Asp-N").
139+
Default is "Trypsin".
140+
141+
Returns:
142+
Py_AASequence: New sequence with reversed peptides between cleavage sites.
143+
144+
Example:
145+
>>> seq = Py_AASequence.from_string("PEPTIDERK")
146+
>>> reversed_seq = seq.reverse_with_enzyme("Trypsin")
147+
>>> # K and R are cleavage sites, so segments are reversed separately
148+
"""
149+
dg = self._get_decoy_generator()
150+
reversed_native = dg.reversePeptides(self._sequence, enzyme)
151+
return Py_AASequence(reversed_native)
152+
153+
def shuffle(
154+
self, enzyme: str = "Trypsin", max_attempts: int = 100, seed: Optional[int] = None
155+
) -> Py_AASequence:
156+
"""
157+
Shuffle peptide sequences between enzymatic cleavage sites.
158+
159+
This creates a decoy sequence by shuffling amino acids within
160+
peptide segments defined by enzyme cleavage sites, attempting
161+
to minimize sequence identity with the original.
162+
163+
Args:
164+
enzyme: Name of the enzyme (e.g., "Trypsin", "Lys-C", "Asp-N").
165+
Default is "Trypsin".
166+
max_attempts: Maximum number of shuffle attempts to minimize
167+
sequence identity. Default is 100.
168+
seed: Optional random seed for reproducible shuffling.
169+
170+
Returns:
171+
Py_AASequence: New shuffled sequence.
172+
173+
Example:
174+
>>> seq = Py_AASequence.from_string("PEPTIDERK")
175+
>>> shuffled_seq = seq.shuffle(enzyme="Trypsin", seed=42)
176+
>>> # Amino acids are shuffled within enzyme-defined segments
177+
"""
178+
dg = self._get_decoy_generator()
179+
if seed is not None:
180+
dg.setSeed(seed)
181+
shuffled_native = dg.shufflePeptides(self._sequence, enzyme, max_attempts)
182+
return Py_AASequence(shuffled_native)
183+
184+
# ==================== Sequence Operations ====================
185+
186+
def __len__(self) -> int:
187+
"""Get sequence length."""
188+
return self._sequence.size()
189+
190+
def __str__(self) -> str:
191+
"""String representation."""
192+
return self.sequence
193+
194+
def __repr__(self) -> str:
195+
"""Developer-friendly representation."""
196+
seq_str = self.sequence
197+
if len(seq_str) > 20:
198+
seq_str = seq_str[:17] + "..."
199+
return f"Py_AASequence('{seq_str}')"
200+
201+
def __eq__(self, other: object) -> bool:
202+
"""Check equality based on sequence string."""
203+
if not isinstance(other, Py_AASequence):
204+
return False
205+
return self.sequence == other.sequence
206+
207+
def __getitem__(self, index: int) -> str:
208+
"""
209+
Get residue at position.
210+
211+
Args:
212+
index: Position in the sequence (0-based).
213+
214+
Returns:
215+
str: Single letter amino acid code.
216+
"""
217+
if index < 0 or index >= len(self):
218+
raise IndexError(f"Index {index} out of range for sequence of length {len(self)}")
219+
residue = self._sequence.getResidue(index)
220+
return residue.getOneLetterCode()
221+
222+
def __iter__(self):
223+
"""Iterate over residues."""
224+
for i in range(len(self)):
225+
yield self[i]
226+
227+
# ==================== Additional Utilities ====================
228+
229+
def get_mz(self, charge: int) -> float:
230+
"""
231+
Get m/z value for given charge state.
232+
233+
Args:
234+
charge: Charge state (must be > 0).
235+
236+
Returns:
237+
float: m/z value.
238+
239+
Example:
240+
>>> seq = Py_AASequence.from_string("PEPTIDE")
241+
>>> mz = seq.get_mz(2) # doubly charged
242+
"""
243+
return self._sequence.getMZ(charge)
244+
245+
def has_substring(self, substring: str) -> bool:
246+
"""
247+
Check if sequence contains a substring.
248+
249+
Args:
250+
substring: Amino acid sequence to search for.
251+
252+
Returns:
253+
bool: True if substring is present.
254+
"""
255+
return self._sequence.hasSubsequence(oms.AASequence.fromString(substring))
256+
257+
def has_prefix(self, prefix: str) -> bool:
258+
"""
259+
Check if sequence starts with a prefix.
260+
261+
Args:
262+
prefix: Amino acid sequence to check.
263+
264+
Returns:
265+
bool: True if sequence starts with prefix.
266+
"""
267+
return self._sequence.hasPrefix(oms.AASequence.fromString(prefix))
268+
269+
def has_suffix(self, suffix: str) -> bool:
270+
"""
271+
Check if sequence ends with a suffix.
272+
273+
Args:
274+
suffix: Amino acid sequence to check.
275+
276+
Returns:
277+
bool: True if sequence ends with suffix.
278+
"""
279+
return self._sequence.hasSuffix(oms.AASequence.fromString(suffix))

0 commit comments

Comments
 (0)