Skip to content

Commit 079586f

Browse files
authored
Merge pull request #175 from singjc/add/im_boundaries_to_tsv_export
Add/im boundaries to tsv export
2 parents 4d0156c + d56378c commit 079586f

22 files changed

Lines changed: 549 additions & 250 deletions

pyprophet/io/export/osw.py

Lines changed: 140 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,24 @@
11
import os
2-
import pickle
3-
from shutil import copyfile
4-
import sqlite3
5-
from typing import Literal, Tuple
62
import re
3+
import sqlite3
4+
from typing import Tuple
5+
6+
import click
77
import duckdb
8-
import pandas as pd
98
import numpy as np
10-
import click
9+
import pandas as pd
1110
from loguru import logger
11+
12+
from ..._config import ExportIOConfig
13+
from .._base import BaseOSWReader, BaseOSWWriter
1214
from ..util import (
1315
check_sqlite_table,
14-
check_duckdb_table,
15-
unimod_to_codename,
16-
write_scores_sql_command,
17-
load_sqlite_scanner,
1816
get_table_columns,
1917
get_table_columns_with_types,
18+
load_sqlite_scanner,
19+
unimod_to_codename,
20+
write_scores_sql_command,
2021
)
21-
from .._base import BaseOSWReader, BaseOSWWriter
22-
from ..._config import ExportIOConfig
2322

2423

2524
class OSWReader(BaseOSWReader):
@@ -167,10 +166,67 @@ def _check_alignment_presence(self, con):
167166
con, "SCORE_ALIGNMENT"
168167
)
169168

169+
def _has_im_boundaries(self, con) -> bool:
170+
"""Return True if the FEATURE table contains IM boundary columns.
171+
172+
Older OSW files may not have these columns; this helper centralises the
173+
PRAGMA check so callers don't duplicate the logic.
174+
"""
175+
try:
176+
cols = [
177+
r[1] for r in con.execute("PRAGMA table_info('FEATURE')").fetchall()
178+
]
179+
except Exception:
180+
return False
181+
return "EXP_IM_LEFTWIDTH" in cols and "EXP_IM_RIGHTWIDTH" in cols
182+
183+
def _has_im(self, con) -> bool:
184+
"""Return True if the FEATURE table contains the EXP_IM column.
185+
186+
Older OSW files may not have this column; centralise the PRAGMA
187+
check so callers don't duplicate the logic.
188+
"""
189+
try:
190+
cols = [
191+
r[1] for r in con.execute("PRAGMA table_info('FEATURE')").fetchall()
192+
]
193+
except Exception:
194+
return False
195+
return "EXP_IM" in cols
196+
170197
def _read_unscored_data(self, con):
171198
"""Read data from unscored files."""
172199
score_sql = self._build_score_sql(con)
173200

201+
# IM columns may or may not be present; centralised checks
202+
has_im_boundaries = self._has_im_boundaries(con)
203+
has_im = self._has_im(con)
204+
205+
# Compose EXP_IM (or NULL) plus IM boundary columns (or NULLs)
206+
im_cols_sql = (
207+
(
208+
"""FEATURE.EXP_IM AS EXP_IM,
209+
FEATURE.EXP_IM_LEFTWIDTH AS IM_leftWidth,
210+
FEATURE.EXP_IM_RIGHTWIDTH AS IM_rightWidth"""
211+
)
212+
if has_im and has_im_boundaries
213+
else (
214+
"""FEATURE.EXP_IM AS EXP_IM,
215+
NULL AS IM_leftWidth,
216+
NULL AS IM_rightWidth"""
217+
)
218+
if has_im and not has_im_boundaries
219+
else (
220+
"""NULL AS EXP_IM,
221+
FEATURE.EXP_IM_LEFTWIDTH AS IM_leftWidth,
222+
FEATURE.EXP_IM_RIGHTWIDTH AS IM_rightWidth"""
223+
)
224+
if (not has_im) and has_im_boundaries
225+
else """NULL AS EXP_IM,
226+
NULL AS IM_leftWidth,
227+
NULL AS IM_rightWidth"""
228+
)
229+
174230
query = f"""
175231
SELECT
176232
RUN.ID AS id_run,
@@ -191,7 +247,8 @@ def _read_unscored_data(self, con):
191247
FEATURE_MS1.AREA_INTENSITY AS aggr_prec_Peak_Area,
192248
FEATURE_MS1.APEX_INTENSITY AS aggr_prec_Peak_Apex,
193249
FEATURE.LEFT_WIDTH AS leftWidth,
194-
FEATURE.RIGHT_WIDTH AS rightWidth
250+
FEATURE.RIGHT_WIDTH AS rightWidth,
251+
{im_cols_sql}
195252
{score_sql}
196253
FROM PRECURSOR
197254
INNER JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID
@@ -224,6 +281,34 @@ def _read_peptidoform_data(self, con, cfg):
224281
"""Read data with peptidoform IPF information."""
225282
score_ms1_pep, link_ms1 = self._get_ms1_score_info(con)
226283

284+
# IM columns may or may not be present; centralised checks
285+
has_im_boundaries = self._has_im_boundaries(con)
286+
has_im = self._has_im(con)
287+
288+
im_cols_sql = (
289+
(
290+
"""FEATURE.EXP_IM AS EXP_IM,
291+
FEATURE.EXP_IM_LEFTWIDTH AS IM_leftWidth,
292+
FEATURE.EXP_IM_RIGHTWIDTH AS IM_rightWidth,"""
293+
)
294+
if has_im and has_im_boundaries
295+
else (
296+
"""FEATURE.EXP_IM AS EXP_IM,
297+
NULL AS IM_leftWidth,
298+
NULL AS IM_rightWidth,"""
299+
)
300+
if has_im and not has_im_boundaries
301+
else (
302+
"""NULL AS EXP_IM,
303+
FEATURE.EXP_IM_LEFTWIDTH AS IM_leftWidth,
304+
FEATURE.EXP_IM_RIGHTWIDTH AS IM_rightWidth,"""
305+
)
306+
if (not has_im) and has_im_boundaries
307+
else """NULL AS EXP_IM,
308+
NULL AS IM_leftWidth,
309+
NULL AS IM_rightWidth,"""
310+
)
311+
227312
query = f"""
228313
SELECT RUN.ID AS id_run,
229314
PEPTIDE.ID AS id_peptide,
@@ -247,6 +332,7 @@ def _read_peptidoform_data(self, con, cfg):
247332
FEATURE_MS1.APEX_INTENSITY AS aggr_prec_Peak_Apex,
248333
FEATURE.LEFT_WIDTH AS leftWidth,
249334
FEATURE.RIGHT_WIDTH AS rightWidth,
335+
{im_cols_sql}
250336
{score_ms1_pep} AS ms1_pep,
251337
SCORE_MS2.PEP AS ms2_pep,
252338
SCORE_IPF.PRECURSOR_PEAKGROUP_PEP AS precursor_pep,
@@ -275,6 +361,34 @@ def _read_augmented_data(self, con, cfg):
275361
"""Read standard data augmented with IPF information."""
276362
score_ms1_pep, link_ms1 = self._get_ms1_score_info(con)
277363

364+
# IM columns may or may not be present; centralised checks
365+
has_im_boundaries = self._has_im_boundaries(con)
366+
has_im = self._has_im(con)
367+
368+
im_cols_sql = (
369+
(
370+
"""FEATURE.EXP_IM AS EXP_IM,
371+
FEATURE.EXP_IM_LEFTWIDTH AS IM_leftWidth,
372+
FEATURE.EXP_IM_RIGHTWIDTH AS IM_rightWidth,"""
373+
)
374+
if has_im and has_im_boundaries
375+
else (
376+
"""FEATURE.EXP_IM AS EXP_IM,
377+
NULL AS IM_leftWidth,
378+
NULL AS IM_rightWidth,"""
379+
)
380+
if has_im and not has_im_boundaries
381+
else (
382+
"""NULL AS EXP_IM,
383+
FEATURE.EXP_IM_LEFTWIDTH AS IM_leftWidth,
384+
FEATURE.EXP_IM_RIGHTWIDTH AS IM_rightWidth,"""
385+
)
386+
if (not has_im) and has_im_boundaries
387+
else """NULL AS EXP_IM,
388+
NULL AS IM_leftWidth,
389+
NULL AS IM_rightWidth,"""
390+
)
391+
278392
query = f"""
279393
SELECT RUN.ID AS id_run,
280394
PEPTIDE.ID AS id_peptide,
@@ -298,6 +412,7 @@ def _read_augmented_data(self, con, cfg):
298412
FEATURE_MS1.APEX_INTENSITY AS aggr_prec_Peak_Apex,
299413
FEATURE.LEFT_WIDTH AS leftWidth,
300414
FEATURE.RIGHT_WIDTH AS rightWidth,
415+
{im_cols_sql}
301416
SCORE_MS2.RANK AS peak_group_rank,
302417
SCORE_MS2.SCORE AS d_score,
303418
SCORE_MS2.QVALUE AS m_score,
@@ -326,6 +441,17 @@ def _read_standard_data(self, con, cfg):
326441
# Check if we should attempt alignment integration
327442
use_alignment = cfg.use_alignment and self._check_alignment_presence(con)
328443

444+
# IM boundary columns may or may not be present; centralised check
445+
has_im_boundaries = self._has_im_boundaries(con)
446+
447+
im_cols_sql = (
448+
"""FEATURE.EXP_IM_LEFTWIDTH AS IM_leftWidth,
449+
FEATURE.EXP_IM_RIGHTWIDTH AS IM_rightWidth,"""
450+
if has_im_boundaries
451+
else """NULL AS IM_leftWidth,
452+
NULL AS IM_rightWidth,"""
453+
)
454+
329455
# First, get features that pass MS2 QVALUE threshold
330456
query = f"""
331457
SELECT RUN.ID AS id_run,
@@ -350,6 +476,7 @@ def _read_standard_data(self, con, cfg):
350476
FEATURE_MS1.APEX_INTENSITY AS aggr_prec_Peak_Apex,
351477
FEATURE.LEFT_WIDTH AS leftWidth,
352478
FEATURE.RIGHT_WIDTH AS rightWidth,
479+
{im_cols_sql}
353480
SCORE_MS2.RANK AS peak_group_rank,
354481
SCORE_MS2.SCORE AS d_score,
355482
SCORE_MS2.QVALUE AS m_score,

pyprophet/io/export/parquet.py

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,26 @@ def _read_unscored_data(self, con) -> pd.DataFrame:
9595
"""
9696
feature_vars_sql = self._build_feature_vars_sql()
9797

98+
# IM columns may or may not be present in the parquet file
99+
has_im = "EXP_IM" in self._columns
100+
has_im_boundaries = (
101+
"IM_leftWidth" in self._columns and "IM_rightWidth" in self._columns
102+
)
103+
104+
im_cols_sql = (
105+
(
106+
"EXP_IM AS EXP_IM, IM_leftWidth AS IM_leftWidth, IM_rightWidth AS IM_rightWidth"
107+
)
108+
if has_im and has_im_boundaries
109+
else ("EXP_IM AS EXP_IM, NULL AS IM_leftWidth, NULL AS IM_rightWidth")
110+
if has_im and not has_im_boundaries
111+
else (
112+
"NULL AS EXP_IM, IM_leftWidth AS IM_leftWidth, IM_rightWidth AS IM_rightWidth"
113+
)
114+
if (not has_im) and has_im_boundaries
115+
else "NULL AS EXP_IM, NULL AS IM_leftWidth, NULL AS IM_rightWidth"
116+
)
117+
98118
query = f"""
99119
SELECT
100120
RUN_ID AS id_run,
@@ -115,7 +135,8 @@ def _read_unscored_data(self, con) -> pd.DataFrame:
115135
FEATURE_MS1_AREA_INTENSITY AS aggr_prec_Peak_Area,
116136
FEATURE_MS1_APEX_INTENSITY AS aggr_prec_Peak_Apex,
117137
LEFT_WIDTH AS leftWidth,
118-
RIGHT_WIDTH AS rightWidth
138+
RIGHT_WIDTH AS rightWidth,
139+
{im_cols_sql}
119140
{feature_vars_sql}
120141
FROM data
121142
WHERE PROTEIN_ID IS NOT NULL -- Filter to precursor rows
@@ -129,6 +150,26 @@ def _read_peptidoform_data(self, con) -> pd.DataFrame:
129150
"""
130151
score_ms1_pep, _link_ms1 = self._get_ms1_score_info()
131152

153+
# IM columns may or may not be present in the parquet file
154+
has_im = "EXP_IM" in self._columns
155+
has_im_boundaries = (
156+
"IM_leftWidth" in self._columns and "IM_rightWidth" in self._columns
157+
)
158+
159+
im_cols_sql = (
160+
(
161+
"EXP_IM AS EXP_IM, IM_leftWidth AS IM_leftWidth, IM_rightWidth AS IM_rightWidth"
162+
)
163+
if has_im and has_im_boundaries
164+
else ("EXP_IM AS EXP_IM, NULL AS IM_leftWidth, NULL AS IM_rightWidth")
165+
if has_im and not has_im_boundaries
166+
else (
167+
"NULL AS EXP_IM, IM_leftWidth AS IM_leftWidth, IM_rightWidth AS IM_rightWidth"
168+
)
169+
if (not has_im) and has_im_boundaries
170+
else "NULL AS EXP_IM, NULL AS IM_leftWidth, NULL AS IM_rightWidth"
171+
)
172+
132173
query = f"""
133174
SELECT
134175
RUN_ID AS id_run,
@@ -153,6 +194,7 @@ def _read_peptidoform_data(self, con) -> pd.DataFrame:
153194
FEATURE_MS1_APEX_INTENSITY AS aggr_prec_Peak_Apex,
154195
LEFT_WIDTH AS leftWidth,
155196
RIGHT_WIDTH AS rightWidth,
197+
{im_cols_sql}
156198
{score_ms1_pep} AS ms1_pep,
157199
SCORE_MS2_PEP AS ms2_pep,
158200
SCORE_IPF_PRECURSOR_PEAKGROUP_PEP AS precursor_pep,
@@ -175,6 +217,26 @@ def _read_augmented_data(self, con) -> pd.DataFrame:
175217
"""
176218
score_ms1_pep, _link_ms1 = self._get_ms1_score_info()
177219

220+
# IM columns may or may not be present in the parquet file
221+
has_im = "EXP_IM" in self._columns
222+
has_im_boundaries = (
223+
"IM_leftWidth" in self._columns and "IM_rightWidth" in self._columns
224+
)
225+
226+
im_cols_sql = (
227+
(
228+
"EXP_IM AS EXP_IM, IM_leftWidth AS IM_leftWidth, IM_rightWidth AS IM_rightWidth"
229+
)
230+
if has_im and has_im_boundaries
231+
else ("EXP_IM AS EXP_IM, NULL AS IM_leftWidth, NULL AS IM_rightWidth")
232+
if has_im and not has_im_boundaries
233+
else (
234+
"NULL AS EXP_IM, IM_leftWidth AS IM_leftWidth, IM_rightWidth AS IM_rightWidth"
235+
)
236+
if (not has_im) and has_im_boundaries
237+
else "NULL AS EXP_IM, NULL AS IM_leftWidth, NULL AS IM_rightWidth"
238+
)
239+
178240
# First get main data
179241
query = f"""
180242
SELECT
@@ -200,6 +262,7 @@ def _read_augmented_data(self, con) -> pd.DataFrame:
200262
FEATURE_MS1_APEX_INTENSITY AS aggr_prec_Peak_Apex,
201263
LEFT_WIDTH AS leftWidth,
202264
RIGHT_WIDTH AS rightWidth,
265+
{im_cols_sql}
203266
SCORE_MS2_PEAK_GROUP_RANK AS peak_group_rank,
204267
SCORE_MS2_SCORE AS d_score,
205268
SCORE_MS2_Q_VALUE AS m_score,
@@ -262,6 +325,26 @@ def _read_standard_data(self, con) -> pd.DataFrame:
262325
use_alignment = self.config.use_alignment and self._has_alignment
263326

264327
# First, get features that pass MS2 QVALUE threshold
328+
# IM columns may or may not be present in the parquet file
329+
has_im = "EXP_IM" in self._columns
330+
has_im_boundaries = (
331+
"IM_leftWidth" in self._columns and "IM_rightWidth" in self._columns
332+
)
333+
334+
im_cols_sql = (
335+
(
336+
"EXP_IM AS EXP_IM, IM_leftWidth AS IM_leftWidth, IM_rightWidth AS IM_rightWidth,"
337+
)
338+
if has_im and has_im_boundaries
339+
else ("EXP_IM AS EXP_IM, NULL AS IM_leftWidth, NULL AS IM_rightWidth,")
340+
if has_im and not has_im_boundaries
341+
else (
342+
"NULL AS EXP_IM, IM_leftWidth AS IM_leftWidth, IM_rightWidth AS IM_rightWidth,"
343+
)
344+
if (not has_im) and has_im_boundaries
345+
else "NULL AS EXP_IM, NULL AS IM_leftWidth, NULL AS IM_rightWidth,"
346+
)
347+
265348
query = f"""
266349
SELECT
267350
RUN_ID AS id_run,
@@ -286,6 +369,7 @@ def _read_standard_data(self, con) -> pd.DataFrame:
286369
FEATURE_MS1_APEX_INTENSITY AS aggr_prec_Peak_Apex,
287370
LEFT_WIDTH AS leftWidth,
288371
RIGHT_WIDTH AS rightWidth,
372+
{im_cols_sql}
289373
SCORE_MS2_PEAK_GROUP_RANK AS peak_group_rank,
290374
SCORE_MS2_SCORE AS d_score,
291375
SCORE_MS2_Q_VALUE AS m_score,
@@ -334,6 +418,7 @@ def _read_standard_data(self, con) -> pd.DataFrame:
334418
aligned_ids_df = pd.DataFrame({"id": new_aligned_ids})
335419
con.register("aligned_ids_temp", aligned_ids_df)
336420

421+
# For recovered aligned features include IM columns the same way
337422
aligned_query = f"""
338423
SELECT
339424
RUN_ID AS id_run,
@@ -358,6 +443,7 @@ def _read_standard_data(self, con) -> pd.DataFrame:
358443
FEATURE_MS1_APEX_INTENSITY AS aggr_prec_Peak_Apex,
359444
LEFT_WIDTH AS leftWidth,
360445
RIGHT_WIDTH AS rightWidth,
446+
{im_cols_sql}
361447
SCORE_MS2_PEAK_GROUP_RANK AS peak_group_rank,
362448
SCORE_MS2_SCORE AS d_score,
363449
SCORE_MS2_Q_VALUE AS m_score

0 commit comments

Comments
 (0)