Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 37 additions & 29 deletions pyprophet/export.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
import pandas as pd
import numpy as np
import sqlite3
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need sqlite3?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point probably for export we don't need sqlite3

import duckdb
from duckdb_extensions import extension_importer
import click
import os

from .data_handling import check_sqlite_table
from .data_handling import write_scores_sql_command
from .report import plot_scores

## ensure proper extension installed
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a bit more info on why this extension is needed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

duckdb base extension does not come with sqlite3. To install duckdb with sqlite3 after duckdb is installed something like db.install_extension("sqlite3") needs to be executed or else the program crashes.

However, this command cannot be run in singularity containers because you are modifying the container.
A workaround is to install the sqlite3 extension with pip (community extension) which is done here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mm, how stable is that? I tried to install the community extension on cedar, but it fails to find a matching distribution

$ pip install duckdb-extension-sqlite-scanner
Looking in links: /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/gentoo2023/x86-64-v3, /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/gentoo2023/generic, /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/generic
ERROR: Could not find a version that satisfies the requirement duckdb-extension-sqlite-scanner (from versions: none)
ERROR: No matching distribution found for duckdb-extension-sqlite-scanner

If I try install from the wheel and modify the filename (since the manylinux does not get recognized), then it fails when trying to load the extension

extension_importer.import_extension("sqlite_scanner")
  File "/project/6011811/singjust/bin/py310/lib/python3.10/site-packages/duckdb_extensions/extension_importer.py", line 24, in import_extension
    duckdb.sql(f"INSTALL '{extension_file}'")
duckdb.duckdb.IOException: IO Error: Failed to install '/project/6011811/singjust/bin/py310/lib/python3.10/site-packages/duckdb_extension_sqlite_scanner/extensions/v1.2.0/sqlite_scanner.duckdb_extension'
The file was built for the platform 'linux_amd64_gcc4', but we can only load extensions built for platform 'linux_amd64'.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure how stable it is but I have had some problems in the past with duckdb on the cluster. I guess just the way it is built

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mm I think we will have to put this on hold for now, or implement it as an extra feature if someone wants to use a version of pyprophet with duckdb.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok sounds good. When I have time I will implement duckdb as a default and sqlite3 as a fallback.

extension_importer.import_extension("sqlite_scanner")

def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_transition_pep, ipf, ipf_max_peptidoform_pep, max_rs_peakgroup_qvalue, peptide, max_global_peptide_qvalue, protein, max_global_protein_qvalue):

condb = duckdb.connect(infile)
con = sqlite3.connect(infile)

# output for merged but not scored pyprophet input
Expand All @@ -29,7 +34,7 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t
score_sql = ", " + score_sql # add comma at the beginning to fit to statement
score_sql = score_sql[:-2] # remove additional space and comma from the end of the string

data = pd.read_sql_query("""
data = condb.sql("""
SELECT
RUN.ID AS id_run,
PEPTIDE.ID AS id_peptide,
Expand All @@ -40,8 +45,8 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t
FEATURE.EXP_RT AS RT,
FEATURE.EXP_RT - FEATURE.DELTA_RT AS assay_rt,
FEATURE.DELTA_RT AS delta_rt,
PRECURSOR.LIBRARY_RT AS assay_RT,
FEATURE.NORM_RT - PRECURSOR.LIBRARY_RT AS delta_RT,
PRECURSOR.LIBRARY_RT AS assay_iRT,
FEATURE.NORM_RT - PRECURSOR.LIBRARY_RT AS delta_iRT,
FEATURE.ID AS id,
PRECURSOR.CHARGE AS Charge,
PRECURSOR.PRECURSOR_MZ AS mz,
Expand All @@ -59,7 +64,7 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t
LEFT JOIN FEATURE_MS1 ON FEATURE_MS1.FEATURE_ID = FEATURE.ID
LEFT JOIN FEATURE_MS2 ON FEATURE_MS2.FEATURE_ID = FEATURE.ID
ORDER BY transition_group_id
""" % score_sql, con)
""" % score_sql).df()

else:

Expand Down Expand Up @@ -292,11 +297,11 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t
# Execute main SQLite query
click.echo("Info: Reading peak group-level results.")
con.executescript(idx_query) # Add indices
data = pd.read_sql_query(query, con)
data = condb.sql(query).df()

# Augment OpenSWATH results with IPF scores
if ipf_present and ipf=='augmented':
data_augmented = pd.read_sql_query(query_augmented, con)
data_augmented = condb.sql(query_augmented).df()

data_augmented = data_augmented.groupby('id').apply(lambda x: pd.Series({'ipf_FullUniModPeptideName': ";".join(x[x['ipf_peptidoform_pep'] == np.min(x['ipf_peptidoform_pep'])]['ipf_FullUniModPeptideName']), 'ipf_precursor_peakgroup_pep': x[x['ipf_peptidoform_pep'] == np.min(x['ipf_peptidoform_pep'])]['ipf_precursor_peakgroup_pep'].values[0], 'ipf_peptidoform_pep': x[x['ipf_peptidoform_pep'] == np.min(x['ipf_peptidoform_pep'])]['ipf_peptidoform_pep'].values[0], 'ipf_peptidoform_m_score': x[x['ipf_peptidoform_pep'] == np.min(x['ipf_peptidoform_pep'])]['ipf_peptidoform_m_score'].values[0]})).reset_index(level='id')

Expand All @@ -317,7 +322,7 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t
SELECT FEATURE_TRANSITION.FEATURE_ID AS id,
GROUP_CONCAT(AREA_INTENSITY,';') AS aggr_Peak_Area,
GROUP_CONCAT(APEX_INTENSITY,';') AS aggr_Peak_Apex,
GROUP_CONCAT(TRANSITION.ID || "_" || TRANSITION.TYPE || TRANSITION.ORDINAL || "_" || TRANSITION.CHARGE,';') AS aggr_Fragment_Annotation
GROUP_CONCAT(TRANSITION.ID || '_' || TRANSITION.TYPE || TRANSITION.ORDINAL || '_' || TRANSITION.CHARGE,';') AS aggr_Fragment_Annotation
FROM FEATURE_TRANSITION
INNER JOIN TRANSITION ON FEATURE_TRANSITION.TRANSITION_ID = TRANSITION.ID
INNER JOIN SCORE_TRANSITION ON FEATURE_TRANSITION.TRANSITION_ID = SCORE_TRANSITION.TRANSITION_ID AND FEATURE_TRANSITION.FEATURE_ID = SCORE_TRANSITION.FEATURE_ID
Expand All @@ -335,14 +340,14 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t
SELECT FEATURE_ID AS id,
GROUP_CONCAT(AREA_INTENSITY,';') AS aggr_Peak_Area,
GROUP_CONCAT(APEX_INTENSITY,';') AS aggr_Peak_Apex,
GROUP_CONCAT(TRANSITION.ID || "_" || TRANSITION.TYPE || TRANSITION.ORDINAL || "_" || TRANSITION.CHARGE,';') AS aggr_Fragment_Annotation
GROUP_CONCAT(TRANSITION.ID || '_' || TRANSITION.TYPE || TRANSITION.ORDINAL || '_' || TRANSITION.CHARGE,';') AS aggr_Fragment_Annotation
FROM FEATURE_TRANSITION
INNER JOIN TRANSITION ON FEATURE_TRANSITION.TRANSITION_ID = TRANSITION.ID
GROUP BY FEATURE_ID
'''
click.echo("Info: Reading transition-level results.")
con.executescript(idx_transition_query) # Add indices
data_transition = pd.read_sql_query(transition_query, con)
data_transition = condb.sql(transition_query).df()
data = pd.merge(data, data_transition, how='left', on=['id'])

# Append concatenated protein identifier
Expand All @@ -353,13 +358,13 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t

CREATE INDEX IF NOT EXISTS idx_peptide_protein_mapping_peptide_id ON PEPTIDE_PROTEIN_MAPPING (PEPTIDE_ID);
''')
data_protein = pd.read_sql_query('''
data_protein = condb.sql('''
SELECT PEPTIDE_ID AS id_peptide,
GROUP_CONCAT(PROTEIN.PROTEIN_ACCESSION,';') AS ProteinName
FROM PEPTIDE_PROTEIN_MAPPING
INNER JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID
GROUP BY PEPTIDE_ID;
''', con)
''').df()
data = pd.merge(data, data_protein, how='inner', on=['id_peptide'])

# Append peptide error-rate control
Expand All @@ -369,32 +374,32 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t

if peptide_present and peptide:
click.echo("Info: Reading peptide-level results.")
data_peptide_run = pd.read_sql_query('''
data_peptide_run = condb.sql('''
SELECT RUN_ID AS id_run,
PEPTIDE_ID AS id_peptide,
QVALUE AS m_score_peptide_run_specific
FROM SCORE_PEPTIDE
WHERE CONTEXT == 'run-specific';
''', con)
''').df()
if len(data_peptide_run.index) > 0:
data = pd.merge(data, data_peptide_run, how='inner', on=['id_run','id_peptide'])

data_peptide_experiment = pd.read_sql_query('''
data_peptide_experiment = condb.sql('''
SELECT RUN_ID AS id_run,
PEPTIDE_ID AS id_peptide,
QVALUE AS m_score_peptide_experiment_wide
FROM SCORE_PEPTIDE
WHERE CONTEXT == 'experiment-wide';
''', con)
''').df()
if len(data_peptide_experiment.index) > 0:
data = pd.merge(data, data_peptide_experiment, on=['id_run','id_peptide'])

data_peptide_global = pd.read_sql_query('''
data_peptide_global = condb.sql('''
SELECT PEPTIDE_ID AS id_peptide,
QVALUE AS m_score_peptide_global
FROM SCORE_PEPTIDE
WHERE CONTEXT == 'global';
''', con)
''').df()
if len(data_peptide_global.index) > 0:
data = pd.merge(data, data_peptide_global[data_peptide_global['m_score_peptide_global'] < max_global_peptide_qvalue], on=['id_peptide'])

Expand All @@ -411,7 +416,7 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t
CREATE INDEX IF NOT EXISTS idx_score_protein_protein_id ON SCORE_PROTEIN (PROTEIN_ID);
CREATE INDEX IF NOT EXISTS idx_score_protein_run_id ON SCORE_PROTEIN (RUN_ID);
''')
data_protein_run = pd.read_sql_query('''
data_protein_run = condb.sql('''
SELECT RUN_ID AS id_run,
PEPTIDE_ID AS id_peptide,
MIN(QVALUE) AS m_score_protein_run_specific
Expand All @@ -420,7 +425,7 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t
WHERE CONTEXT == 'run-specific'
GROUP BY RUN_ID,
PEPTIDE_ID;
''', con)
''').df()
if len(data_protein_run.index) > 0:
data = pd.merge(data, data_protein_run, how='inner', on=['id_run','id_peptide'])

Expand All @@ -430,7 +435,7 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t
CREATE INDEX IF NOT EXISTS idx_score_protein_protein_id ON SCORE_PROTEIN (PROTEIN_ID);
CREATE INDEX IF NOT EXISTS idx_score_protein_run_id ON SCORE_PROTEIN (RUN_ID);
''')
data_protein_experiment = pd.read_sql_query('''
data_protein_experiment = condb.sql('''
SELECT RUN_ID AS id_run,
PEPTIDE_ID AS id_peptide,
MIN(QVALUE) AS m_score_protein_experiment_wide
Expand All @@ -439,7 +444,7 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t
WHERE CONTEXT == 'experiment-wide'
GROUP BY RUN_ID,
PEPTIDE_ID;
''', con)
''').df()
if len(data_protein_experiment.index) > 0:
data = pd.merge(data, data_protein_experiment, how='inner', on=['id_run','id_peptide'])

Expand All @@ -448,14 +453,14 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t
CREATE INDEX IF NOT EXISTS idx_peptide_protein_mapping_peptide_id ON PEPTIDE_PROTEIN_MAPPING (PEPTIDE_ID);
CREATE INDEX IF NOT EXISTS idx_score_protein_protein_id ON SCORE_PROTEIN (PROTEIN_ID);
''')
data_protein_global = pd.read_sql_query('''
data_protein_global = condb.sql('''
SELECT PEPTIDE_ID AS id_peptide,
MIN(QVALUE) AS m_score_protein_global
FROM PEPTIDE_PROTEIN_MAPPING
INNER JOIN SCORE_PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = SCORE_PROTEIN.PROTEIN_ID
WHERE CONTEXT == 'global'
GROUP BY PEPTIDE_ID;
''', con)
''').df()
if len(data_protein_global.index) > 0:
data = pd.merge(data, data_protein_global[data_protein_global['m_score_protein_global'] < max_global_protein_qvalue], how='inner', on=['id_peptide'])

Expand All @@ -478,14 +483,16 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t
data.to_csv(outfile, sep=sep, index=True)

con.close()
condb.close()

def export_score_plots(infile):

con = sqlite3.connect(infile)
condb = duckdb.connect(infile)

if check_sqlite_table(con, "SCORE_MS2"):
outfile = infile.split(".osw")[0] + "_ms2_score_plots.pdf"
table_ms2 = pd.read_sql_query('''
table_ms2 = condb.sql('''
SELECT *,
RUN_ID || '_' || PRECURSOR_ID AS GROUP_ID
FROM FEATURE_MS2
Expand All @@ -512,12 +519,12 @@ def export_score_plots(infile):
ORDER BY RUN_ID,
PRECURSOR.ID ASC,
FEATURE.EXP_RT ASC;
''', con)
''').df()
plot_scores(table_ms2, outfile)

if check_sqlite_table(con, "SCORE_MS1"):
outfile = infile.split(".osw")[0] + "_ms1_score_plots.pdf"
table_ms1 = pd.read_sql_query('''
table_ms1 = condb.sql('''
SELECT *,
RUN_ID || '_' || PRECURSOR_ID AS GROUP_ID
FROM FEATURE_MS1
Expand All @@ -537,12 +544,12 @@ def export_score_plots(infile):
ORDER BY RUN_ID,
PRECURSOR.ID ASC,
FEATURE.EXP_RT ASC;
''', con)
''').df()
plot_scores(table_ms1, outfile)

if check_sqlite_table(con, "SCORE_TRANSITION"):
outfile = infile.split(".osw")[0] + "_transition_score_plots.pdf"
table_transition = pd.read_sql_query('''
table_transition = condb.sql('''
SELECT TRANSITION.DECOY AS DECOY,
FEATURE_TRANSITION.*,
PRECURSOR.CHARGE AS VAR_PRECURSOR_CHARGE,
Expand All @@ -568,7 +575,8 @@ def export_score_plots(infile):
PRECURSOR.ID,
FEATURE.EXP_RT,
TRANSITION.ID;
''', con)
''').df()
plot_scores(table_transition, outfile)

con.close()
condb.close()
33 changes: 19 additions & 14 deletions pyprophet/export_compound.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import pandas as pd
import duckdb
from duckdb_extensions import extension_importer
import sqlite3

from .data_handling import check_sqlite_table
from .data_handling import write_scores_sql_command
from .report import plot_scores

## ensure proper extension installed
extension_importer.import_extension("sqlite_scanner")

def export_compound_tsv(infile, outfile, format, outcsv, max_rs_peakgroup_qvalue):
con = sqlite3.connect(infile)
condb = duckdb.connect(infile)


# output for merged but not scored pyprophet input
Expand All @@ -25,7 +29,7 @@ def export_compound_tsv(infile, outfile, format, outcsv, max_rs_peakgroup_qvalue
score_sql = ", " + score_sql # add comma at the beginning to fit to statement
score_sql = score_sql[:-2] # remove additional space and comma from the end of the string

data = pd.read_sql_query("""
data = condb.sql("""
SELECT
RUN.ID AS id_run,
COMPOUND.ID AS id_compound,
Expand All @@ -36,8 +40,8 @@ def export_compound_tsv(infile, outfile, format, outcsv, max_rs_peakgroup_qvalue
FEATURE.EXP_RT AS RT,
FEATURE.EXP_RT - FEATURE.DELTA_RT AS assay_rt,
FEATURE.DELTA_RT AS delta_rt,
PRECURSOR.LIBRARY_RT AS assay_RT,
FEATURE.NORM_RT - PRECURSOR.LIBRARY_RT AS delta_RT,
PRECURSOR.LIBRARY_RT AS assay_iRT,
FEATURE.NORM_RT - PRECURSOR.LIBRARY_RT AS delta_iRT,
FEATURE.ID AS id,
COMPOUND.SUM_FORMULA AS sum_formula,
COMPOUND.COMPOUND_NAME AS compound_name,
Expand All @@ -58,10 +62,10 @@ def export_compound_tsv(infile, outfile, format, outcsv, max_rs_peakgroup_qvalue
LEFT JOIN FEATURE_MS1 ON FEATURE_MS1.FEATURE_ID = FEATURE.ID
LEFT JOIN FEATURE_MS2 ON FEATURE_MS2.FEATURE_ID = FEATURE.ID
ORDER BY transition_group_id
""" % score_sql, con)
""" % score_sql).df()

elif check_sqlite_table(con, "SCORE_MS1"): # MS1 scoring performend
data = pd.read_sql_query("""
data = condb.sql("""
SELECT
RUN.ID AS id_run,
COMPOUND.ID AS id_compound,
Expand All @@ -72,8 +76,8 @@ def export_compound_tsv(infile, outfile, format, outcsv, max_rs_peakgroup_qvalue
FEATURE.EXP_RT AS RT,
FEATURE.EXP_RT - FEATURE.DELTA_RT AS assay_rt,
FEATURE.DELTA_RT AS delta_rt,
PRECURSOR.LIBRARY_RT AS assay_RT,
FEATURE.NORM_RT - PRECURSOR.LIBRARY_RT AS delta_RT,
PRECURSOR.LIBRARY_RT AS assay_iRT,
FEATURE.NORM_RT - PRECURSOR.LIBRARY_RT AS delta_iRT,
FEATURE.ID AS id,
COMPOUND.SUM_FORMULA AS sum_formula,
COMPOUND.COMPOUND_NAME AS compound_name,
Expand All @@ -99,9 +103,9 @@ def export_compound_tsv(infile, outfile, format, outcsv, max_rs_peakgroup_qvalue
WHERE SCORE_MS1.QVALUE < %s
ORDER BY transition_group_id,
peak_group_rank;
""" % max_rs_peakgroup_qvalue, con)
""" % max_rs_peakgroup_qvalue).df()
else: # MS2 or MS1MS2 scoring performend
data = pd.read_sql_query("""
data = condb.sql("""
SELECT
RUN.ID AS id_run,
COMPOUND.ID AS id_compound,
Expand All @@ -112,8 +116,8 @@ def export_compound_tsv(infile, outfile, format, outcsv, max_rs_peakgroup_qvalue
FEATURE.EXP_RT AS RT,
FEATURE.EXP_RT - FEATURE.DELTA_RT AS assay_rt,
FEATURE.DELTA_RT AS delta_rt,
PRECURSOR.LIBRARY_RT AS assay_RT,
FEATURE.NORM_RT - PRECURSOR.LIBRARY_RT AS delta_RT,
PRECURSOR.LIBRARY_RT AS assay_iRT,
FEATURE.NORM_RT - PRECURSOR.LIBRARY_RT AS delta_iRT,
FEATURE.ID AS id,
COMPOUND.SUM_FORMULA AS sum_formula,
COMPOUND.COMPOUND_NAME AS compound_name,
Expand All @@ -139,9 +143,10 @@ def export_compound_tsv(infile, outfile, format, outcsv, max_rs_peakgroup_qvalue
WHERE SCORE_MS2.QVALUE < %s
ORDER BY transition_group_id,
peak_group_rank;
""" % max_rs_peakgroup_qvalue, con)
""" % max_rs_peakgroup_qvalue).df()

con.close()
condb.close()

if outcsv:
sep = ","
Expand Down
4 changes: 3 additions & 1 deletion pyprophet/export_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
from duckdb_extensions import extension_importer
import re

## ensure proper extension installed
extension_importer.import_extension("sqlite_scanner")

def getPeptideProteinScoreTable(conndb, level):
if level == 'peptide':
id = 'PEPTIDE_ID'
Expand Down Expand Up @@ -45,7 +48,6 @@ def export_to_parquet(infile, outfile, transitionLevel=False, onlyFeatures=False
Return:
None
'''
extension_importer.import_extension("sqlite_scanner")
condb = duckdb.connect(infile)
con = sqlite3.connect(infile)

Expand Down
Loading