-
Notifications
You must be signed in to change notification settings - Fork 21
Use duckdb loading throughout pyprophet #131
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6b04bfe
7f11626
a4536d4
893f4ec
49f6e43
40f98f9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,16 +1,21 @@ | ||
| import pandas as pd | ||
| import numpy as np | ||
| import sqlite3 | ||
| import duckdb | ||
| from duckdb_extensions import extension_importer | ||
| import click | ||
| import os | ||
|
|
||
| from .data_handling import check_sqlite_table | ||
| from .data_handling import write_scores_sql_command | ||
| from .report import plot_scores | ||
|
|
||
| ## ensure proper extension installed | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a bit more info on why this extension is needed?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. duckdb base extension does not come with sqlite3. To install duckdb with sqlite3 after duckdb is installed something like However, this command cannot be run in singularity containers because you are modifying the container.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mm, how stable is that? I tried to install the community extension on cedar, but it fails to find a matching distribution If I try install from the wheel and modify the filename (since the manylinux does not get recognized), then it fails when trying to load the extension
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure how stable it is but I have had some problems in the past with duckdb on the cluster. I guess just the way it is built
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mm I think we will have to put this on hold for now, or implement it as an extra feature if someone wants to use a version of pyprophet with duckdb.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok sounds good. When I have time I will implement duckdb as a default and sqlite3 as a fallback. |
||
| extension_importer.import_extension("sqlite_scanner") | ||
|
|
||
| def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_transition_pep, ipf, ipf_max_peptidoform_pep, max_rs_peakgroup_qvalue, peptide, max_global_peptide_qvalue, protein, max_global_protein_qvalue): | ||
|
|
||
| condb = duckdb.connect(infile) | ||
| con = sqlite3.connect(infile) | ||
|
|
||
| # output for merged but not scored pyprophet input | ||
|
|
@@ -29,7 +34,7 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t | |
| score_sql = ", " + score_sql # add comma at the beginning to fit to statement | ||
| score_sql = score_sql[:-2] # remove additional space and comma from the end of the string | ||
|
|
||
| data = pd.read_sql_query(""" | ||
| data = condb.sql(""" | ||
| SELECT | ||
| RUN.ID AS id_run, | ||
| PEPTIDE.ID AS id_peptide, | ||
|
|
@@ -40,8 +45,8 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t | |
| FEATURE.EXP_RT AS RT, | ||
| FEATURE.EXP_RT - FEATURE.DELTA_RT AS assay_rt, | ||
| FEATURE.DELTA_RT AS delta_rt, | ||
| PRECURSOR.LIBRARY_RT AS assay_RT, | ||
| FEATURE.NORM_RT - PRECURSOR.LIBRARY_RT AS delta_RT, | ||
| PRECURSOR.LIBRARY_RT AS assay_iRT, | ||
| FEATURE.NORM_RT - PRECURSOR.LIBRARY_RT AS delta_iRT, | ||
| FEATURE.ID AS id, | ||
| PRECURSOR.CHARGE AS Charge, | ||
| PRECURSOR.PRECURSOR_MZ AS mz, | ||
|
|
@@ -59,7 +64,7 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t | |
| LEFT JOIN FEATURE_MS1 ON FEATURE_MS1.FEATURE_ID = FEATURE.ID | ||
| LEFT JOIN FEATURE_MS2 ON FEATURE_MS2.FEATURE_ID = FEATURE.ID | ||
| ORDER BY transition_group_id | ||
| """ % score_sql, con) | ||
| """ % score_sql).df() | ||
|
|
||
| else: | ||
|
|
||
|
|
@@ -292,11 +297,11 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t | |
| # Execute main SQLite query | ||
| click.echo("Info: Reading peak group-level results.") | ||
| con.executescript(idx_query) # Add indices | ||
| data = pd.read_sql_query(query, con) | ||
| data = condb.sql(query).df() | ||
|
|
||
| # Augment OpenSWATH results with IPF scores | ||
| if ipf_present and ipf=='augmented': | ||
| data_augmented = pd.read_sql_query(query_augmented, con) | ||
| data_augmented = condb.sql(query_augmented).df() | ||
|
|
||
| data_augmented = data_augmented.groupby('id').apply(lambda x: pd.Series({'ipf_FullUniModPeptideName': ";".join(x[x['ipf_peptidoform_pep'] == np.min(x['ipf_peptidoform_pep'])]['ipf_FullUniModPeptideName']), 'ipf_precursor_peakgroup_pep': x[x['ipf_peptidoform_pep'] == np.min(x['ipf_peptidoform_pep'])]['ipf_precursor_peakgroup_pep'].values[0], 'ipf_peptidoform_pep': x[x['ipf_peptidoform_pep'] == np.min(x['ipf_peptidoform_pep'])]['ipf_peptidoform_pep'].values[0], 'ipf_peptidoform_m_score': x[x['ipf_peptidoform_pep'] == np.min(x['ipf_peptidoform_pep'])]['ipf_peptidoform_m_score'].values[0]})).reset_index(level='id') | ||
|
|
||
|
|
@@ -317,7 +322,7 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t | |
| SELECT FEATURE_TRANSITION.FEATURE_ID AS id, | ||
| GROUP_CONCAT(AREA_INTENSITY,';') AS aggr_Peak_Area, | ||
| GROUP_CONCAT(APEX_INTENSITY,';') AS aggr_Peak_Apex, | ||
| GROUP_CONCAT(TRANSITION.ID || "_" || TRANSITION.TYPE || TRANSITION.ORDINAL || "_" || TRANSITION.CHARGE,';') AS aggr_Fragment_Annotation | ||
| GROUP_CONCAT(TRANSITION.ID || '_' || TRANSITION.TYPE || TRANSITION.ORDINAL || '_' || TRANSITION.CHARGE,';') AS aggr_Fragment_Annotation | ||
| FROM FEATURE_TRANSITION | ||
| INNER JOIN TRANSITION ON FEATURE_TRANSITION.TRANSITION_ID = TRANSITION.ID | ||
| INNER JOIN SCORE_TRANSITION ON FEATURE_TRANSITION.TRANSITION_ID = SCORE_TRANSITION.TRANSITION_ID AND FEATURE_TRANSITION.FEATURE_ID = SCORE_TRANSITION.FEATURE_ID | ||
|
|
@@ -335,14 +340,14 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t | |
| SELECT FEATURE_ID AS id, | ||
| GROUP_CONCAT(AREA_INTENSITY,';') AS aggr_Peak_Area, | ||
| GROUP_CONCAT(APEX_INTENSITY,';') AS aggr_Peak_Apex, | ||
| GROUP_CONCAT(TRANSITION.ID || "_" || TRANSITION.TYPE || TRANSITION.ORDINAL || "_" || TRANSITION.CHARGE,';') AS aggr_Fragment_Annotation | ||
| GROUP_CONCAT(TRANSITION.ID || '_' || TRANSITION.TYPE || TRANSITION.ORDINAL || '_' || TRANSITION.CHARGE,';') AS aggr_Fragment_Annotation | ||
| FROM FEATURE_TRANSITION | ||
| INNER JOIN TRANSITION ON FEATURE_TRANSITION.TRANSITION_ID = TRANSITION.ID | ||
| GROUP BY FEATURE_ID | ||
| ''' | ||
| click.echo("Info: Reading transition-level results.") | ||
| con.executescript(idx_transition_query) # Add indices | ||
| data_transition = pd.read_sql_query(transition_query, con) | ||
| data_transition = condb.sql(transition_query).df() | ||
| data = pd.merge(data, data_transition, how='left', on=['id']) | ||
|
|
||
| # Append concatenated protein identifier | ||
|
|
@@ -353,13 +358,13 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t | |
|
|
||
| CREATE INDEX IF NOT EXISTS idx_peptide_protein_mapping_peptide_id ON PEPTIDE_PROTEIN_MAPPING (PEPTIDE_ID); | ||
| ''') | ||
| data_protein = pd.read_sql_query(''' | ||
| data_protein = condb.sql(''' | ||
| SELECT PEPTIDE_ID AS id_peptide, | ||
| GROUP_CONCAT(PROTEIN.PROTEIN_ACCESSION,';') AS ProteinName | ||
| FROM PEPTIDE_PROTEIN_MAPPING | ||
| INNER JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID | ||
| GROUP BY PEPTIDE_ID; | ||
| ''', con) | ||
| ''').df() | ||
| data = pd.merge(data, data_protein, how='inner', on=['id_peptide']) | ||
|
|
||
| # Append peptide error-rate control | ||
|
|
@@ -369,32 +374,32 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t | |
|
|
||
| if peptide_present and peptide: | ||
| click.echo("Info: Reading peptide-level results.") | ||
| data_peptide_run = pd.read_sql_query(''' | ||
| data_peptide_run = condb.sql(''' | ||
| SELECT RUN_ID AS id_run, | ||
| PEPTIDE_ID AS id_peptide, | ||
| QVALUE AS m_score_peptide_run_specific | ||
| FROM SCORE_PEPTIDE | ||
| WHERE CONTEXT == 'run-specific'; | ||
| ''', con) | ||
| ''').df() | ||
| if len(data_peptide_run.index) > 0: | ||
| data = pd.merge(data, data_peptide_run, how='inner', on=['id_run','id_peptide']) | ||
|
|
||
| data_peptide_experiment = pd.read_sql_query(''' | ||
| data_peptide_experiment = condb.sql(''' | ||
| SELECT RUN_ID AS id_run, | ||
| PEPTIDE_ID AS id_peptide, | ||
| QVALUE AS m_score_peptide_experiment_wide | ||
| FROM SCORE_PEPTIDE | ||
| WHERE CONTEXT == 'experiment-wide'; | ||
| ''', con) | ||
| ''').df() | ||
| if len(data_peptide_experiment.index) > 0: | ||
| data = pd.merge(data, data_peptide_experiment, on=['id_run','id_peptide']) | ||
|
|
||
| data_peptide_global = pd.read_sql_query(''' | ||
| data_peptide_global = condb.sql(''' | ||
| SELECT PEPTIDE_ID AS id_peptide, | ||
| QVALUE AS m_score_peptide_global | ||
| FROM SCORE_PEPTIDE | ||
| WHERE CONTEXT == 'global'; | ||
| ''', con) | ||
| ''').df() | ||
| if len(data_peptide_global.index) > 0: | ||
| data = pd.merge(data, data_peptide_global[data_peptide_global['m_score_peptide_global'] < max_global_peptide_qvalue], on=['id_peptide']) | ||
|
|
||
|
|
@@ -411,7 +416,7 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t | |
| CREATE INDEX IF NOT EXISTS idx_score_protein_protein_id ON SCORE_PROTEIN (PROTEIN_ID); | ||
| CREATE INDEX IF NOT EXISTS idx_score_protein_run_id ON SCORE_PROTEIN (RUN_ID); | ||
| ''') | ||
| data_protein_run = pd.read_sql_query(''' | ||
| data_protein_run = condb.sql(''' | ||
| SELECT RUN_ID AS id_run, | ||
| PEPTIDE_ID AS id_peptide, | ||
| MIN(QVALUE) AS m_score_protein_run_specific | ||
|
|
@@ -420,7 +425,7 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t | |
| WHERE CONTEXT == 'run-specific' | ||
| GROUP BY RUN_ID, | ||
| PEPTIDE_ID; | ||
| ''', con) | ||
| ''').df() | ||
| if len(data_protein_run.index) > 0: | ||
| data = pd.merge(data, data_protein_run, how='inner', on=['id_run','id_peptide']) | ||
|
|
||
|
|
@@ -430,7 +435,7 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t | |
| CREATE INDEX IF NOT EXISTS idx_score_protein_protein_id ON SCORE_PROTEIN (PROTEIN_ID); | ||
| CREATE INDEX IF NOT EXISTS idx_score_protein_run_id ON SCORE_PROTEIN (RUN_ID); | ||
| ''') | ||
| data_protein_experiment = pd.read_sql_query(''' | ||
| data_protein_experiment = condb.sql(''' | ||
| SELECT RUN_ID AS id_run, | ||
| PEPTIDE_ID AS id_peptide, | ||
| MIN(QVALUE) AS m_score_protein_experiment_wide | ||
|
|
@@ -439,7 +444,7 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t | |
| WHERE CONTEXT == 'experiment-wide' | ||
| GROUP BY RUN_ID, | ||
| PEPTIDE_ID; | ||
| ''', con) | ||
| ''').df() | ||
| if len(data_protein_experiment.index) > 0: | ||
| data = pd.merge(data, data_protein_experiment, how='inner', on=['id_run','id_peptide']) | ||
|
|
||
|
|
@@ -448,14 +453,14 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t | |
| CREATE INDEX IF NOT EXISTS idx_peptide_protein_mapping_peptide_id ON PEPTIDE_PROTEIN_MAPPING (PEPTIDE_ID); | ||
| CREATE INDEX IF NOT EXISTS idx_score_protein_protein_id ON SCORE_PROTEIN (PROTEIN_ID); | ||
| ''') | ||
| data_protein_global = pd.read_sql_query(''' | ||
| data_protein_global = condb.sql(''' | ||
| SELECT PEPTIDE_ID AS id_peptide, | ||
| MIN(QVALUE) AS m_score_protein_global | ||
| FROM PEPTIDE_PROTEIN_MAPPING | ||
| INNER JOIN SCORE_PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = SCORE_PROTEIN.PROTEIN_ID | ||
| WHERE CONTEXT == 'global' | ||
| GROUP BY PEPTIDE_ID; | ||
| ''', con) | ||
| ''').df() | ||
| if len(data_protein_global.index) > 0: | ||
| data = pd.merge(data, data_protein_global[data_protein_global['m_score_protein_global'] < max_global_protein_qvalue], how='inner', on=['id_peptide']) | ||
|
|
||
|
|
@@ -478,14 +483,16 @@ def export_tsv(infile, outfile, format, outcsv, transition_quantification, max_t | |
| data.to_csv(outfile, sep=sep, index=True) | ||
|
|
||
| con.close() | ||
| condb.close() | ||
|
|
||
| def export_score_plots(infile): | ||
|
|
||
| con = sqlite3.connect(infile) | ||
| condb = duckdb.connect(infile) | ||
|
|
||
| if check_sqlite_table(con, "SCORE_MS2"): | ||
| outfile = infile.split(".osw")[0] + "_ms2_score_plots.pdf" | ||
| table_ms2 = pd.read_sql_query(''' | ||
| table_ms2 = condb.sql(''' | ||
| SELECT *, | ||
| RUN_ID || '_' || PRECURSOR_ID AS GROUP_ID | ||
| FROM FEATURE_MS2 | ||
|
|
@@ -512,12 +519,12 @@ def export_score_plots(infile): | |
| ORDER BY RUN_ID, | ||
| PRECURSOR.ID ASC, | ||
| FEATURE.EXP_RT ASC; | ||
| ''', con) | ||
| ''').df() | ||
| plot_scores(table_ms2, outfile) | ||
|
|
||
| if check_sqlite_table(con, "SCORE_MS1"): | ||
| outfile = infile.split(".osw")[0] + "_ms1_score_plots.pdf" | ||
| table_ms1 = pd.read_sql_query(''' | ||
| table_ms1 = condb.sql(''' | ||
| SELECT *, | ||
| RUN_ID || '_' || PRECURSOR_ID AS GROUP_ID | ||
| FROM FEATURE_MS1 | ||
|
|
@@ -537,12 +544,12 @@ def export_score_plots(infile): | |
| ORDER BY RUN_ID, | ||
| PRECURSOR.ID ASC, | ||
| FEATURE.EXP_RT ASC; | ||
| ''', con) | ||
| ''').df() | ||
| plot_scores(table_ms1, outfile) | ||
|
|
||
| if check_sqlite_table(con, "SCORE_TRANSITION"): | ||
| outfile = infile.split(".osw")[0] + "_transition_score_plots.pdf" | ||
| table_transition = pd.read_sql_query(''' | ||
| table_transition = condb.sql(''' | ||
| SELECT TRANSITION.DECOY AS DECOY, | ||
| FEATURE_TRANSITION.*, | ||
| PRECURSOR.CHARGE AS VAR_PRECURSOR_CHARGE, | ||
|
|
@@ -568,7 +575,8 @@ def export_score_plots(infile): | |
| PRECURSOR.ID, | ||
| FEATURE.EXP_RT, | ||
| TRANSITION.ID; | ||
| ''', con) | ||
| ''').df() | ||
| plot_scores(table_transition, outfile) | ||
|
|
||
| con.close() | ||
| condb.close() | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we still need sqlite3?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point probably for export we don't need sqlite3