Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 52 additions & 55 deletions pyprophet/export_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pandas as pd
from pyprophet.export import check_sqlite_table
from duckdb_extensions import extension_importer
import re

def getPeptideProteinScoreTable(conndb, level):
if level == 'peptide':
Expand Down Expand Up @@ -31,7 +32,7 @@ def getVarColumnNames(condb, tableName):


# this method is only currently supported for combined output and not with ipf
def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False):
def export_to_parquet(infile, outfile, transitionLevel=False, onlyFeatures=False, noDecoys=False):
'''
Convert an OSW sqlite file to Parquet format

Expand Down Expand Up @@ -66,6 +67,9 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False):
CREATE INDEX IF NOT EXISTS idx_protein_protein_id ON PROTEIN (ID);
CREATE INDEX IF NOT EXISTS idx_peptide_protein_mapping_peptide_id ON PEPTIDE_PROTEIN_MAPPING (PEPTIDE_ID);

CREATE INDEX IF NOT EXISTS idx_transition_id ON TRANSITION (ID);
CREATE INDEX IF NOT EXISTS idx_transition_precursor_mapping_transition_id ON TRANSITION_PRECURSOR_MAPPING (TRANSITION_ID);
CREATE INDEX IF NOT EXISTS idx_transition_precursor_mapping_precursor_id ON TRANSITION_PRECURSOR_MAPPING (PRECURSOR_ID);
'''

if check_sqlite_table(con, "FEATURE_MS1"):
Expand Down Expand Up @@ -200,19 +204,30 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False):

# create a list of all the columns
columns_list = [col for c in columns.values() for col in c]

# create a list of just aliases for groupby
pattern = re.compile(r"(.*)\sAS")
alias_list = [ pattern.search(col).group(1) for c in columns.values() for col in c]

# join the list into a single string separated by a comma and a space
columnsToSelect = ", ".join(columns_list)

join_features = "LEFT JOIN" if onlyFeatures else "FULL JOIN"

# First read feature data
# Feature Data
if not transitionLevel:
feature_query = f'''
SELECT {columnsToSelect}
FROM FEATURE
{join_features} PRECURSOR ON FEATURE.PRECURSOR_ID = PRECURSOR.ID
aliasToSelect = ", ".join(alias_list)

# For feature level group important transition level data into one row separated by ';'
featureLvlPrefix = "GROUP_CONCAT(TRANSITION.ID, ';') AS 'TRANSITION_ID', GROUP_CONCAT(TRANSITION.ANNOTATION, ';') AS 'TRANSITION_ANNOTATION'" if not transitionLevel else ""
featureLvlSuffix = f'GROUP BY {aliasToSelect}' if not transitionLevel else ""

decoyExclude = "WHERE PRECURSOR.DECOY == 0" if noDecoys else ""

if not onlyFeatures:
query = f'''
SELECT {columnsToSelect},
{featureLvlPrefix}
FROM TRANSITION_PRECURSOR_MAPPING
LEFT JOIN TRANSITION ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID
LEFT JOIN PRECURSOR ON TRANSITION_PRECURSOR_MAPPING.PRECURSOR_ID = PRECURSOR.ID
LEFT JOIN FEATURE_TRANSITION ON TRANSITION.ID = FEATURE_TRANSITION.TRANSITION_ID
LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID
LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID
LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID
LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID
Expand All @@ -224,48 +239,30 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False):
{gene_table_joins}
{pepJoin}
{protJoin}
{decoyExclude}
{featureLvlSuffix}
'''
else: # is transition level

# merge transition and precursor level data
if not onlyFeatures:
feature_query = f'''
SELECT {columnsToSelect}
FROM TRANSITION_PRECURSOR_MAPPING
LEFT JOIN TRANSITION ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID
LEFT JOIN PRECURSOR ON TRANSITION_PRECURSOR_MAPPING.PRECURSOR_ID = PRECURSOR.ID
LEFT JOIN FEATURE_TRANSITION ON TRANSITION.ID = FEATURE_TRANSITION.TRANSITION_ID
LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID
LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID
LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID
LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID
LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID
LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID
LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID
LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID
LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID
{gene_table_joins}
{pepJoin}
{protJoin}
'''
else:
feature_query = f'''
SELECT {columnsToSelect}
FROM FEATURE_TRANSITION
LEFT JOIN TRANSITION ON FEATURE_TRANSITION.TRANSITION_ID = TRANSITION.ID
LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID
LEFT JOIN PRECURSOR ON FEATURE.PRECURSOR_ID = PRECURSOR.ID
LEFT JOIN TRANSITION_PRECURSOR_MAPPING ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID
LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID
LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID
LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID
LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID
LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID
LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID
LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID
LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID
{gene_table_joins}
{pepJoin}
{protJoin}
'''
condb.sql(feature_query).write_parquet(outfile)
else:
query = f'''
SELECT {columnsToSelect},
{featureLvlPrefix}
FROM FEATURE_TRANSITION
LEFT JOIN TRANSITION ON FEATURE_TRANSITION.TRANSITION_ID = TRANSITION.ID
LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID
LEFT JOIN PRECURSOR ON FEATURE.PRECURSOR_ID = PRECURSOR.ID
LEFT JOIN TRANSITION_PRECURSOR_MAPPING ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID
LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID
LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID
LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID
LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID
LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID
LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID
LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID
LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID
{gene_table_joins}
{pepJoin}
{protJoin}
{decoyExclude}
{featureLvlSuffix}
'''
condb.sql(query).write_parquet(outfile)
5 changes: 3 additions & 2 deletions pyprophet/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,8 @@ def export(infile, outfile, format, outcsv, transition_quantification, max_trans
@click.option('--out', 'outfile', required=False, type=click.Path(exists=False), help='Output parquet file.')
@click.option('--transitionLevel', 'transitionLevel', is_flag=True, help='Whether to export transition level data as well')
@click.option('--onlyFeatures', 'onlyFeatures', is_flag=True, help='Only include precursors that have a corresponding feature')
def export_parquet(infile, outfile, transitionLevel, onlyFeatures):
@click.option('--noDecoys', 'noDecoys', is_flag=True, help='Do not include decoys in the exported data')
def export_parquet(infile, outfile, transitionLevel, onlyFeatures, noDecoys):
"""
Export all transition data to parquet file
"""
Expand All @@ -381,7 +382,7 @@ def export_parquet(infile, outfile, transitionLevel, onlyFeatures):
if not overwrite:
raise click.ClickException(f"Aborting: {outfile} already exists!")
click.echo("Info: Parquet file will be written to {}".format(outfile))
export_to_parquet(os.path.abspath(infile), os.path.abspath(outfile), transitionLevel, onlyFeatures)
export_to_parquet(os.path.abspath(infile), os.path.abspath(outfile), transitionLevel, onlyFeatures, noDecoys)

# Export Compound TSV
@cli.command()
Expand Down
Loading