diff --git a/pyprophet/export_parquet.py b/pyprophet/export_parquet.py
index 24b723c2..1f89fa54 100644
--- a/pyprophet/export_parquet.py
+++ b/pyprophet/export_parquet.py
@@ -3,6 +3,7 @@
import pandas as pd
from pyprophet.export import check_sqlite_table
from duckdb_extensions import extension_importer
+import re
def getPeptideProteinScoreTable(conndb, level):
if level == 'peptide':
@@ -31,7 +32,7 @@ def getVarColumnNames(condb, tableName):
# this method is only currently supported for combined output and not with ipf
-def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False):
+def export_to_parquet(infile, outfile, transitionLevel=False, onlyFeatures=False, noDecoys=False):
'''
Convert an OSW sqlite file to Parquet format
@@ -66,6 +67,9 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False):
CREATE INDEX IF NOT EXISTS idx_protein_protein_id ON PROTEIN (ID);
CREATE INDEX IF NOT EXISTS idx_peptide_protein_mapping_peptide_id ON PEPTIDE_PROTEIN_MAPPING (PEPTIDE_ID);
+ CREATE INDEX IF NOT EXISTS idx_transition_id ON TRANSITION (ID);
+ CREATE INDEX IF NOT EXISTS idx_transition_precursor_mapping_transition_id ON TRANSITION_PRECURSOR_MAPPING (TRANSITION_ID);
+ CREATE INDEX IF NOT EXISTS idx_transition_precursor_mapping_precursor_id ON TRANSITION_PRECURSOR_MAPPING (PRECURSOR_ID);
'''
if check_sqlite_table(con, "FEATURE_MS1"):
@@ -200,19 +204,30 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False):
# create a list of all the columns
columns_list = [col for c in columns.values() for col in c]
+
+ # create a list of just aliases for groupby
+ pattern = re.compile(r"(.*)\sAS")
+ alias_list = [ pattern.search(col).group(1) for c in columns.values() for col in c]
# join the list into a single string separated by a comma and a space
columnsToSelect = ", ".join(columns_list)
-
- join_features = "LEFT JOIN" if onlyFeatures else "FULL JOIN"
-
- # First read feature data
- # Feature Data
- if not transitionLevel:
- feature_query = f'''
- SELECT {columnsToSelect}
- FROM FEATURE
- {join_features} PRECURSOR ON FEATURE.PRECURSOR_ID = PRECURSOR.ID
+ aliasToSelect = ", ".join(alias_list)
+
+ # For feature level group important transition level data into one row separated by ';'
+ featureLvlPrefix = "GROUP_CONCAT(TRANSITION.ID, ';') AS 'TRANSITION_ID', GROUP_CONCAT(TRANSITION.ANNOTATION, ';') AS 'TRANSITION_ANNOTATION'" if not transitionLevel else ""
+ featureLvlSuffix = f'GROUP BY {aliasToSelect}' if not transitionLevel else ""
+
+ decoyExclude = "WHERE PRECURSOR.DECOY == 0" if noDecoys else ""
+
+ if not onlyFeatures:
+ query = f'''
+ SELECT {columnsToSelect},
+ {featureLvlPrefix}
+ FROM TRANSITION_PRECURSOR_MAPPING
+ LEFT JOIN TRANSITION ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID
+ LEFT JOIN PRECURSOR ON TRANSITION_PRECURSOR_MAPPING.PRECURSOR_ID = PRECURSOR.ID
+ LEFT JOIN FEATURE_TRANSITION ON TRANSITION.ID = FEATURE_TRANSITION.TRANSITION_ID
+ LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID
LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID
LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID
LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID
@@ -224,48 +239,30 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False):
{gene_table_joins}
{pepJoin}
{protJoin}
+ {decoyExclude}
+ {featureLvlSuffix}
'''
- else: # is transition level
-
- # merge transition and precursor level data
- if not onlyFeatures:
- feature_query = f'''
- SELECT {columnsToSelect}
- FROM TRANSITION_PRECURSOR_MAPPING
- LEFT JOIN TRANSITION ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID
- LEFT JOIN PRECURSOR ON TRANSITION_PRECURSOR_MAPPING.PRECURSOR_ID = PRECURSOR.ID
- LEFT JOIN FEATURE_TRANSITION ON TRANSITION.ID = FEATURE_TRANSITION.TRANSITION_ID
- LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID
- LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID
- LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID
- LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID
- LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID
- LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID
- LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID
- LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID
- LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID
- {gene_table_joins}
- {pepJoin}
- {protJoin}
- '''
- else:
- feature_query = f'''
- SELECT {columnsToSelect}
- FROM FEATURE_TRANSITION
- LEFT JOIN TRANSITION ON FEATURE_TRANSITION.TRANSITION_ID = TRANSITION.ID
- LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID
- LEFT JOIN PRECURSOR ON FEATURE.PRECURSOR_ID = PRECURSOR.ID
- LEFT JOIN TRANSITION_PRECURSOR_MAPPING ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID
- LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID
- LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID
- LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID
- LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID
- LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID
- LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID
- LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID
- LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID
- {gene_table_joins}
- {pepJoin}
- {protJoin}
- '''
- condb.sql(feature_query).write_parquet(outfile)
\ No newline at end of file
+ else:
+ query = f'''
+ SELECT {columnsToSelect},
+ {featureLvlPrefix}
+ FROM FEATURE_TRANSITION
+ LEFT JOIN TRANSITION ON FEATURE_TRANSITION.TRANSITION_ID = TRANSITION.ID
+ LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID
+ LEFT JOIN PRECURSOR ON FEATURE.PRECURSOR_ID = PRECURSOR.ID
+ LEFT JOIN TRANSITION_PRECURSOR_MAPPING ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID
+ LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID
+ LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID
+ LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID
+ LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID
+ LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID
+ LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID
+ LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID
+ LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID
+ {gene_table_joins}
+ {pepJoin}
+ {protJoin}
+ {decoyExclude}
+ {featureLvlSuffix}
+ '''
+ condb.sql(query).write_parquet(outfile)
\ No newline at end of file
diff --git a/pyprophet/main.py b/pyprophet/main.py
index 24f97abe..eed24392 100644
--- a/pyprophet/main.py
+++ b/pyprophet/main.py
@@ -368,7 +368,8 @@ def export(infile, outfile, format, outcsv, transition_quantification, max_trans
@click.option('--out', 'outfile', required=False, type=click.Path(exists=False), help='Output parquet file.')
@click.option('--transitionLevel', 'transitionLevel', is_flag=True, help='Whether to export transition level data as well')
@click.option('--onlyFeatures', 'onlyFeatures', is_flag=True, help='Only include precursors that have a corresponding feature')
-def export_parquet(infile, outfile, transitionLevel, onlyFeatures):
+@click.option('--noDecoys', 'noDecoys', is_flag=True, help='Do not include decoys in the exported data')
+def export_parquet(infile, outfile, transitionLevel, onlyFeatures, noDecoys):
"""
Export all transition data to parquet file
"""
@@ -381,7 +382,7 @@ def export_parquet(infile, outfile, transitionLevel, onlyFeatures):
if not overwrite:
raise click.ClickException(f"Aborting: {outfile} already exists!")
click.echo("Info: Parquet file will be written to {}".format(outfile))
- export_to_parquet(os.path.abspath(infile), os.path.abspath(outfile), transitionLevel, onlyFeatures)
+ export_to_parquet(os.path.abspath(infile), os.path.abspath(outfile), transitionLevel, onlyFeatures, noDecoys)
# Export Compound TSV
@cli.command()
diff --git a/tests/Create_OSW_test.ipynb b/tests/Create_OSW_test.ipynb
index 4a96d9df..0ba87a6f 100644
--- a/tests/Create_OSW_test.ipynb
+++ b/tests/Create_OSW_test.ipynb
@@ -8,6 +8,14 @@
"## **Create a fake .OSW file for testing**"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "426d6f86-aaea-4372-b1fd-16234cdccb7f",
+ "metadata": {},
+ "source": [
+ "**Note:** Code cell 11 must be edited manually if new entries are added to the library"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 1,
@@ -59,7 +67,7 @@
"metadata": {},
"outputs": [],
"source": [
- "lib = pd.read_csv(\"fakeLib.tsv\", sep='\\t')"
+ "lib = pd.read_csv(\"data/fakeLib.tsv\", sep='\\t')"
]
},
{
@@ -102,6 +110,7 @@
"
ProductCharge | \n",
" GeneName | \n",
" LibraryDriftTime | \n",
+ " Decoy | \n",
" Annotation | \n",
" TransitionId | \n",
" \n",
@@ -122,6 +131,7 @@
" 2 | \n",
" Y | \n",
" 10 | \n",
+ " 0 | \n",
" b1^2 | \n",
" YYYYYYYYYYYK2_b1^2 | \n",
" \n",
@@ -140,6 +150,7 @@
" 2 | \n",
" Y | \n",
" 10 | \n",
+ " 0 | \n",
" y2^2 | \n",
" YYYYYYYYYYYK2_y2^2 | \n",
" \n",
@@ -158,6 +169,7 @@
" 2 | \n",
" Y | \n",
" 10 | \n",
+ " 0 | \n",
" b3^2 | \n",
" YYYYYYYYYYYK2_b3^2 | \n",
" \n",
@@ -176,6 +188,7 @@
" 2 | \n",
" Y | \n",
" 20 | \n",
+ " 0 | \n",
" b1^2 | \n",
" YYYYYR2_b1^2 | \n",
" \n",
@@ -194,6 +207,7 @@
" 2 | \n",
" Y | \n",
" 20 | \n",
+ " 0 | \n",
" y2^2 | \n",
" YYYYYR2_y2^2 | \n",
" \n",
@@ -212,6 +226,7 @@
" 2 | \n",
" Y | \n",
" 20 | \n",
+ " 0 | \n",
" b3^2 | \n",
" YYYYYR2_b3^2 | \n",
" \n",
@@ -230,6 +245,7 @@
" 2 | \n",
" Y | \n",
" 20 | \n",
+ " 0 | \n",
" b1^2 | \n",
" YYYYYR3_b1^2 | \n",
" \n",
@@ -248,6 +264,7 @@
" 2 | \n",
" Y | \n",
" 20 | \n",
+ " 0 | \n",
" y2^2 | \n",
" YYYYYR3_y2^2 | \n",
" \n",
@@ -266,6 +283,7 @@
" 2 | \n",
" G | \n",
" 40 | \n",
+ " 0 | \n",
" b1^2 | \n",
" GGGGGGGGGGR4_b1^2 | \n",
" \n",
@@ -284,6 +302,7 @@
" 2 | \n",
" G | \n",
" 40 | \n",
+ " 0 | \n",
" y2^2 | \n",
" GGGGGGGGGGR4_y2^2 | \n",
" \n",
@@ -302,6 +321,7 @@
" 2 | \n",
" G | \n",
" 40 | \n",
+ " 0 | \n",
" b3^2 | \n",
" GGGGGGGGGGR4_b3^2 | \n",
" \n",
@@ -320,6 +340,7 @@
" 2 | \n",
" G | \n",
" 40 | \n",
+ " 0 | \n",
" y4^2 | \n",
" GGGGGGGGGGR4_y4^2 | \n",
" \n",
@@ -338,6 +359,7 @@
" 2 | \n",
" T | \n",
" 50 | \n",
+ " 0 | \n",
" b1^2 | \n",
" TTTTTTTR2_b1^2 | \n",
" \n",
@@ -356,6 +378,7 @@
" 2 | \n",
" T | \n",
" 50 | \n",
+ " 0 | \n",
" y2^2 | \n",
" TTTTTTTR2_y2^2 | \n",
" \n",
@@ -374,6 +397,7 @@
" 2 | \n",
" T | \n",
" 50 | \n",
+ " 0 | \n",
" b3^2 | \n",
" TTTTTTTR2_b3^2 | \n",
" \n",
@@ -392,6 +416,7 @@
" 2 | \n",
" T | \n",
" 60 | \n",
+ " 0 | \n",
" b1^2 | \n",
" TTTTTTTTTTTTK2_b1^2 | \n",
" \n",
@@ -410,6 +435,7 @@
" 2 | \n",
" T | \n",
" 60 | \n",
+ " 0 | \n",
" y2^2 | \n",
" TTTTTTTTTTTTK2_y2^2 | \n",
" \n",
@@ -428,6 +454,7 @@
" 3 | \n",
" T | \n",
" 70 | \n",
+ " 0 | \n",
" b1^3 | \n",
" TTR3_b1^3 | \n",
" \n",
@@ -446,6 +473,7 @@
" 3 | \n",
" T | \n",
" 70 | \n",
+ " 0 | \n",
" y2^3 | \n",
" TTR3_y2^3 | \n",
" \n",
@@ -464,9 +492,67 @@
" 3 | \n",
" T | \n",
" 70 | \n",
+ " 0 | \n",
" b3^3 | \n",
" TTR3_b3^3 | \n",
" \n",
+ " \n",
+ " | 20 | \n",
+ " 800 | \n",
+ " 801 | \n",
+ " 808 | \n",
+ " 80 | \n",
+ " Decoy_ProtT | \n",
+ " TTK | \n",
+ " TTK | \n",
+ " 3 | \n",
+ " b | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " Decoy_T | \n",
+ " 80 | \n",
+ " 1 | \n",
+ " b1^3 | \n",
+ " TTK3_b1^3 | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " 800 | \n",
+ " 802 | \n",
+ " 808 | \n",
+ " 80 | \n",
+ " Decoy_ProtT | \n",
+ " TTK | \n",
+ " TTK | \n",
+ " 3 | \n",
+ " y | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " Decoy_T | \n",
+ " 80 | \n",
+ " 1 | \n",
+ " y2^3 | \n",
+ " TTK3_y2^3 | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " 800 | \n",
+ " 803 | \n",
+ " 808 | \n",
+ " 80 | \n",
+ " Decoy_ProtT | \n",
+ " TTK | \n",
+ " TTK | \n",
+ " 3 | \n",
+ " b | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " Decoy_T | \n",
+ " 80 | \n",
+ " 1 | \n",
+ " b3^3 | \n",
+ " TTK3_b3^3 | \n",
+ "
\n",
" \n",
"\n",
""
@@ -493,28 +579,34 @@
"17 700 701 107 70 \n",
"18 700 702 207 70 \n",
"19 700 703 307 70 \n",
+ "20 800 801 808 80 \n",
+ "21 800 802 808 80 \n",
+ "22 800 803 808 80 \n",
"\n",
- " ProteinId PeptideSequence ModifiedPeptideSequence PrecursorCharge \\\n",
- "0 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 \n",
- "1 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 \n",
- "2 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 \n",
- "3 ProtY YYYYYR YYYYYR 2 \n",
- "4 ProtY YYYYYR YYYYYR 2 \n",
- "5 ProtY YYYYYR YYYYYR 2 \n",
- "6 ProtY YYYYYR YYYYYR 3 \n",
- "7 ProtY YYYYYR YYYYYR 3 \n",
- "8 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n",
- "9 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n",
- "10 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n",
- "11 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n",
- "12 ProtT TTTTTTTR TTTTTTTR 2 \n",
- "13 ProtT TTTTTTTR TTTTTTTR 2 \n",
- "14 ProtT TTTTTTTR TTTTTTTR 2 \n",
- "15 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 \n",
- "16 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 \n",
- "17 ProtT TTR TTR 3 \n",
- "18 ProtT TTR TTR 3 \n",
- "19 ProtT TTR TTR 3 \n",
+ " ProteinId PeptideSequence ModifiedPeptideSequence PrecursorCharge \\\n",
+ "0 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 \n",
+ "1 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 \n",
+ "2 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 \n",
+ "3 ProtY YYYYYR YYYYYR 2 \n",
+ "4 ProtY YYYYYR YYYYYR 2 \n",
+ "5 ProtY YYYYYR YYYYYR 2 \n",
+ "6 ProtY YYYYYR YYYYYR 3 \n",
+ "7 ProtY YYYYYR YYYYYR 3 \n",
+ "8 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n",
+ "9 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n",
+ "10 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n",
+ "11 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n",
+ "12 ProtT TTTTTTTR TTTTTTTR 2 \n",
+ "13 ProtT TTTTTTTR TTTTTTTR 2 \n",
+ "14 ProtT TTTTTTTR TTTTTTTR 2 \n",
+ "15 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 \n",
+ "16 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 \n",
+ "17 ProtT TTR TTR 3 \n",
+ "18 ProtT TTR TTR 3 \n",
+ "19 ProtT TTR TTR 3 \n",
+ "20 Decoy_ProtT TTK TTK 3 \n",
+ "21 Decoy_ProtT TTK TTK 3 \n",
+ "22 Decoy_ProtT TTK TTK 3 \n",
"\n",
" FragmentType FragmentSeriesNumber ProductCharge GeneName \\\n",
"0 b 1 2 Y \n",
@@ -537,28 +629,34 @@
"17 b 1 3 T \n",
"18 y 2 3 T \n",
"19 b 3 3 T \n",
+ "20 b 1 3 Decoy_T \n",
+ "21 y 2 3 Decoy_T \n",
+ "22 b 3 3 Decoy_T \n",
"\n",
- " LibraryDriftTime Annotation TransitionId \n",
- "0 10 b1^2 YYYYYYYYYYYK2_b1^2 \n",
- "1 10 y2^2 YYYYYYYYYYYK2_y2^2 \n",
- "2 10 b3^2 YYYYYYYYYYYK2_b3^2 \n",
- "3 20 b1^2 YYYYYR2_b1^2 \n",
- "4 20 y2^2 YYYYYR2_y2^2 \n",
- "5 20 b3^2 YYYYYR2_b3^2 \n",
- "6 20 b1^2 YYYYYR3_b1^2 \n",
- "7 20 y2^2 YYYYYR3_y2^2 \n",
- "8 40 b1^2 GGGGGGGGGGR4_b1^2 \n",
- "9 40 y2^2 GGGGGGGGGGR4_y2^2 \n",
- "10 40 b3^2 GGGGGGGGGGR4_b3^2 \n",
- "11 40 y4^2 GGGGGGGGGGR4_y4^2 \n",
- "12 50 b1^2 TTTTTTTR2_b1^2 \n",
- "13 50 y2^2 TTTTTTTR2_y2^2 \n",
- "14 50 b3^2 TTTTTTTR2_b3^2 \n",
- "15 60 b1^2 TTTTTTTTTTTTK2_b1^2 \n",
- "16 60 y2^2 TTTTTTTTTTTTK2_y2^2 \n",
- "17 70 b1^3 TTR3_b1^3 \n",
- "18 70 y2^3 TTR3_y2^3 \n",
- "19 70 b3^3 TTR3_b3^3 "
+ " LibraryDriftTime Decoy Annotation TransitionId \n",
+ "0 10 0 b1^2 YYYYYYYYYYYK2_b1^2 \n",
+ "1 10 0 y2^2 YYYYYYYYYYYK2_y2^2 \n",
+ "2 10 0 b3^2 YYYYYYYYYYYK2_b3^2 \n",
+ "3 20 0 b1^2 YYYYYR2_b1^2 \n",
+ "4 20 0 y2^2 YYYYYR2_y2^2 \n",
+ "5 20 0 b3^2 YYYYYR2_b3^2 \n",
+ "6 20 0 b1^2 YYYYYR3_b1^2 \n",
+ "7 20 0 y2^2 YYYYYR3_y2^2 \n",
+ "8 40 0 b1^2 GGGGGGGGGGR4_b1^2 \n",
+ "9 40 0 y2^2 GGGGGGGGGGR4_y2^2 \n",
+ "10 40 0 b3^2 GGGGGGGGGGR4_b3^2 \n",
+ "11 40 0 y4^2 GGGGGGGGGGR4_y4^2 \n",
+ "12 50 0 b1^2 TTTTTTTR2_b1^2 \n",
+ "13 50 0 y2^2 TTTTTTTR2_y2^2 \n",
+ "14 50 0 b3^2 TTTTTTTR2_b3^2 \n",
+ "15 60 0 b1^2 TTTTTTTTTTTTK2_b1^2 \n",
+ "16 60 0 y2^2 TTTTTTTTTTTTK2_y2^2 \n",
+ "17 70 0 b1^3 TTR3_b1^3 \n",
+ "18 70 0 y2^3 TTR3_y2^3 \n",
+ "19 70 0 b3^3 TTR3_b3^3 \n",
+ "20 80 1 b1^3 TTK3_b1^3 \n",
+ "21 80 1 y2^3 TTK3_y2^3 \n",
+ "22 80 1 b3^3 TTK3_b3^3 "
]
},
"execution_count": 3,
@@ -579,7 +677,7 @@
"metadata": {},
"outputs": [],
"source": [
- "lib.to_csv(\"fakeLib_appended.tsv\", sep='\\t', index=False)"
+ "lib.to_csv(\"data/fakeLib_appended.tsv\", sep='\\t', index=False)"
]
},
{
@@ -619,17 +717,39 @@
{
"cell_type": "code",
"execution_count": 5,
+ "id": "d8a28a22-9f82-4914-a123-e631509e6ab8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'data/dummyOSWScoredData.osw'"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import shutil\n",
+ "shutil.copyfile(\"data/fakeLib.pqp\", \"data/dummyOSWScoredData.osw\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
"id": "1afe24b1-cf28-44e0-84e4-f9194428e18f",
"metadata": {},
"outputs": [],
"source": [
- "conn = sqlite3.connect(\"fakeLib.pqp\")\n",
+ "conn = sqlite3.connect(\"data/dummyOSWScoredData.osw\")\n",
"cur = conn.cursor()"
]
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 7,
"id": "2b50c3b4-e436-4edb-8a80-cc3bf67e73d3",
"metadata": {},
"outputs": [],
@@ -654,17 +774,17 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 8,
"id": "584dc6c7-9c7a-4ad6-91c2-12a0fffa4c08",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 7,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -676,7 +796,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 9,
"id": "98401ee9-5153-477b-928a-1c66cdbb8e5d",
"metadata": {},
"outputs": [],
@@ -686,7 +806,7 @@
},
{
"cell_type": "code",
- "execution_count": 56,
+ "execution_count": 10,
"id": "008b6e0f-28b8-4f22-89fe-54cdad6dca02",
"metadata": {
"tags": []
@@ -720,17 +840,17 @@
" \n",
" \n",
" | 0 | \n",
- " 6 | \n",
+ " 7 | \n",
" 100.0 | \n",
"
\n",
" \n",
" | 1 | \n",
- " 4 | \n",
+ " 5 | \n",
" 200.0 | \n",
"
\n",
" \n",
" | 2 | \n",
- " 5 | \n",
+ " 6 | \n",
" 220.0 | \n",
"
\n",
" \n",
@@ -740,35 +860,41 @@
"
\n",
" \n",
" | 4 | \n",
- " 2 | \n",
+ " 3 | \n",
" 500.0 | \n",
"
\n",
" \n",
" | 5 | \n",
- " 3 | \n",
+ " 4 | \n",
" 600.0 | \n",
"
\n",
" \n",
" | 6 | \n",
- " 1 | \n",
+ " 2 | \n",
" 700.0 | \n",
"
\n",
+ " \n",
+ " | 7 | \n",
+ " 1 | \n",
+ " 800.0 | \n",
+ "
\n",
" \n",
"\n",
""
],
"text/plain": [
" ID PRECURSOR_MZ\n",
- "0 6 100.0\n",
- "1 4 200.0\n",
- "2 5 220.0\n",
+ "0 7 100.0\n",
+ "1 5 200.0\n",
+ "2 6 220.0\n",
"3 0 400.0\n",
- "4 2 500.0\n",
- "5 3 600.0\n",
- "6 1 700.0"
+ "4 3 500.0\n",
+ "5 4 600.0\n",
+ "6 2 700.0\n",
+ "7 1 800.0"
]
},
- "execution_count": 56,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -779,17 +905,18 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 11,
"id": "8f2b41fa-efb2-4f92-ad20-1737c16b3b8b",
"metadata": {},
"outputs": [],
"source": [
- "features = pd.DataFrame(np.column_stack([np.arange(0,7), np.array([5,5,3,4,0,1,2])]), columns=['id', 'precursor_id'])"
+ "## Note: The second numpy array must be edited manually if add new precursors to the library\n",
+ "features = pd.DataFrame(np.column_stack([np.arange(1,9), np.array([6,6,4,5,0,2,3,1])]), columns=['id', 'precursor_id'])"
]
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 12,
"id": "4b6472b3-a603-492c-b6da-bca4ae0c1ae9",
"metadata": {},
"outputs": [],
@@ -799,11 +926,36 @@
},
{
"cell_type": "code",
- "execution_count": 55,
- "id": "23ea2e05-f030-41f4-aa98-1fc78da43303",
- "metadata": {
- "tags": []
- },
+ "execution_count": 13,
+ "id": "a251b758-df07-4d44-ab84-f8f9dd6911af",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# feature table\n",
+ "features['run_id'] = np.array([1] * len(features), dtype=int)\n",
+ "features['exp_rt'] = features['PRECURSOR_MZ'] + round((features['id'] + 1) / 100, 2)\n",
+ "features['exp_im'] = features['exp_rt']\n",
+ "features['norm_rt'] = [ int(i) for i in features['exp_rt'] ]\n",
+ "features['delta_rt'] = [0.01] * len(features)\n",
+ "features['left_width'] = [5] * len(features)\n",
+ "features['right_width'] = [5] * len(features)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "feeb09d3-17c4-4dd3-9325-5af12d22842b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "features = features.drop(columns=['PRECURSOR_MZ'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "c9c07727-5351-4501-8f1f-23df31b0a4e9",
+ "metadata": {},
"outputs": [
{
"data": {
@@ -841,12 +993,12 @@
" \n",
" \n",
" | 0 | \n",
- " 0 | \n",
- " 5 | \n",
- " 5 | \n",
" 1 | \n",
- " 220.01 | \n",
- " 220.01 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " 1 | \n",
+ " 220.02 | \n",
+ " 220.02 | \n",
" 220 | \n",
" 0.01 | \n",
" 5 | \n",
@@ -854,12 +1006,12 @@
"
\n",
" \n",
" | 1 | \n",
+ " 2 | \n",
+ " 6 | \n",
+ " 6 | \n",
" 1 | \n",
- " 5 | \n",
- " 5 | \n",
- " 1 | \n",
- " 220.02 | \n",
- " 220.02 | \n",
+ " 220.03 | \n",
+ " 220.03 | \n",
" 220 | \n",
" 0.01 | \n",
" 5 | \n",
@@ -867,12 +1019,12 @@
"
\n",
" \n",
" | 2 | \n",
- " 2 | \n",
- " 3 | \n",
" 3 | \n",
+ " 4 | \n",
+ " 4 | \n",
" 1 | \n",
- " 600.03 | \n",
- " 600.03 | \n",
+ " 600.04 | \n",
+ " 600.04 | \n",
" 600 | \n",
" 0.01 | \n",
" 5 | \n",
@@ -880,12 +1032,12 @@
"
\n",
" \n",
" | 3 | \n",
- " 3 | \n",
- " 4 | \n",
" 4 | \n",
+ " 5 | \n",
+ " 5 | \n",
" 1 | \n",
- " 200.04 | \n",
- " 200.04 | \n",
+ " 200.05 | \n",
+ " 200.05 | \n",
" 200 | \n",
" 0.01 | \n",
" 5 | \n",
@@ -893,12 +1045,12 @@
"
\n",
" \n",
" | 4 | \n",
- " 4 | \n",
+ " 5 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
- " 400.05 | \n",
- " 400.05 | \n",
+ " 400.06 | \n",
+ " 400.06 | \n",
" 400 | \n",
" 0.01 | \n",
" 5 | \n",
@@ -906,12 +1058,12 @@
"
\n",
" \n",
" | 5 | \n",
- " 5 | \n",
- " 1 | \n",
- " 1 | \n",
+ " 6 | \n",
+ " 2 | \n",
+ " 2 | \n",
" 1 | \n",
- " 700.06 | \n",
- " 700.06 | \n",
+ " 700.07 | \n",
+ " 700.07 | \n",
" 700 | \n",
" 0.01 | \n",
" 5 | \n",
@@ -919,30 +1071,44 @@
"
\n",
" \n",
" | 6 | \n",
- " 6 | \n",
- " 2 | \n",
- " 2 | \n",
+ " 7 | \n",
+ " 3 | \n",
+ " 3 | \n",
" 1 | \n",
- " 500.07 | \n",
- " 500.07 | \n",
+ " 500.08 | \n",
+ " 500.08 | \n",
" 500 | \n",
" 0.01 | \n",
" 5 | \n",
" 5 | \n",
"
\n",
+ " \n",
+ " | 7 | \n",
+ " 8 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 800.09 | \n",
+ " 800.09 | \n",
+ " 800 | \n",
+ " 0.01 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ "
\n",
" \n",
"\n",
""
],
"text/plain": [
" id precursor_id ID run_id exp_rt exp_im norm_rt delta_rt \\\n",
- "0 0 5 5 1 220.01 220.01 220 0.01 \n",
- "1 1 5 5 1 220.02 220.02 220 0.01 \n",
- "2 2 3 3 1 600.03 600.03 600 0.01 \n",
- "3 3 4 4 1 200.04 200.04 200 0.01 \n",
- "4 4 0 0 1 400.05 400.05 400 0.01 \n",
- "5 5 1 1 1 700.06 700.06 700 0.01 \n",
- "6 6 2 2 1 500.07 500.07 500 0.01 \n",
+ "0 1 6 6 1 220.02 220.02 220 0.01 \n",
+ "1 2 6 6 1 220.03 220.03 220 0.01 \n",
+ "2 3 4 4 1 600.04 600.04 600 0.01 \n",
+ "3 4 5 5 1 200.05 200.05 200 0.01 \n",
+ "4 5 0 0 1 400.06 400.06 400 0.01 \n",
+ "5 6 2 2 1 700.07 700.07 700 0.01 \n",
+ "6 7 3 3 1 500.08 500.08 500 0.01 \n",
+ "7 8 1 1 1 800.09 800.09 800 0.01 \n",
"\n",
" left_width right_width \n",
"0 5 5 \n",
@@ -951,10 +1117,11 @@
"3 5 5 \n",
"4 5 5 \n",
"5 5 5 \n",
- "6 5 5 "
+ "6 5 5 \n",
+ "7 5 5 "
]
},
- "execution_count": 55,
+ "execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
@@ -965,126 +1132,19 @@
},
{
"cell_type": "code",
- "execution_count": 11,
- "id": "a5473d3c-793d-4d9d-b722-4aab486e0fa5",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " ID | \n",
- " PRECURSOR_MZ | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 6 | \n",
- " 100.0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 4 | \n",
- " 200.0 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 5 | \n",
- " 220.0 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 0 | \n",
- " 400.0 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 2 | \n",
- " 500.0 | \n",
- "
\n",
- " \n",
- " | 5 | \n",
- " 3 | \n",
- " 600.0 | \n",
- "
\n",
- " \n",
- " | 6 | \n",
- " 1 | \n",
- " 700.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " ID PRECURSOR_MZ\n",
- "0 6 100.0\n",
- "1 4 200.0\n",
- "2 5 220.0\n",
- "3 0 400.0\n",
- "4 2 500.0\n",
- "5 3 600.0\n",
- "6 1 700.0"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "precursor_table"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "id": "a251b758-df07-4d44-ab84-f8f9dd6911af",
- "metadata": {},
- "outputs": [],
- "source": [
- "# feature table\n",
- "features['run_id'] = np.array([1] * len(features), dtype=int)\n",
- "features['exp_rt'] = features['PRECURSOR_MZ'] + round((features['id'] + 1) / 100, 2)\n",
- "features['exp_im'] = features['exp_rt']\n",
- "features['norm_rt'] = [ int(i) for i in features['exp_rt'] ]\n",
- "features['delta_rt'] = [0.01] * len(features)\n",
- "features['left_width'] = [5] * len(features)\n",
- "features['right_width'] = [5] * len(features)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "feeb09d3-17c4-4dd3-9325-5af12d22842b",
+ "execution_count": 16,
+ "id": "cbb86464-d1d6-4de9-92b0-a48a787a6895",
"metadata": {},
"outputs": [],
"source": [
- "features = features.drop(columns=['PRECURSOR_MZ'])"
+ "# make id a long string so more realistic\n",
+ "features['id'] = (features['id'].astype(str) * 19).astype(int)"
]
},
{
"cell_type": "code",
- "execution_count": 14,
- "id": "c9c07727-5351-4501-8f1f-23df31b0a4e9",
+ "execution_count": 17,
+ "id": "70d20530-d775-4b3f-896d-76a345d42e5e",
"metadata": {},
"outputs": [
{
@@ -1123,12 +1183,12 @@
" \n",
" \n",
" | 0 | \n",
- " 0 | \n",
- " 5 | \n",
- " 5 | \n",
+ " 1111111111111111111 | \n",
+ " 6 | \n",
+ " 6 | \n",
" 1 | \n",
- " 220.01 | \n",
- " 220.01 | \n",
+ " 220.02 | \n",
+ " 220.02 | \n",
" 220 | \n",
" 0.01 | \n",
" 5 | \n",
@@ -1136,12 +1196,12 @@
"
\n",
" \n",
" | 1 | \n",
+ " 2222222222222222222 | \n",
+ " 6 | \n",
+ " 6 | \n",
" 1 | \n",
- " 5 | \n",
- " 5 | \n",
- " 1 | \n",
- " 220.02 | \n",
- " 220.02 | \n",
+ " 220.03 | \n",
+ " 220.03 | \n",
" 220 | \n",
" 0.01 | \n",
" 5 | \n",
@@ -1149,12 +1209,12 @@
"
\n",
" \n",
" | 2 | \n",
- " 2 | \n",
- " 3 | \n",
- " 3 | \n",
+ " 3333333333333333333 | \n",
+ " 4 | \n",
+ " 4 | \n",
" 1 | \n",
- " 600.03 | \n",
- " 600.03 | \n",
+ " 600.04 | \n",
+ " 600.04 | \n",
" 600 | \n",
" 0.01 | \n",
" 5 | \n",
@@ -1162,12 +1222,12 @@
"
\n",
" \n",
" | 3 | \n",
- " 3 | \n",
- " 4 | \n",
- " 4 | \n",
+ " 4444444444444444444 | \n",
+ " 5 | \n",
+ " 5 | \n",
" 1 | \n",
- " 200.04 | \n",
- " 200.04 | \n",
+ " 200.05 | \n",
+ " 200.05 | \n",
" 200 | \n",
" 0.01 | \n",
" 5 | \n",
@@ -1175,12 +1235,12 @@
"
\n",
" \n",
" | 4 | \n",
- " 4 | \n",
+ " 5555555555555555555 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
- " 400.05 | \n",
- " 400.05 | \n",
+ " 400.06 | \n",
+ " 400.06 | \n",
" 400 | \n",
" 0.01 | \n",
" 5 | \n",
@@ -1188,12 +1248,12 @@
"
\n",
" \n",
" | 5 | \n",
- " 5 | \n",
- " 1 | \n",
- " 1 | \n",
+ " 6666666666666666666 | \n",
+ " 2 | \n",
+ " 2 | \n",
" 1 | \n",
- " 700.06 | \n",
- " 700.06 | \n",
+ " 700.07 | \n",
+ " 700.07 | \n",
" 700 | \n",
" 0.01 | \n",
" 5 | \n",
@@ -1201,42 +1261,57 @@
"
\n",
" \n",
" | 6 | \n",
- " 6 | \n",
- " 2 | \n",
- " 2 | \n",
+ " 7777777777777777777 | \n",
+ " 3 | \n",
+ " 3 | \n",
" 1 | \n",
- " 500.07 | \n",
- " 500.07 | \n",
+ " 500.08 | \n",
+ " 500.08 | \n",
" 500 | \n",
" 0.01 | \n",
" 5 | \n",
" 5 | \n",
"
\n",
+ " \n",
+ " | 7 | \n",
+ " 8888888888888888888 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 800.09 | \n",
+ " 800.09 | \n",
+ " 800 | \n",
+ " 0.01 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ "
\n",
" \n",
"\n",
""
],
"text/plain": [
- " id precursor_id ID run_id exp_rt exp_im norm_rt delta_rt \\\n",
- "0 0 5 5 1 220.01 220.01 220 0.01 \n",
- "1 1 5 5 1 220.02 220.02 220 0.01 \n",
- "2 2 3 3 1 600.03 600.03 600 0.01 \n",
- "3 3 4 4 1 200.04 200.04 200 0.01 \n",
- "4 4 0 0 1 400.05 400.05 400 0.01 \n",
- "5 5 1 1 1 700.06 700.06 700 0.01 \n",
- "6 6 2 2 1 500.07 500.07 500 0.01 \n",
+ " id precursor_id ID run_id exp_rt exp_im norm_rt \\\n",
+ "0 1111111111111111111 6 6 1 220.02 220.02 220 \n",
+ "1 2222222222222222222 6 6 1 220.03 220.03 220 \n",
+ "2 3333333333333333333 4 4 1 600.04 600.04 600 \n",
+ "3 4444444444444444444 5 5 1 200.05 200.05 200 \n",
+ "4 5555555555555555555 0 0 1 400.06 400.06 400 \n",
+ "5 6666666666666666666 2 2 1 700.07 700.07 700 \n",
+ "6 7777777777777777777 3 3 1 500.08 500.08 500 \n",
+ "7 8888888888888888888 1 1 1 800.09 800.09 800 \n",
"\n",
- " left_width right_width \n",
- "0 5 5 \n",
- "1 5 5 \n",
- "2 5 5 \n",
- "3 5 5 \n",
- "4 5 5 \n",
- "5 5 5 \n",
- "6 5 5 "
+ " delta_rt left_width right_width \n",
+ "0 0.01 5 5 \n",
+ "1 0.01 5 5 \n",
+ "2 0.01 5 5 \n",
+ "3 0.01 5 5 \n",
+ "4 0.01 5 5 \n",
+ "5 0.01 5 5 \n",
+ "6 0.01 5 5 \n",
+ "7 0.01 5 5 "
]
},
- "execution_count": 14,
+ "execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
@@ -1247,7 +1322,7 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 18,
"id": "9b2a1cfc-dcca-45f3-9aa4-e7f75382de3c",
"metadata": {},
"outputs": [],
@@ -1262,7 +1337,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 19,
"id": "4806f254-1c29-4aa5-9822-6cb8c6ea730c",
"metadata": {},
"outputs": [],
@@ -1288,17 +1363,17 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 20,
"id": "8d03f3ca-a4b0-48e9-bd7f-3c1c7b8d6874",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 17,
+ "execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
@@ -1310,7 +1385,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 21,
"id": "aae782e1-c9f9-4c2c-84f7-a7741e385a5e",
"metadata": {},
"outputs": [
@@ -1358,10 +1433,10 @@
" \n",
" \n",
" | 0 | \n",
- " 0 | \n",
- " 220010.0 | \n",
- " 5 | \n",
- " 220.01 | \n",
+ " 1111111111111111111 | \n",
+ " 220020.0 | \n",
+ " 6 | \n",
+ " 220.02 | \n",
" 0.01 | \n",
" 1 | \n",
" 1 | \n",
@@ -1379,10 +1454,10 @@
"
\n",
" \n",
" | 1 | \n",
- " 1 | \n",
- " 220020.0 | \n",
- " 5 | \n",
- " 220.02 | \n",
+ " 2222222222222222222 | \n",
+ " 220030.0 | \n",
+ " 6 | \n",
+ " 220.03 | \n",
" 0.01 | \n",
" 1 | \n",
" 1 | \n",
@@ -1400,10 +1475,10 @@
"
\n",
" \n",
" | 2 | \n",
- " 2 | \n",
- " 600030.0 | \n",
- " 3 | \n",
- " 600.03 | \n",
+ " 3333333333333333333 | \n",
+ " 600040.0 | \n",
+ " 4 | \n",
+ " 600.04 | \n",
" 0.01 | \n",
" 1 | \n",
" 1 | \n",
@@ -1421,10 +1496,10 @@
"
\n",
" \n",
" | 3 | \n",
- " 3 | \n",
- " 200040.0 | \n",
- " 4 | \n",
- " 200.04 | \n",
+ " 4444444444444444444 | \n",
+ " 200050.0 | \n",
+ " 5 | \n",
+ " 200.05 | \n",
" 0.01 | \n",
" 1 | \n",
" 1 | \n",
@@ -1442,10 +1517,10 @@
"
\n",
" \n",
" | 4 | \n",
- " 4 | \n",
- " 400050.0 | \n",
+ " 5555555555555555555 | \n",
+ " 400060.0 | \n",
" 0 | \n",
- " 400.05 | \n",
+ " 400.06 | \n",
" 0.01 | \n",
" 1 | \n",
" 1 | \n",
@@ -1463,10 +1538,10 @@
"
\n",
" \n",
" | 5 | \n",
- " 5 | \n",
- " 700060.0 | \n",
- " 1 | \n",
- " 700.06 | \n",
+ " 6666666666666666666 | \n",
+ " 700070.0 | \n",
+ " 2 | \n",
+ " 700.07 | \n",
" 0.01 | \n",
" 1 | \n",
" 1 | \n",
@@ -1484,10 +1559,31 @@
"
\n",
" \n",
" | 6 | \n",
- " 6 | \n",
- " 500070.0 | \n",
- " 2 | \n",
- " 500.07 | \n",
+ " 7777777777777777777 | \n",
+ " 500080.0 | \n",
+ " 3 | \n",
+ " 500.08 | \n",
+ " 0.01 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 8888888888888888888 | \n",
+ " 800090.0 | \n",
+ " 1 | \n",
+ " 800.09 | \n",
" 0.01 | \n",
" 1 | \n",
" 1 | \n",
@@ -1508,14 +1604,15 @@
""
],
"text/plain": [
- " feature_id area_intensity apex_intensity exp_im delta_im \\\n",
- "0 0 220010.0 5 220.01 0.01 \n",
- "1 1 220020.0 5 220.02 0.01 \n",
- "2 2 600030.0 3 600.03 0.01 \n",
- "3 3 200040.0 4 200.04 0.01 \n",
- "4 4 400050.0 0 400.05 0.01 \n",
- "5 5 700060.0 1 700.06 0.01 \n",
- "6 6 500070.0 2 500.07 0.01 \n",
+ " feature_id area_intensity apex_intensity exp_im delta_im \\\n",
+ "0 1111111111111111111 220020.0 6 220.02 0.01 \n",
+ "1 2222222222222222222 220030.0 6 220.03 0.01 \n",
+ "2 3333333333333333333 600040.0 4 600.04 0.01 \n",
+ "3 4444444444444444444 200050.0 5 200.05 0.01 \n",
+ "4 5555555555555555555 400060.0 0 400.06 0.01 \n",
+ "5 6666666666666666666 700070.0 2 700.07 0.01 \n",
+ "6 7777777777777777777 500080.0 3 500.08 0.01 \n",
+ "7 8888888888888888888 800090.0 1 800.09 0.01 \n",
"\n",
" var_massdev_score var_mi_score var_mi_contrast_score \\\n",
"0 1 1 1 \n",
@@ -1525,6 +1622,7 @@
"4 1 1 1 \n",
"5 1 1 1 \n",
"6 1 1 1 \n",
+ "7 1 1 1 \n",
"\n",
" var_mi_combined_score var_isotope_correlation_score \\\n",
"0 1 1 \n",
@@ -1534,6 +1632,7 @@
"4 1 1 \n",
"5 1 1 \n",
"6 1 1 \n",
+ "7 1 1 \n",
"\n",
" var_isotope_overlap_score var_im_ms1_delta_score var_xcorr_coelution \\\n",
"0 1 1 1 \n",
@@ -1543,6 +1642,7 @@
"4 1 1 1 \n",
"5 1 1 1 \n",
"6 1 1 1 \n",
+ "7 1 1 1 \n",
"\n",
" var_xcorr_coelution_contrast var_xcorr_coelution_combined \\\n",
"0 1 1 \n",
@@ -1552,6 +1652,7 @@
"4 1 1 \n",
"5 1 1 \n",
"6 1 1 \n",
+ "7 1 1 \n",
"\n",
" var_xcorr_shape var_xcorr_shape_contrast var_xcorr_shape_combined \n",
"0 1 1 1 \n",
@@ -1560,41 +1661,43 @@
"3 1 1 1 \n",
"4 1 1 1 \n",
"5 1 1 1 \n",
- "6 1 1 1 "
+ "6 1 1 1 \n",
+ "7 1 1 1 "
]
},
- "execution_count": 18,
+ "execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
+ "length = 8\n",
"feature_ms1 = features[['id']].copy().rename(columns={'id':'feature_id'})\n",
"\n",
"feature_ms1['area_intensity'] = features['exp_rt'] * 1000 \n",
"feature_ms1['apex_intensity'] = features['precursor_id']\n",
"feature_ms1['exp_im'] = features['exp_im']\n",
"feature_ms1['delta_im'] = features['delta_rt']\n",
- "feature_ms1['var_massdev_score'] = [1] *7\n",
- "feature_ms1['var_mi_score'] = [1] *7\n",
- "feature_ms1['var_mi_contrast_score'] = [1] *7\n",
- "feature_ms1['var_mi_combined_score'] = [1] *7\n",
- "feature_ms1['var_isotope_correlation_score'] = [1] *7\n",
- "feature_ms1['var_isotope_overlap_score'] = [1] *7\n",
- "feature_ms1['var_im_ms1_delta_score'] = [1] *7\n",
- "feature_ms1['var_xcorr_coelution'] = [1] *7\n",
- "feature_ms1['var_xcorr_coelution_contrast'] = [1] *7\n",
- "feature_ms1['var_xcorr_coelution_combined'] = [1] *7\n",
- "feature_ms1['var_xcorr_shape'] = [1] *7\n",
- "feature_ms1['var_xcorr_shape_contrast'] = [1] *7\n",
- "feature_ms1['var_xcorr_shape_combined'] = [1] *7\n",
+ "feature_ms1['var_massdev_score'] = [1] * length\n",
+ "feature_ms1['var_mi_score'] = [1] * length\n",
+ "feature_ms1['var_mi_contrast_score'] = [1] * length\n",
+ "feature_ms1['var_mi_combined_score'] = [1] * length\n",
+ "feature_ms1['var_isotope_correlation_score'] = [1] * length\n",
+ "feature_ms1['var_isotope_overlap_score'] = [1] * length\n",
+ "feature_ms1['var_im_ms1_delta_score'] = [1] * length\n",
+ "feature_ms1['var_xcorr_coelution'] = [1] * length\n",
+ "feature_ms1['var_xcorr_coelution_contrast'] = [1] * length\n",
+ "feature_ms1['var_xcorr_coelution_combined'] = [1] * length\n",
+ "feature_ms1['var_xcorr_shape'] = [1] *length\n",
+ "feature_ms1['var_xcorr_shape_contrast'] = [1] * length\n",
+ "feature_ms1['var_xcorr_shape_combined'] = [1] * length\n",
"\n",
"feature_ms1"
]
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 22,
"id": "8480c6a7-23ab-4a22-9da9-998cbc8606ac",
"metadata": {},
"outputs": [],
@@ -1609,7 +1712,7 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 23,
"id": "98ff8eac-5acd-435a-b292-fe64d590ea51",
"metadata": {},
"outputs": [],
@@ -1635,17 +1738,17 @@
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 24,
"id": "7691c149-7f67-48de-9bff-2856a44d40eb",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 21,
+ "execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
@@ -1657,7 +1760,7 @@
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 25,
"id": "863fcd87-d051-4bc9-b88c-8535cbc90c4a",
"metadata": {
"scrolled": true,
@@ -1719,7 +1822,7 @@
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": 26,
"id": "30f29e9b-b345-4a02-ba63-74dabf69555b",
"metadata": {},
"outputs": [
@@ -1770,11 +1873,11 @@
" \n",
" \n",
" | 0 | \n",
- " 0 | \n",
- " 220010.0 | \n",
- " 220010.0 | \n",
- " 5 | \n",
- " 220.01 | \n",
+ " 1111111111111111111 | \n",
+ " 220020.0 | \n",
+ " 220020.0 | \n",
+ " 6 | \n",
+ " 220.02 | \n",
" 0.01 | \n",
" 1 | \n",
" 1 | \n",
@@ -1794,11 +1897,11 @@
"
\n",
" \n",
" | 1 | \n",
- " 1 | \n",
- " 220020.0 | \n",
- " 220020.0 | \n",
- " 5 | \n",
- " 220.02 | \n",
+ " 2222222222222222222 | \n",
+ " 220030.0 | \n",
+ " 220030.0 | \n",
+ " 6 | \n",
+ " 220.03 | \n",
" 0.01 | \n",
" 1 | \n",
" 1 | \n",
@@ -1818,11 +1921,11 @@
"
\n",
" \n",
" | 2 | \n",
- " 2 | \n",
- " 600030.0 | \n",
- " 600030.0 | \n",
- " 3 | \n",
- " 600.03 | \n",
+ " 3333333333333333333 | \n",
+ " 600040.0 | \n",
+ " 600040.0 | \n",
+ " 4 | \n",
+ " 600.04 | \n",
" 0.01 | \n",
" 1 | \n",
" 1 | \n",
@@ -1842,11 +1945,11 @@
"
\n",
" \n",
" | 3 | \n",
- " 3 | \n",
- " 200040.0 | \n",
- " 200040.0 | \n",
- " 4 | \n",
- " 200.04 | \n",
+ " 4444444444444444444 | \n",
+ " 200050.0 | \n",
+ " 200050.0 | \n",
+ " 5 | \n",
+ " 200.05 | \n",
" 0.01 | \n",
" 1 | \n",
" 1 | \n",
@@ -1866,11 +1969,11 @@
"
\n",
" \n",
" | 4 | \n",
- " 4 | \n",
- " 400050.0 | \n",
- " 400050.0 | \n",
+ " 5555555555555555555 | \n",
+ " 400060.0 | \n",
+ " 400060.0 | \n",
" 0 | \n",
- " 400.05 | \n",
+ " 400.06 | \n",
" 0.01 | \n",
" 1 | \n",
" 1 | \n",
@@ -1890,11 +1993,11 @@
"
\n",
" \n",
" | 5 | \n",
- " 5 | \n",
- " 700060.0 | \n",
- " 700060.0 | \n",
- " 1 | \n",
- " 700.06 | \n",
+ " 6666666666666666666 | \n",
+ " 700070.0 | \n",
+ " 700070.0 | \n",
+ " 2 | \n",
+ " 700.07 | \n",
" 0.01 | \n",
" 1 | \n",
" 1 | \n",
@@ -1914,11 +2017,35 @@
"
\n",
" \n",
" | 6 | \n",
- " 6 | \n",
- " 500070.0 | \n",
- " 500070.0 | \n",
- " 2 | \n",
- " 500.07 | \n",
+ " 7777777777777777777 | \n",
+ " 500080.0 | \n",
+ " 500080.0 | \n",
+ " 3 | \n",
+ " 500.08 | \n",
+ " 0.01 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 8888888888888888888 | \n",
+ " 800090.0 | \n",
+ " 800090.0 | \n",
+ " 1 | \n",
+ " 800.09 | \n",
" 0.01 | \n",
" 1 | \n",
" 1 | \n",
@@ -1938,27 +2065,29 @@
"
\n",
" \n",
"\n",
- "7 rows × 41 columns
\n",
+ "8 rows × 41 columns
\n",
""
],
"text/plain": [
- " feature_id AREA_INTENSITY TOTAL_AREA_INTENSITY APEX_INTENSITY EXP_IM \\\n",
- "0 0 220010.0 220010.0 5 220.01 \n",
- "1 1 220020.0 220020.0 5 220.02 \n",
- "2 2 600030.0 600030.0 3 600.03 \n",
- "3 3 200040.0 200040.0 4 200.04 \n",
- "4 4 400050.0 400050.0 0 400.05 \n",
- "5 5 700060.0 700060.0 1 700.06 \n",
- "6 6 500070.0 500070.0 2 500.07 \n",
+ " feature_id AREA_INTENSITY TOTAL_AREA_INTENSITY APEX_INTENSITY \\\n",
+ "0 1111111111111111111 220020.0 220020.0 6 \n",
+ "1 2222222222222222222 220030.0 220030.0 6 \n",
+ "2 3333333333333333333 600040.0 600040.0 4 \n",
+ "3 4444444444444444444 200050.0 200050.0 5 \n",
+ "4 5555555555555555555 400060.0 400060.0 0 \n",
+ "5 6666666666666666666 700070.0 700070.0 2 \n",
+ "6 7777777777777777777 500080.0 500080.0 3 \n",
+ "7 8888888888888888888 800090.0 800090.0 1 \n",
"\n",
- " DELTA_IM TOTAL_MI VAR_BSERIES_SCORE VAR_DOTPROD_SCORE \\\n",
- "0 0.01 1 1 1 \n",
- "1 0.01 1 1 1 \n",
- "2 0.01 1 1 1 \n",
- "3 0.01 1 1 1 \n",
- "4 0.01 1 1 1 \n",
- "5 0.01 1 1 1 \n",
- "6 0.01 1 1 1 \n",
+ " EXP_IM DELTA_IM TOTAL_MI VAR_BSERIES_SCORE VAR_DOTPROD_SCORE \\\n",
+ "0 220.02 0.01 1 1 1 \n",
+ "1 220.03 0.01 1 1 1 \n",
+ "2 600.04 0.01 1 1 1 \n",
+ "3 200.05 0.01 1 1 1 \n",
+ "4 400.06 0.01 1 1 1 \n",
+ "5 700.07 0.01 1 1 1 \n",
+ "6 500.08 0.01 1 1 1 \n",
+ "7 800.09 0.01 1 1 1 \n",
"\n",
" VAR_INTENSITY_SCORE ... VAR_ELUTION_MODEL_FIT_SCORE VAR_IM_XCORR_SHAPE \\\n",
"0 1 ... 1 1 \n",
@@ -1968,6 +2097,7 @@
"4 1 ... 1 1 \n",
"5 1 ... 1 1 \n",
"6 1 ... 1 1 \n",
+ "7 1 ... 1 1 \n",
"\n",
" VAR_IM_XCORR_COELUTION VAR_IM_DELTA_SCORE VAR_SONAR_LAG VAR_SONAR_SHAPE \\\n",
"0 1 1 1 1 \n",
@@ -1977,6 +2107,7 @@
"4 1 1 1 1 \n",
"5 1 1 1 1 \n",
"6 1 1 1 1 \n",
+ "7 1 1 1 1 \n",
"\n",
" VAR_SONAR_LOG_SN VAR_SONAR_LOG_DIFF VAR_SONAR_LOG_TREND VAR_SONAR_RSQ \n",
"0 1 1 1 1 \n",
@@ -1986,63 +2117,65 @@
"4 1 1 1 1 \n",
"5 1 1 1 1 \n",
"6 1 1 1 1 \n",
+ "7 1 1 1 1 \n",
"\n",
- "[7 rows x 41 columns]"
+ "[8 rows x 41 columns]"
]
},
- "execution_count": 23,
+ "execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
+ "length = 8\n",
"feature_ms2 = feature_ms1[['feature_id']].copy()\n",
"feature_ms2['AREA_INTENSITY'] = feature_ms1['area_intensity']\n",
"feature_ms2['TOTAL_AREA_INTENSITY'] = feature_ms2['AREA_INTENSITY']\n",
"feature_ms2['APEX_INTENSITY'] = feature_ms1['apex_intensity']\n",
"feature_ms2['EXP_IM'] = feature_ms1['exp_im']\n",
"feature_ms2['DELTA_IM'] = feature_ms1['delta_im']\n",
- "feature_ms2['TOTAL_MI'] = [1] *7 \n",
- "feature_ms2['VAR_BSERIES_SCORE'] = [1] *7 \n",
- "feature_ms2['VAR_DOTPROD_SCORE'] = [1] *7 \n",
- "feature_ms2['VAR_INTENSITY_SCORE'] = [1] *7 \n",
- "feature_ms2['VAR_ISOTOPE_CORRELATION_SCORE'] = [1] *7 \n",
- "feature_ms2['VAR_ISOTOPE_OVERLAP_SCORE'] = [1] *7 \n",
- "feature_ms2['VAR_LIBRARY_CORR'] = [1] *7 \n",
- "feature_ms2['VAR_LIBRARY_DOTPROD'] = [1] *7 \n",
- "feature_ms2['VAR_LIBRARY_MANHATTAN'] = [1] *7 \n",
- "feature_ms2['VAR_LIBRARY_RMSD'] = [1] *7 \n",
- "feature_ms2['VAR_LIBRARY_ROOTMEANSQUARE'] = [1] *7 \n",
- "feature_ms2['VAR_LIBRARY_SANGLE'] = [1] *7 \n",
- "feature_ms2['VAR_LOG_SN_SCORE'] = [1] *7 \n",
- "feature_ms2['VAR_MANHATTAN_SCORE'] = [1] *7 \n",
- "feature_ms2['VAR_MASSDEV_SCORE'] = [1] *7 \n",
- "feature_ms2['VAR_MASSDEV_SCORE_WEIGHTED'] = [1] *7 \n",
- "feature_ms2['VAR_MI_SCORE'] = [1] *7 \n",
- "feature_ms2['VAR_MI_WEIGHTED_SCORE'] = [1] *7 \n",
- "feature_ms2['VAR_MI_RATIO_SCORE'] = [1] *7 \n",
- "feature_ms2['VAR_NORM_RT_SCORE'] = [1] *7 \n",
- "feature_ms2['VAR_XCORR_COELUTION'] = [1] *7 \n",
- "feature_ms2['VAR_XCORR_COELUTION_WEIGHTED'] = [1] *7 \n",
- "feature_ms2['VAR_XCORR_SHAPE'] = [1] *7 \n",
- "feature_ms2['VAR_XCORR_SHAPE_WEIGHTED'] = [1] *7 \n",
- "feature_ms2['VAR_YSERIES_SCORE'] = [1] *7 \n",
- "feature_ms2['VAR_ELUTION_MODEL_FIT_SCORE'] = [1] *7 \n",
- "feature_ms2['VAR_IM_XCORR_SHAPE'] = [1] *7 \n",
- "feature_ms2['VAR_IM_XCORR_COELUTION'] = [1] *7 \n",
- "feature_ms2['VAR_IM_DELTA_SCORE'] = [1] *7 \n",
- "feature_ms2['VAR_SONAR_LAG'] = [1] *7 \n",
- "feature_ms2['VAR_SONAR_SHAPE'] = [1] *7 \n",
- "feature_ms2['VAR_SONAR_LOG_SN'] = [1] *7 \n",
- "feature_ms2['VAR_SONAR_LOG_DIFF'] = [1] *7 \n",
- "feature_ms2['VAR_SONAR_LOG_TREND'] = [1] *7 \n",
- "feature_ms2['VAR_SONAR_RSQ'] = [1] *7 \n",
+ "feature_ms2['TOTAL_MI'] = [1] * length \n",
+ "feature_ms2['VAR_BSERIES_SCORE'] = [1] * length \n",
+ "feature_ms2['VAR_DOTPROD_SCORE'] = [1] * length \n",
+ "feature_ms2['VAR_INTENSITY_SCORE'] = [1] * length \n",
+ "feature_ms2['VAR_ISOTOPE_CORRELATION_SCORE'] = [1] * length \n",
+ "feature_ms2['VAR_ISOTOPE_OVERLAP_SCORE'] = [1] * length \n",
+ "feature_ms2['VAR_LIBRARY_CORR'] = [1] * length \n",
+ "feature_ms2['VAR_LIBRARY_DOTPROD'] = [1] * length \n",
+ "feature_ms2['VAR_LIBRARY_MANHATTAN'] = [1] * length \n",
+ "feature_ms2['VAR_LIBRARY_RMSD'] = [1] * length \n",
+ "feature_ms2['VAR_LIBRARY_ROOTMEANSQUARE'] = [1] * length \n",
+ "feature_ms2['VAR_LIBRARY_SANGLE'] = [1] * length \n",
+ "feature_ms2['VAR_LOG_SN_SCORE'] = [1] * length \n",
+ "feature_ms2['VAR_MANHATTAN_SCORE'] = [1] * length \n",
+ "feature_ms2['VAR_MASSDEV_SCORE'] = [1] * length \n",
+ "feature_ms2['VAR_MASSDEV_SCORE_WEIGHTED'] = [1] * length \n",
+ "feature_ms2['VAR_MI_SCORE'] = [1] * length \n",
+ "feature_ms2['VAR_MI_WEIGHTED_SCORE'] = [1] * length \n",
+ "feature_ms2['VAR_MI_RATIO_SCORE'] = [1] * length \n",
+ "feature_ms2['VAR_NORM_RT_SCORE'] = [1] * length \n",
+ "feature_ms2['VAR_XCORR_COELUTION'] = [1] * length \n",
+ "feature_ms2['VAR_XCORR_COELUTION_WEIGHTED'] = [1] * length \n",
+ "feature_ms2['VAR_XCORR_SHAPE'] = [1] * length \n",
+ "feature_ms2['VAR_XCORR_SHAPE_WEIGHTED'] = [1] * length \n",
+ "feature_ms2['VAR_YSERIES_SCORE'] = [1] * length \n",
+ "feature_ms2['VAR_ELUTION_MODEL_FIT_SCORE'] = [1] * length \n",
+ "feature_ms2['VAR_IM_XCORR_SHAPE'] = [1] * length \n",
+ "feature_ms2['VAR_IM_XCORR_COELUTION'] = [1] * length \n",
+ "feature_ms2['VAR_IM_DELTA_SCORE'] = [1] * length \n",
+ "feature_ms2['VAR_SONAR_LAG'] = [1] * length \n",
+ "feature_ms2['VAR_SONAR_SHAPE'] = [1] * length \n",
+ "feature_ms2['VAR_SONAR_LOG_SN'] = [1] * length \n",
+ "feature_ms2['VAR_SONAR_LOG_DIFF'] = [1] * length \n",
+ "feature_ms2['VAR_SONAR_LOG_TREND'] = [1] * length \n",
+ "feature_ms2['VAR_SONAR_RSQ'] = [1] * length \n",
"feature_ms2"
]
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 27,
"id": "d93163c0-20a1-4d98-86de-71c6d265d418",
"metadata": {},
"outputs": [],
@@ -2057,7 +2190,7 @@
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": 28,
"id": "db1cbc3f-e463-43a6-895a-979c7aafe393",
"metadata": {},
"outputs": [],
@@ -2083,17 +2216,17 @@
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 29,
"id": "bca8bfba-86e9-497c-ae94-4c0f679b45f1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 26,
+ "execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
@@ -2105,7 +2238,7 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 30,
"id": "4678179b-aea7-460f-ad71-d157a1e3ce38",
"metadata": {},
"outputs": [],
@@ -2115,7 +2248,7 @@
},
{
"cell_type": "code",
- "execution_count": 28,
+ "execution_count": 31,
"id": "231cf0c6-01ac-4061-b1a9-d68404f793b3",
"metadata": {},
"outputs": [],
@@ -2125,7 +2258,7 @@
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": 32,
"id": "7f730c1c-73a3-4ce1-a2b3-35b064edd558",
"metadata": {},
"outputs": [],
@@ -2136,7 +2269,7 @@
},
{
"cell_type": "code",
- "execution_count": 30,
+ "execution_count": 33,
"id": "46a4405d-c5c9-48a6-aee5-abb2a3b9b047",
"metadata": {},
"outputs": [
@@ -2172,88 +2305,88 @@
" \n",
" \n",
" | 0 | \n",
- " 0 | \n",
- " 5 | \n",
+ " 1111111111111111111 | \n",
+ " 6 | \n",
+ " 6 | \n",
" 6 | \n",
- " 5 | \n",
" YYYYYR3_b1^2 | \n",
" 221.0 | \n",
"
\n",
" \n",
" | 1 | \n",
- " 1 | \n",
- " 5 | \n",
+ " 1111111111111111111 | \n",
" 6 | \n",
- " 5 | \n",
- " YYYYYR3_b1^2 | \n",
- " 221.0 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 0 | \n",
- " 5 | \n",
" 7 | \n",
- " 5 | \n",
+ " 6 | \n",
" YYYYYR3_y2^2 | \n",
" 222.0 | \n",
"
\n",
" \n",
+ " | 2 | \n",
+ " 2222222222222222222 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " 6 | \n",
+ " YYYYYR3_b1^2 | \n",
+ " 221.0 | \n",
+ "
\n",
+ " \n",
" | 3 | \n",
- " 1 | \n",
- " 5 | \n",
+ " 2222222222222222222 | \n",
+ " 6 | \n",
" 7 | \n",
- " 5 | \n",
+ " 6 | \n",
" YYYYYR3_y2^2 | \n",
" 222.0 | \n",
"
\n",
" \n",
" | 4 | \n",
- " 2 | \n",
- " 3 | \n",
+ " 3333333333333333333 | \n",
+ " 4 | \n",
" 15 | \n",
- " 3 | \n",
+ " 4 | \n",
" TTTTTTTTTTTTK2_b1^2 | \n",
" 601.0 | \n",
"
\n",
" \n",
" | 5 | \n",
- " 2 | \n",
- " 3 | \n",
+ " 3333333333333333333 | \n",
+ " 4 | \n",
" 16 | \n",
- " 3 | \n",
+ " 4 | \n",
" TTTTTTTTTTTTK2_y2^2 | \n",
" 602.0 | \n",
"
\n",
" \n",
" | 6 | \n",
+ " 4444444444444444444 | \n",
+ " 5 | \n",
" 3 | \n",
- " 4 | \n",
- " 3 | \n",
- " 4 | \n",
+ " 5 | \n",
" YYYYYR2_b1^2 | \n",
" 201.0 | \n",
"
\n",
" \n",
" | 7 | \n",
- " 3 | \n",
- " 4 | \n",
- " 4 | \n",
+ " 4444444444444444444 | \n",
+ " 5 | \n",
" 4 | \n",
+ " 5 | \n",
" YYYYYR2_y2^2 | \n",
" 202.0 | \n",
"
\n",
" \n",
" | 8 | \n",
- " 3 | \n",
- " 4 | \n",
+ " 4444444444444444444 | \n",
+ " 5 | \n",
+ " 5 | \n",
" 5 | \n",
- " 4 | \n",
" YYYYYR2_b3^2 | \n",
" 203.0 | \n",
"
\n",
" \n",
" | 9 | \n",
- " 4 | \n",
+ " 5555555555555555555 | \n",
" 0 | \n",
" 8 | \n",
" 0 | \n",
@@ -2262,7 +2395,7 @@
"
\n",
" \n",
" | 10 | \n",
- " 4 | \n",
+ " 5555555555555555555 | \n",
" 0 | \n",
" 9 | \n",
" 0 | \n",
@@ -2271,7 +2404,7 @@
"
\n",
" \n",
" | 11 | \n",
- " 4 | \n",
+ " 5555555555555555555 | \n",
" 0 | \n",
" 10 | \n",
" 0 | \n",
@@ -2280,7 +2413,7 @@
"
\n",
" \n",
" | 12 | \n",
- " 4 | \n",
+ " 5555555555555555555 | \n",
" 0 | \n",
" 11 | \n",
" 0 | \n",
@@ -2289,88 +2422,118 @@
"
\n",
" \n",
" | 13 | \n",
- " 5 | \n",
- " 1 | \n",
+ " 6666666666666666666 | \n",
+ " 2 | \n",
" 17 | \n",
- " 1 | \n",
+ " 2 | \n",
" TTR3_b1^3 | \n",
" 701.0 | \n",
"
\n",
" \n",
" | 14 | \n",
- " 5 | \n",
- " 1 | \n",
+ " 6666666666666666666 | \n",
+ " 2 | \n",
" 18 | \n",
- " 1 | \n",
+ " 2 | \n",
" TTR3_y2^3 | \n",
" 702.0 | \n",
"
\n",
" \n",
" | 15 | \n",
- " 5 | \n",
- " 1 | \n",
+ " 6666666666666666666 | \n",
+ " 2 | \n",
" 19 | \n",
- " 1 | \n",
+ " 2 | \n",
" TTR3_b3^3 | \n",
" 703.0 | \n",
"
\n",
" \n",
" | 16 | \n",
- " 6 | \n",
- " 2 | \n",
+ " 7777777777777777777 | \n",
+ " 3 | \n",
" 12 | \n",
- " 2 | \n",
+ " 3 | \n",
" TTTTTTTR2_b1^2 | \n",
" 501.0 | \n",
"
\n",
" \n",
" | 17 | \n",
- " 6 | \n",
- " 2 | \n",
+ " 7777777777777777777 | \n",
+ " 3 | \n",
" 13 | \n",
- " 2 | \n",
+ " 3 | \n",
" TTTTTTTR2_y2^2 | \n",
" 502.0 | \n",
"
\n",
" \n",
" | 18 | \n",
- " 6 | \n",
- " 2 | \n",
+ " 7777777777777777777 | \n",
+ " 3 | \n",
" 14 | \n",
- " 2 | \n",
+ " 3 | \n",
" TTTTTTTR2_b3^2 | \n",
" 503.0 | \n",
"
\n",
+ " \n",
+ " | 19 | \n",
+ " 8888888888888888888 | \n",
+ " 1 | \n",
+ " 20 | \n",
+ " 1 | \n",
+ " TTK3_b1^3 | \n",
+ " 801.0 | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " 8888888888888888888 | \n",
+ " 1 | \n",
+ " 21 | \n",
+ " 1 | \n",
+ " TTK3_y2^3 | \n",
+ " 802.0 | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " 8888888888888888888 | \n",
+ " 1 | \n",
+ " 22 | \n",
+ " 1 | \n",
+ " TTK3_b3^3 | \n",
+ " 803.0 | \n",
+ "
\n",
" \n",
"\n",
""
],
"text/plain": [
- " feature_id precursor_id TRANSITION_ID PRECURSOR_ID \\\n",
- "0 0 5 6 5 \n",
- "1 1 5 6 5 \n",
- "2 0 5 7 5 \n",
- "3 1 5 7 5 \n",
- "4 2 3 15 3 \n",
- "5 2 3 16 3 \n",
- "6 3 4 3 4 \n",
- "7 3 4 4 4 \n",
- "8 3 4 5 4 \n",
- "9 4 0 8 0 \n",
- "10 4 0 9 0 \n",
- "11 4 0 10 0 \n",
- "12 4 0 11 0 \n",
- "13 5 1 17 1 \n",
- "14 5 1 18 1 \n",
- "15 5 1 19 1 \n",
- "16 6 2 12 2 \n",
- "17 6 2 13 2 \n",
- "18 6 2 14 2 \n",
+ " feature_id precursor_id TRANSITION_ID PRECURSOR_ID \\\n",
+ "0 1111111111111111111 6 6 6 \n",
+ "1 1111111111111111111 6 7 6 \n",
+ "2 2222222222222222222 6 6 6 \n",
+ "3 2222222222222222222 6 7 6 \n",
+ "4 3333333333333333333 4 15 4 \n",
+ "5 3333333333333333333 4 16 4 \n",
+ "6 4444444444444444444 5 3 5 \n",
+ "7 4444444444444444444 5 4 5 \n",
+ "8 4444444444444444444 5 5 5 \n",
+ "9 5555555555555555555 0 8 0 \n",
+ "10 5555555555555555555 0 9 0 \n",
+ "11 5555555555555555555 0 10 0 \n",
+ "12 5555555555555555555 0 11 0 \n",
+ "13 6666666666666666666 2 17 2 \n",
+ "14 6666666666666666666 2 18 2 \n",
+ "15 6666666666666666666 2 19 2 \n",
+ "16 7777777777777777777 3 12 3 \n",
+ "17 7777777777777777777 3 13 3 \n",
+ "18 7777777777777777777 3 14 3 \n",
+ "19 8888888888888888888 1 20 1 \n",
+ "20 8888888888888888888 1 21 1 \n",
+ "21 8888888888888888888 1 22 1 \n",
"\n",
" TRAML_ID PRODUCT_MZ \n",
"0 YYYYYR3_b1^2 221.0 \n",
- "1 YYYYYR3_b1^2 221.0 \n",
- "2 YYYYYR3_y2^2 222.0 \n",
+ "1 YYYYYR3_y2^2 222.0 \n",
+ "2 YYYYYR3_b1^2 221.0 \n",
"3 YYYYYR3_y2^2 222.0 \n",
"4 TTTTTTTTTTTTK2_b1^2 601.0 \n",
"5 TTTTTTTTTTTTK2_y2^2 602.0 \n",
@@ -2386,10 +2549,13 @@
"15 TTR3_b3^3 703.0 \n",
"16 TTTTTTTR2_b1^2 501.0 \n",
"17 TTTTTTTR2_y2^2 502.0 \n",
- "18 TTTTTTTR2_b3^2 503.0 "
+ "18 TTTTTTTR2_b3^2 503.0 \n",
+ "19 TTK3_b1^3 801.0 \n",
+ "20 TTK3_y2^3 802.0 \n",
+ "21 TTK3_b3^3 803.0 "
]
},
- "execution_count": 30,
+ "execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
@@ -2400,7 +2566,18 @@
},
{
"cell_type": "code",
- "execution_count": 31,
+ "execution_count": 34,
+ "id": "f0b98de2-dead-43c2-85ff-40706699a9ee",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# calculations with new feature id would result in overflow so just take first digit for calculations\n",
+ "psuedo_feature_id = (feature_transition['feature_id'].astype(str).str.slice(start=0, stop=1)).astype(int)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
"id": "cf55c2d6-fa9f-433a-bb8f-931984f48bbe",
"metadata": {},
"outputs": [
@@ -2447,7 +2624,7 @@
" \n",
" \n",
" | 0 | \n",
- " 0 | \n",
+ " 1111111111111111111 | \n",
" 6 | \n",
" 221.0 | \n",
" 221.0 | \n",
@@ -2467,10 +2644,10 @@
"
\n",
" \n",
" | 1 | \n",
- " 1 | \n",
- " 6 | \n",
- " 442.0 | \n",
- " 442.0 | \n",
+ " 1111111111111111111 | \n",
+ " 7 | \n",
+ " 222.0 | \n",
+ " 222.0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
@@ -2487,10 +2664,10 @@
"
\n",
" \n",
" | 2 | \n",
- " 0 | \n",
- " 7 | \n",
- " 222.0 | \n",
- " 222.0 | \n",
+ " 2222222222222222222 | \n",
+ " 6 | \n",
+ " 442.0 | \n",
+ " 442.0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
@@ -2507,7 +2684,7 @@
"
\n",
" \n",
" | 3 | \n",
- " 1 | \n",
+ " 2222222222222222222 | \n",
" 7 | \n",
" 444.0 | \n",
" 444.0 | \n",
@@ -2527,7 +2704,7 @@
"
\n",
" \n",
" | 4 | \n",
- " 2 | \n",
+ " 3333333333333333333 | \n",
" 15 | \n",
" 1803.0 | \n",
" 1803.0 | \n",
@@ -2547,7 +2724,7 @@
"
\n",
" \n",
" | 5 | \n",
- " 2 | \n",
+ " 3333333333333333333 | \n",
" 16 | \n",
" 1806.0 | \n",
" 1806.0 | \n",
@@ -2567,7 +2744,7 @@
"
\n",
" \n",
" | 6 | \n",
- " 3 | \n",
+ " 4444444444444444444 | \n",
" 3 | \n",
" 804.0 | \n",
" 804.0 | \n",
@@ -2587,7 +2764,7 @@
"
\n",
" \n",
" | 7 | \n",
- " 3 | \n",
+ " 4444444444444444444 | \n",
" 4 | \n",
" 808.0 | \n",
" 808.0 | \n",
@@ -2607,7 +2784,7 @@
"
\n",
" \n",
" | 8 | \n",
- " 3 | \n",
+ " 4444444444444444444 | \n",
" 5 | \n",
" 812.0 | \n",
" 812.0 | \n",
@@ -2627,7 +2804,7 @@
"
\n",
" \n",
" | 9 | \n",
- " 4 | \n",
+ " 5555555555555555555 | \n",
" 8 | \n",
" 2005.0 | \n",
" 2005.0 | \n",
@@ -2647,7 +2824,7 @@
"
\n",
" \n",
" | 10 | \n",
- " 4 | \n",
+ " 5555555555555555555 | \n",
" 9 | \n",
" 2010.0 | \n",
" 2010.0 | \n",
@@ -2667,7 +2844,7 @@
"
\n",
" \n",
" | 11 | \n",
- " 4 | \n",
+ " 5555555555555555555 | \n",
" 10 | \n",
" 2015.0 | \n",
" 2015.0 | \n",
@@ -2687,7 +2864,7 @@
"
\n",
" \n",
" | 12 | \n",
- " 4 | \n",
+ " 5555555555555555555 | \n",
" 11 | \n",
" 2020.0 | \n",
" 2020.0 | \n",
@@ -2707,7 +2884,7 @@
"
\n",
" \n",
" | 13 | \n",
- " 5 | \n",
+ " 6666666666666666666 | \n",
" 17 | \n",
" 4206.0 | \n",
" 4206.0 | \n",
@@ -2727,7 +2904,7 @@
"
\n",
" \n",
" | 14 | \n",
- " 5 | \n",
+ " 6666666666666666666 | \n",
" 18 | \n",
" 4212.0 | \n",
" 4212.0 | \n",
@@ -2747,7 +2924,7 @@
"
\n",
" \n",
" | 15 | \n",
- " 5 | \n",
+ " 6666666666666666666 | \n",
" 19 | \n",
" 4218.0 | \n",
" 4218.0 | \n",
@@ -2767,7 +2944,7 @@
"
\n",
" \n",
" | 16 | \n",
- " 6 | \n",
+ " 7777777777777777777 | \n",
" 12 | \n",
" 3507.0 | \n",
" 3507.0 | \n",
@@ -2787,7 +2964,7 @@
"
\n",
" \n",
" | 17 | \n",
- " 6 | \n",
+ " 7777777777777777777 | \n",
" 13 | \n",
" 3514.0 | \n",
" 3514.0 | \n",
@@ -2807,7 +2984,7 @@
"
\n",
" \n",
" | 18 | \n",
- " 6 | \n",
+ " 7777777777777777777 | \n",
" 14 | \n",
" 3521.0 | \n",
" 3521.0 | \n",
@@ -2825,31 +3002,94 @@
" 1 | \n",
" 1 | \n",
"
\n",
+ " \n",
+ " | 19 | \n",
+ " 8888888888888888888 | \n",
+ " 20 | \n",
+ " 6408.0 | \n",
+ " 6408.0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " 8888888888888888888 | \n",
+ " 21 | \n",
+ " 6416.0 | \n",
+ " 6416.0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " 8888888888888888888 | \n",
+ " 22 | \n",
+ " 6424.0 | \n",
+ " 6424.0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
" \n",
"\n",
""
],
"text/plain": [
- " feature_id TRANSITION_ID AREA_INTENSITY TOTAL_AREA_INTENSITY \\\n",
- "0 0 6 221.0 221.0 \n",
- "1 1 6 442.0 442.0 \n",
- "2 0 7 222.0 222.0 \n",
- "3 1 7 444.0 444.0 \n",
- "4 2 15 1803.0 1803.0 \n",
- "5 2 16 1806.0 1806.0 \n",
- "6 3 3 804.0 804.0 \n",
- "7 3 4 808.0 808.0 \n",
- "8 3 5 812.0 812.0 \n",
- "9 4 8 2005.0 2005.0 \n",
- "10 4 9 2010.0 2010.0 \n",
- "11 4 10 2015.0 2015.0 \n",
- "12 4 11 2020.0 2020.0 \n",
- "13 5 17 4206.0 4206.0 \n",
- "14 5 18 4212.0 4212.0 \n",
- "15 5 19 4218.0 4218.0 \n",
- "16 6 12 3507.0 3507.0 \n",
- "17 6 13 3514.0 3514.0 \n",
- "18 6 14 3521.0 3521.0 \n",
+ " feature_id TRANSITION_ID AREA_INTENSITY TOTAL_AREA_INTENSITY \\\n",
+ "0 1111111111111111111 6 221.0 221.0 \n",
+ "1 1111111111111111111 7 222.0 222.0 \n",
+ "2 2222222222222222222 6 442.0 442.0 \n",
+ "3 2222222222222222222 7 444.0 444.0 \n",
+ "4 3333333333333333333 15 1803.0 1803.0 \n",
+ "5 3333333333333333333 16 1806.0 1806.0 \n",
+ "6 4444444444444444444 3 804.0 804.0 \n",
+ "7 4444444444444444444 4 808.0 808.0 \n",
+ "8 4444444444444444444 5 812.0 812.0 \n",
+ "9 5555555555555555555 8 2005.0 2005.0 \n",
+ "10 5555555555555555555 9 2010.0 2010.0 \n",
+ "11 5555555555555555555 10 2015.0 2015.0 \n",
+ "12 5555555555555555555 11 2020.0 2020.0 \n",
+ "13 6666666666666666666 17 4206.0 4206.0 \n",
+ "14 6666666666666666666 18 4212.0 4212.0 \n",
+ "15 6666666666666666666 19 4218.0 4218.0 \n",
+ "16 7777777777777777777 12 3507.0 3507.0 \n",
+ "17 7777777777777777777 13 3514.0 3514.0 \n",
+ "18 7777777777777777777 14 3521.0 3521.0 \n",
+ "19 8888888888888888888 20 6408.0 6408.0 \n",
+ "20 8888888888888888888 21 6416.0 6416.0 \n",
+ "21 8888888888888888888 22 6424.0 6424.0 \n",
"\n",
" APEX_INTENSITY TOTAL_MI VAR_INTENSITY_SCORE VAR_INTENSITY_RATIO_SCORE \\\n",
"0 1 1 1 1 \n",
@@ -2871,6 +3111,9 @@
"16 1 1 1 1 \n",
"17 1 1 1 1 \n",
"18 1 1 1 1 \n",
+ "19 1 1 1 1 \n",
+ "20 1 1 1 1 \n",
+ "21 1 1 1 1 \n",
"\n",
" VAR_LOG_INTENSITY VAR_XCORR_COELUTION VAR_XCORR_SHAPE VAR_LOG_SN_SCORE \\\n",
"0 1 1 1 1 \n",
@@ -2892,6 +3135,9 @@
"16 1 1 1 1 \n",
"17 1 1 1 1 \n",
"18 1 1 1 1 \n",
+ "19 1 1 1 1 \n",
+ "20 1 1 1 1 \n",
+ "21 1 1 1 1 \n",
"\n",
" VAR_MASSDEV_SCORE VAR_MI_SCORE VAR_MI_RATIO_SCORE \\\n",
"0 1 1 1 \n",
@@ -2913,6 +3159,9 @@
"16 1 1 1 \n",
"17 1 1 1 \n",
"18 1 1 1 \n",
+ "19 1 1 1 \n",
+ "20 1 1 1 \n",
+ "21 1 1 1 \n",
"\n",
" VAR_ISOTOPE_CORRELATION_SCORE VAR_ISOTOPE_OVERLAP_SCORE \n",
"0 1 1 \n",
@@ -2933,30 +3182,35 @@
"15 1 1 \n",
"16 1 1 \n",
"17 1 1 \n",
- "18 1 1 "
+ "18 1 1 \n",
+ "19 1 1 \n",
+ "20 1 1 \n",
+ "21 1 1 "
]
},
- "execution_count": 31,
+ "execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "feature_transition['AREA_INTENSITY'] = feature_transition['PRODUCT_MZ'] * (feature_transition['feature_id'] + 1) #should be equal to product_mz * (feature_id + 1)\n",
+ "length = 22\n",
+ "\n",
+ "feature_transition['AREA_INTENSITY'] = feature_transition['PRODUCT_MZ'] * (psuedo_feature_id) #should be equal to product_mz * (feature_id + 1)\n",
"feature_transition['TOTAL_AREA_INTENSITY'] = feature_transition['AREA_INTENSITY']\n",
- "feature_transition['APEX_INTENSITY'] = [1] * 19\n",
- "feature_transition['TOTAL_MI'] = [1] * 19\n",
- "feature_transition['VAR_INTENSITY_SCORE'] = [1] * 19\n",
- "feature_transition['VAR_INTENSITY_RATIO_SCORE'] = [1] * 19\n",
- "feature_transition['VAR_LOG_INTENSITY'] = [1] * 19\n",
- "feature_transition['VAR_XCORR_COELUTION'] = [1] * 19\n",
- "feature_transition['VAR_XCORR_SHAPE'] = [1] * 19\n",
- "feature_transition['VAR_LOG_SN_SCORE'] = [1] * 19\n",
- "feature_transition['VAR_MASSDEV_SCORE'] = [1] * 19\n",
- "feature_transition['VAR_MI_SCORE'] = [1] * 19\n",
- "feature_transition['VAR_MI_RATIO_SCORE'] = [1] * 19\n",
- "feature_transition['VAR_ISOTOPE_CORRELATION_SCORE'] = [1] * 19\n",
- "feature_transition['VAR_ISOTOPE_OVERLAP_SCORE'] = [1] * 19\n",
+ "feature_transition['APEX_INTENSITY'] = [1] * length\n",
+ "feature_transition['TOTAL_MI'] = [1] * length\n",
+ "feature_transition['VAR_INTENSITY_SCORE'] = [1] * length\n",
+ "feature_transition['VAR_INTENSITY_RATIO_SCORE'] = [1] * length\n",
+ "feature_transition['VAR_LOG_INTENSITY'] = [1] * length\n",
+ "feature_transition['VAR_XCORR_COELUTION'] = [1] * length\n",
+ "feature_transition['VAR_XCORR_SHAPE'] = [1] * length\n",
+ "feature_transition['VAR_LOG_SN_SCORE'] = [1] * length\n",
+ "feature_transition['VAR_MASSDEV_SCORE'] = [1] * length\n",
+ "feature_transition['VAR_MI_SCORE'] = [1] * length\n",
+ "feature_transition['VAR_MI_RATIO_SCORE'] = [1] * length\n",
+ "feature_transition['VAR_ISOTOPE_CORRELATION_SCORE'] = [1] * length\n",
+ "feature_transition['VAR_ISOTOPE_OVERLAP_SCORE'] = [1] * length\n",
"\n",
"feature_transition = feature_transition.drop(columns=['precursor_id', 'PRECURSOR_ID', 'TRAML_ID', 'PRODUCT_MZ'])\n",
"\n",
@@ -2965,7 +3219,7 @@
},
{
"cell_type": "code",
- "execution_count": 32,
+ "execution_count": 36,
"id": "588ee10a-7b00-4f61-b305-2a392b5bbd1b",
"metadata": {},
"outputs": [],
@@ -2982,7 +3236,7 @@
},
{
"cell_type": "code",
- "execution_count": 33,
+ "execution_count": 37,
"id": "78f13b81-7db1-46ee-947b-723e8b0b340b",
"metadata": {},
"outputs": [],
@@ -3008,7 +3262,7 @@
},
{
"cell_type": "code",
- "execution_count": 34,
+ "execution_count": 38,
"id": "8c143ee6-928c-4404-85cc-5fc7b9b1ce85",
"metadata": {},
"outputs": [],
@@ -3036,17 +3290,17 @@
},
{
"cell_type": "code",
- "execution_count": 35,
+ "execution_count": 39,
"id": "9a24a430-d994-4012-9ac3-37c792e30026",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 35,
+ "execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
@@ -3058,7 +3312,7 @@
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": 40,
"id": "563c3e64-e528-457c-ba42-f00fabcff0f0",
"metadata": {
"tags": []
@@ -3084,7 +3338,18 @@
},
{
"cell_type": "code",
- "execution_count": 37,
+ "execution_count": 41,
+ "id": "d2f151a4-a735-4034-b573-d98faf5c7d76",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# calculations with new feature id would result in overflow so just take first digit for calculations\n",
+ "psuedo_feature_id = (feature_ms1['feature_id'].astype(str).str.slice(start=0, stop=1)).astype(int)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
"id": "c8539aa2-f946-4be8-a891-0b55d6957322",
"metadata": {},
"outputs": [
@@ -3120,8 +3385,8 @@
" \n",
" \n",
" | 0 | \n",
- " 0 | \n",
- " 1320 | \n",
+ " 1111111111111111111 | \n",
+ " 1540 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
@@ -3129,8 +3394,8 @@
"
\n",
" \n",
" | 1 | \n",
- " 1 | \n",
- " 2640 | \n",
+ " 2222222222222222222 | \n",
+ " 3080 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
@@ -3138,8 +3403,8 @@
"
\n",
" \n",
" | 2 | \n",
- " 2 | \n",
- " 7200 | \n",
+ " 3333333333333333333 | \n",
+ " 9000 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
@@ -3147,8 +3412,8 @@
"
\n",
" \n",
" | 3 | \n",
- " 3 | \n",
- " 4000 | \n",
+ " 4444444444444444444 | \n",
+ " 4800 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
@@ -3156,7 +3421,7 @@
"
\n",
" \n",
" | 4 | \n",
- " 4 | \n",
+ " 5555555555555555555 | \n",
" 2000 | \n",
" 1 | \n",
" 1 | \n",
@@ -3165,8 +3430,8 @@
"
\n",
" \n",
" | 5 | \n",
- " 5 | \n",
- " 8400 | \n",
+ " 6666666666666666666 | \n",
+ " 12600 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
@@ -3174,8 +3439,17 @@
"
\n",
" \n",
" | 6 | \n",
- " 6 | \n",
- " 10500 | \n",
+ " 7777777777777777777 | \n",
+ " 14000 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 8888888888888888888 | \n",
+ " 12800 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
@@ -3186,34 +3460,36 @@
""
],
"text/plain": [
- " feature_id SCORE RANK PVALUE QVALUE PEP\n",
- "0 0 1320 1 1 1 1\n",
- "1 1 2640 1 1 1 1\n",
- "2 2 7200 1 1 1 1\n",
- "3 3 4000 1 1 1 1\n",
- "4 4 2000 1 1 1 1\n",
- "5 5 8400 1 1 1 1\n",
- "6 6 10500 1 1 1 1"
+ " feature_id SCORE RANK PVALUE QVALUE PEP\n",
+ "0 1111111111111111111 1540 1 1 1 1\n",
+ "1 2222222222222222222 3080 1 1 1 1\n",
+ "2 3333333333333333333 9000 1 1 1 1\n",
+ "3 4444444444444444444 4800 1 1 1 1\n",
+ "4 5555555555555555555 2000 1 1 1 1\n",
+ "5 6666666666666666666 12600 1 1 1 1\n",
+ "6 7777777777777777777 14000 1 1 1 1\n",
+ "7 8888888888888888888 12800 1 1 1 1"
]
},
- "execution_count": 37,
+ "execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
+ "length = 8 \n",
"score_ms2 = feature_ms1[['feature_id']].copy()\n",
- "score_ms2['SCORE'] = (features['id'] + 1) * (features['precursor_id'] + 1) * features['exp_rt'].astype(int) # (feature_id+1) * (precursor_id+1)\n",
- "score_ms2['RANK'] = [1] *7\n",
- "score_ms2['PVALUE'] = [1] * 7\n",
- "score_ms2['QVALUE'] = [1] *7 \n",
- "score_ms2['PEP'] = [1] *7\n",
+ "score_ms2['SCORE'] = (psuedo_feature_id) * (features['precursor_id'] + 1) * features['exp_rt'].astype(int) # (feature_id+1) * (precursor_id+1)\n",
+ "score_ms2['RANK'] = [1] * length\n",
+ "score_ms2['PVALUE'] = [1] * length\n",
+ "score_ms2['QVALUE'] = [1] * length \n",
+ "score_ms2['PEP'] = [1] * length\n",
"score_ms2"
]
},
{
"cell_type": "code",
- "execution_count": 38,
+ "execution_count": 43,
"id": "4cbda7cf-0535-4292-bfed-739a5f1bd2b8",
"metadata": {},
"outputs": [],
@@ -3228,7 +3504,7 @@
},
{
"cell_type": "code",
- "execution_count": 39,
+ "execution_count": 44,
"id": "adb88443-6d34-4173-8b37-9f52dba9f5e7",
"metadata": {},
"outputs": [],
@@ -3254,17 +3530,17 @@
},
{
"cell_type": "code",
- "execution_count": 40,
+ "execution_count": 45,
"id": "e0094b3a-5a80-48e4-8041-a537ce409480",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 40,
+ "execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
@@ -3276,7 +3552,7 @@
},
{
"cell_type": "code",
- "execution_count": 41,
+ "execution_count": 46,
"id": "acf865f3-3353-4baa-b83e-91be2abed776",
"metadata": {
"tags": []
@@ -3303,7 +3579,7 @@
},
{
"cell_type": "code",
- "execution_count": 42,
+ "execution_count": 47,
"id": "f95142b9-612b-43a8-bb42-356b71839ea6",
"metadata": {},
"outputs": [],
@@ -3313,7 +3589,7 @@
},
{
"cell_type": "code",
- "execution_count": 43,
+ "execution_count": 48,
"id": "43828588-c1ff-4943-a7b7-24a968562c4e",
"metadata": {},
"outputs": [
@@ -3347,41 +3623,48 @@
" \n",
" \n",
" | 0 | \n",
- " 3 | \n",
+ " 4 | \n",
" TTTTTTTTTTTTK | \n",
" TTTTTTTTTTTTK | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
- " 2 | \n",
+ " 3 | \n",
" TTTTTTTR | \n",
" TTTTTTTR | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
- " 4 | \n",
+ " 5 | \n",
" YYYYYR | \n",
" YYYYYR | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
- " 1 | \n",
+ " 2 | \n",
" TTR | \n",
" TTR | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
- " 5 | \n",
+ " 1 | \n",
+ " TTK | \n",
+ " TTK | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 6 | \n",
" YYYYYYYYYYYK | \n",
" YYYYYYYYYYYK | \n",
" 0 | \n",
"
\n",
" \n",
- " | 5 | \n",
+ " 6 | \n",
" 0 | \n",
" GGGGGGGGGGR | \n",
" GGGGGGGGGGR | \n",
@@ -3393,15 +3676,16 @@
],
"text/plain": [
" ID UNMODIFIED_SEQUENCE MODIFIED_SEQUENCE DECOY\n",
- "0 3 TTTTTTTTTTTTK TTTTTTTTTTTTK 0\n",
- "1 2 TTTTTTTR TTTTTTTR 0\n",
- "2 4 YYYYYR YYYYYR 0\n",
- "3 1 TTR TTR 0\n",
- "4 5 YYYYYYYYYYYK YYYYYYYYYYYK 0\n",
- "5 0 GGGGGGGGGGR GGGGGGGGGGR 0"
+ "0 4 TTTTTTTTTTTTK TTTTTTTTTTTTK 0\n",
+ "1 3 TTTTTTTR TTTTTTTR 0\n",
+ "2 5 YYYYYR YYYYYR 0\n",
+ "3 2 TTR TTR 0\n",
+ "4 1 TTK TTK 1\n",
+ "5 6 YYYYYYYYYYYK YYYYYYYYYYYK 0\n",
+ "6 0 GGGGGGGGGGR GGGGGGGGGGR 0"
]
},
- "execution_count": 43,
+ "execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
@@ -3412,7 +3696,7 @@
},
{
"cell_type": "code",
- "execution_count": 44,
+ "execution_count": 49,
"id": "f416e48b-6bb6-4cb7-8d81-597cfd52320c",
"metadata": {
"tags": []
@@ -3425,7 +3709,7 @@
},
{
"cell_type": "code",
- "execution_count": 45,
+ "execution_count": 50,
"id": "0e1eb1b9-730e-45d4-9618-fd532c1ccc25",
"metadata": {},
"outputs": [],
@@ -3441,7 +3725,7 @@
},
{
"cell_type": "code",
- "execution_count": 46,
+ "execution_count": 51,
"id": "e9692e06-ddf2-4f74-bb80-f2a92728767b",
"metadata": {},
"outputs": [],
@@ -3456,7 +3740,7 @@
},
{
"cell_type": "code",
- "execution_count": 47,
+ "execution_count": 52,
"id": "f720c22b-e6fa-4ac0-8402-bdcd2e74840b",
"metadata": {},
"outputs": [],
@@ -3482,17 +3766,17 @@
},
{
"cell_type": "code",
- "execution_count": 48,
+ "execution_count": 53,
"id": "94c860e0-880a-4091-afb8-af368ed72b26",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 48,
+ "execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
@@ -3504,7 +3788,7 @@
},
{
"cell_type": "code",
- "execution_count": 49,
+ "execution_count": 54,
"id": "1c053178-b8a5-44ad-876b-49b5fd8afa23",
"metadata": {
"tags": []
@@ -3531,7 +3815,7 @@
},
{
"cell_type": "code",
- "execution_count": 50,
+ "execution_count": 55,
"id": "d70ba894-55bf-4a36-b306-45b7c5e9d1bd",
"metadata": {},
"outputs": [],
@@ -3541,7 +3825,7 @@
},
{
"cell_type": "code",
- "execution_count": 51,
+ "execution_count": 56,
"id": "3e1bbbeb-7cc9-4b9f-b898-5148abff911d",
"metadata": {},
"outputs": [
@@ -3574,34 +3858,41 @@
" \n",
" \n",
" | 0 | \n",
- " 2 | \n",
+ " 3 | \n",
" ProtY | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
- " 1 | \n",
+ " 2 | \n",
" ProtT | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
- " 0 | \n",
+ " 1 | \n",
" ProtG | \n",
" 0 | \n",
"
\n",
+ " \n",
+ " | 3 | \n",
+ " 0 | \n",
+ " Decoy_ProtT | \n",
+ " 1 | \n",
+ "
\n",
" \n",
"\n",
""
],
"text/plain": [
" ID PROTEIN_ACCESSION DECOY\n",
- "0 2 ProtY 0\n",
- "1 1 ProtT 0\n",
- "2 0 ProtG 0"
+ "0 3 ProtY 0\n",
+ "1 2 ProtT 0\n",
+ "2 1 ProtG 0\n",
+ "3 0 Decoy_ProtT 1"
]
},
- "execution_count": 51,
+ "execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
@@ -3612,7 +3903,7 @@
},
{
"cell_type": "code",
- "execution_count": 52,
+ "execution_count": 57,
"id": "7b3410b1-5d6a-4e85-838c-5ccb3b15f1c5",
"metadata": {},
"outputs": [
@@ -3649,6 +3940,16 @@
" \n",
" \n",
" | 0 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " global | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
" 2 | \n",
" 2 | \n",
" 1 | \n",
@@ -3658,7 +3959,7 @@
" 1 | \n",
"
\n",
" \n",
- " | 1 | \n",
+ " 2 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
@@ -3668,7 +3969,7 @@
" 1 | \n",
"
\n",
" \n",
- " | 2 | \n",
+ " 3 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
@@ -3683,12 +3984,13 @@
],
"text/plain": [
" PROTEIN_ID SCORE PVALUE QVALUE PEP CONTEXT RUN_ID\n",
- "0 2 2 1 1 1 global 1\n",
- "1 1 1 1 1 1 global 1\n",
- "2 0 0 1 1 1 global 1"
+ "0 3 3 1 1 1 global 1\n",
+ "1 2 2 1 1 1 global 1\n",
+ "2 1 1 1 1 1 global 1\n",
+ "3 0 0 1 1 1 global 1"
]
},
- "execution_count": 52,
+ "execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
@@ -3706,7 +4008,7 @@
},
{
"cell_type": "code",
- "execution_count": 53,
+ "execution_count": 58,
"id": "51408b81-b650-4787-9050-59d63c0098c0",
"metadata": {},
"outputs": [],
@@ -3721,7 +4023,7 @@
},
{
"cell_type": "code",
- "execution_count": 54,
+ "execution_count": 59,
"id": "ab6a68b5-f5db-46e7-95e8-6e98a69eb062",
"metadata": {},
"outputs": [],
@@ -3746,7 +4048,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.16"
+ "version": "3.11.10"
}
},
"nbformat": 4,
diff --git a/tests/data/dummyOSWScoredData.osw b/tests/data/dummyOSWScoredData.osw
index c96832a1..82311a35 100644
Binary files a/tests/data/dummyOSWScoredData.osw and b/tests/data/dummyOSWScoredData.osw differ
diff --git a/tests/data/fakeLib.tsv b/tests/data/fakeLib.tsv
index 3831e0e3..d9189e14 100644
--- a/tests/data/fakeLib.tsv
+++ b/tests/data/fakeLib.tsv
@@ -1,21 +1,24 @@
-PrecursorMz ProductMz LibraryIntensity NormalizedRetentionTime ProteinId PeptideSequence ModifiedPeptideSequence PrecursorCharge FragmentType FragmentSeriesNumber ProductCharge GeneName LibraryDriftTime
-100 101 101 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 b 1 2 Y 10
-100 102 201 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 y 2 2 Y 10
-100 103 301 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 b 3 2 Y 10
-200 201 102 20 ProtY YYYYYR YYYYYR 2 b 1 2 Y 20
-200 202 202 20 ProtY YYYYYR YYYYYR 2 y 2 2 Y 20
-200 203 302 20 ProtY YYYYYR YYYYYR 2 b 3 2 Y 20
-220 221 122 20 ProtY YYYYYR YYYYYR 3 b 1 2 Y 20
-220 222 222 20 ProtY YYYYYR YYYYYR 3 y 2 2 Y 20
-400 401 104 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 b 1 2 G 40
-400 402 204 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 y 2 2 G 40
-400 403 403 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 b 3 2 G 40
-400 404 404 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 y 4 2 G 40
-500 501 105 50 ProtT TTTTTTTR TTTTTTTR 2 b 1 2 T 50
-500 502 205 50 ProtT TTTTTTTR TTTTTTTR 2 y 2 2 T 50
-500 503 305 50 ProtT TTTTTTTR TTTTTTTR 2 b 3 2 T 50
-600 601 106 60 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 b 1 2 T 60
-600 602 206 60 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 y 2 2 T 60
-700 701 107 70 ProtT TTR TTR 3 b 1 3 T 70
-700 702 207 70 ProtT TTR TTR 3 y 2 3 T 70
-700 703 307 70 ProtT TTR TTR 3 b 3 3 T 70
+PrecursorMz ProductMz LibraryIntensity NormalizedRetentionTime ProteinId PeptideSequence ModifiedPeptideSequence PrecursorCharge FragmentType FragmentSeriesNumber ProductCharge GeneName LibraryDriftTime Decoy
+100 101 101 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 b 1 2 Y 10 0
+100 102 201 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 y 2 2 Y 10 0
+100 103 301 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 b 3 2 Y 10 0
+200 201 102 20 ProtY YYYYYR YYYYYR 2 b 1 2 Y 20 0
+200 202 202 20 ProtY YYYYYR YYYYYR 2 y 2 2 Y 20 0
+200 203 302 20 ProtY YYYYYR YYYYYR 2 b 3 2 Y 20 0
+220 221 122 20 ProtY YYYYYR YYYYYR 3 b 1 2 Y 20 0
+220 222 222 20 ProtY YYYYYR YYYYYR 3 y 2 2 Y 20 0
+400 401 104 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 b 1 2 G 40 0
+400 402 204 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 y 2 2 G 40 0
+400 403 403 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 b 3 2 G 40 0
+400 404 404 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 y 4 2 G 40 0
+500 501 105 50 ProtT TTTTTTTR TTTTTTTR 2 b 1 2 T 50 0
+500 502 205 50 ProtT TTTTTTTR TTTTTTTR 2 y 2 2 T 50 0
+500 503 305 50 ProtT TTTTTTTR TTTTTTTR 2 b 3 2 T 50 0
+600 601 106 60 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 b 1 2 T 60 0
+600 602 206 60 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 y 2 2 T 60 0
+700 701 107 70 ProtT TTR TTR 3 b 1 3 T 70 0
+700 702 207 70 ProtT TTR TTR 3 y 2 3 T 70 0
+700 703 307 70 ProtT TTR TTR 3 b 3 3 T 70 0
+800 801 808 80 Decoy_ProtT TTK TTK 3 b 1 3 Decoy_T 80 1
+800 802 808 80 Decoy_ProtT TTK TTK 3 y 2 3 Decoy_T 80 1
+800 803 808 80 Decoy_ProtT TTK TTK 3 b 3 3 Decoy_T 80 1
diff --git a/tests/fakeLib.tsv b/tests/fakeLib.tsv
deleted file mode 100644
index 3831e0e3..00000000
--- a/tests/fakeLib.tsv
+++ /dev/null
@@ -1,21 +0,0 @@
-PrecursorMz ProductMz LibraryIntensity NormalizedRetentionTime ProteinId PeptideSequence ModifiedPeptideSequence PrecursorCharge FragmentType FragmentSeriesNumber ProductCharge GeneName LibraryDriftTime
-100 101 101 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 b 1 2 Y 10
-100 102 201 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 y 2 2 Y 10
-100 103 301 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 b 3 2 Y 10
-200 201 102 20 ProtY YYYYYR YYYYYR 2 b 1 2 Y 20
-200 202 202 20 ProtY YYYYYR YYYYYR 2 y 2 2 Y 20
-200 203 302 20 ProtY YYYYYR YYYYYR 2 b 3 2 Y 20
-220 221 122 20 ProtY YYYYYR YYYYYR 3 b 1 2 Y 20
-220 222 222 20 ProtY YYYYYR YYYYYR 3 y 2 2 Y 20
-400 401 104 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 b 1 2 G 40
-400 402 204 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 y 2 2 G 40
-400 403 403 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 b 3 2 G 40
-400 404 404 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 y 4 2 G 40
-500 501 105 50 ProtT TTTTTTTR TTTTTTTR 2 b 1 2 T 50
-500 502 205 50 ProtT TTTTTTTR TTTTTTTR 2 y 2 2 T 50
-500 503 305 50 ProtT TTTTTTTR TTTTTTTR 2 b 3 2 T 50
-600 601 106 60 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 b 1 2 T 60
-600 602 206 60 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 y 2 2 T 60
-700 701 107 70 ProtT TTR TTR 3 b 1 3 T 70
-700 702 207 70 ProtT TTR TTR 3 y 2 3 T 70
-700 703 307 70 ProtT TTR TTR 3 b 3 3 T 70
diff --git a/tests/test_pyprophet_export_parquet.py b/tests/test_pyprophet_export_parquet.py
index 6c01696d..647560b6 100644
--- a/tests/test_pyprophet_export_parquet.py
+++ b/tests/test_pyprophet_export_parquet.py
@@ -27,7 +27,7 @@ def _run_cmdline(cmdline):
return stdout
-def _run_export_parquet_single_run(temp_folder, transitionLevel=False, pd_testing_kwargs=dict(check_dtype=False, check_names=False), onlyFeatures=False):
+def _run_export_parquet_single_run(temp_folder, transitionLevel=False, pd_testing_kwargs=dict(check_dtype=False, check_names=False), onlyFeatures=False, noDecoys=False):
os.chdir(temp_folder)
DATA_NAME="dummyOSWScoredData.osw"
data_path = os.path.join(DATA_FOLDER, DATA_NAME)
@@ -41,41 +41,68 @@ def _run_export_parquet_single_run(temp_folder, transitionLevel=False, pd_testin
cmdline += " --transitionLevel"
if onlyFeatures:
cmdline += " --onlyFeatures"
+ if noDecoys:
+ cmdline += " --noDecoys"
stdout = _run_cmdline(cmdline)
### This file was configured in a way where the following tests should work
parquet = pd.read_parquet("dummyOSWScoredData.parquet") ## automatically with parquet ending of input file name
+ ### CHECK LENGTHS ###
if transitionLevel:
if onlyFeatures: # length of FEATURE_TRANSITION table
- expectedLength = len(pd.read_sql("select * from feature_transition", conn))
+ if not noDecoys:
+ expectedLength = len(pd.read_sql("select * from feature_transition", conn))
+ else:
+ expectedLength = len(pd.read_sql("select * from feature_transition inner join transition on transition.id = feature_transition.transition_id where DECOY == 0", conn))
else:
- featureTransition = pd.read_sql("select * from feature_transition", conn)
- precursorTransition = pd.read_sql("select * from transition_precursor_mapping", conn)
+ if not noDecoys:
+ featureTransition = pd.read_sql("select * from feature_transition", conn)
+ precursorTransition = pd.read_sql("select * from transition_precursor_mapping", conn)
+ else:
+ featureTransition = pd.read_sql("select * from feature_transition inner join transition on transition.id = feature_transition.transition_id where DECOY == 0", conn)
+ precursorTransition = pd.read_sql("select * from transition_precursor_mapping inner join transition on transition.id = transition_precursor_mapping.transition_id where DECOY=0", conn)
+
featureTable = pd.read_sql("select * from feature", conn)
numTransNoFeature = len(precursorTransition[~precursorTransition['PRECURSOR_ID'].isin(featureTable['PRECURSOR_ID'])])
expectedLength = numTransNoFeature + len(featureTransition)
- assert(expectedLength == len(parquet))
else:
if onlyFeatures: # expected length, length of feature table
- expectedLength = len(pd.read_sql("select * from feature", conn))
+ if noDecoys:
+ expectedLength = len(pd.read_sql("select * from feature inner join precursor on feature.precursor_id = precursor.id where decoy = 0", conn))
+ else:
+ expectedLength = len(pd.read_sql("select * from feature inner join precursor on precursor.id = feature.precursor_id", conn))
else:
# Expected length is number of features + number of precursors with no feature
- featureTable = pd.read_sql("select * from feature", conn)
- precTable = pd.read_sql("select * from precursor", conn)
+ if noDecoys:
+ featureTable = pd.read_sql("select * from feature inner join precursor on feature.precursor_id = precursor.id where decoy = 0", conn)
+ else:
+ featureTable = pd.read_sql("select * from feature", conn)
+
+ if noDecoys:
+ precTable = pd.read_sql("select * from precursor where decoy = 0", conn)
+ else:
+ precTable = pd.read_sql("select * from precursor", conn)
numPrecsNoFeature = len(precTable[~precTable['ID'].isin(featureTable['PRECURSOR_ID'])])
expectedLength = numPrecsNoFeature + len(featureTable)
- assert(expectedLength == len(parquet))
+ assert(expectedLength == len(parquet))
- ########### FEATURE LEVEL TESTS ########
+
+ ########### FEATURE LEVEL VALUE TESTS ########
# Tests that columns are equal across different sqlite3 tables to ensure joins occured correctly
+ # since cannot compare NAN drop rows which contain an NAN
+ na_columns = ['PRECURSOR.LIBRARY_INTENSITY'] # this is a list of columns which expect to be NAN
+ parquet = parquet.drop(columns=na_columns).dropna()
+
+ assert(len(parquet) > 0) # assert that did not just drop everything (means that missed an na column)
+
if transitionLevel:
## check features and transitions joined properly for those all cases (including those with no features
## Way library was created precursor and transition m/z both are in the same 100s (e.g. if precursor m/z is 700 transition mz can be 701)
@@ -83,21 +110,26 @@ def _run_export_parquet_single_run(temp_folder, transitionLevel=False, pd_testin
### Note: Current tests assume no na
parquet = parquet.dropna()
- proxy_feature_id = parquet['FEATURE_ID'].astype(str).apply(lambda x: x[0]).astype(int) # since id is complicated, dummy values created using a proxy id which is the first digit of the actual id
+ pseudo_feature_id = (parquet['FEATURE_ID'].astype(str).str.slice(start=0, stop=1)).astype(int)
pd.testing.assert_series_equal(parquet['FEATURE_MS1.APEX_INTENSITY'], parquet['PRECURSOR_ID'], **pd_testing_kwargs)
pd.testing.assert_series_equal(parquet['FEATURE_MS2.APEX_INTENSITY'], parquet['PRECURSOR_ID'], **pd_testing_kwargs)
pd.testing.assert_series_equal(parquet['FEATURE_MS1.EXP_IM'], parquet['FEATURE_MS2.EXP_IM'], **pd_testing_kwargs)
pd.testing.assert_series_equal(parquet['FEATURE_MS2.DELTA_IM'], parquet['FEATURE_MS1.DELTA_IM'], **pd_testing_kwargs)
- pd.testing.assert_series_equal(parquet['SCORE_MS2.SCORE'], (parquet['PRECURSOR_ID'] + 1) * parquet['FEATURE.EXP_RT'].astype(int) * (proxy_feature_id), **pd_testing_kwargs)
- print(parquet.columns)
+ pd.testing.assert_series_equal(parquet['SCORE_MS2.SCORE'], (parquet['PRECURSOR_ID'] + 1) * parquet['FEATURE.EXP_RT'].astype(int) * pseudo_feature_id, **pd_testing_kwargs)
pd.testing.assert_series_equal(parquet['SCORE_PEPTIDE.SCORE_GLOBAL'], parquet['PEPTIDE_ID'], **pd_testing_kwargs)
pd.testing.assert_series_equal(parquet['SCORE_PROTEIN.SCORE_GLOBAL'], parquet['PROTEIN_ID'], **pd_testing_kwargs)
+ # check is/no decoys
+ if noDecoys:
+ assert(parquet[parquet['DECOY'] == 1].shape[0] == 0)
+
+
+
############### TRANSTION LEVEL TESTS ################
if transitionLevel:
- pd.testing.assert_series_equal(parquet['FEATURE_TRANSITION.AREA_INTENSITY'], parquet['TRANSITION.PRODUCT_MZ'] * (proxy_feature_id), **pd_testing_kwargs)
+ pd.testing.assert_series_equal(parquet['FEATURE_TRANSITION.AREA_INTENSITY'], parquet['TRANSITION.PRODUCT_MZ'] * pseudo_feature_id, **pd_testing_kwargs)
def test_export_parquet_single_run(tmpdir):
_run_export_parquet_single_run(tmpdir, transitionLevel=False)
@@ -112,4 +144,10 @@ def test_export_parquet_single_run_onlyFeatures(tmpdir):
def test_export_parquet_single_run_transitionLevel_onlyFeatures(tmpdir):
- _run_export_parquet_single_run(tmpdir, transitionLevel=True, onlyFeatures=True)
\ No newline at end of file
+ _run_export_parquet_single_run(tmpdir, transitionLevel=True, onlyFeatures=True)
+
+def test_export_parquet_single_run_noDecoys(tmpdir):
+ _run_export_parquet_single_run(tmpdir, noDecoys=True)
+
+def test_export_parquet_single_run_transitionLevel_noDecoys(tmpdir):
+ _run_export_parquet_single_run(tmpdir, transitionLevel=True, noDecoys=True)
\ No newline at end of file