diff --git a/pyprophet/export_parquet.py b/pyprophet/export_parquet.py index 24b723c2..1f89fa54 100644 --- a/pyprophet/export_parquet.py +++ b/pyprophet/export_parquet.py @@ -3,6 +3,7 @@ import pandas as pd from pyprophet.export import check_sqlite_table from duckdb_extensions import extension_importer +import re def getPeptideProteinScoreTable(conndb, level): if level == 'peptide': @@ -31,7 +32,7 @@ def getVarColumnNames(condb, tableName): # this method is only currently supported for combined output and not with ipf -def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False): +def export_to_parquet(infile, outfile, transitionLevel=False, onlyFeatures=False, noDecoys=False): ''' Convert an OSW sqlite file to Parquet format @@ -66,6 +67,9 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False): CREATE INDEX IF NOT EXISTS idx_protein_protein_id ON PROTEIN (ID); CREATE INDEX IF NOT EXISTS idx_peptide_protein_mapping_peptide_id ON PEPTIDE_PROTEIN_MAPPING (PEPTIDE_ID); + CREATE INDEX IF NOT EXISTS idx_transition_id ON TRANSITION (ID); + CREATE INDEX IF NOT EXISTS idx_transition_precursor_mapping_transition_id ON TRANSITION_PRECURSOR_MAPPING (TRANSITION_ID); + CREATE INDEX IF NOT EXISTS idx_transition_precursor_mapping_precursor_id ON TRANSITION_PRECURSOR_MAPPING (PRECURSOR_ID); ''' if check_sqlite_table(con, "FEATURE_MS1"): @@ -200,19 +204,30 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False): # create a list of all the columns columns_list = [col for c in columns.values() for col in c] + + # create a list of just aliases for groupby + pattern = re.compile(r"(.*)\sAS") + alias_list = [ pattern.search(col).group(1) for c in columns.values() for col in c] # join the list into a single string separated by a comma and a space columnsToSelect = ", ".join(columns_list) - - join_features = "LEFT JOIN" if onlyFeatures else "FULL JOIN" - - # First read feature data - # Feature Data - if not transitionLevel: - feature_query = f''' - SELECT {columnsToSelect} - FROM FEATURE - {join_features} PRECURSOR ON FEATURE.PRECURSOR_ID = PRECURSOR.ID + aliasToSelect = ", ".join(alias_list) + + # For feature level group important transition level data into one row separated by ';' + featureLvlPrefix = "GROUP_CONCAT(TRANSITION.ID, ';') AS 'TRANSITION_ID', GROUP_CONCAT(TRANSITION.ANNOTATION, ';') AS 'TRANSITION_ANNOTATION'" if not transitionLevel else "" + featureLvlSuffix = f'GROUP BY {aliasToSelect}' if not transitionLevel else "" + + decoyExclude = "WHERE PRECURSOR.DECOY == 0" if noDecoys else "" + + if not onlyFeatures: + query = f''' + SELECT {columnsToSelect}, + {featureLvlPrefix} + FROM TRANSITION_PRECURSOR_MAPPING + LEFT JOIN TRANSITION ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID + LEFT JOIN PRECURSOR ON TRANSITION_PRECURSOR_MAPPING.PRECURSOR_ID = PRECURSOR.ID + LEFT JOIN FEATURE_TRANSITION ON TRANSITION.ID = FEATURE_TRANSITION.TRANSITION_ID + LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID @@ -224,48 +239,30 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False): {gene_table_joins} {pepJoin} {protJoin} + {decoyExclude} + {featureLvlSuffix} ''' - else: # is transition level - - # merge transition and precursor level data - if not onlyFeatures: - feature_query = f''' - SELECT {columnsToSelect} - FROM TRANSITION_PRECURSOR_MAPPING - LEFT JOIN TRANSITION ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID - LEFT JOIN PRECURSOR ON TRANSITION_PRECURSOR_MAPPING.PRECURSOR_ID = PRECURSOR.ID - LEFT JOIN FEATURE_TRANSITION ON TRANSITION.ID = FEATURE_TRANSITION.TRANSITION_ID - LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID - LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID - LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID - LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID - LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID - LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID - LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID - LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID - LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID - {gene_table_joins} - {pepJoin} - {protJoin} - ''' - else: - feature_query = f''' - SELECT {columnsToSelect} - FROM FEATURE_TRANSITION - LEFT JOIN TRANSITION ON FEATURE_TRANSITION.TRANSITION_ID = TRANSITION.ID - LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID - LEFT JOIN PRECURSOR ON FEATURE.PRECURSOR_ID = PRECURSOR.ID - LEFT JOIN TRANSITION_PRECURSOR_MAPPING ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID - LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID - LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID - LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID - LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID - LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID - LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID - LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID - LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID - {gene_table_joins} - {pepJoin} - {protJoin} - ''' - condb.sql(feature_query).write_parquet(outfile) \ No newline at end of file + else: + query = f''' + SELECT {columnsToSelect}, + {featureLvlPrefix} + FROM FEATURE_TRANSITION + LEFT JOIN TRANSITION ON FEATURE_TRANSITION.TRANSITION_ID = TRANSITION.ID + LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID + LEFT JOIN PRECURSOR ON FEATURE.PRECURSOR_ID = PRECURSOR.ID + LEFT JOIN TRANSITION_PRECURSOR_MAPPING ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID + LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID + LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID + LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID + LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID + LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID + LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID + LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID + LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID + {gene_table_joins} + {pepJoin} + {protJoin} + {decoyExclude} + {featureLvlSuffix} + ''' + condb.sql(query).write_parquet(outfile) \ No newline at end of file diff --git a/pyprophet/main.py b/pyprophet/main.py index 24f97abe..eed24392 100644 --- a/pyprophet/main.py +++ b/pyprophet/main.py @@ -368,7 +368,8 @@ def export(infile, outfile, format, outcsv, transition_quantification, max_trans @click.option('--out', 'outfile', required=False, type=click.Path(exists=False), help='Output parquet file.') @click.option('--transitionLevel', 'transitionLevel', is_flag=True, help='Whether to export transition level data as well') @click.option('--onlyFeatures', 'onlyFeatures', is_flag=True, help='Only include precursors that have a corresponding feature') -def export_parquet(infile, outfile, transitionLevel, onlyFeatures): +@click.option('--noDecoys', 'noDecoys', is_flag=True, help='Do not include decoys in the exported data') +def export_parquet(infile, outfile, transitionLevel, onlyFeatures, noDecoys): """ Export all transition data to parquet file """ @@ -381,7 +382,7 @@ def export_parquet(infile, outfile, transitionLevel, onlyFeatures): if not overwrite: raise click.ClickException(f"Aborting: {outfile} already exists!") click.echo("Info: Parquet file will be written to {}".format(outfile)) - export_to_parquet(os.path.abspath(infile), os.path.abspath(outfile), transitionLevel, onlyFeatures) + export_to_parquet(os.path.abspath(infile), os.path.abspath(outfile), transitionLevel, onlyFeatures, noDecoys) # Export Compound TSV @cli.command() diff --git a/tests/Create_OSW_test.ipynb b/tests/Create_OSW_test.ipynb index 4a96d9df..0ba87a6f 100644 --- a/tests/Create_OSW_test.ipynb +++ b/tests/Create_OSW_test.ipynb @@ -8,6 +8,14 @@ "## **Create a fake .OSW file for testing**" ] }, + { + "cell_type": "markdown", + "id": "426d6f86-aaea-4372-b1fd-16234cdccb7f", + "metadata": {}, + "source": [ + "**Note:** Code cell 11 must be edited manually if new entries are added to the library" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -59,7 +67,7 @@ "metadata": {}, "outputs": [], "source": [ - "lib = pd.read_csv(\"fakeLib.tsv\", sep='\\t')" + "lib = pd.read_csv(\"data/fakeLib.tsv\", sep='\\t')" ] }, { @@ -102,6 +110,7 @@ " ProductCharge\n", " GeneName\n", " LibraryDriftTime\n", + " Decoy\n", " Annotation\n", " TransitionId\n", " \n", @@ -122,6 +131,7 @@ " 2\n", " Y\n", " 10\n", + " 0\n", " b1^2\n", " YYYYYYYYYYYK2_b1^2\n", " \n", @@ -140,6 +150,7 @@ " 2\n", " Y\n", " 10\n", + " 0\n", " y2^2\n", " YYYYYYYYYYYK2_y2^2\n", " \n", @@ -158,6 +169,7 @@ " 2\n", " Y\n", " 10\n", + " 0\n", " b3^2\n", " YYYYYYYYYYYK2_b3^2\n", " \n", @@ -176,6 +188,7 @@ " 2\n", " Y\n", " 20\n", + " 0\n", " b1^2\n", " YYYYYR2_b1^2\n", " \n", @@ -194,6 +207,7 @@ " 2\n", " Y\n", " 20\n", + " 0\n", " y2^2\n", " YYYYYR2_y2^2\n", " \n", @@ -212,6 +226,7 @@ " 2\n", " Y\n", " 20\n", + " 0\n", " b3^2\n", " YYYYYR2_b3^2\n", " \n", @@ -230,6 +245,7 @@ " 2\n", " Y\n", " 20\n", + " 0\n", " b1^2\n", " YYYYYR3_b1^2\n", " \n", @@ -248,6 +264,7 @@ " 2\n", " Y\n", " 20\n", + " 0\n", " y2^2\n", " YYYYYR3_y2^2\n", " \n", @@ -266,6 +283,7 @@ " 2\n", " G\n", " 40\n", + " 0\n", " b1^2\n", " GGGGGGGGGGR4_b1^2\n", " \n", @@ -284,6 +302,7 @@ " 2\n", " G\n", " 40\n", + " 0\n", " y2^2\n", " GGGGGGGGGGR4_y2^2\n", " \n", @@ -302,6 +321,7 @@ " 2\n", " G\n", " 40\n", + " 0\n", " b3^2\n", " GGGGGGGGGGR4_b3^2\n", " \n", @@ -320,6 +340,7 @@ " 2\n", " G\n", " 40\n", + " 0\n", " y4^2\n", " GGGGGGGGGGR4_y4^2\n", " \n", @@ -338,6 +359,7 @@ " 2\n", " T\n", " 50\n", + " 0\n", " b1^2\n", " TTTTTTTR2_b1^2\n", " \n", @@ -356,6 +378,7 @@ " 2\n", " T\n", " 50\n", + " 0\n", " y2^2\n", " TTTTTTTR2_y2^2\n", " \n", @@ -374,6 +397,7 @@ " 2\n", " T\n", " 50\n", + " 0\n", " b3^2\n", " TTTTTTTR2_b3^2\n", " \n", @@ -392,6 +416,7 @@ " 2\n", " T\n", " 60\n", + " 0\n", " b1^2\n", " TTTTTTTTTTTTK2_b1^2\n", " \n", @@ -410,6 +435,7 @@ " 2\n", " T\n", " 60\n", + " 0\n", " y2^2\n", " TTTTTTTTTTTTK2_y2^2\n", " \n", @@ -428,6 +454,7 @@ " 3\n", " T\n", " 70\n", + " 0\n", " b1^3\n", " TTR3_b1^3\n", " \n", @@ -446,6 +473,7 @@ " 3\n", " T\n", " 70\n", + " 0\n", " y2^3\n", " TTR3_y2^3\n", " \n", @@ -464,9 +492,67 @@ " 3\n", " T\n", " 70\n", + " 0\n", " b3^3\n", " TTR3_b3^3\n", " \n", + " \n", + " 20\n", + " 800\n", + " 801\n", + " 808\n", + " 80\n", + " Decoy_ProtT\n", + " TTK\n", + " TTK\n", + " 3\n", + " b\n", + " 1\n", + " 3\n", + " Decoy_T\n", + " 80\n", + " 1\n", + " b1^3\n", + " TTK3_b1^3\n", + " \n", + " \n", + " 21\n", + " 800\n", + " 802\n", + " 808\n", + " 80\n", + " Decoy_ProtT\n", + " TTK\n", + " TTK\n", + " 3\n", + " y\n", + " 2\n", + " 3\n", + " Decoy_T\n", + " 80\n", + " 1\n", + " y2^3\n", + " TTK3_y2^3\n", + " \n", + " \n", + " 22\n", + " 800\n", + " 803\n", + " 808\n", + " 80\n", + " Decoy_ProtT\n", + " TTK\n", + " TTK\n", + " 3\n", + " b\n", + " 3\n", + " 3\n", + " Decoy_T\n", + " 80\n", + " 1\n", + " b3^3\n", + " TTK3_b3^3\n", + " \n", " \n", "\n", "" @@ -493,28 +579,34 @@ "17 700 701 107 70 \n", "18 700 702 207 70 \n", "19 700 703 307 70 \n", + "20 800 801 808 80 \n", + "21 800 802 808 80 \n", + "22 800 803 808 80 \n", "\n", - " ProteinId PeptideSequence ModifiedPeptideSequence PrecursorCharge \\\n", - "0 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 \n", - "1 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 \n", - "2 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 \n", - "3 ProtY YYYYYR YYYYYR 2 \n", - "4 ProtY YYYYYR YYYYYR 2 \n", - "5 ProtY YYYYYR YYYYYR 2 \n", - "6 ProtY YYYYYR YYYYYR 3 \n", - "7 ProtY YYYYYR YYYYYR 3 \n", - "8 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n", - "9 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n", - "10 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n", - "11 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n", - "12 ProtT TTTTTTTR TTTTTTTR 2 \n", - "13 ProtT TTTTTTTR TTTTTTTR 2 \n", - "14 ProtT TTTTTTTR TTTTTTTR 2 \n", - "15 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 \n", - "16 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 \n", - "17 ProtT TTR TTR 3 \n", - "18 ProtT TTR TTR 3 \n", - "19 ProtT TTR TTR 3 \n", + " ProteinId PeptideSequence ModifiedPeptideSequence PrecursorCharge \\\n", + "0 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 \n", + "1 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 \n", + "2 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 \n", + "3 ProtY YYYYYR YYYYYR 2 \n", + "4 ProtY YYYYYR YYYYYR 2 \n", + "5 ProtY YYYYYR YYYYYR 2 \n", + "6 ProtY YYYYYR YYYYYR 3 \n", + "7 ProtY YYYYYR YYYYYR 3 \n", + "8 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n", + "9 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n", + "10 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n", + "11 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n", + "12 ProtT TTTTTTTR TTTTTTTR 2 \n", + "13 ProtT TTTTTTTR TTTTTTTR 2 \n", + "14 ProtT TTTTTTTR TTTTTTTR 2 \n", + "15 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 \n", + "16 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 \n", + "17 ProtT TTR TTR 3 \n", + "18 ProtT TTR TTR 3 \n", + "19 ProtT TTR TTR 3 \n", + "20 Decoy_ProtT TTK TTK 3 \n", + "21 Decoy_ProtT TTK TTK 3 \n", + "22 Decoy_ProtT TTK TTK 3 \n", "\n", " FragmentType FragmentSeriesNumber ProductCharge GeneName \\\n", "0 b 1 2 Y \n", @@ -537,28 +629,34 @@ "17 b 1 3 T \n", "18 y 2 3 T \n", "19 b 3 3 T \n", + "20 b 1 3 Decoy_T \n", + "21 y 2 3 Decoy_T \n", + "22 b 3 3 Decoy_T \n", "\n", - " LibraryDriftTime Annotation TransitionId \n", - "0 10 b1^2 YYYYYYYYYYYK2_b1^2 \n", - "1 10 y2^2 YYYYYYYYYYYK2_y2^2 \n", - "2 10 b3^2 YYYYYYYYYYYK2_b3^2 \n", - "3 20 b1^2 YYYYYR2_b1^2 \n", - "4 20 y2^2 YYYYYR2_y2^2 \n", - "5 20 b3^2 YYYYYR2_b3^2 \n", - "6 20 b1^2 YYYYYR3_b1^2 \n", - "7 20 y2^2 YYYYYR3_y2^2 \n", - "8 40 b1^2 GGGGGGGGGGR4_b1^2 \n", - "9 40 y2^2 GGGGGGGGGGR4_y2^2 \n", - "10 40 b3^2 GGGGGGGGGGR4_b3^2 \n", - "11 40 y4^2 GGGGGGGGGGR4_y4^2 \n", - "12 50 b1^2 TTTTTTTR2_b1^2 \n", - "13 50 y2^2 TTTTTTTR2_y2^2 \n", - "14 50 b3^2 TTTTTTTR2_b3^2 \n", - "15 60 b1^2 TTTTTTTTTTTTK2_b1^2 \n", - "16 60 y2^2 TTTTTTTTTTTTK2_y2^2 \n", - "17 70 b1^3 TTR3_b1^3 \n", - "18 70 y2^3 TTR3_y2^3 \n", - "19 70 b3^3 TTR3_b3^3 " + " LibraryDriftTime Decoy Annotation TransitionId \n", + "0 10 0 b1^2 YYYYYYYYYYYK2_b1^2 \n", + "1 10 0 y2^2 YYYYYYYYYYYK2_y2^2 \n", + "2 10 0 b3^2 YYYYYYYYYYYK2_b3^2 \n", + "3 20 0 b1^2 YYYYYR2_b1^2 \n", + "4 20 0 y2^2 YYYYYR2_y2^2 \n", + "5 20 0 b3^2 YYYYYR2_b3^2 \n", + "6 20 0 b1^2 YYYYYR3_b1^2 \n", + "7 20 0 y2^2 YYYYYR3_y2^2 \n", + "8 40 0 b1^2 GGGGGGGGGGR4_b1^2 \n", + "9 40 0 y2^2 GGGGGGGGGGR4_y2^2 \n", + "10 40 0 b3^2 GGGGGGGGGGR4_b3^2 \n", + "11 40 0 y4^2 GGGGGGGGGGR4_y4^2 \n", + "12 50 0 b1^2 TTTTTTTR2_b1^2 \n", + "13 50 0 y2^2 TTTTTTTR2_y2^2 \n", + "14 50 0 b3^2 TTTTTTTR2_b3^2 \n", + "15 60 0 b1^2 TTTTTTTTTTTTK2_b1^2 \n", + "16 60 0 y2^2 TTTTTTTTTTTTK2_y2^2 \n", + "17 70 0 b1^3 TTR3_b1^3 \n", + "18 70 0 y2^3 TTR3_y2^3 \n", + "19 70 0 b3^3 TTR3_b3^3 \n", + "20 80 1 b1^3 TTK3_b1^3 \n", + "21 80 1 y2^3 TTK3_y2^3 \n", + "22 80 1 b3^3 TTK3_b3^3 " ] }, "execution_count": 3, @@ -579,7 +677,7 @@ "metadata": {}, "outputs": [], "source": [ - "lib.to_csv(\"fakeLib_appended.tsv\", sep='\\t', index=False)" + "lib.to_csv(\"data/fakeLib_appended.tsv\", sep='\\t', index=False)" ] }, { @@ -619,17 +717,39 @@ { "cell_type": "code", "execution_count": 5, + "id": "d8a28a22-9f82-4914-a123-e631509e6ab8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'data/dummyOSWScoredData.osw'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import shutil\n", + "shutil.copyfile(\"data/fakeLib.pqp\", \"data/dummyOSWScoredData.osw\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "id": "1afe24b1-cf28-44e0-84e4-f9194428e18f", "metadata": {}, "outputs": [], "source": [ - "conn = sqlite3.connect(\"fakeLib.pqp\")\n", + "conn = sqlite3.connect(\"data/dummyOSWScoredData.osw\")\n", "cur = conn.cursor()" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "2b50c3b4-e436-4edb-8a80-cc3bf67e73d3", "metadata": {}, "outputs": [], @@ -654,17 +774,17 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "584dc6c7-9c7a-4ad6-91c2-12a0fffa4c08", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -676,7 +796,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "98401ee9-5153-477b-928a-1c66cdbb8e5d", "metadata": {}, "outputs": [], @@ -686,7 +806,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 10, "id": "008b6e0f-28b8-4f22-89fe-54cdad6dca02", "metadata": { "tags": [] @@ -720,17 +840,17 @@ " \n", " \n", " 0\n", - " 6\n", + " 7\n", " 100.0\n", " \n", " \n", " 1\n", - " 4\n", + " 5\n", " 200.0\n", " \n", " \n", " 2\n", - " 5\n", + " 6\n", " 220.0\n", " \n", " \n", @@ -740,35 +860,41 @@ " \n", " \n", " 4\n", - " 2\n", + " 3\n", " 500.0\n", " \n", " \n", " 5\n", - " 3\n", + " 4\n", " 600.0\n", " \n", " \n", " 6\n", - " 1\n", + " 2\n", " 700.0\n", " \n", + " \n", + " 7\n", + " 1\n", + " 800.0\n", + " \n", " \n", "\n", "" ], "text/plain": [ " ID PRECURSOR_MZ\n", - "0 6 100.0\n", - "1 4 200.0\n", - "2 5 220.0\n", + "0 7 100.0\n", + "1 5 200.0\n", + "2 6 220.0\n", "3 0 400.0\n", - "4 2 500.0\n", - "5 3 600.0\n", - "6 1 700.0" + "4 3 500.0\n", + "5 4 600.0\n", + "6 2 700.0\n", + "7 1 800.0" ] }, - "execution_count": 56, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -779,17 +905,18 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "id": "8f2b41fa-efb2-4f92-ad20-1737c16b3b8b", "metadata": {}, "outputs": [], "source": [ - "features = pd.DataFrame(np.column_stack([np.arange(0,7), np.array([5,5,3,4,0,1,2])]), columns=['id', 'precursor_id'])" + "## Note: The second numpy array must be edited manually if add new precursors to the library\n", + "features = pd.DataFrame(np.column_stack([np.arange(1,9), np.array([6,6,4,5,0,2,3,1])]), columns=['id', 'precursor_id'])" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "id": "4b6472b3-a603-492c-b6da-bca4ae0c1ae9", "metadata": {}, "outputs": [], @@ -799,11 +926,36 @@ }, { "cell_type": "code", - "execution_count": 55, - "id": "23ea2e05-f030-41f4-aa98-1fc78da43303", - "metadata": { - "tags": [] - }, + "execution_count": 13, + "id": "a251b758-df07-4d44-ab84-f8f9dd6911af", + "metadata": {}, + "outputs": [], + "source": [ + "# feature table\n", + "features['run_id'] = np.array([1] * len(features), dtype=int)\n", + "features['exp_rt'] = features['PRECURSOR_MZ'] + round((features['id'] + 1) / 100, 2)\n", + "features['exp_im'] = features['exp_rt']\n", + "features['norm_rt'] = [ int(i) for i in features['exp_rt'] ]\n", + "features['delta_rt'] = [0.01] * len(features)\n", + "features['left_width'] = [5] * len(features)\n", + "features['right_width'] = [5] * len(features)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "feeb09d3-17c4-4dd3-9325-5af12d22842b", + "metadata": {}, + "outputs": [], + "source": [ + "features = features.drop(columns=['PRECURSOR_MZ'])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "c9c07727-5351-4501-8f1f-23df31b0a4e9", + "metadata": {}, "outputs": [ { "data": { @@ -841,12 +993,12 @@ " \n", " \n", " 0\n", - " 0\n", - " 5\n", - " 5\n", " 1\n", - " 220.01\n", - " 220.01\n", + " 6\n", + " 6\n", + " 1\n", + " 220.02\n", + " 220.02\n", " 220\n", " 0.01\n", " 5\n", @@ -854,12 +1006,12 @@ " \n", " \n", " 1\n", + " 2\n", + " 6\n", + " 6\n", " 1\n", - " 5\n", - " 5\n", - " 1\n", - " 220.02\n", - " 220.02\n", + " 220.03\n", + " 220.03\n", " 220\n", " 0.01\n", " 5\n", @@ -867,12 +1019,12 @@ " \n", " \n", " 2\n", - " 2\n", - " 3\n", " 3\n", + " 4\n", + " 4\n", " 1\n", - " 600.03\n", - " 600.03\n", + " 600.04\n", + " 600.04\n", " 600\n", " 0.01\n", " 5\n", @@ -880,12 +1032,12 @@ " \n", " \n", " 3\n", - " 3\n", - " 4\n", " 4\n", + " 5\n", + " 5\n", " 1\n", - " 200.04\n", - " 200.04\n", + " 200.05\n", + " 200.05\n", " 200\n", " 0.01\n", " 5\n", @@ -893,12 +1045,12 @@ " \n", " \n", " 4\n", - " 4\n", + " 5\n", " 0\n", " 0\n", " 1\n", - " 400.05\n", - " 400.05\n", + " 400.06\n", + " 400.06\n", " 400\n", " 0.01\n", " 5\n", @@ -906,12 +1058,12 @@ " \n", " \n", " 5\n", - " 5\n", - " 1\n", - " 1\n", + " 6\n", + " 2\n", + " 2\n", " 1\n", - " 700.06\n", - " 700.06\n", + " 700.07\n", + " 700.07\n", " 700\n", " 0.01\n", " 5\n", @@ -919,30 +1071,44 @@ " \n", " \n", " 6\n", - " 6\n", - " 2\n", - " 2\n", + " 7\n", + " 3\n", + " 3\n", " 1\n", - " 500.07\n", - " 500.07\n", + " 500.08\n", + " 500.08\n", " 500\n", " 0.01\n", " 5\n", " 5\n", " \n", + " \n", + " 7\n", + " 8\n", + " 1\n", + " 1\n", + " 1\n", + " 800.09\n", + " 800.09\n", + " 800\n", + " 0.01\n", + " 5\n", + " 5\n", + " \n", " \n", "\n", "" ], "text/plain": [ " id precursor_id ID run_id exp_rt exp_im norm_rt delta_rt \\\n", - "0 0 5 5 1 220.01 220.01 220 0.01 \n", - "1 1 5 5 1 220.02 220.02 220 0.01 \n", - "2 2 3 3 1 600.03 600.03 600 0.01 \n", - "3 3 4 4 1 200.04 200.04 200 0.01 \n", - "4 4 0 0 1 400.05 400.05 400 0.01 \n", - "5 5 1 1 1 700.06 700.06 700 0.01 \n", - "6 6 2 2 1 500.07 500.07 500 0.01 \n", + "0 1 6 6 1 220.02 220.02 220 0.01 \n", + "1 2 6 6 1 220.03 220.03 220 0.01 \n", + "2 3 4 4 1 600.04 600.04 600 0.01 \n", + "3 4 5 5 1 200.05 200.05 200 0.01 \n", + "4 5 0 0 1 400.06 400.06 400 0.01 \n", + "5 6 2 2 1 700.07 700.07 700 0.01 \n", + "6 7 3 3 1 500.08 500.08 500 0.01 \n", + "7 8 1 1 1 800.09 800.09 800 0.01 \n", "\n", " left_width right_width \n", "0 5 5 \n", @@ -951,10 +1117,11 @@ "3 5 5 \n", "4 5 5 \n", "5 5 5 \n", - "6 5 5 " + "6 5 5 \n", + "7 5 5 " ] }, - "execution_count": 55, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -965,126 +1132,19 @@ }, { "cell_type": "code", - "execution_count": 11, - "id": "a5473d3c-793d-4d9d-b722-4aab486e0fa5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
IDPRECURSOR_MZ
06100.0
14200.0
25220.0
30400.0
42500.0
53600.0
61700.0
\n", - "
" - ], - "text/plain": [ - " ID PRECURSOR_MZ\n", - "0 6 100.0\n", - "1 4 200.0\n", - "2 5 220.0\n", - "3 0 400.0\n", - "4 2 500.0\n", - "5 3 600.0\n", - "6 1 700.0" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "precursor_table" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "a251b758-df07-4d44-ab84-f8f9dd6911af", - "metadata": {}, - "outputs": [], - "source": [ - "# feature table\n", - "features['run_id'] = np.array([1] * len(features), dtype=int)\n", - "features['exp_rt'] = features['PRECURSOR_MZ'] + round((features['id'] + 1) / 100, 2)\n", - "features['exp_im'] = features['exp_rt']\n", - "features['norm_rt'] = [ int(i) for i in features['exp_rt'] ]\n", - "features['delta_rt'] = [0.01] * len(features)\n", - "features['left_width'] = [5] * len(features)\n", - "features['right_width'] = [5] * len(features)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "feeb09d3-17c4-4dd3-9325-5af12d22842b", + "execution_count": 16, + "id": "cbb86464-d1d6-4de9-92b0-a48a787a6895", "metadata": {}, "outputs": [], "source": [ - "features = features.drop(columns=['PRECURSOR_MZ'])" + "# make id a long string so more realistic\n", + "features['id'] = (features['id'].astype(str) * 19).astype(int)" ] }, { "cell_type": "code", - "execution_count": 14, - "id": "c9c07727-5351-4501-8f1f-23df31b0a4e9", + "execution_count": 17, + "id": "70d20530-d775-4b3f-896d-76a345d42e5e", "metadata": {}, "outputs": [ { @@ -1123,12 +1183,12 @@ " \n", " \n", " 0\n", - " 0\n", - " 5\n", - " 5\n", + " 1111111111111111111\n", + " 6\n", + " 6\n", " 1\n", - " 220.01\n", - " 220.01\n", + " 220.02\n", + " 220.02\n", " 220\n", " 0.01\n", " 5\n", @@ -1136,12 +1196,12 @@ " \n", " \n", " 1\n", + " 2222222222222222222\n", + " 6\n", + " 6\n", " 1\n", - " 5\n", - " 5\n", - " 1\n", - " 220.02\n", - " 220.02\n", + " 220.03\n", + " 220.03\n", " 220\n", " 0.01\n", " 5\n", @@ -1149,12 +1209,12 @@ " \n", " \n", " 2\n", - " 2\n", - " 3\n", - " 3\n", + " 3333333333333333333\n", + " 4\n", + " 4\n", " 1\n", - " 600.03\n", - " 600.03\n", + " 600.04\n", + " 600.04\n", " 600\n", " 0.01\n", " 5\n", @@ -1162,12 +1222,12 @@ " \n", " \n", " 3\n", - " 3\n", - " 4\n", - " 4\n", + " 4444444444444444444\n", + " 5\n", + " 5\n", " 1\n", - " 200.04\n", - " 200.04\n", + " 200.05\n", + " 200.05\n", " 200\n", " 0.01\n", " 5\n", @@ -1175,12 +1235,12 @@ " \n", " \n", " 4\n", - " 4\n", + " 5555555555555555555\n", " 0\n", " 0\n", " 1\n", - " 400.05\n", - " 400.05\n", + " 400.06\n", + " 400.06\n", " 400\n", " 0.01\n", " 5\n", @@ -1188,12 +1248,12 @@ " \n", " \n", " 5\n", - " 5\n", - " 1\n", - " 1\n", + " 6666666666666666666\n", + " 2\n", + " 2\n", " 1\n", - " 700.06\n", - " 700.06\n", + " 700.07\n", + " 700.07\n", " 700\n", " 0.01\n", " 5\n", @@ -1201,42 +1261,57 @@ " \n", " \n", " 6\n", - " 6\n", - " 2\n", - " 2\n", + " 7777777777777777777\n", + " 3\n", + " 3\n", " 1\n", - " 500.07\n", - " 500.07\n", + " 500.08\n", + " 500.08\n", " 500\n", " 0.01\n", " 5\n", " 5\n", " \n", + " \n", + " 7\n", + " 8888888888888888888\n", + " 1\n", + " 1\n", + " 1\n", + " 800.09\n", + " 800.09\n", + " 800\n", + " 0.01\n", + " 5\n", + " 5\n", + " \n", " \n", "\n", "" ], "text/plain": [ - " id precursor_id ID run_id exp_rt exp_im norm_rt delta_rt \\\n", - "0 0 5 5 1 220.01 220.01 220 0.01 \n", - "1 1 5 5 1 220.02 220.02 220 0.01 \n", - "2 2 3 3 1 600.03 600.03 600 0.01 \n", - "3 3 4 4 1 200.04 200.04 200 0.01 \n", - "4 4 0 0 1 400.05 400.05 400 0.01 \n", - "5 5 1 1 1 700.06 700.06 700 0.01 \n", - "6 6 2 2 1 500.07 500.07 500 0.01 \n", + " id precursor_id ID run_id exp_rt exp_im norm_rt \\\n", + "0 1111111111111111111 6 6 1 220.02 220.02 220 \n", + "1 2222222222222222222 6 6 1 220.03 220.03 220 \n", + "2 3333333333333333333 4 4 1 600.04 600.04 600 \n", + "3 4444444444444444444 5 5 1 200.05 200.05 200 \n", + "4 5555555555555555555 0 0 1 400.06 400.06 400 \n", + "5 6666666666666666666 2 2 1 700.07 700.07 700 \n", + "6 7777777777777777777 3 3 1 500.08 500.08 500 \n", + "7 8888888888888888888 1 1 1 800.09 800.09 800 \n", "\n", - " left_width right_width \n", - "0 5 5 \n", - "1 5 5 \n", - "2 5 5 \n", - "3 5 5 \n", - "4 5 5 \n", - "5 5 5 \n", - "6 5 5 " + " delta_rt left_width right_width \n", + "0 0.01 5 5 \n", + "1 0.01 5 5 \n", + "2 0.01 5 5 \n", + "3 0.01 5 5 \n", + "4 0.01 5 5 \n", + "5 0.01 5 5 \n", + "6 0.01 5 5 \n", + "7 0.01 5 5 " ] }, - "execution_count": 14, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1247,7 +1322,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "id": "9b2a1cfc-dcca-45f3-9aa4-e7f75382de3c", "metadata": {}, "outputs": [], @@ -1262,7 +1337,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, "id": "4806f254-1c29-4aa5-9822-6cb8c6ea730c", "metadata": {}, "outputs": [], @@ -1288,17 +1363,17 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "id": "8d03f3ca-a4b0-48e9-bd7f-3c1c7b8d6874", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 17, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1310,7 +1385,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "id": "aae782e1-c9f9-4c2c-84f7-a7741e385a5e", "metadata": {}, "outputs": [ @@ -1358,10 +1433,10 @@ " \n", " \n", " 0\n", - " 0\n", - " 220010.0\n", - " 5\n", - " 220.01\n", + " 1111111111111111111\n", + " 220020.0\n", + " 6\n", + " 220.02\n", " 0.01\n", " 1\n", " 1\n", @@ -1379,10 +1454,10 @@ " \n", " \n", " 1\n", - " 1\n", - " 220020.0\n", - " 5\n", - " 220.02\n", + " 2222222222222222222\n", + " 220030.0\n", + " 6\n", + " 220.03\n", " 0.01\n", " 1\n", " 1\n", @@ -1400,10 +1475,10 @@ " \n", " \n", " 2\n", - " 2\n", - " 600030.0\n", - " 3\n", - " 600.03\n", + " 3333333333333333333\n", + " 600040.0\n", + " 4\n", + " 600.04\n", " 0.01\n", " 1\n", " 1\n", @@ -1421,10 +1496,10 @@ " \n", " \n", " 3\n", - " 3\n", - " 200040.0\n", - " 4\n", - " 200.04\n", + " 4444444444444444444\n", + " 200050.0\n", + " 5\n", + " 200.05\n", " 0.01\n", " 1\n", " 1\n", @@ -1442,10 +1517,10 @@ " \n", " \n", " 4\n", - " 4\n", - " 400050.0\n", + " 5555555555555555555\n", + " 400060.0\n", " 0\n", - " 400.05\n", + " 400.06\n", " 0.01\n", " 1\n", " 1\n", @@ -1463,10 +1538,10 @@ " \n", " \n", " 5\n", - " 5\n", - " 700060.0\n", - " 1\n", - " 700.06\n", + " 6666666666666666666\n", + " 700070.0\n", + " 2\n", + " 700.07\n", " 0.01\n", " 1\n", " 1\n", @@ -1484,10 +1559,31 @@ " \n", " \n", " 6\n", - " 6\n", - " 500070.0\n", - " 2\n", - " 500.07\n", + " 7777777777777777777\n", + " 500080.0\n", + " 3\n", + " 500.08\n", + " 0.01\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " \n", + " \n", + " 7\n", + " 8888888888888888888\n", + " 800090.0\n", + " 1\n", + " 800.09\n", " 0.01\n", " 1\n", " 1\n", @@ -1508,14 +1604,15 @@ "" ], "text/plain": [ - " feature_id area_intensity apex_intensity exp_im delta_im \\\n", - "0 0 220010.0 5 220.01 0.01 \n", - "1 1 220020.0 5 220.02 0.01 \n", - "2 2 600030.0 3 600.03 0.01 \n", - "3 3 200040.0 4 200.04 0.01 \n", - "4 4 400050.0 0 400.05 0.01 \n", - "5 5 700060.0 1 700.06 0.01 \n", - "6 6 500070.0 2 500.07 0.01 \n", + " feature_id area_intensity apex_intensity exp_im delta_im \\\n", + "0 1111111111111111111 220020.0 6 220.02 0.01 \n", + "1 2222222222222222222 220030.0 6 220.03 0.01 \n", + "2 3333333333333333333 600040.0 4 600.04 0.01 \n", + "3 4444444444444444444 200050.0 5 200.05 0.01 \n", + "4 5555555555555555555 400060.0 0 400.06 0.01 \n", + "5 6666666666666666666 700070.0 2 700.07 0.01 \n", + "6 7777777777777777777 500080.0 3 500.08 0.01 \n", + "7 8888888888888888888 800090.0 1 800.09 0.01 \n", "\n", " var_massdev_score var_mi_score var_mi_contrast_score \\\n", "0 1 1 1 \n", @@ -1525,6 +1622,7 @@ "4 1 1 1 \n", "5 1 1 1 \n", "6 1 1 1 \n", + "7 1 1 1 \n", "\n", " var_mi_combined_score var_isotope_correlation_score \\\n", "0 1 1 \n", @@ -1534,6 +1632,7 @@ "4 1 1 \n", "5 1 1 \n", "6 1 1 \n", + "7 1 1 \n", "\n", " var_isotope_overlap_score var_im_ms1_delta_score var_xcorr_coelution \\\n", "0 1 1 1 \n", @@ -1543,6 +1642,7 @@ "4 1 1 1 \n", "5 1 1 1 \n", "6 1 1 1 \n", + "7 1 1 1 \n", "\n", " var_xcorr_coelution_contrast var_xcorr_coelution_combined \\\n", "0 1 1 \n", @@ -1552,6 +1652,7 @@ "4 1 1 \n", "5 1 1 \n", "6 1 1 \n", + "7 1 1 \n", "\n", " var_xcorr_shape var_xcorr_shape_contrast var_xcorr_shape_combined \n", "0 1 1 1 \n", @@ -1560,41 +1661,43 @@ "3 1 1 1 \n", "4 1 1 1 \n", "5 1 1 1 \n", - "6 1 1 1 " + "6 1 1 1 \n", + "7 1 1 1 " ] }, - "execution_count": 18, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "length = 8\n", "feature_ms1 = features[['id']].copy().rename(columns={'id':'feature_id'})\n", "\n", "feature_ms1['area_intensity'] = features['exp_rt'] * 1000 \n", "feature_ms1['apex_intensity'] = features['precursor_id']\n", "feature_ms1['exp_im'] = features['exp_im']\n", "feature_ms1['delta_im'] = features['delta_rt']\n", - "feature_ms1['var_massdev_score'] = [1] *7\n", - "feature_ms1['var_mi_score'] = [1] *7\n", - "feature_ms1['var_mi_contrast_score'] = [1] *7\n", - "feature_ms1['var_mi_combined_score'] = [1] *7\n", - "feature_ms1['var_isotope_correlation_score'] = [1] *7\n", - "feature_ms1['var_isotope_overlap_score'] = [1] *7\n", - "feature_ms1['var_im_ms1_delta_score'] = [1] *7\n", - "feature_ms1['var_xcorr_coelution'] = [1] *7\n", - "feature_ms1['var_xcorr_coelution_contrast'] = [1] *7\n", - "feature_ms1['var_xcorr_coelution_combined'] = [1] *7\n", - "feature_ms1['var_xcorr_shape'] = [1] *7\n", - "feature_ms1['var_xcorr_shape_contrast'] = [1] *7\n", - "feature_ms1['var_xcorr_shape_combined'] = [1] *7\n", + "feature_ms1['var_massdev_score'] = [1] * length\n", + "feature_ms1['var_mi_score'] = [1] * length\n", + "feature_ms1['var_mi_contrast_score'] = [1] * length\n", + "feature_ms1['var_mi_combined_score'] = [1] * length\n", + "feature_ms1['var_isotope_correlation_score'] = [1] * length\n", + "feature_ms1['var_isotope_overlap_score'] = [1] * length\n", + "feature_ms1['var_im_ms1_delta_score'] = [1] * length\n", + "feature_ms1['var_xcorr_coelution'] = [1] * length\n", + "feature_ms1['var_xcorr_coelution_contrast'] = [1] * length\n", + "feature_ms1['var_xcorr_coelution_combined'] = [1] * length\n", + "feature_ms1['var_xcorr_shape'] = [1] *length\n", + "feature_ms1['var_xcorr_shape_contrast'] = [1] * length\n", + "feature_ms1['var_xcorr_shape_combined'] = [1] * length\n", "\n", "feature_ms1" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 22, "id": "8480c6a7-23ab-4a22-9da9-998cbc8606ac", "metadata": {}, "outputs": [], @@ -1609,7 +1712,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 23, "id": "98ff8eac-5acd-435a-b292-fe64d590ea51", "metadata": {}, "outputs": [], @@ -1635,17 +1738,17 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 24, "id": "7691c149-7f67-48de-9bff-2856a44d40eb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 21, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -1657,7 +1760,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 25, "id": "863fcd87-d051-4bc9-b88c-8535cbc90c4a", "metadata": { "scrolled": true, @@ -1719,7 +1822,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 26, "id": "30f29e9b-b345-4a02-ba63-74dabf69555b", "metadata": {}, "outputs": [ @@ -1770,11 +1873,11 @@ " \n", " \n", " 0\n", - " 0\n", - " 220010.0\n", - " 220010.0\n", - " 5\n", - " 220.01\n", + " 1111111111111111111\n", + " 220020.0\n", + " 220020.0\n", + " 6\n", + " 220.02\n", " 0.01\n", " 1\n", " 1\n", @@ -1794,11 +1897,11 @@ " \n", " \n", " 1\n", - " 1\n", - " 220020.0\n", - " 220020.0\n", - " 5\n", - " 220.02\n", + " 2222222222222222222\n", + " 220030.0\n", + " 220030.0\n", + " 6\n", + " 220.03\n", " 0.01\n", " 1\n", " 1\n", @@ -1818,11 +1921,11 @@ " \n", " \n", " 2\n", - " 2\n", - " 600030.0\n", - " 600030.0\n", - " 3\n", - " 600.03\n", + " 3333333333333333333\n", + " 600040.0\n", + " 600040.0\n", + " 4\n", + " 600.04\n", " 0.01\n", " 1\n", " 1\n", @@ -1842,11 +1945,11 @@ " \n", " \n", " 3\n", - " 3\n", - " 200040.0\n", - " 200040.0\n", - " 4\n", - " 200.04\n", + " 4444444444444444444\n", + " 200050.0\n", + " 200050.0\n", + " 5\n", + " 200.05\n", " 0.01\n", " 1\n", " 1\n", @@ -1866,11 +1969,11 @@ " \n", " \n", " 4\n", - " 4\n", - " 400050.0\n", - " 400050.0\n", + " 5555555555555555555\n", + " 400060.0\n", + " 400060.0\n", " 0\n", - " 400.05\n", + " 400.06\n", " 0.01\n", " 1\n", " 1\n", @@ -1890,11 +1993,11 @@ " \n", " \n", " 5\n", - " 5\n", - " 700060.0\n", - " 700060.0\n", - " 1\n", - " 700.06\n", + " 6666666666666666666\n", + " 700070.0\n", + " 700070.0\n", + " 2\n", + " 700.07\n", " 0.01\n", " 1\n", " 1\n", @@ -1914,11 +2017,35 @@ " \n", " \n", " 6\n", - " 6\n", - " 500070.0\n", - " 500070.0\n", - " 2\n", - " 500.07\n", + " 7777777777777777777\n", + " 500080.0\n", + " 500080.0\n", + " 3\n", + " 500.08\n", + " 0.01\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " ...\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " \n", + " \n", + " 7\n", + " 8888888888888888888\n", + " 800090.0\n", + " 800090.0\n", + " 1\n", + " 800.09\n", " 0.01\n", " 1\n", " 1\n", @@ -1938,27 +2065,29 @@ " \n", " \n", "\n", - "

7 rows × 41 columns

\n", + "

8 rows × 41 columns

\n", "" ], "text/plain": [ - " feature_id AREA_INTENSITY TOTAL_AREA_INTENSITY APEX_INTENSITY EXP_IM \\\n", - "0 0 220010.0 220010.0 5 220.01 \n", - "1 1 220020.0 220020.0 5 220.02 \n", - "2 2 600030.0 600030.0 3 600.03 \n", - "3 3 200040.0 200040.0 4 200.04 \n", - "4 4 400050.0 400050.0 0 400.05 \n", - "5 5 700060.0 700060.0 1 700.06 \n", - "6 6 500070.0 500070.0 2 500.07 \n", + " feature_id AREA_INTENSITY TOTAL_AREA_INTENSITY APEX_INTENSITY \\\n", + "0 1111111111111111111 220020.0 220020.0 6 \n", + "1 2222222222222222222 220030.0 220030.0 6 \n", + "2 3333333333333333333 600040.0 600040.0 4 \n", + "3 4444444444444444444 200050.0 200050.0 5 \n", + "4 5555555555555555555 400060.0 400060.0 0 \n", + "5 6666666666666666666 700070.0 700070.0 2 \n", + "6 7777777777777777777 500080.0 500080.0 3 \n", + "7 8888888888888888888 800090.0 800090.0 1 \n", "\n", - " DELTA_IM TOTAL_MI VAR_BSERIES_SCORE VAR_DOTPROD_SCORE \\\n", - "0 0.01 1 1 1 \n", - "1 0.01 1 1 1 \n", - "2 0.01 1 1 1 \n", - "3 0.01 1 1 1 \n", - "4 0.01 1 1 1 \n", - "5 0.01 1 1 1 \n", - "6 0.01 1 1 1 \n", + " EXP_IM DELTA_IM TOTAL_MI VAR_BSERIES_SCORE VAR_DOTPROD_SCORE \\\n", + "0 220.02 0.01 1 1 1 \n", + "1 220.03 0.01 1 1 1 \n", + "2 600.04 0.01 1 1 1 \n", + "3 200.05 0.01 1 1 1 \n", + "4 400.06 0.01 1 1 1 \n", + "5 700.07 0.01 1 1 1 \n", + "6 500.08 0.01 1 1 1 \n", + "7 800.09 0.01 1 1 1 \n", "\n", " VAR_INTENSITY_SCORE ... VAR_ELUTION_MODEL_FIT_SCORE VAR_IM_XCORR_SHAPE \\\n", "0 1 ... 1 1 \n", @@ -1968,6 +2097,7 @@ "4 1 ... 1 1 \n", "5 1 ... 1 1 \n", "6 1 ... 1 1 \n", + "7 1 ... 1 1 \n", "\n", " VAR_IM_XCORR_COELUTION VAR_IM_DELTA_SCORE VAR_SONAR_LAG VAR_SONAR_SHAPE \\\n", "0 1 1 1 1 \n", @@ -1977,6 +2107,7 @@ "4 1 1 1 1 \n", "5 1 1 1 1 \n", "6 1 1 1 1 \n", + "7 1 1 1 1 \n", "\n", " VAR_SONAR_LOG_SN VAR_SONAR_LOG_DIFF VAR_SONAR_LOG_TREND VAR_SONAR_RSQ \n", "0 1 1 1 1 \n", @@ -1986,63 +2117,65 @@ "4 1 1 1 1 \n", "5 1 1 1 1 \n", "6 1 1 1 1 \n", + "7 1 1 1 1 \n", "\n", - "[7 rows x 41 columns]" + "[8 rows x 41 columns]" ] }, - "execution_count": 23, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "length = 8\n", "feature_ms2 = feature_ms1[['feature_id']].copy()\n", "feature_ms2['AREA_INTENSITY'] = feature_ms1['area_intensity']\n", "feature_ms2['TOTAL_AREA_INTENSITY'] = feature_ms2['AREA_INTENSITY']\n", "feature_ms2['APEX_INTENSITY'] = feature_ms1['apex_intensity']\n", "feature_ms2['EXP_IM'] = feature_ms1['exp_im']\n", "feature_ms2['DELTA_IM'] = feature_ms1['delta_im']\n", - "feature_ms2['TOTAL_MI'] = [1] *7 \n", - "feature_ms2['VAR_BSERIES_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_DOTPROD_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_INTENSITY_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_ISOTOPE_CORRELATION_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_ISOTOPE_OVERLAP_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_LIBRARY_CORR'] = [1] *7 \n", - "feature_ms2['VAR_LIBRARY_DOTPROD'] = [1] *7 \n", - "feature_ms2['VAR_LIBRARY_MANHATTAN'] = [1] *7 \n", - "feature_ms2['VAR_LIBRARY_RMSD'] = [1] *7 \n", - "feature_ms2['VAR_LIBRARY_ROOTMEANSQUARE'] = [1] *7 \n", - "feature_ms2['VAR_LIBRARY_SANGLE'] = [1] *7 \n", - "feature_ms2['VAR_LOG_SN_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_MANHATTAN_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_MASSDEV_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_MASSDEV_SCORE_WEIGHTED'] = [1] *7 \n", - "feature_ms2['VAR_MI_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_MI_WEIGHTED_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_MI_RATIO_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_NORM_RT_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_XCORR_COELUTION'] = [1] *7 \n", - "feature_ms2['VAR_XCORR_COELUTION_WEIGHTED'] = [1] *7 \n", - "feature_ms2['VAR_XCORR_SHAPE'] = [1] *7 \n", - "feature_ms2['VAR_XCORR_SHAPE_WEIGHTED'] = [1] *7 \n", - "feature_ms2['VAR_YSERIES_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_ELUTION_MODEL_FIT_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_IM_XCORR_SHAPE'] = [1] *7 \n", - "feature_ms2['VAR_IM_XCORR_COELUTION'] = [1] *7 \n", - "feature_ms2['VAR_IM_DELTA_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_SONAR_LAG'] = [1] *7 \n", - "feature_ms2['VAR_SONAR_SHAPE'] = [1] *7 \n", - "feature_ms2['VAR_SONAR_LOG_SN'] = [1] *7 \n", - "feature_ms2['VAR_SONAR_LOG_DIFF'] = [1] *7 \n", - "feature_ms2['VAR_SONAR_LOG_TREND'] = [1] *7 \n", - "feature_ms2['VAR_SONAR_RSQ'] = [1] *7 \n", + "feature_ms2['TOTAL_MI'] = [1] * length \n", + "feature_ms2['VAR_BSERIES_SCORE'] = [1] * length \n", + "feature_ms2['VAR_DOTPROD_SCORE'] = [1] * length \n", + "feature_ms2['VAR_INTENSITY_SCORE'] = [1] * length \n", + "feature_ms2['VAR_ISOTOPE_CORRELATION_SCORE'] = [1] * length \n", + "feature_ms2['VAR_ISOTOPE_OVERLAP_SCORE'] = [1] * length \n", + "feature_ms2['VAR_LIBRARY_CORR'] = [1] * length \n", + "feature_ms2['VAR_LIBRARY_DOTPROD'] = [1] * length \n", + "feature_ms2['VAR_LIBRARY_MANHATTAN'] = [1] * length \n", + "feature_ms2['VAR_LIBRARY_RMSD'] = [1] * length \n", + "feature_ms2['VAR_LIBRARY_ROOTMEANSQUARE'] = [1] * length \n", + "feature_ms2['VAR_LIBRARY_SANGLE'] = [1] * length \n", + "feature_ms2['VAR_LOG_SN_SCORE'] = [1] * length \n", + "feature_ms2['VAR_MANHATTAN_SCORE'] = [1] * length \n", + "feature_ms2['VAR_MASSDEV_SCORE'] = [1] * length \n", + "feature_ms2['VAR_MASSDEV_SCORE_WEIGHTED'] = [1] * length \n", + "feature_ms2['VAR_MI_SCORE'] = [1] * length \n", + "feature_ms2['VAR_MI_WEIGHTED_SCORE'] = [1] * length \n", + "feature_ms2['VAR_MI_RATIO_SCORE'] = [1] * length \n", + "feature_ms2['VAR_NORM_RT_SCORE'] = [1] * length \n", + "feature_ms2['VAR_XCORR_COELUTION'] = [1] * length \n", + "feature_ms2['VAR_XCORR_COELUTION_WEIGHTED'] = [1] * length \n", + "feature_ms2['VAR_XCORR_SHAPE'] = [1] * length \n", + "feature_ms2['VAR_XCORR_SHAPE_WEIGHTED'] = [1] * length \n", + "feature_ms2['VAR_YSERIES_SCORE'] = [1] * length \n", + "feature_ms2['VAR_ELUTION_MODEL_FIT_SCORE'] = [1] * length \n", + "feature_ms2['VAR_IM_XCORR_SHAPE'] = [1] * length \n", + "feature_ms2['VAR_IM_XCORR_COELUTION'] = [1] * length \n", + "feature_ms2['VAR_IM_DELTA_SCORE'] = [1] * length \n", + "feature_ms2['VAR_SONAR_LAG'] = [1] * length \n", + "feature_ms2['VAR_SONAR_SHAPE'] = [1] * length \n", + "feature_ms2['VAR_SONAR_LOG_SN'] = [1] * length \n", + "feature_ms2['VAR_SONAR_LOG_DIFF'] = [1] * length \n", + "feature_ms2['VAR_SONAR_LOG_TREND'] = [1] * length \n", + "feature_ms2['VAR_SONAR_RSQ'] = [1] * length \n", "feature_ms2" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 27, "id": "d93163c0-20a1-4d98-86de-71c6d265d418", "metadata": {}, "outputs": [], @@ -2057,7 +2190,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 28, "id": "db1cbc3f-e463-43a6-895a-979c7aafe393", "metadata": {}, "outputs": [], @@ -2083,17 +2216,17 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 29, "id": "bca8bfba-86e9-497c-ae94-4c0f679b45f1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 26, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -2105,7 +2238,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 30, "id": "4678179b-aea7-460f-ad71-d157a1e3ce38", "metadata": {}, "outputs": [], @@ -2115,7 +2248,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 31, "id": "231cf0c6-01ac-4061-b1a9-d68404f793b3", "metadata": {}, "outputs": [], @@ -2125,7 +2258,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 32, "id": "7f730c1c-73a3-4ce1-a2b3-35b064edd558", "metadata": {}, "outputs": [], @@ -2136,7 +2269,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 33, "id": "46a4405d-c5c9-48a6-aee5-abb2a3b9b047", "metadata": {}, "outputs": [ @@ -2172,88 +2305,88 @@ " \n", " \n", " 0\n", - " 0\n", - " 5\n", + " 1111111111111111111\n", + " 6\n", + " 6\n", " 6\n", - " 5\n", " YYYYYR3_b1^2\n", " 221.0\n", " \n", " \n", " 1\n", - " 1\n", - " 5\n", + " 1111111111111111111\n", " 6\n", - " 5\n", - " YYYYYR3_b1^2\n", - " 221.0\n", - " \n", - " \n", - " 2\n", - " 0\n", - " 5\n", " 7\n", - " 5\n", + " 6\n", " YYYYYR3_y2^2\n", " 222.0\n", " \n", " \n", + " 2\n", + " 2222222222222222222\n", + " 6\n", + " 6\n", + " 6\n", + " YYYYYR3_b1^2\n", + " 221.0\n", + " \n", + " \n", " 3\n", - " 1\n", - " 5\n", + " 2222222222222222222\n", + " 6\n", " 7\n", - " 5\n", + " 6\n", " YYYYYR3_y2^2\n", " 222.0\n", " \n", " \n", " 4\n", - " 2\n", - " 3\n", + " 3333333333333333333\n", + " 4\n", " 15\n", - " 3\n", + " 4\n", " TTTTTTTTTTTTK2_b1^2\n", " 601.0\n", " \n", " \n", " 5\n", - " 2\n", - " 3\n", + " 3333333333333333333\n", + " 4\n", " 16\n", - " 3\n", + " 4\n", " TTTTTTTTTTTTK2_y2^2\n", " 602.0\n", " \n", " \n", " 6\n", + " 4444444444444444444\n", + " 5\n", " 3\n", - " 4\n", - " 3\n", - " 4\n", + " 5\n", " YYYYYR2_b1^2\n", " 201.0\n", " \n", " \n", " 7\n", - " 3\n", - " 4\n", - " 4\n", + " 4444444444444444444\n", + " 5\n", " 4\n", + " 5\n", " YYYYYR2_y2^2\n", " 202.0\n", " \n", " \n", " 8\n", - " 3\n", - " 4\n", + " 4444444444444444444\n", + " 5\n", + " 5\n", " 5\n", - " 4\n", " YYYYYR2_b3^2\n", " 203.0\n", " \n", " \n", " 9\n", - " 4\n", + " 5555555555555555555\n", " 0\n", " 8\n", " 0\n", @@ -2262,7 +2395,7 @@ " \n", " \n", " 10\n", - " 4\n", + " 5555555555555555555\n", " 0\n", " 9\n", " 0\n", @@ -2271,7 +2404,7 @@ " \n", " \n", " 11\n", - " 4\n", + " 5555555555555555555\n", " 0\n", " 10\n", " 0\n", @@ -2280,7 +2413,7 @@ " \n", " \n", " 12\n", - " 4\n", + " 5555555555555555555\n", " 0\n", " 11\n", " 0\n", @@ -2289,88 +2422,118 @@ " \n", " \n", " 13\n", - " 5\n", - " 1\n", + " 6666666666666666666\n", + " 2\n", " 17\n", - " 1\n", + " 2\n", " TTR3_b1^3\n", " 701.0\n", " \n", " \n", " 14\n", - " 5\n", - " 1\n", + " 6666666666666666666\n", + " 2\n", " 18\n", - " 1\n", + " 2\n", " TTR3_y2^3\n", " 702.0\n", " \n", " \n", " 15\n", - " 5\n", - " 1\n", + " 6666666666666666666\n", + " 2\n", " 19\n", - " 1\n", + " 2\n", " TTR3_b3^3\n", " 703.0\n", " \n", " \n", " 16\n", - " 6\n", - " 2\n", + " 7777777777777777777\n", + " 3\n", " 12\n", - " 2\n", + " 3\n", " TTTTTTTR2_b1^2\n", " 501.0\n", " \n", " \n", " 17\n", - " 6\n", - " 2\n", + " 7777777777777777777\n", + " 3\n", " 13\n", - " 2\n", + " 3\n", " TTTTTTTR2_y2^2\n", " 502.0\n", " \n", " \n", " 18\n", - " 6\n", - " 2\n", + " 7777777777777777777\n", + " 3\n", " 14\n", - " 2\n", + " 3\n", " TTTTTTTR2_b3^2\n", " 503.0\n", " \n", + " \n", + " 19\n", + " 8888888888888888888\n", + " 1\n", + " 20\n", + " 1\n", + " TTK3_b1^3\n", + " 801.0\n", + " \n", + " \n", + " 20\n", + " 8888888888888888888\n", + " 1\n", + " 21\n", + " 1\n", + " TTK3_y2^3\n", + " 802.0\n", + " \n", + " \n", + " 21\n", + " 8888888888888888888\n", + " 1\n", + " 22\n", + " 1\n", + " TTK3_b3^3\n", + " 803.0\n", + " \n", " \n", "\n", "" ], "text/plain": [ - " feature_id precursor_id TRANSITION_ID PRECURSOR_ID \\\n", - "0 0 5 6 5 \n", - "1 1 5 6 5 \n", - "2 0 5 7 5 \n", - "3 1 5 7 5 \n", - "4 2 3 15 3 \n", - "5 2 3 16 3 \n", - "6 3 4 3 4 \n", - "7 3 4 4 4 \n", - "8 3 4 5 4 \n", - "9 4 0 8 0 \n", - "10 4 0 9 0 \n", - "11 4 0 10 0 \n", - "12 4 0 11 0 \n", - "13 5 1 17 1 \n", - "14 5 1 18 1 \n", - "15 5 1 19 1 \n", - "16 6 2 12 2 \n", - "17 6 2 13 2 \n", - "18 6 2 14 2 \n", + " feature_id precursor_id TRANSITION_ID PRECURSOR_ID \\\n", + "0 1111111111111111111 6 6 6 \n", + "1 1111111111111111111 6 7 6 \n", + "2 2222222222222222222 6 6 6 \n", + "3 2222222222222222222 6 7 6 \n", + "4 3333333333333333333 4 15 4 \n", + "5 3333333333333333333 4 16 4 \n", + "6 4444444444444444444 5 3 5 \n", + "7 4444444444444444444 5 4 5 \n", + "8 4444444444444444444 5 5 5 \n", + "9 5555555555555555555 0 8 0 \n", + "10 5555555555555555555 0 9 0 \n", + "11 5555555555555555555 0 10 0 \n", + "12 5555555555555555555 0 11 0 \n", + "13 6666666666666666666 2 17 2 \n", + "14 6666666666666666666 2 18 2 \n", + "15 6666666666666666666 2 19 2 \n", + "16 7777777777777777777 3 12 3 \n", + "17 7777777777777777777 3 13 3 \n", + "18 7777777777777777777 3 14 3 \n", + "19 8888888888888888888 1 20 1 \n", + "20 8888888888888888888 1 21 1 \n", + "21 8888888888888888888 1 22 1 \n", "\n", " TRAML_ID PRODUCT_MZ \n", "0 YYYYYR3_b1^2 221.0 \n", - "1 YYYYYR3_b1^2 221.0 \n", - "2 YYYYYR3_y2^2 222.0 \n", + "1 YYYYYR3_y2^2 222.0 \n", + "2 YYYYYR3_b1^2 221.0 \n", "3 YYYYYR3_y2^2 222.0 \n", "4 TTTTTTTTTTTTK2_b1^2 601.0 \n", "5 TTTTTTTTTTTTK2_y2^2 602.0 \n", @@ -2386,10 +2549,13 @@ "15 TTR3_b3^3 703.0 \n", "16 TTTTTTTR2_b1^2 501.0 \n", "17 TTTTTTTR2_y2^2 502.0 \n", - "18 TTTTTTTR2_b3^2 503.0 " + "18 TTTTTTTR2_b3^2 503.0 \n", + "19 TTK3_b1^3 801.0 \n", + "20 TTK3_y2^3 802.0 \n", + "21 TTK3_b3^3 803.0 " ] }, - "execution_count": 30, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -2400,7 +2566,18 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 34, + "id": "f0b98de2-dead-43c2-85ff-40706699a9ee", + "metadata": {}, + "outputs": [], + "source": [ + "# calculations with new feature id would result in overflow so just take first digit for calculations\n", + "psuedo_feature_id = (feature_transition['feature_id'].astype(str).str.slice(start=0, stop=1)).astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, "id": "cf55c2d6-fa9f-433a-bb8f-931984f48bbe", "metadata": {}, "outputs": [ @@ -2447,7 +2624,7 @@ " \n", " \n", " 0\n", - " 0\n", + " 1111111111111111111\n", " 6\n", " 221.0\n", " 221.0\n", @@ -2467,10 +2644,10 @@ " \n", " \n", " 1\n", - " 1\n", - " 6\n", - " 442.0\n", - " 442.0\n", + " 1111111111111111111\n", + " 7\n", + " 222.0\n", + " 222.0\n", " 1\n", " 1\n", " 1\n", @@ -2487,10 +2664,10 @@ " \n", " \n", " 2\n", - " 0\n", - " 7\n", - " 222.0\n", - " 222.0\n", + " 2222222222222222222\n", + " 6\n", + " 442.0\n", + " 442.0\n", " 1\n", " 1\n", " 1\n", @@ -2507,7 +2684,7 @@ " \n", " \n", " 3\n", - " 1\n", + " 2222222222222222222\n", " 7\n", " 444.0\n", " 444.0\n", @@ -2527,7 +2704,7 @@ " \n", " \n", " 4\n", - " 2\n", + " 3333333333333333333\n", " 15\n", " 1803.0\n", " 1803.0\n", @@ -2547,7 +2724,7 @@ " \n", " \n", " 5\n", - " 2\n", + " 3333333333333333333\n", " 16\n", " 1806.0\n", " 1806.0\n", @@ -2567,7 +2744,7 @@ " \n", " \n", " 6\n", - " 3\n", + " 4444444444444444444\n", " 3\n", " 804.0\n", " 804.0\n", @@ -2587,7 +2764,7 @@ " \n", " \n", " 7\n", - " 3\n", + " 4444444444444444444\n", " 4\n", " 808.0\n", " 808.0\n", @@ -2607,7 +2784,7 @@ " \n", " \n", " 8\n", - " 3\n", + " 4444444444444444444\n", " 5\n", " 812.0\n", " 812.0\n", @@ -2627,7 +2804,7 @@ " \n", " \n", " 9\n", - " 4\n", + " 5555555555555555555\n", " 8\n", " 2005.0\n", " 2005.0\n", @@ -2647,7 +2824,7 @@ " \n", " \n", " 10\n", - " 4\n", + " 5555555555555555555\n", " 9\n", " 2010.0\n", " 2010.0\n", @@ -2667,7 +2844,7 @@ " \n", " \n", " 11\n", - " 4\n", + " 5555555555555555555\n", " 10\n", " 2015.0\n", " 2015.0\n", @@ -2687,7 +2864,7 @@ " \n", " \n", " 12\n", - " 4\n", + " 5555555555555555555\n", " 11\n", " 2020.0\n", " 2020.0\n", @@ -2707,7 +2884,7 @@ " \n", " \n", " 13\n", - " 5\n", + " 6666666666666666666\n", " 17\n", " 4206.0\n", " 4206.0\n", @@ -2727,7 +2904,7 @@ " \n", " \n", " 14\n", - " 5\n", + " 6666666666666666666\n", " 18\n", " 4212.0\n", " 4212.0\n", @@ -2747,7 +2924,7 @@ " \n", " \n", " 15\n", - " 5\n", + " 6666666666666666666\n", " 19\n", " 4218.0\n", " 4218.0\n", @@ -2767,7 +2944,7 @@ " \n", " \n", " 16\n", - " 6\n", + " 7777777777777777777\n", " 12\n", " 3507.0\n", " 3507.0\n", @@ -2787,7 +2964,7 @@ " \n", " \n", " 17\n", - " 6\n", + " 7777777777777777777\n", " 13\n", " 3514.0\n", " 3514.0\n", @@ -2807,7 +2984,7 @@ " \n", " \n", " 18\n", - " 6\n", + " 7777777777777777777\n", " 14\n", " 3521.0\n", " 3521.0\n", @@ -2825,31 +3002,94 @@ " 1\n", " 1\n", " \n", + " \n", + " 19\n", + " 8888888888888888888\n", + " 20\n", + " 6408.0\n", + " 6408.0\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " \n", + " \n", + " 20\n", + " 8888888888888888888\n", + " 21\n", + " 6416.0\n", + " 6416.0\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " \n", + " \n", + " 21\n", + " 8888888888888888888\n", + " 22\n", + " 6424.0\n", + " 6424.0\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " \n", " \n", "\n", "" ], "text/plain": [ - " feature_id TRANSITION_ID AREA_INTENSITY TOTAL_AREA_INTENSITY \\\n", - "0 0 6 221.0 221.0 \n", - "1 1 6 442.0 442.0 \n", - "2 0 7 222.0 222.0 \n", - "3 1 7 444.0 444.0 \n", - "4 2 15 1803.0 1803.0 \n", - "5 2 16 1806.0 1806.0 \n", - "6 3 3 804.0 804.0 \n", - "7 3 4 808.0 808.0 \n", - "8 3 5 812.0 812.0 \n", - "9 4 8 2005.0 2005.0 \n", - "10 4 9 2010.0 2010.0 \n", - "11 4 10 2015.0 2015.0 \n", - "12 4 11 2020.0 2020.0 \n", - "13 5 17 4206.0 4206.0 \n", - "14 5 18 4212.0 4212.0 \n", - "15 5 19 4218.0 4218.0 \n", - "16 6 12 3507.0 3507.0 \n", - "17 6 13 3514.0 3514.0 \n", - "18 6 14 3521.0 3521.0 \n", + " feature_id TRANSITION_ID AREA_INTENSITY TOTAL_AREA_INTENSITY \\\n", + "0 1111111111111111111 6 221.0 221.0 \n", + "1 1111111111111111111 7 222.0 222.0 \n", + "2 2222222222222222222 6 442.0 442.0 \n", + "3 2222222222222222222 7 444.0 444.0 \n", + "4 3333333333333333333 15 1803.0 1803.0 \n", + "5 3333333333333333333 16 1806.0 1806.0 \n", + "6 4444444444444444444 3 804.0 804.0 \n", + "7 4444444444444444444 4 808.0 808.0 \n", + "8 4444444444444444444 5 812.0 812.0 \n", + "9 5555555555555555555 8 2005.0 2005.0 \n", + "10 5555555555555555555 9 2010.0 2010.0 \n", + "11 5555555555555555555 10 2015.0 2015.0 \n", + "12 5555555555555555555 11 2020.0 2020.0 \n", + "13 6666666666666666666 17 4206.0 4206.0 \n", + "14 6666666666666666666 18 4212.0 4212.0 \n", + "15 6666666666666666666 19 4218.0 4218.0 \n", + "16 7777777777777777777 12 3507.0 3507.0 \n", + "17 7777777777777777777 13 3514.0 3514.0 \n", + "18 7777777777777777777 14 3521.0 3521.0 \n", + "19 8888888888888888888 20 6408.0 6408.0 \n", + "20 8888888888888888888 21 6416.0 6416.0 \n", + "21 8888888888888888888 22 6424.0 6424.0 \n", "\n", " APEX_INTENSITY TOTAL_MI VAR_INTENSITY_SCORE VAR_INTENSITY_RATIO_SCORE \\\n", "0 1 1 1 1 \n", @@ -2871,6 +3111,9 @@ "16 1 1 1 1 \n", "17 1 1 1 1 \n", "18 1 1 1 1 \n", + "19 1 1 1 1 \n", + "20 1 1 1 1 \n", + "21 1 1 1 1 \n", "\n", " VAR_LOG_INTENSITY VAR_XCORR_COELUTION VAR_XCORR_SHAPE VAR_LOG_SN_SCORE \\\n", "0 1 1 1 1 \n", @@ -2892,6 +3135,9 @@ "16 1 1 1 1 \n", "17 1 1 1 1 \n", "18 1 1 1 1 \n", + "19 1 1 1 1 \n", + "20 1 1 1 1 \n", + "21 1 1 1 1 \n", "\n", " VAR_MASSDEV_SCORE VAR_MI_SCORE VAR_MI_RATIO_SCORE \\\n", "0 1 1 1 \n", @@ -2913,6 +3159,9 @@ "16 1 1 1 \n", "17 1 1 1 \n", "18 1 1 1 \n", + "19 1 1 1 \n", + "20 1 1 1 \n", + "21 1 1 1 \n", "\n", " VAR_ISOTOPE_CORRELATION_SCORE VAR_ISOTOPE_OVERLAP_SCORE \n", "0 1 1 \n", @@ -2933,30 +3182,35 @@ "15 1 1 \n", "16 1 1 \n", "17 1 1 \n", - "18 1 1 " + "18 1 1 \n", + "19 1 1 \n", + "20 1 1 \n", + "21 1 1 " ] }, - "execution_count": 31, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "feature_transition['AREA_INTENSITY'] = feature_transition['PRODUCT_MZ'] * (feature_transition['feature_id'] + 1) #should be equal to product_mz * (feature_id + 1)\n", + "length = 22\n", + "\n", + "feature_transition['AREA_INTENSITY'] = feature_transition['PRODUCT_MZ'] * (psuedo_feature_id) #should be equal to product_mz * (feature_id + 1)\n", "feature_transition['TOTAL_AREA_INTENSITY'] = feature_transition['AREA_INTENSITY']\n", - "feature_transition['APEX_INTENSITY'] = [1] * 19\n", - "feature_transition['TOTAL_MI'] = [1] * 19\n", - "feature_transition['VAR_INTENSITY_SCORE'] = [1] * 19\n", - "feature_transition['VAR_INTENSITY_RATIO_SCORE'] = [1] * 19\n", - "feature_transition['VAR_LOG_INTENSITY'] = [1] * 19\n", - "feature_transition['VAR_XCORR_COELUTION'] = [1] * 19\n", - "feature_transition['VAR_XCORR_SHAPE'] = [1] * 19\n", - "feature_transition['VAR_LOG_SN_SCORE'] = [1] * 19\n", - "feature_transition['VAR_MASSDEV_SCORE'] = [1] * 19\n", - "feature_transition['VAR_MI_SCORE'] = [1] * 19\n", - "feature_transition['VAR_MI_RATIO_SCORE'] = [1] * 19\n", - "feature_transition['VAR_ISOTOPE_CORRELATION_SCORE'] = [1] * 19\n", - "feature_transition['VAR_ISOTOPE_OVERLAP_SCORE'] = [1] * 19\n", + "feature_transition['APEX_INTENSITY'] = [1] * length\n", + "feature_transition['TOTAL_MI'] = [1] * length\n", + "feature_transition['VAR_INTENSITY_SCORE'] = [1] * length\n", + "feature_transition['VAR_INTENSITY_RATIO_SCORE'] = [1] * length\n", + "feature_transition['VAR_LOG_INTENSITY'] = [1] * length\n", + "feature_transition['VAR_XCORR_COELUTION'] = [1] * length\n", + "feature_transition['VAR_XCORR_SHAPE'] = [1] * length\n", + "feature_transition['VAR_LOG_SN_SCORE'] = [1] * length\n", + "feature_transition['VAR_MASSDEV_SCORE'] = [1] * length\n", + "feature_transition['VAR_MI_SCORE'] = [1] * length\n", + "feature_transition['VAR_MI_RATIO_SCORE'] = [1] * length\n", + "feature_transition['VAR_ISOTOPE_CORRELATION_SCORE'] = [1] * length\n", + "feature_transition['VAR_ISOTOPE_OVERLAP_SCORE'] = [1] * length\n", "\n", "feature_transition = feature_transition.drop(columns=['precursor_id', 'PRECURSOR_ID', 'TRAML_ID', 'PRODUCT_MZ'])\n", "\n", @@ -2965,7 +3219,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 36, "id": "588ee10a-7b00-4f61-b305-2a392b5bbd1b", "metadata": {}, "outputs": [], @@ -2982,7 +3236,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 37, "id": "78f13b81-7db1-46ee-947b-723e8b0b340b", "metadata": {}, "outputs": [], @@ -3008,7 +3262,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 38, "id": "8c143ee6-928c-4404-85cc-5fc7b9b1ce85", "metadata": {}, "outputs": [], @@ -3036,17 +3290,17 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 39, "id": "9a24a430-d994-4012-9ac3-37c792e30026", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 35, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -3058,7 +3312,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 40, "id": "563c3e64-e528-457c-ba42-f00fabcff0f0", "metadata": { "tags": [] @@ -3084,7 +3338,18 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 41, + "id": "d2f151a4-a735-4034-b573-d98faf5c7d76", + "metadata": {}, + "outputs": [], + "source": [ + "# calculations with new feature id would result in overflow so just take first digit for calculations\n", + "psuedo_feature_id = (feature_ms1['feature_id'].astype(str).str.slice(start=0, stop=1)).astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, "id": "c8539aa2-f946-4be8-a891-0b55d6957322", "metadata": {}, "outputs": [ @@ -3120,8 +3385,8 @@ " \n", " \n", " 0\n", - " 0\n", - " 1320\n", + " 1111111111111111111\n", + " 1540\n", " 1\n", " 1\n", " 1\n", @@ -3129,8 +3394,8 @@ " \n", " \n", " 1\n", - " 1\n", - " 2640\n", + " 2222222222222222222\n", + " 3080\n", " 1\n", " 1\n", " 1\n", @@ -3138,8 +3403,8 @@ " \n", " \n", " 2\n", - " 2\n", - " 7200\n", + " 3333333333333333333\n", + " 9000\n", " 1\n", " 1\n", " 1\n", @@ -3147,8 +3412,8 @@ " \n", " \n", " 3\n", - " 3\n", - " 4000\n", + " 4444444444444444444\n", + " 4800\n", " 1\n", " 1\n", " 1\n", @@ -3156,7 +3421,7 @@ " \n", " \n", " 4\n", - " 4\n", + " 5555555555555555555\n", " 2000\n", " 1\n", " 1\n", @@ -3165,8 +3430,8 @@ " \n", " \n", " 5\n", - " 5\n", - " 8400\n", + " 6666666666666666666\n", + " 12600\n", " 1\n", " 1\n", " 1\n", @@ -3174,8 +3439,17 @@ " \n", " \n", " 6\n", - " 6\n", - " 10500\n", + " 7777777777777777777\n", + " 14000\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " \n", + " \n", + " 7\n", + " 8888888888888888888\n", + " 12800\n", " 1\n", " 1\n", " 1\n", @@ -3186,34 +3460,36 @@ "" ], "text/plain": [ - " feature_id SCORE RANK PVALUE QVALUE PEP\n", - "0 0 1320 1 1 1 1\n", - "1 1 2640 1 1 1 1\n", - "2 2 7200 1 1 1 1\n", - "3 3 4000 1 1 1 1\n", - "4 4 2000 1 1 1 1\n", - "5 5 8400 1 1 1 1\n", - "6 6 10500 1 1 1 1" + " feature_id SCORE RANK PVALUE QVALUE PEP\n", + "0 1111111111111111111 1540 1 1 1 1\n", + "1 2222222222222222222 3080 1 1 1 1\n", + "2 3333333333333333333 9000 1 1 1 1\n", + "3 4444444444444444444 4800 1 1 1 1\n", + "4 5555555555555555555 2000 1 1 1 1\n", + "5 6666666666666666666 12600 1 1 1 1\n", + "6 7777777777777777777 14000 1 1 1 1\n", + "7 8888888888888888888 12800 1 1 1 1" ] }, - "execution_count": 37, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "length = 8 \n", "score_ms2 = feature_ms1[['feature_id']].copy()\n", - "score_ms2['SCORE'] = (features['id'] + 1) * (features['precursor_id'] + 1) * features['exp_rt'].astype(int) # (feature_id+1) * (precursor_id+1)\n", - "score_ms2['RANK'] = [1] *7\n", - "score_ms2['PVALUE'] = [1] * 7\n", - "score_ms2['QVALUE'] = [1] *7 \n", - "score_ms2['PEP'] = [1] *7\n", + "score_ms2['SCORE'] = (psuedo_feature_id) * (features['precursor_id'] + 1) * features['exp_rt'].astype(int) # (feature_id+1) * (precursor_id+1)\n", + "score_ms2['RANK'] = [1] * length\n", + "score_ms2['PVALUE'] = [1] * length\n", + "score_ms2['QVALUE'] = [1] * length \n", + "score_ms2['PEP'] = [1] * length\n", "score_ms2" ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 43, "id": "4cbda7cf-0535-4292-bfed-739a5f1bd2b8", "metadata": {}, "outputs": [], @@ -3228,7 +3504,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 44, "id": "adb88443-6d34-4173-8b37-9f52dba9f5e7", "metadata": {}, "outputs": [], @@ -3254,17 +3530,17 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 45, "id": "e0094b3a-5a80-48e4-8041-a537ce409480", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 40, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } @@ -3276,7 +3552,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 46, "id": "acf865f3-3353-4baa-b83e-91be2abed776", "metadata": { "tags": [] @@ -3303,7 +3579,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 47, "id": "f95142b9-612b-43a8-bb42-356b71839ea6", "metadata": {}, "outputs": [], @@ -3313,7 +3589,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 48, "id": "43828588-c1ff-4943-a7b7-24a968562c4e", "metadata": {}, "outputs": [ @@ -3347,41 +3623,48 @@ " \n", " \n", " 0\n", - " 3\n", + " 4\n", " TTTTTTTTTTTTK\n", " TTTTTTTTTTTTK\n", " 0\n", " \n", " \n", " 1\n", - " 2\n", + " 3\n", " TTTTTTTR\n", " TTTTTTTR\n", " 0\n", " \n", " \n", " 2\n", - " 4\n", + " 5\n", " YYYYYR\n", " YYYYYR\n", " 0\n", " \n", " \n", " 3\n", - " 1\n", + " 2\n", " TTR\n", " TTR\n", " 0\n", " \n", " \n", " 4\n", - " 5\n", + " 1\n", + " TTK\n", + " TTK\n", + " 1\n", + " \n", + " \n", + " 5\n", + " 6\n", " YYYYYYYYYYYK\n", " YYYYYYYYYYYK\n", " 0\n", " \n", " \n", - " 5\n", + " 6\n", " 0\n", " GGGGGGGGGGR\n", " GGGGGGGGGGR\n", @@ -3393,15 +3676,16 @@ ], "text/plain": [ " ID UNMODIFIED_SEQUENCE MODIFIED_SEQUENCE DECOY\n", - "0 3 TTTTTTTTTTTTK TTTTTTTTTTTTK 0\n", - "1 2 TTTTTTTR TTTTTTTR 0\n", - "2 4 YYYYYR YYYYYR 0\n", - "3 1 TTR TTR 0\n", - "4 5 YYYYYYYYYYYK YYYYYYYYYYYK 0\n", - "5 0 GGGGGGGGGGR GGGGGGGGGGR 0" + "0 4 TTTTTTTTTTTTK TTTTTTTTTTTTK 0\n", + "1 3 TTTTTTTR TTTTTTTR 0\n", + "2 5 YYYYYR YYYYYR 0\n", + "3 2 TTR TTR 0\n", + "4 1 TTK TTK 1\n", + "5 6 YYYYYYYYYYYK YYYYYYYYYYYK 0\n", + "6 0 GGGGGGGGGGR GGGGGGGGGGR 0" ] }, - "execution_count": 43, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } @@ -3412,7 +3696,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 49, "id": "f416e48b-6bb6-4cb7-8d81-597cfd52320c", "metadata": { "tags": [] @@ -3425,7 +3709,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 50, "id": "0e1eb1b9-730e-45d4-9618-fd532c1ccc25", "metadata": {}, "outputs": [], @@ -3441,7 +3725,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 51, "id": "e9692e06-ddf2-4f74-bb80-f2a92728767b", "metadata": {}, "outputs": [], @@ -3456,7 +3740,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 52, "id": "f720c22b-e6fa-4ac0-8402-bdcd2e74840b", "metadata": {}, "outputs": [], @@ -3482,17 +3766,17 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 53, "id": "94c860e0-880a-4091-afb8-af368ed72b26", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 48, + "execution_count": 53, "metadata": {}, "output_type": "execute_result" } @@ -3504,7 +3788,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 54, "id": "1c053178-b8a5-44ad-876b-49b5fd8afa23", "metadata": { "tags": [] @@ -3531,7 +3815,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 55, "id": "d70ba894-55bf-4a36-b306-45b7c5e9d1bd", "metadata": {}, "outputs": [], @@ -3541,7 +3825,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 56, "id": "3e1bbbeb-7cc9-4b9f-b898-5148abff911d", "metadata": {}, "outputs": [ @@ -3574,34 +3858,41 @@ " \n", " \n", " 0\n", - " 2\n", + " 3\n", " ProtY\n", " 0\n", " \n", " \n", " 1\n", - " 1\n", + " 2\n", " ProtT\n", " 0\n", " \n", " \n", " 2\n", - " 0\n", + " 1\n", " ProtG\n", " 0\n", " \n", + " \n", + " 3\n", + " 0\n", + " Decoy_ProtT\n", + " 1\n", + " \n", " \n", "\n", "" ], "text/plain": [ " ID PROTEIN_ACCESSION DECOY\n", - "0 2 ProtY 0\n", - "1 1 ProtT 0\n", - "2 0 ProtG 0" + "0 3 ProtY 0\n", + "1 2 ProtT 0\n", + "2 1 ProtG 0\n", + "3 0 Decoy_ProtT 1" ] }, - "execution_count": 51, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } @@ -3612,7 +3903,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 57, "id": "7b3410b1-5d6a-4e85-838c-5ccb3b15f1c5", "metadata": {}, "outputs": [ @@ -3649,6 +3940,16 @@ " \n", " \n", " 0\n", + " 3\n", + " 3\n", + " 1\n", + " 1\n", + " 1\n", + " global\n", + " 1\n", + " \n", + " \n", + " 1\n", " 2\n", " 2\n", " 1\n", @@ -3658,7 +3959,7 @@ " 1\n", " \n", " \n", - " 1\n", + " 2\n", " 1\n", " 1\n", " 1\n", @@ -3668,7 +3969,7 @@ " 1\n", " \n", " \n", - " 2\n", + " 3\n", " 0\n", " 0\n", " 1\n", @@ -3683,12 +3984,13 @@ ], "text/plain": [ " PROTEIN_ID SCORE PVALUE QVALUE PEP CONTEXT RUN_ID\n", - "0 2 2 1 1 1 global 1\n", - "1 1 1 1 1 1 global 1\n", - "2 0 0 1 1 1 global 1" + "0 3 3 1 1 1 global 1\n", + "1 2 2 1 1 1 global 1\n", + "2 1 1 1 1 1 global 1\n", + "3 0 0 1 1 1 global 1" ] }, - "execution_count": 52, + "execution_count": 57, "metadata": {}, "output_type": "execute_result" } @@ -3706,7 +4008,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 58, "id": "51408b81-b650-4787-9050-59d63c0098c0", "metadata": {}, "outputs": [], @@ -3721,7 +4023,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 59, "id": "ab6a68b5-f5db-46e7-95e8-6e98a69eb062", "metadata": {}, "outputs": [], @@ -3746,7 +4048,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.11.10" } }, "nbformat": 4, diff --git a/tests/data/dummyOSWScoredData.osw b/tests/data/dummyOSWScoredData.osw index c96832a1..82311a35 100644 Binary files a/tests/data/dummyOSWScoredData.osw and b/tests/data/dummyOSWScoredData.osw differ diff --git a/tests/data/fakeLib.tsv b/tests/data/fakeLib.tsv index 3831e0e3..d9189e14 100644 --- a/tests/data/fakeLib.tsv +++ b/tests/data/fakeLib.tsv @@ -1,21 +1,24 @@ -PrecursorMz ProductMz LibraryIntensity NormalizedRetentionTime ProteinId PeptideSequence ModifiedPeptideSequence PrecursorCharge FragmentType FragmentSeriesNumber ProductCharge GeneName LibraryDriftTime -100 101 101 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 b 1 2 Y 10 -100 102 201 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 y 2 2 Y 10 -100 103 301 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 b 3 2 Y 10 -200 201 102 20 ProtY YYYYYR YYYYYR 2 b 1 2 Y 20 -200 202 202 20 ProtY YYYYYR YYYYYR 2 y 2 2 Y 20 -200 203 302 20 ProtY YYYYYR YYYYYR 2 b 3 2 Y 20 -220 221 122 20 ProtY YYYYYR YYYYYR 3 b 1 2 Y 20 -220 222 222 20 ProtY YYYYYR YYYYYR 3 y 2 2 Y 20 -400 401 104 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 b 1 2 G 40 -400 402 204 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 y 2 2 G 40 -400 403 403 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 b 3 2 G 40 -400 404 404 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 y 4 2 G 40 -500 501 105 50 ProtT TTTTTTTR TTTTTTTR 2 b 1 2 T 50 -500 502 205 50 ProtT TTTTTTTR TTTTTTTR 2 y 2 2 T 50 -500 503 305 50 ProtT TTTTTTTR TTTTTTTR 2 b 3 2 T 50 -600 601 106 60 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 b 1 2 T 60 -600 602 206 60 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 y 2 2 T 60 -700 701 107 70 ProtT TTR TTR 3 b 1 3 T 70 -700 702 207 70 ProtT TTR TTR 3 y 2 3 T 70 -700 703 307 70 ProtT TTR TTR 3 b 3 3 T 70 +PrecursorMz ProductMz LibraryIntensity NormalizedRetentionTime ProteinId PeptideSequence ModifiedPeptideSequence PrecursorCharge FragmentType FragmentSeriesNumber ProductCharge GeneName LibraryDriftTime Decoy +100 101 101 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 b 1 2 Y 10 0 +100 102 201 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 y 2 2 Y 10 0 +100 103 301 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 b 3 2 Y 10 0 +200 201 102 20 ProtY YYYYYR YYYYYR 2 b 1 2 Y 20 0 +200 202 202 20 ProtY YYYYYR YYYYYR 2 y 2 2 Y 20 0 +200 203 302 20 ProtY YYYYYR YYYYYR 2 b 3 2 Y 20 0 +220 221 122 20 ProtY YYYYYR YYYYYR 3 b 1 2 Y 20 0 +220 222 222 20 ProtY YYYYYR YYYYYR 3 y 2 2 Y 20 0 +400 401 104 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 b 1 2 G 40 0 +400 402 204 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 y 2 2 G 40 0 +400 403 403 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 b 3 2 G 40 0 +400 404 404 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 y 4 2 G 40 0 +500 501 105 50 ProtT TTTTTTTR TTTTTTTR 2 b 1 2 T 50 0 +500 502 205 50 ProtT TTTTTTTR TTTTTTTR 2 y 2 2 T 50 0 +500 503 305 50 ProtT TTTTTTTR TTTTTTTR 2 b 3 2 T 50 0 +600 601 106 60 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 b 1 2 T 60 0 +600 602 206 60 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 y 2 2 T 60 0 +700 701 107 70 ProtT TTR TTR 3 b 1 3 T 70 0 +700 702 207 70 ProtT TTR TTR 3 y 2 3 T 70 0 +700 703 307 70 ProtT TTR TTR 3 b 3 3 T 70 0 +800 801 808 80 Decoy_ProtT TTK TTK 3 b 1 3 Decoy_T 80 1 +800 802 808 80 Decoy_ProtT TTK TTK 3 y 2 3 Decoy_T 80 1 +800 803 808 80 Decoy_ProtT TTK TTK 3 b 3 3 Decoy_T 80 1 diff --git a/tests/fakeLib.tsv b/tests/fakeLib.tsv deleted file mode 100644 index 3831e0e3..00000000 --- a/tests/fakeLib.tsv +++ /dev/null @@ -1,21 +0,0 @@ -PrecursorMz ProductMz LibraryIntensity NormalizedRetentionTime ProteinId PeptideSequence ModifiedPeptideSequence PrecursorCharge FragmentType FragmentSeriesNumber ProductCharge GeneName LibraryDriftTime -100 101 101 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 b 1 2 Y 10 -100 102 201 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 y 2 2 Y 10 -100 103 301 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 b 3 2 Y 10 -200 201 102 20 ProtY YYYYYR YYYYYR 2 b 1 2 Y 20 -200 202 202 20 ProtY YYYYYR YYYYYR 2 y 2 2 Y 20 -200 203 302 20 ProtY YYYYYR YYYYYR 2 b 3 2 Y 20 -220 221 122 20 ProtY YYYYYR YYYYYR 3 b 1 2 Y 20 -220 222 222 20 ProtY YYYYYR YYYYYR 3 y 2 2 Y 20 -400 401 104 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 b 1 2 G 40 -400 402 204 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 y 2 2 G 40 -400 403 403 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 b 3 2 G 40 -400 404 404 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 y 4 2 G 40 -500 501 105 50 ProtT TTTTTTTR TTTTTTTR 2 b 1 2 T 50 -500 502 205 50 ProtT TTTTTTTR TTTTTTTR 2 y 2 2 T 50 -500 503 305 50 ProtT TTTTTTTR TTTTTTTR 2 b 3 2 T 50 -600 601 106 60 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 b 1 2 T 60 -600 602 206 60 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 y 2 2 T 60 -700 701 107 70 ProtT TTR TTR 3 b 1 3 T 70 -700 702 207 70 ProtT TTR TTR 3 y 2 3 T 70 -700 703 307 70 ProtT TTR TTR 3 b 3 3 T 70 diff --git a/tests/test_pyprophet_export_parquet.py b/tests/test_pyprophet_export_parquet.py index 6c01696d..647560b6 100644 --- a/tests/test_pyprophet_export_parquet.py +++ b/tests/test_pyprophet_export_parquet.py @@ -27,7 +27,7 @@ def _run_cmdline(cmdline): return stdout -def _run_export_parquet_single_run(temp_folder, transitionLevel=False, pd_testing_kwargs=dict(check_dtype=False, check_names=False), onlyFeatures=False): +def _run_export_parquet_single_run(temp_folder, transitionLevel=False, pd_testing_kwargs=dict(check_dtype=False, check_names=False), onlyFeatures=False, noDecoys=False): os.chdir(temp_folder) DATA_NAME="dummyOSWScoredData.osw" data_path = os.path.join(DATA_FOLDER, DATA_NAME) @@ -41,41 +41,68 @@ def _run_export_parquet_single_run(temp_folder, transitionLevel=False, pd_testin cmdline += " --transitionLevel" if onlyFeatures: cmdline += " --onlyFeatures" + if noDecoys: + cmdline += " --noDecoys" stdout = _run_cmdline(cmdline) ### This file was configured in a way where the following tests should work parquet = pd.read_parquet("dummyOSWScoredData.parquet") ## automatically with parquet ending of input file name + ### CHECK LENGTHS ### if transitionLevel: if onlyFeatures: # length of FEATURE_TRANSITION table - expectedLength = len(pd.read_sql("select * from feature_transition", conn)) + if not noDecoys: + expectedLength = len(pd.read_sql("select * from feature_transition", conn)) + else: + expectedLength = len(pd.read_sql("select * from feature_transition inner join transition on transition.id = feature_transition.transition_id where DECOY == 0", conn)) else: - featureTransition = pd.read_sql("select * from feature_transition", conn) - precursorTransition = pd.read_sql("select * from transition_precursor_mapping", conn) + if not noDecoys: + featureTransition = pd.read_sql("select * from feature_transition", conn) + precursorTransition = pd.read_sql("select * from transition_precursor_mapping", conn) + else: + featureTransition = pd.read_sql("select * from feature_transition inner join transition on transition.id = feature_transition.transition_id where DECOY == 0", conn) + precursorTransition = pd.read_sql("select * from transition_precursor_mapping inner join transition on transition.id = transition_precursor_mapping.transition_id where DECOY=0", conn) + featureTable = pd.read_sql("select * from feature", conn) numTransNoFeature = len(precursorTransition[~precursorTransition['PRECURSOR_ID'].isin(featureTable['PRECURSOR_ID'])]) expectedLength = numTransNoFeature + len(featureTransition) - assert(expectedLength == len(parquet)) else: if onlyFeatures: # expected length, length of feature table - expectedLength = len(pd.read_sql("select * from feature", conn)) + if noDecoys: + expectedLength = len(pd.read_sql("select * from feature inner join precursor on feature.precursor_id = precursor.id where decoy = 0", conn)) + else: + expectedLength = len(pd.read_sql("select * from feature inner join precursor on precursor.id = feature.precursor_id", conn)) else: # Expected length is number of features + number of precursors with no feature - featureTable = pd.read_sql("select * from feature", conn) - precTable = pd.read_sql("select * from precursor", conn) + if noDecoys: + featureTable = pd.read_sql("select * from feature inner join precursor on feature.precursor_id = precursor.id where decoy = 0", conn) + else: + featureTable = pd.read_sql("select * from feature", conn) + + if noDecoys: + precTable = pd.read_sql("select * from precursor where decoy = 0", conn) + else: + precTable = pd.read_sql("select * from precursor", conn) numPrecsNoFeature = len(precTable[~precTable['ID'].isin(featureTable['PRECURSOR_ID'])]) expectedLength = numPrecsNoFeature + len(featureTable) - assert(expectedLength == len(parquet)) + assert(expectedLength == len(parquet)) - ########### FEATURE LEVEL TESTS ######## + + ########### FEATURE LEVEL VALUE TESTS ######## # Tests that columns are equal across different sqlite3 tables to ensure joins occured correctly + # since cannot compare NAN drop rows which contain an NAN + na_columns = ['PRECURSOR.LIBRARY_INTENSITY'] # this is a list of columns which expect to be NAN + parquet = parquet.drop(columns=na_columns).dropna() + + assert(len(parquet) > 0) # assert that did not just drop everything (means that missed an na column) + if transitionLevel: ## check features and transitions joined properly for those all cases (including those with no features ## Way library was created precursor and transition m/z both are in the same 100s (e.g. if precursor m/z is 700 transition mz can be 701) @@ -83,21 +110,26 @@ def _run_export_parquet_single_run(temp_folder, transitionLevel=False, pd_testin ### Note: Current tests assume no na parquet = parquet.dropna() - proxy_feature_id = parquet['FEATURE_ID'].astype(str).apply(lambda x: x[0]).astype(int) # since id is complicated, dummy values created using a proxy id which is the first digit of the actual id + pseudo_feature_id = (parquet['FEATURE_ID'].astype(str).str.slice(start=0, stop=1)).astype(int) pd.testing.assert_series_equal(parquet['FEATURE_MS1.APEX_INTENSITY'], parquet['PRECURSOR_ID'], **pd_testing_kwargs) pd.testing.assert_series_equal(parquet['FEATURE_MS2.APEX_INTENSITY'], parquet['PRECURSOR_ID'], **pd_testing_kwargs) pd.testing.assert_series_equal(parquet['FEATURE_MS1.EXP_IM'], parquet['FEATURE_MS2.EXP_IM'], **pd_testing_kwargs) pd.testing.assert_series_equal(parquet['FEATURE_MS2.DELTA_IM'], parquet['FEATURE_MS1.DELTA_IM'], **pd_testing_kwargs) - pd.testing.assert_series_equal(parquet['SCORE_MS2.SCORE'], (parquet['PRECURSOR_ID'] + 1) * parquet['FEATURE.EXP_RT'].astype(int) * (proxy_feature_id), **pd_testing_kwargs) - print(parquet.columns) + pd.testing.assert_series_equal(parquet['SCORE_MS2.SCORE'], (parquet['PRECURSOR_ID'] + 1) * parquet['FEATURE.EXP_RT'].astype(int) * pseudo_feature_id, **pd_testing_kwargs) pd.testing.assert_series_equal(parquet['SCORE_PEPTIDE.SCORE_GLOBAL'], parquet['PEPTIDE_ID'], **pd_testing_kwargs) pd.testing.assert_series_equal(parquet['SCORE_PROTEIN.SCORE_GLOBAL'], parquet['PROTEIN_ID'], **pd_testing_kwargs) + # check is/no decoys + if noDecoys: + assert(parquet[parquet['DECOY'] == 1].shape[0] == 0) + + + ############### TRANSTION LEVEL TESTS ################ if transitionLevel: - pd.testing.assert_series_equal(parquet['FEATURE_TRANSITION.AREA_INTENSITY'], parquet['TRANSITION.PRODUCT_MZ'] * (proxy_feature_id), **pd_testing_kwargs) + pd.testing.assert_series_equal(parquet['FEATURE_TRANSITION.AREA_INTENSITY'], parquet['TRANSITION.PRODUCT_MZ'] * pseudo_feature_id, **pd_testing_kwargs) def test_export_parquet_single_run(tmpdir): _run_export_parquet_single_run(tmpdir, transitionLevel=False) @@ -112,4 +144,10 @@ def test_export_parquet_single_run_onlyFeatures(tmpdir): def test_export_parquet_single_run_transitionLevel_onlyFeatures(tmpdir): - _run_export_parquet_single_run(tmpdir, transitionLevel=True, onlyFeatures=True) \ No newline at end of file + _run_export_parquet_single_run(tmpdir, transitionLevel=True, onlyFeatures=True) + +def test_export_parquet_single_run_noDecoys(tmpdir): + _run_export_parquet_single_run(tmpdir, noDecoys=True) + +def test_export_parquet_single_run_transitionLevel_noDecoys(tmpdir): + _run_export_parquet_single_run(tmpdir, transitionLevel=True, noDecoys=True) \ No newline at end of file