From d1262a65889636700c913bbb3e55c3916718fc9f Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 6 Feb 2025 13:32:30 -0500 Subject: [PATCH 1/8] feature: transition lvl string in prec level data - export a list of transition_ids and annotations separated by semicolumns in the feature level data. This is useful to get some important transition level info in the precursor dataframe. --- pyprophet/export_parquet.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/pyprophet/export_parquet.py b/pyprophet/export_parquet.py index 24b723c2..afaf6da1 100644 --- a/pyprophet/export_parquet.py +++ b/pyprophet/export_parquet.py @@ -3,6 +3,7 @@ import pandas as pd from pyprophet.export import check_sqlite_table from duckdb_extensions import extension_importer +import re def getPeptideProteinScoreTable(conndb, level): if level == 'peptide': @@ -66,6 +67,9 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False): CREATE INDEX IF NOT EXISTS idx_protein_protein_id ON PROTEIN (ID); CREATE INDEX IF NOT EXISTS idx_peptide_protein_mapping_peptide_id ON PEPTIDE_PROTEIN_MAPPING (PEPTIDE_ID); + CREATE INDEX IF NOT EXISTS idx_transition_id ON TRANSITION (ID); + CREATE INDEX IF NOT EXISTS idx_transition_precursor_mapping_transition_id ON TRANSITION_PRECURSOR_MAPPING (TRANSITION_ID); + CREATE INDEX IF NOT EXISTS idx_transition_precursor_mapping_precursor_id ON TRANSITION_PRECURSOR_MAPPING (PRECURSOR_ID); ''' if check_sqlite_table(con, "FEATURE_MS1"): @@ -200,19 +204,27 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False): # create a list of all the columns columns_list = [col for c in columns.values() for col in c] + + # create a list of just aliases for groupby + pattern = re.compile(r"(.*)\sAS") + alias_list = [ pattern.search(col).group(1) for c in columns.values() for col in c] # join the list into a single string separated by a comma and a space columnsToSelect = ", ".join(columns_list) - - join_features = "LEFT JOIN" if onlyFeatures else "FULL JOIN" + aliasToSelect = ", ".join(alias_list) # First read feature data # Feature Data if not transitionLevel: feature_query = f''' - SELECT {columnsToSelect} - FROM FEATURE - {join_features} PRECURSOR ON FEATURE.PRECURSOR_ID = PRECURSOR.ID + SELECT {columnsToSelect}, + GROUP_CONCAT(TRANSITION.ID, ';') AS 'TRANSITION_ID', + GROUP_CONCAT(TRANSITION.ANNOTATION, ';') AS 'TRANSITION_ANNOTATION' + FROM TRANSITION_PRECURSOR_MAPPING + LEFT JOIN TRANSITION ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID + LEFT JOIN PRECURSOR ON TRANSITION_PRECURSOR_MAPPING.PRECURSOR_ID = PRECURSOR.ID + LEFT JOIN FEATURE_TRANSITION ON TRANSITION.ID = FEATURE_TRANSITION.TRANSITION_ID + LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID @@ -224,6 +236,7 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False): {gene_table_joins} {pepJoin} {protJoin} + GROUP BY {aliasToSelect} ''' else: # is transition level From 1206b232a8a17c6ca327263b98b41497655e5296 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 6 Feb 2025 13:54:10 -0500 Subject: [PATCH 2/8] fix: failing tests --- pyprophet/export_parquet.py | 69 +++++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 22 deletions(-) diff --git a/pyprophet/export_parquet.py b/pyprophet/export_parquet.py index afaf6da1..2f44ab18 100644 --- a/pyprophet/export_parquet.py +++ b/pyprophet/export_parquet.py @@ -216,28 +216,53 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False): # First read feature data # Feature Data if not transitionLevel: - feature_query = f''' - SELECT {columnsToSelect}, - GROUP_CONCAT(TRANSITION.ID, ';') AS 'TRANSITION_ID', - GROUP_CONCAT(TRANSITION.ANNOTATION, ';') AS 'TRANSITION_ANNOTATION' - FROM TRANSITION_PRECURSOR_MAPPING - LEFT JOIN TRANSITION ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID - LEFT JOIN PRECURSOR ON TRANSITION_PRECURSOR_MAPPING.PRECURSOR_ID = PRECURSOR.ID - LEFT JOIN FEATURE_TRANSITION ON TRANSITION.ID = FEATURE_TRANSITION.TRANSITION_ID - LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID - LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID - LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID - LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID - LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID - LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID - LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID - LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID - LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID - {gene_table_joins} - {pepJoin} - {protJoin} - GROUP BY {aliasToSelect} - ''' + if not onlyFeatures: + feature_query = f''' + SELECT {columnsToSelect}, + GROUP_CONCAT(TRANSITION.ID, ';') AS 'TRANSITION_ID', + GROUP_CONCAT(TRANSITION.ANNOTATION, ';') AS 'TRANSITION_ANNOTATION' + FROM TRANSITION_PRECURSOR_MAPPING + LEFT JOIN TRANSITION ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID + LEFT JOIN PRECURSOR ON TRANSITION_PRECURSOR_MAPPING.PRECURSOR_ID = PRECURSOR.ID + LEFT JOIN FEATURE_TRANSITION ON TRANSITION.ID = FEATURE_TRANSITION.TRANSITION_ID + LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID + LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID + LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID + LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID + LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID + LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID + LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID + LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID + LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID + {gene_table_joins} + {pepJoin} + {protJoin} + GROUP BY {aliasToSelect} + ''' + else: + feature_query = f''' + SELECT {columnsToSelect}, + GROUP_CONCAT(TRANSITION.ID, ';') AS 'TRANSITION_ID', + GROUP_CONCAT(TRANSITION.ANNOTATION, ';') AS 'TRANSITION_ANNOTATION' + FROM FEATURE_TRANSITION + LEFT JOIN TRANSITION ON FEATURE_TRANSITION.TRANSITION_ID = TRANSITION.ID + LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID + LEFT JOIN PRECURSOR ON FEATURE.PRECURSOR_ID = PRECURSOR.ID + LEFT JOIN TRANSITION_PRECURSOR_MAPPING ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID + LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID + LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID + LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID + LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID + LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID + LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID + LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID + LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID + {gene_table_joins} + {pepJoin} + {protJoin} + GROUP BY {aliasToSelect} + ''' + else: # is transition level # merge transition and precursor level data From 990a0b8ad2ef7dd734941bdc99b7a134b722fb7b Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 6 Feb 2025 17:47:35 -0500 Subject: [PATCH 3/8] refactor: combine queries into one queries for transition level and precursor level quite similar now so refactor code so less redundancy --- pyprophet/export_parquet.py | 143 ++++++++++++------------------------ 1 file changed, 49 insertions(+), 94 deletions(-) diff --git a/pyprophet/export_parquet.py b/pyprophet/export_parquet.py index 2f44ab18..ddb6084b 100644 --- a/pyprophet/export_parquet.py +++ b/pyprophet/export_parquet.py @@ -213,97 +213,52 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False): columnsToSelect = ", ".join(columns_list) aliasToSelect = ", ".join(alias_list) - # First read feature data - # Feature Data - if not transitionLevel: - if not onlyFeatures: - feature_query = f''' - SELECT {columnsToSelect}, - GROUP_CONCAT(TRANSITION.ID, ';') AS 'TRANSITION_ID', - GROUP_CONCAT(TRANSITION.ANNOTATION, ';') AS 'TRANSITION_ANNOTATION' - FROM TRANSITION_PRECURSOR_MAPPING - LEFT JOIN TRANSITION ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID - LEFT JOIN PRECURSOR ON TRANSITION_PRECURSOR_MAPPING.PRECURSOR_ID = PRECURSOR.ID - LEFT JOIN FEATURE_TRANSITION ON TRANSITION.ID = FEATURE_TRANSITION.TRANSITION_ID - LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID - LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID - LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID - LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID - LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID - LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID - LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID - LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID - LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID - {gene_table_joins} - {pepJoin} - {protJoin} - GROUP BY {aliasToSelect} - ''' - else: - feature_query = f''' - SELECT {columnsToSelect}, - GROUP_CONCAT(TRANSITION.ID, ';') AS 'TRANSITION_ID', - GROUP_CONCAT(TRANSITION.ANNOTATION, ';') AS 'TRANSITION_ANNOTATION' - FROM FEATURE_TRANSITION - LEFT JOIN TRANSITION ON FEATURE_TRANSITION.TRANSITION_ID = TRANSITION.ID - LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID - LEFT JOIN PRECURSOR ON FEATURE.PRECURSOR_ID = PRECURSOR.ID - LEFT JOIN TRANSITION_PRECURSOR_MAPPING ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID - LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID - LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID - LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID - LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID - LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID - LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID - LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID - LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID - {gene_table_joins} - {pepJoin} - {protJoin} - GROUP BY {aliasToSelect} - ''' - - else: # is transition level - - # merge transition and precursor level data - if not onlyFeatures: - feature_query = f''' - SELECT {columnsToSelect} - FROM TRANSITION_PRECURSOR_MAPPING - LEFT JOIN TRANSITION ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID - LEFT JOIN PRECURSOR ON TRANSITION_PRECURSOR_MAPPING.PRECURSOR_ID = PRECURSOR.ID - LEFT JOIN FEATURE_TRANSITION ON TRANSITION.ID = FEATURE_TRANSITION.TRANSITION_ID - LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID - LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID - LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID - LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID - LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID - LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID - LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID - LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID - LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID - {gene_table_joins} - {pepJoin} - {protJoin} - ''' - else: - feature_query = f''' - SELECT {columnsToSelect} - FROM FEATURE_TRANSITION - LEFT JOIN TRANSITION ON FEATURE_TRANSITION.TRANSITION_ID = TRANSITION.ID - LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID - LEFT JOIN PRECURSOR ON FEATURE.PRECURSOR_ID = PRECURSOR.ID - LEFT JOIN TRANSITION_PRECURSOR_MAPPING ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID - LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID - LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID - LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID - LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID - LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID - LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID - LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID - LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID - {gene_table_joins} - {pepJoin} - {protJoin} - ''' - condb.sql(feature_query).write_parquet(outfile) \ No newline at end of file + # For feature level group important transition level data into one row separated by ';' + featureLvlPrefix = "GROUP_CONCAT(TRANSITION.ID, ';') AS 'TRANSITION_ID', GROUP_CONCAT(TRANSITION.ANNOTATION, ';') AS 'TRANSITION_ANNOTATION'" if not transitionLevel else "" + featureLvlSuffix = f'GROUP BY {aliasToSelect}' if not transitionLevel else "" + + if not onlyFeatures: + query = f''' + SELECT {columnsToSelect}, + {featureLvlPrefix} + FROM TRANSITION_PRECURSOR_MAPPING + LEFT JOIN TRANSITION ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID + LEFT JOIN PRECURSOR ON TRANSITION_PRECURSOR_MAPPING.PRECURSOR_ID = PRECURSOR.ID + LEFT JOIN FEATURE_TRANSITION ON TRANSITION.ID = FEATURE_TRANSITION.TRANSITION_ID + LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID + LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID + LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID + LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID + LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID + LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID + LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID + LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID + LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID + {gene_table_joins} + {pepJoin} + {protJoin} + {featureLvlSuffix} + ''' + else: + query = f''' + SELECT {columnsToSelect}, + {featureLvlPrefix} + FROM FEATURE_TRANSITION + LEFT JOIN TRANSITION ON FEATURE_TRANSITION.TRANSITION_ID = TRANSITION.ID + LEFT JOIN FEATURE ON FEATURE_TRANSITION.FEATURE_ID = FEATURE.ID + LEFT JOIN PRECURSOR ON FEATURE.PRECURSOR_ID = PRECURSOR.ID + LEFT JOIN TRANSITION_PRECURSOR_MAPPING ON TRANSITION_PRECURSOR_MAPPING.TRANSITION_ID = TRANSITION.ID + LEFT JOIN RUN ON FEATURE.RUN_ID = RUN.ID + LEFT JOIN FEATURE_MS1 ON FEATURE.ID = FEATURE_MS1.FEATURE_ID + LEFT JOIN FEATURE_MS2 ON FEATURE.ID = FEATURE_MS2.FEATURE_ID + LEFT JOIN SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID + LEFT JOIN PRECURSOR_PEPTIDE_MAPPING ON PRECURSOR.ID = PRECURSOR_PEPTIDE_MAPPING.PRECURSOR_ID + LEFT JOIN PEPTIDE ON PRECURSOR_PEPTIDE_MAPPING.PEPTIDE_ID = PEPTIDE.ID + LEFT JOIN PEPTIDE_PROTEIN_MAPPING ON PEPTIDE.ID = PEPTIDE_PROTEIN_MAPPING.PEPTIDE_ID + LEFT JOIN PROTEIN ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = PROTEIN.ID + {gene_table_joins} + {pepJoin} + {protJoin} + {featureLvlSuffix} + ''' + condb.sql(query).write_parquet(outfile) \ No newline at end of file From c4b6dd6e260059a87ec0fc39ad65ac4d55d8ca4d Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Fri, 7 Feb 2025 07:52:58 -0500 Subject: [PATCH 4/8] feature: option to exlcude decoys --- pyprophet/export_parquet.py | 6 +++++- pyprophet/main.py | 5 +++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pyprophet/export_parquet.py b/pyprophet/export_parquet.py index ddb6084b..1f89fa54 100644 --- a/pyprophet/export_parquet.py +++ b/pyprophet/export_parquet.py @@ -32,7 +32,7 @@ def getVarColumnNames(condb, tableName): # this method is only currently supported for combined output and not with ipf -def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False): +def export_to_parquet(infile, outfile, transitionLevel=False, onlyFeatures=False, noDecoys=False): ''' Convert an OSW sqlite file to Parquet format @@ -217,6 +217,8 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False): featureLvlPrefix = "GROUP_CONCAT(TRANSITION.ID, ';') AS 'TRANSITION_ID', GROUP_CONCAT(TRANSITION.ANNOTATION, ';') AS 'TRANSITION_ANNOTATION'" if not transitionLevel else "" featureLvlSuffix = f'GROUP BY {aliasToSelect}' if not transitionLevel else "" + decoyExclude = "WHERE PRECURSOR.DECOY == 0" if noDecoys else "" + if not onlyFeatures: query = f''' SELECT {columnsToSelect}, @@ -237,6 +239,7 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False): {gene_table_joins} {pepJoin} {protJoin} + {decoyExclude} {featureLvlSuffix} ''' else: @@ -259,6 +262,7 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False): {gene_table_joins} {pepJoin} {protJoin} + {decoyExclude} {featureLvlSuffix} ''' condb.sql(query).write_parquet(outfile) \ No newline at end of file diff --git a/pyprophet/main.py b/pyprophet/main.py index 24f97abe..eed24392 100644 --- a/pyprophet/main.py +++ b/pyprophet/main.py @@ -368,7 +368,8 @@ def export(infile, outfile, format, outcsv, transition_quantification, max_trans @click.option('--out', 'outfile', required=False, type=click.Path(exists=False), help='Output parquet file.') @click.option('--transitionLevel', 'transitionLevel', is_flag=True, help='Whether to export transition level data as well') @click.option('--onlyFeatures', 'onlyFeatures', is_flag=True, help='Only include precursors that have a corresponding feature') -def export_parquet(infile, outfile, transitionLevel, onlyFeatures): +@click.option('--noDecoys', 'noDecoys', is_flag=True, help='Do not include decoys in the exported data') +def export_parquet(infile, outfile, transitionLevel, onlyFeatures, noDecoys): """ Export all transition data to parquet file """ @@ -381,7 +382,7 @@ def export_parquet(infile, outfile, transitionLevel, onlyFeatures): if not overwrite: raise click.ClickException(f"Aborting: {outfile} already exists!") click.echo("Info: Parquet file will be written to {}".format(outfile)) - export_to_parquet(os.path.abspath(infile), os.path.abspath(outfile), transitionLevel, onlyFeatures) + export_to_parquet(os.path.abspath(infile), os.path.abspath(outfile), transitionLevel, onlyFeatures, noDecoys) # Export Compound TSV @cli.command() From cf2387f51961571702b8c7292aa50ee2c980db81 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Fri, 7 Feb 2025 12:17:27 -0500 Subject: [PATCH 5/8] remove fake_lib duplicate --- tests/fakeLib.tsv | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 tests/fakeLib.tsv diff --git a/tests/fakeLib.tsv b/tests/fakeLib.tsv deleted file mode 100644 index 3831e0e3..00000000 --- a/tests/fakeLib.tsv +++ /dev/null @@ -1,21 +0,0 @@ -PrecursorMz ProductMz LibraryIntensity NormalizedRetentionTime ProteinId PeptideSequence ModifiedPeptideSequence PrecursorCharge FragmentType FragmentSeriesNumber ProductCharge GeneName LibraryDriftTime -100 101 101 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 b 1 2 Y 10 -100 102 201 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 y 2 2 Y 10 -100 103 301 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 b 3 2 Y 10 -200 201 102 20 ProtY YYYYYR YYYYYR 2 b 1 2 Y 20 -200 202 202 20 ProtY YYYYYR YYYYYR 2 y 2 2 Y 20 -200 203 302 20 ProtY YYYYYR YYYYYR 2 b 3 2 Y 20 -220 221 122 20 ProtY YYYYYR YYYYYR 3 b 1 2 Y 20 -220 222 222 20 ProtY YYYYYR YYYYYR 3 y 2 2 Y 20 -400 401 104 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 b 1 2 G 40 -400 402 204 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 y 2 2 G 40 -400 403 403 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 b 3 2 G 40 -400 404 404 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 y 4 2 G 40 -500 501 105 50 ProtT TTTTTTTR TTTTTTTR 2 b 1 2 T 50 -500 502 205 50 ProtT TTTTTTTR TTTTTTTR 2 y 2 2 T 50 -500 503 305 50 ProtT TTTTTTTR TTTTTTTR 2 b 3 2 T 50 -600 601 106 60 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 b 1 2 T 60 -600 602 206 60 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 y 2 2 T 60 -700 701 107 70 ProtT TTR TTR 3 b 1 3 T 70 -700 702 207 70 ProtT TTR TTR 3 y 2 3 T 70 -700 703 307 70 ProtT TTR TTR 3 b 3 3 T 70 From 45a58c97e23f52e557a52af41ab4541d663903eb Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Fri, 7 Feb 2025 12:18:20 -0500 Subject: [PATCH 6/8] test: add tests for no-decoys flag --- tests/Create_OSW_test.ipynb | 1490 +++++++++--------------- tests/data/dummyOSWScoredData.osw | Bin 131072 -> 237568 bytes tests/data/fakeLib.tsv | 45 +- tests/test_pyprophet_export_parquet.py | 21 +- 4 files changed, 574 insertions(+), 982 deletions(-) diff --git a/tests/Create_OSW_test.ipynb b/tests/Create_OSW_test.ipynb index 4a96d9df..fda08b68 100644 --- a/tests/Create_OSW_test.ipynb +++ b/tests/Create_OSW_test.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "id": "dc80ebc4-f822-4853-af17-d2ccd4f10e3c", "metadata": {}, "outputs": [], @@ -54,17 +54,17 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 65, "id": "41ff70f2-2102-4be8-b17f-c5c51254b196", "metadata": {}, "outputs": [], "source": [ - "lib = pd.read_csv(\"fakeLib.tsv\", sep='\\t')" + "lib = pd.read_csv(\"data/fakeLib.tsv\", sep='\\t')" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 66, "id": "4325d9ba-722e-4120-8e58-803a86f87b1e", "metadata": {}, "outputs": [ @@ -102,6 +102,7 @@ " ProductCharge\n", " GeneName\n", " LibraryDriftTime\n", + " Decoy\n", " Annotation\n", " TransitionId\n", " \n", @@ -122,6 +123,7 @@ " 2\n", " Y\n", " 10\n", + " 0\n", " b1^2\n", " YYYYYYYYYYYK2_b1^2\n", " \n", @@ -140,6 +142,7 @@ " 2\n", " Y\n", " 10\n", + " 0\n", " y2^2\n", " YYYYYYYYYYYK2_y2^2\n", " \n", @@ -158,6 +161,7 @@ " 2\n", " Y\n", " 10\n", + " 0\n", " b3^2\n", " YYYYYYYYYYYK2_b3^2\n", " \n", @@ -176,6 +180,7 @@ " 2\n", " Y\n", " 20\n", + " 0\n", " b1^2\n", " YYYYYR2_b1^2\n", " \n", @@ -194,6 +199,7 @@ " 2\n", " Y\n", " 20\n", + " 0\n", " y2^2\n", " YYYYYR2_y2^2\n", " \n", @@ -212,6 +218,7 @@ " 2\n", " Y\n", " 20\n", + " 0\n", " b3^2\n", " YYYYYR2_b3^2\n", " \n", @@ -230,6 +237,7 @@ " 2\n", " Y\n", " 20\n", + " 0\n", " b1^2\n", " YYYYYR3_b1^2\n", " \n", @@ -248,6 +256,7 @@ " 2\n", " Y\n", " 20\n", + " 0\n", " y2^2\n", " YYYYYR3_y2^2\n", " \n", @@ -266,6 +275,7 @@ " 2\n", " G\n", " 40\n", + " 0\n", " b1^2\n", " GGGGGGGGGGR4_b1^2\n", " \n", @@ -284,6 +294,7 @@ " 2\n", " G\n", " 40\n", + " 0\n", " y2^2\n", " GGGGGGGGGGR4_y2^2\n", " \n", @@ -302,6 +313,7 @@ " 2\n", " G\n", " 40\n", + " 0\n", " b3^2\n", " GGGGGGGGGGR4_b3^2\n", " \n", @@ -320,6 +332,7 @@ " 2\n", " G\n", " 40\n", + " 0\n", " y4^2\n", " GGGGGGGGGGR4_y4^2\n", " \n", @@ -338,6 +351,7 @@ " 2\n", " T\n", " 50\n", + " 0\n", " b1^2\n", " TTTTTTTR2_b1^2\n", " \n", @@ -356,6 +370,7 @@ " 2\n", " T\n", " 50\n", + " 0\n", " y2^2\n", " TTTTTTTR2_y2^2\n", " \n", @@ -374,6 +389,7 @@ " 2\n", " T\n", " 50\n", + " 0\n", " b3^2\n", " TTTTTTTR2_b3^2\n", " \n", @@ -392,6 +408,7 @@ " 2\n", " T\n", " 60\n", + " 0\n", " b1^2\n", " TTTTTTTTTTTTK2_b1^2\n", " \n", @@ -410,6 +427,7 @@ " 2\n", " T\n", " 60\n", + " 0\n", " y2^2\n", " TTTTTTTTTTTTK2_y2^2\n", " \n", @@ -428,6 +446,7 @@ " 3\n", " T\n", " 70\n", + " 0\n", " b1^3\n", " TTR3_b1^3\n", " \n", @@ -446,6 +465,7 @@ " 3\n", " T\n", " 70\n", + " 0\n", " y2^3\n", " TTR3_y2^3\n", " \n", @@ -464,9 +484,67 @@ " 3\n", " T\n", " 70\n", + " 0\n", " b3^3\n", " TTR3_b3^3\n", " \n", + " \n", + " 20\n", + " 800\n", + " 801\n", + " 808\n", + " 80\n", + " Decoy_ProtT\n", + " TTK\n", + " TTK\n", + " 3\n", + " b\n", + " 1\n", + " 3\n", + " Decoy_T\n", + " 80\n", + " 1\n", + " b1^3\n", + " TTK3_b1^3\n", + " \n", + " \n", + " 21\n", + " 800\n", + " 802\n", + " 808\n", + " 80\n", + " Decoy_ProtT\n", + " TTK\n", + " TTK\n", + " 3\n", + " y\n", + " 2\n", + " 3\n", + " Decoy_T\n", + " 80\n", + " 1\n", + " y2^3\n", + " TTK3_y2^3\n", + " \n", + " \n", + " 22\n", + " 800\n", + " 803\n", + " 808\n", + " 80\n", + " Decoy_ProtT\n", + " TTK\n", + " TTK\n", + " 3\n", + " b\n", + " 3\n", + " 3\n", + " Decoy_T\n", + " 80\n", + " 1\n", + " b3^3\n", + " TTK3_b3^3\n", + " \n", " \n", "\n", "" @@ -493,28 +571,34 @@ "17 700 701 107 70 \n", "18 700 702 207 70 \n", "19 700 703 307 70 \n", + "20 800 801 808 80 \n", + "21 800 802 808 80 \n", + "22 800 803 808 80 \n", "\n", - " ProteinId PeptideSequence ModifiedPeptideSequence PrecursorCharge \\\n", - "0 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 \n", - "1 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 \n", - "2 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 \n", - "3 ProtY YYYYYR YYYYYR 2 \n", - "4 ProtY YYYYYR YYYYYR 2 \n", - "5 ProtY YYYYYR YYYYYR 2 \n", - "6 ProtY YYYYYR YYYYYR 3 \n", - "7 ProtY YYYYYR YYYYYR 3 \n", - "8 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n", - "9 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n", - "10 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n", - "11 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n", - "12 ProtT TTTTTTTR TTTTTTTR 2 \n", - "13 ProtT TTTTTTTR TTTTTTTR 2 \n", - "14 ProtT TTTTTTTR TTTTTTTR 2 \n", - "15 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 \n", - "16 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 \n", - "17 ProtT TTR TTR 3 \n", - "18 ProtT TTR TTR 3 \n", - "19 ProtT TTR TTR 3 \n", + " ProteinId PeptideSequence ModifiedPeptideSequence PrecursorCharge \\\n", + "0 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 \n", + "1 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 \n", + "2 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 \n", + "3 ProtY YYYYYR YYYYYR 2 \n", + "4 ProtY YYYYYR YYYYYR 2 \n", + "5 ProtY YYYYYR YYYYYR 2 \n", + "6 ProtY YYYYYR YYYYYR 3 \n", + "7 ProtY YYYYYR YYYYYR 3 \n", + "8 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n", + "9 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n", + "10 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n", + "11 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 \n", + "12 ProtT TTTTTTTR TTTTTTTR 2 \n", + "13 ProtT TTTTTTTR TTTTTTTR 2 \n", + "14 ProtT TTTTTTTR TTTTTTTR 2 \n", + "15 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 \n", + "16 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 \n", + "17 ProtT TTR TTR 3 \n", + "18 ProtT TTR TTR 3 \n", + "19 ProtT TTR TTR 3 \n", + "20 Decoy_ProtT TTK TTK 3 \n", + "21 Decoy_ProtT TTK TTK 3 \n", + "22 Decoy_ProtT TTK TTK 3 \n", "\n", " FragmentType FragmentSeriesNumber ProductCharge GeneName \\\n", "0 b 1 2 Y \n", @@ -537,31 +621,37 @@ "17 b 1 3 T \n", "18 y 2 3 T \n", "19 b 3 3 T \n", + "20 b 1 3 Decoy_T \n", + "21 y 2 3 Decoy_T \n", + "22 b 3 3 Decoy_T \n", "\n", - " LibraryDriftTime Annotation TransitionId \n", - "0 10 b1^2 YYYYYYYYYYYK2_b1^2 \n", - "1 10 y2^2 YYYYYYYYYYYK2_y2^2 \n", - "2 10 b3^2 YYYYYYYYYYYK2_b3^2 \n", - "3 20 b1^2 YYYYYR2_b1^2 \n", - "4 20 y2^2 YYYYYR2_y2^2 \n", - "5 20 b3^2 YYYYYR2_b3^2 \n", - "6 20 b1^2 YYYYYR3_b1^2 \n", - "7 20 y2^2 YYYYYR3_y2^2 \n", - "8 40 b1^2 GGGGGGGGGGR4_b1^2 \n", - "9 40 y2^2 GGGGGGGGGGR4_y2^2 \n", - "10 40 b3^2 GGGGGGGGGGR4_b3^2 \n", - "11 40 y4^2 GGGGGGGGGGR4_y4^2 \n", - "12 50 b1^2 TTTTTTTR2_b1^2 \n", - "13 50 y2^2 TTTTTTTR2_y2^2 \n", - "14 50 b3^2 TTTTTTTR2_b3^2 \n", - "15 60 b1^2 TTTTTTTTTTTTK2_b1^2 \n", - "16 60 y2^2 TTTTTTTTTTTTK2_y2^2 \n", - "17 70 b1^3 TTR3_b1^3 \n", - "18 70 y2^3 TTR3_y2^3 \n", - "19 70 b3^3 TTR3_b3^3 " + " LibraryDriftTime Decoy Annotation TransitionId \n", + "0 10 0 b1^2 YYYYYYYYYYYK2_b1^2 \n", + "1 10 0 y2^2 YYYYYYYYYYYK2_y2^2 \n", + "2 10 0 b3^2 YYYYYYYYYYYK2_b3^2 \n", + "3 20 0 b1^2 YYYYYR2_b1^2 \n", + "4 20 0 y2^2 YYYYYR2_y2^2 \n", + "5 20 0 b3^2 YYYYYR2_b3^2 \n", + "6 20 0 b1^2 YYYYYR3_b1^2 \n", + "7 20 0 y2^2 YYYYYR3_y2^2 \n", + "8 40 0 b1^2 GGGGGGGGGGR4_b1^2 \n", + "9 40 0 y2^2 GGGGGGGGGGR4_y2^2 \n", + "10 40 0 b3^2 GGGGGGGGGGR4_b3^2 \n", + "11 40 0 y4^2 GGGGGGGGGGR4_y4^2 \n", + "12 50 0 b1^2 TTTTTTTR2_b1^2 \n", + "13 50 0 y2^2 TTTTTTTR2_y2^2 \n", + "14 50 0 b3^2 TTTTTTTR2_b3^2 \n", + "15 60 0 b1^2 TTTTTTTTTTTTK2_b1^2 \n", + "16 60 0 y2^2 TTTTTTTTTTTTK2_y2^2 \n", + "17 70 0 b1^3 TTR3_b1^3 \n", + "18 70 0 y2^3 TTR3_y2^3 \n", + "19 70 0 b3^3 TTR3_b3^3 \n", + "20 80 1 b1^3 TTK3_b1^3 \n", + "21 80 1 y2^3 TTK3_y2^3 \n", + "22 80 1 b3^3 TTK3_b3^3 " ] }, - "execution_count": 3, + "execution_count": 66, "metadata": {}, "output_type": "execute_result" } @@ -574,12 +664,12 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 67, "id": "6a057baa-d129-4347-bd86-920770bc0a26", "metadata": {}, "outputs": [], "source": [ - "lib.to_csv(\"fakeLib_appended.tsv\", sep='\\t', index=False)" + "lib.to_csv(\"data/fakeLib_appended.tsv\", sep='\\t', index=False)" ] }, { @@ -618,18 +708,18 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 76, "id": "1afe24b1-cf28-44e0-84e4-f9194428e18f", "metadata": {}, "outputs": [], "source": [ - "conn = sqlite3.connect(\"fakeLib.pqp\")\n", + "conn = sqlite3.connect(\"data/fakeLib.pqp\")\n", "cur = conn.cursor()" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 77, "id": "2b50c3b4-e436-4edb-8a80-cc3bf67e73d3", "metadata": {}, "outputs": [], @@ -654,17 +744,17 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 78, "id": "584dc6c7-9c7a-4ad6-91c2-12a0fffa4c08", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 7, + "execution_count": 78, "metadata": {}, "output_type": "execute_result" } @@ -676,7 +766,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 79, "id": "98401ee9-5153-477b-928a-1c66cdbb8e5d", "metadata": {}, "outputs": [], @@ -686,7 +776,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 80, "id": "008b6e0f-28b8-4f22-89fe-54cdad6dca02", "metadata": { "tags": [] @@ -720,17 +810,17 @@ " \n", " \n", " 0\n", - " 6\n", + " 7\n", " 100.0\n", " \n", " \n", " 1\n", - " 4\n", + " 5\n", " 200.0\n", " \n", " \n", " 2\n", - " 5\n", + " 6\n", " 220.0\n", " \n", " \n", @@ -740,35 +830,41 @@ " \n", " \n", " 4\n", - " 2\n", + " 3\n", " 500.0\n", " \n", " \n", " 5\n", - " 3\n", + " 4\n", " 600.0\n", " \n", " \n", " 6\n", - " 1\n", + " 2\n", " 700.0\n", " \n", + " \n", + " 7\n", + " 1\n", + " 800.0\n", + " \n", " \n", "\n", "" ], "text/plain": [ " ID PRECURSOR_MZ\n", - "0 6 100.0\n", - "1 4 200.0\n", - "2 5 220.0\n", + "0 7 100.0\n", + "1 5 200.0\n", + "2 6 220.0\n", "3 0 400.0\n", - "4 2 500.0\n", - "5 3 600.0\n", - "6 1 700.0" + "4 3 500.0\n", + "5 4 600.0\n", + "6 2 700.0\n", + "7 1 800.0" ] }, - "execution_count": 56, + "execution_count": 80, "metadata": {}, "output_type": "execute_result" } @@ -779,7 +875,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 81, "id": "8f2b41fa-efb2-4f92-ad20-1737c16b3b8b", "metadata": {}, "outputs": [], @@ -789,7 +885,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 82, "id": "4b6472b3-a603-492c-b6da-bca4ae0c1ae9", "metadata": {}, "outputs": [], @@ -799,7 +895,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 83, "id": "23ea2e05-f030-41f4-aa98-1fc78da43303", "metadata": { "tags": [] @@ -829,13 +925,7 @@ " id\n", " precursor_id\n", " ID\n", - " run_id\n", - " exp_rt\n", - " exp_im\n", - " norm_rt\n", - " delta_rt\n", - " left_width\n", - " right_width\n", + " PRECURSOR_MZ\n", " \n", " \n", " \n", @@ -844,117 +934,66 @@ " 0\n", " 5\n", " 5\n", - " 1\n", - " 220.01\n", - " 220.01\n", - " 220\n", - " 0.01\n", - " 5\n", - " 5\n", + " 200.0\n", " \n", " \n", " 1\n", " 1\n", " 5\n", " 5\n", - " 1\n", - " 220.02\n", - " 220.02\n", - " 220\n", - " 0.01\n", - " 5\n", - " 5\n", + " 200.0\n", " \n", " \n", " 2\n", " 2\n", " 3\n", " 3\n", - " 1\n", - " 600.03\n", - " 600.03\n", - " 600\n", - " 0.01\n", - " 5\n", - " 5\n", + " 500.0\n", " \n", " \n", " 3\n", " 3\n", " 4\n", " 4\n", - " 1\n", - " 200.04\n", - " 200.04\n", - " 200\n", - " 0.01\n", - " 5\n", - " 5\n", + " 600.0\n", " \n", " \n", " 4\n", " 4\n", " 0\n", " 0\n", - " 1\n", - " 400.05\n", - " 400.05\n", - " 400\n", - " 0.01\n", - " 5\n", - " 5\n", + " 400.0\n", " \n", " \n", " 5\n", " 5\n", " 1\n", " 1\n", - " 1\n", - " 700.06\n", - " 700.06\n", - " 700\n", - " 0.01\n", - " 5\n", - " 5\n", + " 800.0\n", " \n", " \n", " 6\n", " 6\n", " 2\n", " 2\n", - " 1\n", - " 500.07\n", - " 500.07\n", - " 500\n", - " 0.01\n", - " 5\n", - " 5\n", + " 700.0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " id precursor_id ID run_id exp_rt exp_im norm_rt delta_rt \\\n", - "0 0 5 5 1 220.01 220.01 220 0.01 \n", - "1 1 5 5 1 220.02 220.02 220 0.01 \n", - "2 2 3 3 1 600.03 600.03 600 0.01 \n", - "3 3 4 4 1 200.04 200.04 200 0.01 \n", - "4 4 0 0 1 400.05 400.05 400 0.01 \n", - "5 5 1 1 1 700.06 700.06 700 0.01 \n", - "6 6 2 2 1 500.07 500.07 500 0.01 \n", - "\n", - " left_width right_width \n", - "0 5 5 \n", - "1 5 5 \n", - "2 5 5 \n", - "3 5 5 \n", - "4 5 5 \n", - "5 5 5 \n", - "6 5 5 " + " id precursor_id ID PRECURSOR_MZ\n", + "0 0 5 5 200.0\n", + "1 1 5 5 200.0\n", + "2 2 3 3 500.0\n", + "3 3 4 4 600.0\n", + "4 4 0 0 400.0\n", + "5 5 1 1 800.0\n", + "6 6 2 2 700.0" ] }, - "execution_count": 55, + "execution_count": 83, "metadata": {}, "output_type": "execute_result" } @@ -965,7 +1004,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 84, "id": "a5473d3c-793d-4d9d-b722-4aab486e0fa5", "metadata": {}, "outputs": [ @@ -997,17 +1036,17 @@ " \n", " \n", " 0\n", - " 6\n", + " 7\n", " 100.0\n", " \n", " \n", " 1\n", - " 4\n", + " 5\n", " 200.0\n", " \n", " \n", " 2\n", - " 5\n", + " 6\n", " 220.0\n", " \n", " \n", @@ -1017,35 +1056,41 @@ " \n", " \n", " 4\n", - " 2\n", + " 3\n", " 500.0\n", " \n", " \n", " 5\n", - " 3\n", + " 4\n", " 600.0\n", " \n", " \n", " 6\n", - " 1\n", + " 2\n", " 700.0\n", " \n", + " \n", + " 7\n", + " 1\n", + " 800.0\n", + " \n", " \n", "\n", "" ], "text/plain": [ " ID PRECURSOR_MZ\n", - "0 6 100.0\n", - "1 4 200.0\n", - "2 5 220.0\n", + "0 7 100.0\n", + "1 5 200.0\n", + "2 6 220.0\n", "3 0 400.0\n", - "4 2 500.0\n", - "5 3 600.0\n", - "6 1 700.0" + "4 3 500.0\n", + "5 4 600.0\n", + "6 2 700.0\n", + "7 1 800.0" ] }, - "execution_count": 11, + "execution_count": 84, "metadata": {}, "output_type": "execute_result" } @@ -1056,7 +1101,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 85, "id": "a251b758-df07-4d44-ab84-f8f9dd6911af", "metadata": {}, "outputs": [], @@ -1073,7 +1118,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 86, "id": "feeb09d3-17c4-4dd3-9325-5af12d22842b", "metadata": {}, "outputs": [], @@ -1083,7 +1128,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 87, "id": "c9c07727-5351-4501-8f1f-23df31b0a4e9", "metadata": {}, "outputs": [ @@ -1127,9 +1172,9 @@ " 5\n", " 5\n", " 1\n", - " 220.01\n", - " 220.01\n", - " 220\n", + " 200.01\n", + " 200.01\n", + " 200\n", " 0.01\n", " 5\n", " 5\n", @@ -1140,9 +1185,9 @@ " 5\n", " 5\n", " 1\n", - " 220.02\n", - " 220.02\n", - " 220\n", + " 200.02\n", + " 200.02\n", + " 200\n", " 0.01\n", " 5\n", " 5\n", @@ -1153,9 +1198,9 @@ " 3\n", " 3\n", " 1\n", - " 600.03\n", - " 600.03\n", - " 600\n", + " 500.03\n", + " 500.03\n", + " 500\n", " 0.01\n", " 5\n", " 5\n", @@ -1166,9 +1211,9 @@ " 4\n", " 4\n", " 1\n", - " 200.04\n", - " 200.04\n", - " 200\n", + " 600.04\n", + " 600.04\n", + " 600\n", " 0.01\n", " 5\n", " 5\n", @@ -1192,9 +1237,9 @@ " 1\n", " 1\n", " 1\n", - " 700.06\n", - " 700.06\n", - " 700\n", + " 800.06\n", + " 800.06\n", + " 800\n", " 0.01\n", " 5\n", " 5\n", @@ -1205,9 +1250,9 @@ " 2\n", " 2\n", " 1\n", - " 500.07\n", - " 500.07\n", - " 500\n", + " 700.07\n", + " 700.07\n", + " 700\n", " 0.01\n", " 5\n", " 5\n", @@ -1218,13 +1263,13 @@ ], "text/plain": [ " id precursor_id ID run_id exp_rt exp_im norm_rt delta_rt \\\n", - "0 0 5 5 1 220.01 220.01 220 0.01 \n", - "1 1 5 5 1 220.02 220.02 220 0.01 \n", - "2 2 3 3 1 600.03 600.03 600 0.01 \n", - "3 3 4 4 1 200.04 200.04 200 0.01 \n", + "0 0 5 5 1 200.01 200.01 200 0.01 \n", + "1 1 5 5 1 200.02 200.02 200 0.01 \n", + "2 2 3 3 1 500.03 500.03 500 0.01 \n", + "3 3 4 4 1 600.04 600.04 600 0.01 \n", "4 4 0 0 1 400.05 400.05 400 0.01 \n", - "5 5 1 1 1 700.06 700.06 700 0.01 \n", - "6 6 2 2 1 500.07 500.07 500 0.01 \n", + "5 5 1 1 1 800.06 800.06 800 0.01 \n", + "6 6 2 2 1 700.07 700.07 700 0.01 \n", "\n", " left_width right_width \n", "0 5 5 \n", @@ -1236,7 +1281,7 @@ "6 5 5 " ] }, - "execution_count": 14, + "execution_count": 87, "metadata": {}, "output_type": "execute_result" } @@ -1247,7 +1292,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 88, "id": "9b2a1cfc-dcca-45f3-9aa4-e7f75382de3c", "metadata": {}, "outputs": [], @@ -1262,7 +1307,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 89, "id": "4806f254-1c29-4aa5-9822-6cb8c6ea730c", "metadata": {}, "outputs": [], @@ -1288,17 +1333,17 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 90, "id": "8d03f3ca-a4b0-48e9-bd7f-3c1c7b8d6874", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 17, + "execution_count": 90, "metadata": {}, "output_type": "execute_result" } @@ -1310,7 +1355,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 91, "id": "aae782e1-c9f9-4c2c-84f7-a7741e385a5e", "metadata": {}, "outputs": [ @@ -1359,9 +1404,9 @@ " \n", " 0\n", " 0\n", - " 220010.0\n", + " 200010.0\n", " 5\n", - " 220.01\n", + " 200.01\n", " 0.01\n", " 1\n", " 1\n", @@ -1380,9 +1425,9 @@ " \n", " 1\n", " 1\n", - " 220020.0\n", + " 200020.0\n", " 5\n", - " 220.02\n", + " 200.02\n", " 0.01\n", " 1\n", " 1\n", @@ -1401,9 +1446,9 @@ " \n", " 2\n", " 2\n", - " 600030.0\n", + " 500030.0\n", " 3\n", - " 600.03\n", + " 500.03\n", " 0.01\n", " 1\n", " 1\n", @@ -1422,9 +1467,9 @@ " \n", " 3\n", " 3\n", - " 200040.0\n", + " 600040.0\n", " 4\n", - " 200.04\n", + " 600.04\n", " 0.01\n", " 1\n", " 1\n", @@ -1464,9 +1509,9 @@ " \n", " 5\n", " 5\n", - " 700060.0\n", + " 800060.0\n", " 1\n", - " 700.06\n", + " 800.06\n", " 0.01\n", " 1\n", " 1\n", @@ -1485,9 +1530,9 @@ " \n", " 6\n", " 6\n", - " 500070.0\n", + " 700070.0\n", " 2\n", - " 500.07\n", + " 700.07\n", " 0.01\n", " 1\n", " 1\n", @@ -1509,13 +1554,13 @@ ], "text/plain": [ " feature_id area_intensity apex_intensity exp_im delta_im \\\n", - "0 0 220010.0 5 220.01 0.01 \n", - "1 1 220020.0 5 220.02 0.01 \n", - "2 2 600030.0 3 600.03 0.01 \n", - "3 3 200040.0 4 200.04 0.01 \n", + "0 0 200010.0 5 200.01 0.01 \n", + "1 1 200020.0 5 200.02 0.01 \n", + "2 2 500030.0 3 500.03 0.01 \n", + "3 3 600040.0 4 600.04 0.01 \n", "4 4 400050.0 0 400.05 0.01 \n", - "5 5 700060.0 1 700.06 0.01 \n", - "6 6 500070.0 2 500.07 0.01 \n", + "5 5 800060.0 1 800.06 0.01 \n", + "6 6 700070.0 2 700.07 0.01 \n", "\n", " var_massdev_score var_mi_score var_mi_contrast_score \\\n", "0 1 1 1 \n", @@ -1563,7 +1608,7 @@ "6 1 1 1 " ] }, - "execution_count": 18, + "execution_count": 91, "metadata": {}, "output_type": "execute_result" } @@ -1594,7 +1639,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 92, "id": "8480c6a7-23ab-4a22-9da9-998cbc8606ac", "metadata": {}, "outputs": [], @@ -1609,7 +1654,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 93, "id": "98ff8eac-5acd-435a-b292-fe64d590ea51", "metadata": {}, "outputs": [], @@ -1635,17 +1680,17 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 94, "id": "7691c149-7f67-48de-9bff-2856a44d40eb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 21, + "execution_count": 94, "metadata": {}, "output_type": "execute_result" } @@ -1657,7 +1702,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 95, "id": "863fcd87-d051-4bc9-b88c-8535cbc90c4a", "metadata": { "scrolled": true, @@ -1719,7 +1764,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 96, "id": "30f29e9b-b345-4a02-ba63-74dabf69555b", "metadata": {}, "outputs": [ @@ -1771,10 +1816,10 @@ " \n", " 0\n", " 0\n", - " 220010.0\n", - " 220010.0\n", + " 200010.0\n", + " 200010.0\n", " 5\n", - " 220.01\n", + " 200.01\n", " 0.01\n", " 1\n", " 1\n", @@ -1795,10 +1840,10 @@ " \n", " 1\n", " 1\n", - " 220020.0\n", - " 220020.0\n", + " 200020.0\n", + " 200020.0\n", " 5\n", - " 220.02\n", + " 200.02\n", " 0.01\n", " 1\n", " 1\n", @@ -1819,10 +1864,10 @@ " \n", " 2\n", " 2\n", - " 600030.0\n", - " 600030.0\n", + " 500030.0\n", + " 500030.0\n", " 3\n", - " 600.03\n", + " 500.03\n", " 0.01\n", " 1\n", " 1\n", @@ -1843,10 +1888,10 @@ " \n", " 3\n", " 3\n", - " 200040.0\n", - " 200040.0\n", + " 600040.0\n", + " 600040.0\n", " 4\n", - " 200.04\n", + " 600.04\n", " 0.01\n", " 1\n", " 1\n", @@ -1891,10 +1936,10 @@ " \n", " 5\n", " 5\n", - " 700060.0\n", - " 700060.0\n", + " 800060.0\n", + " 800060.0\n", " 1\n", - " 700.06\n", + " 800.06\n", " 0.01\n", " 1\n", " 1\n", @@ -1915,10 +1960,10 @@ " \n", " 6\n", " 6\n", - " 500070.0\n", - " 500070.0\n", + " 700070.0\n", + " 700070.0\n", " 2\n", - " 500.07\n", + " 700.07\n", " 0.01\n", " 1\n", " 1\n", @@ -1943,13 +1988,13 @@ ], "text/plain": [ " feature_id AREA_INTENSITY TOTAL_AREA_INTENSITY APEX_INTENSITY EXP_IM \\\n", - "0 0 220010.0 220010.0 5 220.01 \n", - "1 1 220020.0 220020.0 5 220.02 \n", - "2 2 600030.0 600030.0 3 600.03 \n", - "3 3 200040.0 200040.0 4 200.04 \n", + "0 0 200010.0 200010.0 5 200.01 \n", + "1 1 200020.0 200020.0 5 200.02 \n", + "2 2 500030.0 500030.0 3 500.03 \n", + "3 3 600040.0 600040.0 4 600.04 \n", "4 4 400050.0 400050.0 0 400.05 \n", - "5 5 700060.0 700060.0 1 700.06 \n", - "6 6 500070.0 500070.0 2 500.07 \n", + "5 5 800060.0 800060.0 1 800.06 \n", + "6 6 700070.0 700070.0 2 700.07 \n", "\n", " DELTA_IM TOTAL_MI VAR_BSERIES_SCORE VAR_DOTPROD_SCORE \\\n", "0 0.01 1 1 1 \n", @@ -1990,7 +2035,7 @@ "[7 rows x 41 columns]" ] }, - "execution_count": 23, + "execution_count": 96, "metadata": {}, "output_type": "execute_result" } @@ -2042,7 +2087,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 97, "id": "d93163c0-20a1-4d98-86de-71c6d265d418", "metadata": {}, "outputs": [], @@ -2057,7 +2102,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 98, "id": "db1cbc3f-e463-43a6-895a-979c7aafe393", "metadata": {}, "outputs": [], @@ -2083,17 +2128,17 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 99, "id": "bca8bfba-86e9-497c-ae94-4c0f679b45f1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 26, + "execution_count": 99, "metadata": {}, "output_type": "execute_result" } @@ -2105,7 +2150,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 100, "id": "4678179b-aea7-460f-ad71-d157a1e3ce38", "metadata": {}, "outputs": [], @@ -2115,7 +2160,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 101, "id": "231cf0c6-01ac-4061-b1a9-d68404f793b3", "metadata": {}, "outputs": [], @@ -2125,7 +2170,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 102, "id": "7f730c1c-73a3-4ce1-a2b3-35b064edd558", "metadata": {}, "outputs": [], @@ -2136,7 +2181,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 103, "id": "46a4405d-c5c9-48a6-aee5-abb2a3b9b047", "metadata": {}, "outputs": [ @@ -2174,85 +2219,103 @@ " 0\n", " 0\n", " 5\n", - " 6\n", + " 3\n", " 5\n", - " YYYYYR3_b1^2\n", - " 221.0\n", + " YYYYYR2_b1^2\n", + " 201.0\n", " \n", " \n", " 1\n", - " 1\n", + " 0\n", " 5\n", - " 6\n", + " 4\n", " 5\n", - " YYYYYR3_b1^2\n", - " 221.0\n", + " YYYYYR2_y2^2\n", + " 202.0\n", " \n", " \n", " 2\n", " 0\n", " 5\n", - " 7\n", " 5\n", - " YYYYYR3_y2^2\n", - " 222.0\n", + " 5\n", + " YYYYYR2_b3^2\n", + " 203.0\n", " \n", " \n", " 3\n", " 1\n", " 5\n", - " 7\n", + " 3\n", " 5\n", - " YYYYYR3_y2^2\n", - " 222.0\n", + " YYYYYR2_b1^2\n", + " 201.0\n", " \n", " \n", " 4\n", + " 1\n", + " 5\n", + " 4\n", + " 5\n", + " YYYYYR2_y2^2\n", + " 202.0\n", + " \n", + " \n", + " 5\n", + " 1\n", + " 5\n", + " 5\n", + " 5\n", + " YYYYYR2_b3^2\n", + " 203.0\n", + " \n", + " \n", + " 6\n", " 2\n", " 3\n", - " 15\n", + " 12\n", " 3\n", - " TTTTTTTTTTTTK2_b1^2\n", - " 601.0\n", + " TTTTTTTR2_b1^2\n", + " 501.0\n", " \n", " \n", - " 5\n", + " 7\n", " 2\n", " 3\n", - " 16\n", + " 13\n", " 3\n", - " TTTTTTTTTTTTK2_y2^2\n", - " 602.0\n", + " TTTTTTTR2_y2^2\n", + " 502.0\n", " \n", " \n", - " 6\n", + " 8\n", + " 2\n", " 3\n", - " 4\n", + " 14\n", " 3\n", - " 4\n", - " YYYYYR2_b1^2\n", - " 201.0\n", + " TTTTTTTR2_b3^2\n", + " 503.0\n", " \n", " \n", - " 7\n", + " 9\n", " 3\n", " 4\n", + " 15\n", " 4\n", - " 4\n", - " YYYYYR2_y2^2\n", - " 202.0\n", + " TTTTTTTTTTTTK2_b1^2\n", + " 601.0\n", " \n", " \n", - " 8\n", + " 10\n", " 3\n", " 4\n", - " 5\n", + " 16\n", " 4\n", - " YYYYYR2_b3^2\n", - " 203.0\n", + " TTTTTTTTTTTTK2_y2^2\n", + " 602.0\n", " \n", " \n", - " 9\n", + " 11\n", " 4\n", " 0\n", " 8\n", @@ -2261,7 +2324,7 @@ " 401.0\n", " \n", " \n", - " 10\n", + " 12\n", " 4\n", " 0\n", " 9\n", @@ -2270,7 +2333,7 @@ " 402.0\n", " \n", " \n", - " 11\n", + " 13\n", " 4\n", " 0\n", " 10\n", @@ -2279,7 +2342,7 @@ " 403.0\n", " \n", " \n", - " 12\n", + " 14\n", " 4\n", " 0\n", " 11\n", @@ -2288,58 +2351,58 @@ " 404.0\n", " \n", " \n", - " 13\n", + " 15\n", " 5\n", " 1\n", - " 17\n", + " 20\n", " 1\n", - " TTR3_b1^3\n", - " 701.0\n", + " TTK3_b1^3\n", + " 801.0\n", " \n", " \n", - " 14\n", + " 16\n", " 5\n", " 1\n", - " 18\n", + " 21\n", " 1\n", - " TTR3_y2^3\n", - " 702.0\n", + " TTK3_y2^3\n", + " 802.0\n", " \n", " \n", - " 15\n", + " 17\n", " 5\n", " 1\n", - " 19\n", + " 22\n", " 1\n", - " TTR3_b3^3\n", - " 703.0\n", + " TTK3_b3^3\n", + " 803.0\n", " \n", " \n", - " 16\n", + " 18\n", " 6\n", " 2\n", - " 12\n", + " 17\n", " 2\n", - " TTTTTTTR2_b1^2\n", - " 501.0\n", + " TTR3_b1^3\n", + " 701.0\n", " \n", " \n", - " 17\n", + " 19\n", " 6\n", " 2\n", - " 13\n", + " 18\n", " 2\n", - " TTTTTTTR2_y2^2\n", - " 502.0\n", + " TTR3_y2^3\n", + " 702.0\n", " \n", " \n", - " 18\n", + " 20\n", " 6\n", " 2\n", - " 14\n", + " 19\n", " 2\n", - " TTTTTTTR2_b3^2\n", - " 503.0\n", + " TTR3_b3^3\n", + " 703.0\n", " \n", " \n", "\n", @@ -2347,49 +2410,53 @@ ], "text/plain": [ " feature_id precursor_id TRANSITION_ID PRECURSOR_ID \\\n", - "0 0 5 6 5 \n", - "1 1 5 6 5 \n", - "2 0 5 7 5 \n", - "3 1 5 7 5 \n", - "4 2 3 15 3 \n", - "5 2 3 16 3 \n", - "6 3 4 3 4 \n", - "7 3 4 4 4 \n", - "8 3 4 5 4 \n", - "9 4 0 8 0 \n", - "10 4 0 9 0 \n", - "11 4 0 10 0 \n", - "12 4 0 11 0 \n", - "13 5 1 17 1 \n", - "14 5 1 18 1 \n", - "15 5 1 19 1 \n", - "16 6 2 12 2 \n", - "17 6 2 13 2 \n", - "18 6 2 14 2 \n", + "0 0 5 3 5 \n", + "1 0 5 4 5 \n", + "2 0 5 5 5 \n", + "3 1 5 3 5 \n", + "4 1 5 4 5 \n", + "5 1 5 5 5 \n", + "6 2 3 12 3 \n", + "7 2 3 13 3 \n", + "8 2 3 14 3 \n", + "9 3 4 15 4 \n", + "10 3 4 16 4 \n", + "11 4 0 8 0 \n", + "12 4 0 9 0 \n", + "13 4 0 10 0 \n", + "14 4 0 11 0 \n", + "15 5 1 20 1 \n", + "16 5 1 21 1 \n", + "17 5 1 22 1 \n", + "18 6 2 17 2 \n", + "19 6 2 18 2 \n", + "20 6 2 19 2 \n", "\n", " TRAML_ID PRODUCT_MZ \n", - "0 YYYYYR3_b1^2 221.0 \n", - "1 YYYYYR3_b1^2 221.0 \n", - "2 YYYYYR3_y2^2 222.0 \n", - "3 YYYYYR3_y2^2 222.0 \n", - "4 TTTTTTTTTTTTK2_b1^2 601.0 \n", - "5 TTTTTTTTTTTTK2_y2^2 602.0 \n", - "6 YYYYYR2_b1^2 201.0 \n", - "7 YYYYYR2_y2^2 202.0 \n", - "8 YYYYYR2_b3^2 203.0 \n", - "9 GGGGGGGGGGR4_b1^2 401.0 \n", - "10 GGGGGGGGGGR4_y2^2 402.0 \n", - "11 GGGGGGGGGGR4_b3^2 403.0 \n", - "12 GGGGGGGGGGR4_y4^2 404.0 \n", - "13 TTR3_b1^3 701.0 \n", - "14 TTR3_y2^3 702.0 \n", - "15 TTR3_b3^3 703.0 \n", - "16 TTTTTTTR2_b1^2 501.0 \n", - "17 TTTTTTTR2_y2^2 502.0 \n", - "18 TTTTTTTR2_b3^2 503.0 " + "0 YYYYYR2_b1^2 201.0 \n", + "1 YYYYYR2_y2^2 202.0 \n", + "2 YYYYYR2_b3^2 203.0 \n", + "3 YYYYYR2_b1^2 201.0 \n", + "4 YYYYYR2_y2^2 202.0 \n", + "5 YYYYYR2_b3^2 203.0 \n", + "6 TTTTTTTR2_b1^2 501.0 \n", + "7 TTTTTTTR2_y2^2 502.0 \n", + "8 TTTTTTTR2_b3^2 503.0 \n", + "9 TTTTTTTTTTTTK2_b1^2 601.0 \n", + "10 TTTTTTTTTTTTK2_y2^2 602.0 \n", + "11 GGGGGGGGGGR4_b1^2 401.0 \n", + "12 GGGGGGGGGGR4_y2^2 402.0 \n", + "13 GGGGGGGGGGR4_b3^2 403.0 \n", + "14 GGGGGGGGGGR4_y4^2 404.0 \n", + "15 TTK3_b1^3 801.0 \n", + "16 TTK3_y2^3 802.0 \n", + "17 TTK3_b3^3 803.0 \n", + "18 TTR3_b1^3 701.0 \n", + "19 TTR3_y2^3 702.0 \n", + "20 TTR3_b3^3 703.0 " ] }, - "execution_count": 30, + "execution_count": 103, "metadata": {}, "output_type": "execute_result" } @@ -2400,545 +2467,24 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 104, "id": "cf55c2d6-fa9f-433a-bb8f-931984f48bbe", "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
feature_idTRANSITION_IDAREA_INTENSITYTOTAL_AREA_INTENSITYAPEX_INTENSITYTOTAL_MIVAR_INTENSITY_SCOREVAR_INTENSITY_RATIO_SCOREVAR_LOG_INTENSITYVAR_XCORR_COELUTIONVAR_XCORR_SHAPEVAR_LOG_SN_SCOREVAR_MASSDEV_SCOREVAR_MI_SCOREVAR_MI_RATIO_SCOREVAR_ISOTOPE_CORRELATION_SCOREVAR_ISOTOPE_OVERLAP_SCORE
006221.0221.01111111111111
116442.0442.01111111111111
207222.0222.01111111111111
317444.0444.01111111111111
42151803.01803.01111111111111
52161806.01806.01111111111111
633804.0804.01111111111111
734808.0808.01111111111111
835812.0812.01111111111111
9482005.02005.01111111111111
10492010.02010.01111111111111
114102015.02015.01111111111111
124112020.02020.01111111111111
135174206.04206.01111111111111
145184212.04212.01111111111111
155194218.04218.01111111111111
166123507.03507.01111111111111
176133514.03514.01111111111111
186143521.03521.01111111111111
\n", - "
" - ], - "text/plain": [ - " feature_id TRANSITION_ID AREA_INTENSITY TOTAL_AREA_INTENSITY \\\n", - "0 0 6 221.0 221.0 \n", - "1 1 6 442.0 442.0 \n", - "2 0 7 222.0 222.0 \n", - "3 1 7 444.0 444.0 \n", - "4 2 15 1803.0 1803.0 \n", - "5 2 16 1806.0 1806.0 \n", - "6 3 3 804.0 804.0 \n", - "7 3 4 808.0 808.0 \n", - "8 3 5 812.0 812.0 \n", - "9 4 8 2005.0 2005.0 \n", - "10 4 9 2010.0 2010.0 \n", - "11 4 10 2015.0 2015.0 \n", - "12 4 11 2020.0 2020.0 \n", - "13 5 17 4206.0 4206.0 \n", - "14 5 18 4212.0 4212.0 \n", - "15 5 19 4218.0 4218.0 \n", - "16 6 12 3507.0 3507.0 \n", - "17 6 13 3514.0 3514.0 \n", - "18 6 14 3521.0 3521.0 \n", - "\n", - " APEX_INTENSITY TOTAL_MI VAR_INTENSITY_SCORE VAR_INTENSITY_RATIO_SCORE \\\n", - "0 1 1 1 1 \n", - "1 1 1 1 1 \n", - "2 1 1 1 1 \n", - "3 1 1 1 1 \n", - "4 1 1 1 1 \n", - "5 1 1 1 1 \n", - "6 1 1 1 1 \n", - "7 1 1 1 1 \n", - "8 1 1 1 1 \n", - "9 1 1 1 1 \n", - "10 1 1 1 1 \n", - "11 1 1 1 1 \n", - "12 1 1 1 1 \n", - "13 1 1 1 1 \n", - "14 1 1 1 1 \n", - "15 1 1 1 1 \n", - "16 1 1 1 1 \n", - "17 1 1 1 1 \n", - "18 1 1 1 1 \n", - "\n", - " VAR_LOG_INTENSITY VAR_XCORR_COELUTION VAR_XCORR_SHAPE VAR_LOG_SN_SCORE \\\n", - "0 1 1 1 1 \n", - "1 1 1 1 1 \n", - "2 1 1 1 1 \n", - "3 1 1 1 1 \n", - "4 1 1 1 1 \n", - "5 1 1 1 1 \n", - "6 1 1 1 1 \n", - "7 1 1 1 1 \n", - "8 1 1 1 1 \n", - "9 1 1 1 1 \n", - "10 1 1 1 1 \n", - "11 1 1 1 1 \n", - "12 1 1 1 1 \n", - "13 1 1 1 1 \n", - "14 1 1 1 1 \n", - "15 1 1 1 1 \n", - "16 1 1 1 1 \n", - "17 1 1 1 1 \n", - "18 1 1 1 1 \n", - "\n", - " VAR_MASSDEV_SCORE VAR_MI_SCORE VAR_MI_RATIO_SCORE \\\n", - "0 1 1 1 \n", - "1 1 1 1 \n", - "2 1 1 1 \n", - "3 1 1 1 \n", - "4 1 1 1 \n", - "5 1 1 1 \n", - "6 1 1 1 \n", - "7 1 1 1 \n", - "8 1 1 1 \n", - "9 1 1 1 \n", - "10 1 1 1 \n", - "11 1 1 1 \n", - "12 1 1 1 \n", - "13 1 1 1 \n", - "14 1 1 1 \n", - "15 1 1 1 \n", - "16 1 1 1 \n", - "17 1 1 1 \n", - "18 1 1 1 \n", - "\n", - " VAR_ISOTOPE_CORRELATION_SCORE VAR_ISOTOPE_OVERLAP_SCORE \n", - "0 1 1 \n", - "1 1 1 \n", - "2 1 1 \n", - "3 1 1 \n", - "4 1 1 \n", - "5 1 1 \n", - "6 1 1 \n", - "7 1 1 \n", - "8 1 1 \n", - "9 1 1 \n", - "10 1 1 \n", - "11 1 1 \n", - "12 1 1 \n", - "13 1 1 \n", - "14 1 1 \n", - "15 1 1 \n", - "16 1 1 \n", - "17 1 1 \n", - "18 1 1 " - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" + "ename": "ValueError", + "evalue": "Length of values (19) does not match length of index (21)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[104], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m feature_transition[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAREA_INTENSITY\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m feature_transition[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mPRODUCT_MZ\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m*\u001b[39m (feature_transition[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfeature_id\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m) \u001b[38;5;66;03m#should be equal to product_mz * (feature_id + 1)\u001b[39;00m\n\u001b[1;32m 2\u001b[0m feature_transition[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mTOTAL_AREA_INTENSITY\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m feature_transition[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAREA_INTENSITY\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[0;32m----> 3\u001b[0m \u001b[43mfeature_transition\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mAPEX_INTENSITY\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;241m=\u001b[39m [\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m19\u001b[39m\n\u001b[1;32m 4\u001b[0m feature_transition[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mTOTAL_MI\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m [\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m19\u001b[39m\n\u001b[1;32m 5\u001b[0m feature_transition[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mVAR_INTENSITY_SCORE\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m [\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m19\u001b[39m\n", + "File \u001b[0;32m~/mambaforge/envs/pyprophet_dev/lib/python3.11/site-packages/pandas/core/frame.py:4311\u001b[0m, in \u001b[0;36mDataFrame.__setitem__\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m 4308\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_setitem_array([key], value)\n\u001b[1;32m 4309\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 4310\u001b[0m \u001b[38;5;66;03m# set column\u001b[39;00m\n\u001b[0;32m-> 4311\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_set_item\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/mambaforge/envs/pyprophet_dev/lib/python3.11/site-packages/pandas/core/frame.py:4524\u001b[0m, in \u001b[0;36mDataFrame._set_item\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m 4514\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_set_item\u001b[39m(\u001b[38;5;28mself\u001b[39m, key, value) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 4515\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 4516\u001b[0m \u001b[38;5;124;03m Add series to DataFrame in specified column.\u001b[39;00m\n\u001b[1;32m 4517\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 4522\u001b[0m \u001b[38;5;124;03m ensure homogeneity.\u001b[39;00m\n\u001b[1;32m 4523\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 4524\u001b[0m value, refs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sanitize_column\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4526\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 4527\u001b[0m key \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\n\u001b[1;32m 4528\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m value\u001b[38;5;241m.\u001b[39mndim \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 4529\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(value\u001b[38;5;241m.\u001b[39mdtype, ExtensionDtype)\n\u001b[1;32m 4530\u001b[0m ):\n\u001b[1;32m 4531\u001b[0m \u001b[38;5;66;03m# broadcast across multiple columns if necessary\u001b[39;00m\n\u001b[1;32m 4532\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mis_unique \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns, MultiIndex):\n", + "File \u001b[0;32m~/mambaforge/envs/pyprophet_dev/lib/python3.11/site-packages/pandas/core/frame.py:5266\u001b[0m, in \u001b[0;36mDataFrame._sanitize_column\u001b[0;34m(self, value)\u001b[0m\n\u001b[1;32m 5263\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _reindex_for_setitem(value, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindex)\n\u001b[1;32m 5265\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_list_like(value):\n\u001b[0;32m-> 5266\u001b[0m \u001b[43mcom\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequire_length_match\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 5267\u001b[0m arr \u001b[38;5;241m=\u001b[39m sanitize_array(value, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindex, copy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, allow_2d\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 5268\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 5269\u001b[0m \u001b[38;5;28misinstance\u001b[39m(value, Index)\n\u001b[1;32m 5270\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m value\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mobject\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 5273\u001b[0m \u001b[38;5;66;03m# TODO: Remove kludge in sanitize_array for string mode when enforcing\u001b[39;00m\n\u001b[1;32m 5274\u001b[0m \u001b[38;5;66;03m# this deprecation\u001b[39;00m\n", + "File \u001b[0;32m~/mambaforge/envs/pyprophet_dev/lib/python3.11/site-packages/pandas/core/common.py:573\u001b[0m, in \u001b[0;36mrequire_length_match\u001b[0;34m(data, index)\u001b[0m\n\u001b[1;32m 569\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 570\u001b[0m \u001b[38;5;124;03mCheck the length of data matches the length of the index.\u001b[39;00m\n\u001b[1;32m 571\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 572\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(data) \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mlen\u001b[39m(index):\n\u001b[0;32m--> 573\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 574\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLength of values \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 575\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(data)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m) \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 576\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdoes not match length of index \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 577\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(index)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 578\u001b[0m )\n", + "\u001b[0;31mValueError\u001b[0m: Length of values (19) does not match length of index (21)" + ] } ], "source": [ @@ -2965,7 +2511,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "id": "588ee10a-7b00-4f61-b305-2a392b5bbd1b", "metadata": {}, "outputs": [], @@ -2982,7 +2528,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "id": "78f13b81-7db1-46ee-947b-723e8b0b340b", "metadata": {}, "outputs": [], @@ -3008,7 +2554,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "id": "8c143ee6-928c-4404-85cc-5fc7b9b1ce85", "metadata": {}, "outputs": [], @@ -3036,17 +2582,17 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 105, "id": "9a24a430-d994-4012-9ac3-37c792e30026", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 35, + "execution_count": 105, "metadata": {}, "output_type": "execute_result" } @@ -3058,7 +2604,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 106, "id": "563c3e64-e528-457c-ba42-f00fabcff0f0", "metadata": { "tags": [] @@ -3084,7 +2630,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 107, "id": "c8539aa2-f946-4be8-a891-0b55d6957322", "metadata": {}, "outputs": [ @@ -3121,7 +2667,7 @@ " \n", " 0\n", " 0\n", - " 1320\n", + " 1200\n", " 1\n", " 1\n", " 1\n", @@ -3130,7 +2676,7 @@ " \n", " 1\n", " 1\n", - " 2640\n", + " 2400\n", " 1\n", " 1\n", " 1\n", @@ -3139,7 +2685,7 @@ " \n", " 2\n", " 2\n", - " 7200\n", + " 6000\n", " 1\n", " 1\n", " 1\n", @@ -3148,7 +2694,7 @@ " \n", " 3\n", " 3\n", - " 4000\n", + " 12000\n", " 1\n", " 1\n", " 1\n", @@ -3166,7 +2712,7 @@ " \n", " 5\n", " 5\n", - " 8400\n", + " 9600\n", " 1\n", " 1\n", " 1\n", @@ -3175,7 +2721,7 @@ " \n", " 6\n", " 6\n", - " 10500\n", + " 14700\n", " 1\n", " 1\n", " 1\n", @@ -3187,16 +2733,16 @@ ], "text/plain": [ " feature_id SCORE RANK PVALUE QVALUE PEP\n", - "0 0 1320 1 1 1 1\n", - "1 1 2640 1 1 1 1\n", - "2 2 7200 1 1 1 1\n", - "3 3 4000 1 1 1 1\n", + "0 0 1200 1 1 1 1\n", + "1 1 2400 1 1 1 1\n", + "2 2 6000 1 1 1 1\n", + "3 3 12000 1 1 1 1\n", "4 4 2000 1 1 1 1\n", - "5 5 8400 1 1 1 1\n", - "6 6 10500 1 1 1 1" + "5 5 9600 1 1 1 1\n", + "6 6 14700 1 1 1 1" ] }, - "execution_count": 37, + "execution_count": 107, "metadata": {}, "output_type": "execute_result" } @@ -3213,7 +2759,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 108, "id": "4cbda7cf-0535-4292-bfed-739a5f1bd2b8", "metadata": {}, "outputs": [], @@ -3228,7 +2774,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 109, "id": "adb88443-6d34-4173-8b37-9f52dba9f5e7", "metadata": {}, "outputs": [], @@ -3254,17 +2800,17 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 110, "id": "e0094b3a-5a80-48e4-8041-a537ce409480", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 40, + "execution_count": 110, "metadata": {}, "output_type": "execute_result" } @@ -3276,7 +2822,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 111, "id": "acf865f3-3353-4baa-b83e-91be2abed776", "metadata": { "tags": [] @@ -3303,7 +2849,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 112, "id": "f95142b9-612b-43a8-bb42-356b71839ea6", "metadata": {}, "outputs": [], @@ -3313,7 +2859,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 113, "id": "43828588-c1ff-4943-a7b7-24a968562c4e", "metadata": {}, "outputs": [ @@ -3347,41 +2893,48 @@ " \n", " \n", " 0\n", - " 3\n", + " 4\n", " TTTTTTTTTTTTK\n", " TTTTTTTTTTTTK\n", " 0\n", " \n", " \n", " 1\n", - " 2\n", + " 3\n", " TTTTTTTR\n", " TTTTTTTR\n", " 0\n", " \n", " \n", " 2\n", - " 4\n", + " 5\n", " YYYYYR\n", " YYYYYR\n", " 0\n", " \n", " \n", " 3\n", - " 1\n", + " 2\n", " TTR\n", " TTR\n", " 0\n", " \n", " \n", " 4\n", - " 5\n", + " 1\n", + " TTK\n", + " TTK\n", + " 1\n", + " \n", + " \n", + " 5\n", + " 6\n", " YYYYYYYYYYYK\n", " YYYYYYYYYYYK\n", " 0\n", " \n", " \n", - " 5\n", + " 6\n", " 0\n", " GGGGGGGGGGR\n", " GGGGGGGGGGR\n", @@ -3393,15 +2946,16 @@ ], "text/plain": [ " ID UNMODIFIED_SEQUENCE MODIFIED_SEQUENCE DECOY\n", - "0 3 TTTTTTTTTTTTK TTTTTTTTTTTTK 0\n", - "1 2 TTTTTTTR TTTTTTTR 0\n", - "2 4 YYYYYR YYYYYR 0\n", - "3 1 TTR TTR 0\n", - "4 5 YYYYYYYYYYYK YYYYYYYYYYYK 0\n", - "5 0 GGGGGGGGGGR GGGGGGGGGGR 0" + "0 4 TTTTTTTTTTTTK TTTTTTTTTTTTK 0\n", + "1 3 TTTTTTTR TTTTTTTR 0\n", + "2 5 YYYYYR YYYYYR 0\n", + "3 2 TTR TTR 0\n", + "4 1 TTK TTK 1\n", + "5 6 YYYYYYYYYYYK YYYYYYYYYYYK 0\n", + "6 0 GGGGGGGGGGR GGGGGGGGGGR 0" ] }, - "execution_count": 43, + "execution_count": 113, "metadata": {}, "output_type": "execute_result" } @@ -3412,7 +2966,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 114, "id": "f416e48b-6bb6-4cb7-8d81-597cfd52320c", "metadata": { "tags": [] @@ -3425,7 +2979,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 115, "id": "0e1eb1b9-730e-45d4-9618-fd532c1ccc25", "metadata": {}, "outputs": [], @@ -3441,7 +2995,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 116, "id": "e9692e06-ddf2-4f74-bb80-f2a92728767b", "metadata": {}, "outputs": [], @@ -3456,7 +3010,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 117, "id": "f720c22b-e6fa-4ac0-8402-bdcd2e74840b", "metadata": {}, "outputs": [], @@ -3482,17 +3036,17 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 118, "id": "94c860e0-880a-4091-afb8-af368ed72b26", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 48, + "execution_count": 118, "metadata": {}, "output_type": "execute_result" } @@ -3504,7 +3058,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 119, "id": "1c053178-b8a5-44ad-876b-49b5fd8afa23", "metadata": { "tags": [] @@ -3531,7 +3085,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 120, "id": "d70ba894-55bf-4a36-b306-45b7c5e9d1bd", "metadata": {}, "outputs": [], @@ -3541,7 +3095,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 121, "id": "3e1bbbeb-7cc9-4b9f-b898-5148abff911d", "metadata": {}, "outputs": [ @@ -3574,34 +3128,41 @@ " \n", " \n", " 0\n", - " 2\n", + " 3\n", " ProtY\n", " 0\n", " \n", " \n", " 1\n", - " 1\n", + " 2\n", " ProtT\n", " 0\n", " \n", " \n", " 2\n", - " 0\n", + " 1\n", " ProtG\n", " 0\n", " \n", + " \n", + " 3\n", + " 0\n", + " Decoy_ProtT\n", + " 1\n", + " \n", " \n", "\n", "" ], "text/plain": [ " ID PROTEIN_ACCESSION DECOY\n", - "0 2 ProtY 0\n", - "1 1 ProtT 0\n", - "2 0 ProtG 0" + "0 3 ProtY 0\n", + "1 2 ProtT 0\n", + "2 1 ProtG 0\n", + "3 0 Decoy_ProtT 1" ] }, - "execution_count": 51, + "execution_count": 121, "metadata": {}, "output_type": "execute_result" } @@ -3612,7 +3173,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 122, "id": "7b3410b1-5d6a-4e85-838c-5ccb3b15f1c5", "metadata": {}, "outputs": [ @@ -3649,6 +3210,16 @@ " \n", " \n", " 0\n", + " 3\n", + " 3\n", + " 1\n", + " 1\n", + " 1\n", + " global\n", + " 1\n", + " \n", + " \n", + " 1\n", " 2\n", " 2\n", " 1\n", @@ -3658,7 +3229,7 @@ " 1\n", " \n", " \n", - " 1\n", + " 2\n", " 1\n", " 1\n", " 1\n", @@ -3668,7 +3239,7 @@ " 1\n", " \n", " \n", - " 2\n", + " 3\n", " 0\n", " 0\n", " 1\n", @@ -3683,12 +3254,13 @@ ], "text/plain": [ " PROTEIN_ID SCORE PVALUE QVALUE PEP CONTEXT RUN_ID\n", - "0 2 2 1 1 1 global 1\n", - "1 1 1 1 1 1 global 1\n", - "2 0 0 1 1 1 global 1" + "0 3 3 1 1 1 global 1\n", + "1 2 2 1 1 1 global 1\n", + "2 1 1 1 1 1 global 1\n", + "3 0 0 1 1 1 global 1" ] }, - "execution_count": 52, + "execution_count": 122, "metadata": {}, "output_type": "execute_result" } @@ -3706,7 +3278,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 123, "id": "51408b81-b650-4787-9050-59d63c0098c0", "metadata": {}, "outputs": [], @@ -3721,7 +3293,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 124, "id": "ab6a68b5-f5db-46e7-95e8-6e98a69eb062", "metadata": {}, "outputs": [], @@ -3746,7 +3318,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.11.10" } }, "nbformat": 4, diff --git a/tests/data/dummyOSWScoredData.osw b/tests/data/dummyOSWScoredData.osw index c96832a161d6b444f4fd43e1c6cb5d67a932c82b..84e43d5c856e7d65c3100771311fd97510683068 100644 GIT binary patch delta 5107 zcmb_geQXrR6~DRty3c**uWrE*&M}xX7~>BJ0Rjl@L$V0AbGCz{Iyx-wnrspFaki!W z(Pr6xq83LqP2!|=p{-h1fhN%@QckT@l_sr7g;3N=At6dI&RZXPM%(*JHe_?u&`n5(pJ!C<|SMmL&lpNgz@T5-HFpK}i(CND+k)68~=|-X$#l1^Tv0VNMJ@-WRv-#1eb)PYTKa_|xf(vdo0srw=@GRGzHRQr|IxMuV$K&P^<3T#*|5o{ z`v-fv)0-CZ?a|@NR>h-+6V2 zS5D9Cw`F0!?Odd;etVnOFLSHoRd8g2*YWyo_xLq;tyi?i(#Xs@0dq^bP|P2iDo%_S zvqwe?1%2!>yuR2IJvZ~q<7masjXoN8Z2%LSId>ebd@z~n&!oGCG6Q`XQgn0AIz`hl zsY7>1igReQ+O^?-W96~HXc@_ryC)J}!_Bg`u>p-&Vx{NYu`P=^ue{l=Nh#VR+mYGj z*e>Lg*0N^XgHr-$$<3UeM%tG3K5uMsEv!jGJT;9zjYH%X+sIm3+ggt%Tn=ACxssz8D+WHezc#F***d7@Ph6mek~9v_1f&h9hh4h>@*t zc6EM`qAWo<@(OHqv6$!b!UefxqMSd2R0 z7vyiBIa5txoWUvzV+V2MGdL&VUCh58^zC5nyG`z?pQ>`4tfYSf0W>2=RcPz_1k26Bq{Q Xp$h{+qQg?>Ihzse6By|%Inenpqe$7s delta 202 zcmZoTz}L{gF+o~Tfq{VmgkeB*qK>h!0)t*M11nI7k^eVHLW=+QW@ygI*g~eWH!y0jtFfsvo#)@av2i973;PV_){Tvg%-ciS7+n}y__s5dPHZ&X zezu*_oI{G6X(|K%X}&Xjb9vm@Q&<&PX0Uu@VP^JWn!2&EoN4={drWzZ+nJ{@iZU`W uFl?8a%BaJ>edS!na3(fgQ8sZ$>FM*@86~#6v@!}XPM^`vXu50xV*&t#Y%$;f diff --git a/tests/data/fakeLib.tsv b/tests/data/fakeLib.tsv index 3831e0e3..d9189e14 100644 --- a/tests/data/fakeLib.tsv +++ b/tests/data/fakeLib.tsv @@ -1,21 +1,24 @@ -PrecursorMz ProductMz LibraryIntensity NormalizedRetentionTime ProteinId PeptideSequence ModifiedPeptideSequence PrecursorCharge FragmentType FragmentSeriesNumber ProductCharge GeneName LibraryDriftTime -100 101 101 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 b 1 2 Y 10 -100 102 201 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 y 2 2 Y 10 -100 103 301 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 b 3 2 Y 10 -200 201 102 20 ProtY YYYYYR YYYYYR 2 b 1 2 Y 20 -200 202 202 20 ProtY YYYYYR YYYYYR 2 y 2 2 Y 20 -200 203 302 20 ProtY YYYYYR YYYYYR 2 b 3 2 Y 20 -220 221 122 20 ProtY YYYYYR YYYYYR 3 b 1 2 Y 20 -220 222 222 20 ProtY YYYYYR YYYYYR 3 y 2 2 Y 20 -400 401 104 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 b 1 2 G 40 -400 402 204 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 y 2 2 G 40 -400 403 403 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 b 3 2 G 40 -400 404 404 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 y 4 2 G 40 -500 501 105 50 ProtT TTTTTTTR TTTTTTTR 2 b 1 2 T 50 -500 502 205 50 ProtT TTTTTTTR TTTTTTTR 2 y 2 2 T 50 -500 503 305 50 ProtT TTTTTTTR TTTTTTTR 2 b 3 2 T 50 -600 601 106 60 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 b 1 2 T 60 -600 602 206 60 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 y 2 2 T 60 -700 701 107 70 ProtT TTR TTR 3 b 1 3 T 70 -700 702 207 70 ProtT TTR TTR 3 y 2 3 T 70 -700 703 307 70 ProtT TTR TTR 3 b 3 3 T 70 +PrecursorMz ProductMz LibraryIntensity NormalizedRetentionTime ProteinId PeptideSequence ModifiedPeptideSequence PrecursorCharge FragmentType FragmentSeriesNumber ProductCharge GeneName LibraryDriftTime Decoy +100 101 101 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 b 1 2 Y 10 0 +100 102 201 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 y 2 2 Y 10 0 +100 103 301 10 ProtY YYYYYYYYYYYK YYYYYYYYYYYK 2 b 3 2 Y 10 0 +200 201 102 20 ProtY YYYYYR YYYYYR 2 b 1 2 Y 20 0 +200 202 202 20 ProtY YYYYYR YYYYYR 2 y 2 2 Y 20 0 +200 203 302 20 ProtY YYYYYR YYYYYR 2 b 3 2 Y 20 0 +220 221 122 20 ProtY YYYYYR YYYYYR 3 b 1 2 Y 20 0 +220 222 222 20 ProtY YYYYYR YYYYYR 3 y 2 2 Y 20 0 +400 401 104 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 b 1 2 G 40 0 +400 402 204 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 y 2 2 G 40 0 +400 403 403 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 b 3 2 G 40 0 +400 404 404 40 ProtG GGGGGGGGGGR GGGGGGGGGGR 4 y 4 2 G 40 0 +500 501 105 50 ProtT TTTTTTTR TTTTTTTR 2 b 1 2 T 50 0 +500 502 205 50 ProtT TTTTTTTR TTTTTTTR 2 y 2 2 T 50 0 +500 503 305 50 ProtT TTTTTTTR TTTTTTTR 2 b 3 2 T 50 0 +600 601 106 60 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 b 1 2 T 60 0 +600 602 206 60 ProtT TTTTTTTTTTTTK TTTTTTTTTTTTK 2 y 2 2 T 60 0 +700 701 107 70 ProtT TTR TTR 3 b 1 3 T 70 0 +700 702 207 70 ProtT TTR TTR 3 y 2 3 T 70 0 +700 703 307 70 ProtT TTR TTR 3 b 3 3 T 70 0 +800 801 808 80 Decoy_ProtT TTK TTK 3 b 1 3 Decoy_T 80 1 +800 802 808 80 Decoy_ProtT TTK TTK 3 y 2 3 Decoy_T 80 1 +800 803 808 80 Decoy_ProtT TTK TTK 3 b 3 3 Decoy_T 80 1 diff --git a/tests/test_pyprophet_export_parquet.py b/tests/test_pyprophet_export_parquet.py index 6c01696d..4e2b37df 100644 --- a/tests/test_pyprophet_export_parquet.py +++ b/tests/test_pyprophet_export_parquet.py @@ -27,7 +27,7 @@ def _run_cmdline(cmdline): return stdout -def _run_export_parquet_single_run(temp_folder, transitionLevel=False, pd_testing_kwargs=dict(check_dtype=False, check_names=False), onlyFeatures=False): +def _run_export_parquet_single_run(temp_folder, transitionLevel=False, pd_testing_kwargs=dict(check_dtype=False, check_names=False), onlyFeatures=False, noDecoys=False): os.chdir(temp_folder) DATA_NAME="dummyOSWScoredData.osw" data_path = os.path.join(DATA_FOLDER, DATA_NAME) @@ -41,6 +41,8 @@ def _run_export_parquet_single_run(temp_folder, transitionLevel=False, pd_testin cmdline += " --transitionLevel" if onlyFeatures: cmdline += " --onlyFeatures" + if noDecoys: + cmdline += " --noDecoys" stdout = _run_cmdline(cmdline) @@ -73,6 +75,7 @@ def _run_export_parquet_single_run(temp_folder, transitionLevel=False, pd_testin assert(expectedLength == len(parquet)) + ########### FEATURE LEVEL TESTS ######## # Tests that columns are equal across different sqlite3 tables to ensure joins occured correctly @@ -95,6 +98,14 @@ def _run_export_parquet_single_run(temp_folder, transitionLevel=False, pd_testin pd.testing.assert_series_equal(parquet['SCORE_PEPTIDE.SCORE_GLOBAL'], parquet['PEPTIDE_ID'], **pd_testing_kwargs) pd.testing.assert_series_equal(parquet['SCORE_PROTEIN.SCORE_GLOBAL'], parquet['PROTEIN_ID'], **pd_testing_kwargs) + print(parquet) + # check is/no decoys + if noDecoys: + assert(parquet[parquet['DECOY'] == 1].shape[0] == 0) + else: + assert(parquet[parquet['DECOY'] == 1].shape[0] != 0) + + ############### TRANSTION LEVEL TESTS ################ if transitionLevel: pd.testing.assert_series_equal(parquet['FEATURE_TRANSITION.AREA_INTENSITY'], parquet['TRANSITION.PRODUCT_MZ'] * (proxy_feature_id), **pd_testing_kwargs) @@ -112,4 +123,10 @@ def test_export_parquet_single_run_onlyFeatures(tmpdir): def test_export_parquet_single_run_transitionLevel_onlyFeatures(tmpdir): - _run_export_parquet_single_run(tmpdir, transitionLevel=True, onlyFeatures=True) \ No newline at end of file + _run_export_parquet_single_run(tmpdir, transitionLevel=True, onlyFeatures=True) + +def test_export_parquet_single_run_noDecoys(tmpdir): + _run_export_parquet_single_run(tmpdir, noDecoys=True) + +def test_export_parquet_single_run_transitionLevel_noDecoys(tmpdir): + _run_export_parquet_single_run(tmpdir, transitionLevel=True, noDecoys=True) \ No newline at end of file From 5798fd322f98435efa2f479999b24dac4921ee83 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Sat, 8 Feb 2025 13:18:36 -0500 Subject: [PATCH 7/8] test: fix parquet tests parquet tests were previously broken because all rows were dropped because one conlumn contained NAN. Because of this some of the tests were broken. --- tests/Create_OSW_test.ipynb | 1747 +++++++++++++++--------- tests/data/dummyOSWScoredData.osw | Bin 237568 -> 131072 bytes tests/test_pyprophet_export_parquet.py | 52 +- 3 files changed, 1168 insertions(+), 631 deletions(-) diff --git a/tests/Create_OSW_test.ipynb b/tests/Create_OSW_test.ipynb index fda08b68..5bbca243 100644 --- a/tests/Create_OSW_test.ipynb +++ b/tests/Create_OSW_test.ipynb @@ -8,9 +8,17 @@ "## **Create a fake .OSW file for testing**" ] }, + { + "cell_type": "markdown", + "id": "426d6f86-aaea-4372-b1fd-16234cdccb7f", + "metadata": {}, + "source": [ + "**Note:** Code cell 11 must be edited manually if new entries are added to the library" + ] + }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "id": "dc80ebc4-f822-4853-af17-d2ccd4f10e3c", "metadata": {}, "outputs": [], @@ -54,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 2, "id": "41ff70f2-2102-4be8-b17f-c5c51254b196", "metadata": {}, "outputs": [], @@ -64,7 +72,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 3, "id": "4325d9ba-722e-4120-8e58-803a86f87b1e", "metadata": {}, "outputs": [ @@ -651,7 +659,7 @@ "22 80 1 b3^3 TTK3_b3^3 " ] }, - "execution_count": 66, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -664,7 +672,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 4, "id": "6a057baa-d129-4347-bd86-920770bc0a26", "metadata": {}, "outputs": [], @@ -708,18 +716,40 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 5, + "id": "d8a28a22-9f82-4914-a123-e631509e6ab8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'data/dummyOSWScoredData.osw'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import shutil\n", + "shutil.copyfile(\"data/fakeLib.pqp\", \"data/dummyOSWScoredData.osw\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "id": "1afe24b1-cf28-44e0-84e4-f9194428e18f", "metadata": {}, "outputs": [], "source": [ - "conn = sqlite3.connect(\"data/fakeLib.pqp\")\n", + "conn = sqlite3.connect(\"data/dummyOSWScoredData.osw\")\n", "cur = conn.cursor()" ] }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 7, "id": "2b50c3b4-e436-4edb-8a80-cc3bf67e73d3", "metadata": {}, "outputs": [], @@ -744,17 +774,17 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 8, "id": "584dc6c7-9c7a-4ad6-91c2-12a0fffa4c08", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 78, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -766,7 +796,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 9, "id": "98401ee9-5153-477b-928a-1c66cdbb8e5d", "metadata": {}, "outputs": [], @@ -776,7 +806,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 10, "id": "008b6e0f-28b8-4f22-89fe-54cdad6dca02", "metadata": { "tags": [] @@ -864,7 +894,7 @@ "7 1 800.0" ] }, - "execution_count": 80, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -875,17 +905,18 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 11, "id": "8f2b41fa-efb2-4f92-ad20-1737c16b3b8b", "metadata": {}, "outputs": [], "source": [ - "features = pd.DataFrame(np.column_stack([np.arange(0,7), np.array([5,5,3,4,0,1,2])]), columns=['id', 'precursor_id'])" + "## Note: The second numpy array must be edited manually if add new precursors to the library\n", + "features = pd.DataFrame(np.column_stack([np.arange(0,8), np.array([6,6,4,5,0,2,3,1])]), columns=['id', 'precursor_id'])" ] }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 12, "id": "4b6472b3-a603-492c-b6da-bca4ae0c1ae9", "metadata": {}, "outputs": [], @@ -895,213 +926,7 @@ }, { "cell_type": "code", - "execution_count": 83, - "id": "23ea2e05-f030-41f4-aa98-1fc78da43303", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idprecursor_idIDPRECURSOR_MZ
0055200.0
1155200.0
2233500.0
3344600.0
4400400.0
5511800.0
6622700.0
\n", - "
" - ], - "text/plain": [ - " id precursor_id ID PRECURSOR_MZ\n", - "0 0 5 5 200.0\n", - "1 1 5 5 200.0\n", - "2 2 3 3 500.0\n", - "3 3 4 4 600.0\n", - "4 4 0 0 400.0\n", - "5 5 1 1 800.0\n", - "6 6 2 2 700.0" - ] - }, - "execution_count": 83, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "features" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "id": "a5473d3c-793d-4d9d-b722-4aab486e0fa5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
IDPRECURSOR_MZ
07100.0
15200.0
26220.0
30400.0
43500.0
54600.0
62700.0
71800.0
\n", - "
" - ], - "text/plain": [ - " ID PRECURSOR_MZ\n", - "0 7 100.0\n", - "1 5 200.0\n", - "2 6 220.0\n", - "3 0 400.0\n", - "4 3 500.0\n", - "5 4 600.0\n", - "6 2 700.0\n", - "7 1 800.0" - ] - }, - "execution_count": 84, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "precursor_table" - ] - }, - { - "cell_type": "code", - "execution_count": 85, + "execution_count": 13, "id": "a251b758-df07-4d44-ab84-f8f9dd6911af", "metadata": {}, "outputs": [], @@ -1118,7 +943,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 14, "id": "feeb09d3-17c4-4dd3-9325-5af12d22842b", "metadata": {}, "outputs": [], @@ -1128,7 +953,7 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 15, "id": "c9c07727-5351-4501-8f1f-23df31b0a4e9", "metadata": {}, "outputs": [ @@ -1169,12 +994,12 @@ " \n", " 0\n", " 0\n", - " 5\n", - " 5\n", + " 6\n", + " 6\n", " 1\n", - " 200.01\n", - " 200.01\n", - " 200\n", + " 220.01\n", + " 220.01\n", + " 220\n", " 0.01\n", " 5\n", " 5\n", @@ -1182,12 +1007,12 @@ " \n", " 1\n", " 1\n", - " 5\n", - " 5\n", + " 6\n", + " 6\n", " 1\n", - " 200.02\n", - " 200.02\n", - " 200\n", + " 220.02\n", + " 220.02\n", + " 220\n", " 0.01\n", " 5\n", " 5\n", @@ -1195,12 +1020,12 @@ " \n", " 2\n", " 2\n", - " 3\n", - " 3\n", + " 4\n", + " 4\n", " 1\n", - " 500.03\n", - " 500.03\n", - " 500\n", + " 600.03\n", + " 600.03\n", + " 600\n", " 0.01\n", " 5\n", " 5\n", @@ -1208,12 +1033,12 @@ " \n", " 3\n", " 3\n", - " 4\n", - " 4\n", + " 5\n", + " 5\n", " 1\n", - " 600.04\n", - " 600.04\n", - " 600\n", + " 200.04\n", + " 200.04\n", + " 200\n", " 0.01\n", " 5\n", " 5\n", @@ -1234,12 +1059,12 @@ " \n", " 5\n", " 5\n", + " 2\n", + " 2\n", " 1\n", - " 1\n", - " 1\n", - " 800.06\n", - " 800.06\n", - " 800\n", + " 700.06\n", + " 700.06\n", + " 700\n", " 0.01\n", " 5\n", " 5\n", @@ -1247,29 +1072,43 @@ " \n", " 6\n", " 6\n", - " 2\n", - " 2\n", + " 3\n", + " 3\n", " 1\n", - " 700.07\n", - " 700.07\n", - " 700\n", + " 500.07\n", + " 500.07\n", + " 500\n", " 0.01\n", " 5\n", " 5\n", " \n", - " \n", - "\n", - "" - ], + " \n", + " 7\n", + " 7\n", + " 1\n", + " 1\n", + " 1\n", + " 800.08\n", + " 800.08\n", + " 800\n", + " 0.01\n", + " 5\n", + " 5\n", + " \n", + " \n", + "\n", + "" + ], "text/plain": [ " id precursor_id ID run_id exp_rt exp_im norm_rt delta_rt \\\n", - "0 0 5 5 1 200.01 200.01 200 0.01 \n", - "1 1 5 5 1 200.02 200.02 200 0.01 \n", - "2 2 3 3 1 500.03 500.03 500 0.01 \n", - "3 3 4 4 1 600.04 600.04 600 0.01 \n", + "0 0 6 6 1 220.01 220.01 220 0.01 \n", + "1 1 6 6 1 220.02 220.02 220 0.01 \n", + "2 2 4 4 1 600.03 600.03 600 0.01 \n", + "3 3 5 5 1 200.04 200.04 200 0.01 \n", "4 4 0 0 1 400.05 400.05 400 0.01 \n", - "5 5 1 1 1 800.06 800.06 800 0.01 \n", - "6 6 2 2 1 700.07 700.07 700 0.01 \n", + "5 5 2 2 1 700.06 700.06 700 0.01 \n", + "6 6 3 3 1 500.07 500.07 500 0.01 \n", + "7 7 1 1 1 800.08 800.08 800 0.01 \n", "\n", " left_width right_width \n", "0 5 5 \n", @@ -1278,10 +1117,11 @@ "3 5 5 \n", "4 5 5 \n", "5 5 5 \n", - "6 5 5 " + "6 5 5 \n", + "7 5 5 " ] }, - "execution_count": 87, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1292,7 +1132,7 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 16, "id": "9b2a1cfc-dcca-45f3-9aa4-e7f75382de3c", "metadata": {}, "outputs": [], @@ -1307,7 +1147,7 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 17, "id": "4806f254-1c29-4aa5-9822-6cb8c6ea730c", "metadata": {}, "outputs": [], @@ -1333,17 +1173,17 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 18, "id": "8d03f3ca-a4b0-48e9-bd7f-3c1c7b8d6874", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 90, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1355,7 +1195,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 19, "id": "aae782e1-c9f9-4c2c-84f7-a7741e385a5e", "metadata": {}, "outputs": [ @@ -1404,9 +1244,9 @@ " \n", " 0\n", " 0\n", - " 200010.0\n", - " 5\n", - " 200.01\n", + " 220010.0\n", + " 6\n", + " 220.01\n", " 0.01\n", " 1\n", " 1\n", @@ -1425,9 +1265,9 @@ " \n", " 1\n", " 1\n", - " 200020.0\n", - " 5\n", - " 200.02\n", + " 220020.0\n", + " 6\n", + " 220.02\n", " 0.01\n", " 1\n", " 1\n", @@ -1446,9 +1286,9 @@ " \n", " 2\n", " 2\n", - " 500030.0\n", - " 3\n", - " 500.03\n", + " 600030.0\n", + " 4\n", + " 600.03\n", " 0.01\n", " 1\n", " 1\n", @@ -1467,9 +1307,9 @@ " \n", " 3\n", " 3\n", - " 600040.0\n", - " 4\n", - " 600.04\n", + " 200040.0\n", + " 5\n", + " 200.04\n", " 0.01\n", " 1\n", " 1\n", @@ -1509,9 +1349,9 @@ " \n", " 5\n", " 5\n", - " 800060.0\n", - " 1\n", - " 800.06\n", + " 700060.0\n", + " 2\n", + " 700.06\n", " 0.01\n", " 1\n", " 1\n", @@ -1530,9 +1370,30 @@ " \n", " 6\n", " 6\n", - " 700070.0\n", - " 2\n", - " 700.07\n", + " 500070.0\n", + " 3\n", + " 500.07\n", + " 0.01\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " \n", + " \n", + " 7\n", + " 7\n", + " 800080.0\n", + " 1\n", + " 800.08\n", " 0.01\n", " 1\n", " 1\n", @@ -1554,13 +1415,14 @@ ], "text/plain": [ " feature_id area_intensity apex_intensity exp_im delta_im \\\n", - "0 0 200010.0 5 200.01 0.01 \n", - "1 1 200020.0 5 200.02 0.01 \n", - "2 2 500030.0 3 500.03 0.01 \n", - "3 3 600040.0 4 600.04 0.01 \n", + "0 0 220010.0 6 220.01 0.01 \n", + "1 1 220020.0 6 220.02 0.01 \n", + "2 2 600030.0 4 600.03 0.01 \n", + "3 3 200040.0 5 200.04 0.01 \n", "4 4 400050.0 0 400.05 0.01 \n", - "5 5 800060.0 1 800.06 0.01 \n", - "6 6 700070.0 2 700.07 0.01 \n", + "5 5 700060.0 2 700.06 0.01 \n", + "6 6 500070.0 3 500.07 0.01 \n", + "7 7 800080.0 1 800.08 0.01 \n", "\n", " var_massdev_score var_mi_score var_mi_contrast_score \\\n", "0 1 1 1 \n", @@ -1570,6 +1432,7 @@ "4 1 1 1 \n", "5 1 1 1 \n", "6 1 1 1 \n", + "7 1 1 1 \n", "\n", " var_mi_combined_score var_isotope_correlation_score \\\n", "0 1 1 \n", @@ -1579,6 +1442,7 @@ "4 1 1 \n", "5 1 1 \n", "6 1 1 \n", + "7 1 1 \n", "\n", " var_isotope_overlap_score var_im_ms1_delta_score var_xcorr_coelution \\\n", "0 1 1 1 \n", @@ -1588,6 +1452,7 @@ "4 1 1 1 \n", "5 1 1 1 \n", "6 1 1 1 \n", + "7 1 1 1 \n", "\n", " var_xcorr_coelution_contrast var_xcorr_coelution_combined \\\n", "0 1 1 \n", @@ -1597,6 +1462,7 @@ "4 1 1 \n", "5 1 1 \n", "6 1 1 \n", + "7 1 1 \n", "\n", " var_xcorr_shape var_xcorr_shape_contrast var_xcorr_shape_combined \n", "0 1 1 1 \n", @@ -1605,41 +1471,43 @@ "3 1 1 1 \n", "4 1 1 1 \n", "5 1 1 1 \n", - "6 1 1 1 " + "6 1 1 1 \n", + "7 1 1 1 " ] }, - "execution_count": 91, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "length = 8\n", "feature_ms1 = features[['id']].copy().rename(columns={'id':'feature_id'})\n", "\n", "feature_ms1['area_intensity'] = features['exp_rt'] * 1000 \n", "feature_ms1['apex_intensity'] = features['precursor_id']\n", "feature_ms1['exp_im'] = features['exp_im']\n", "feature_ms1['delta_im'] = features['delta_rt']\n", - "feature_ms1['var_massdev_score'] = [1] *7\n", - "feature_ms1['var_mi_score'] = [1] *7\n", - "feature_ms1['var_mi_contrast_score'] = [1] *7\n", - "feature_ms1['var_mi_combined_score'] = [1] *7\n", - "feature_ms1['var_isotope_correlation_score'] = [1] *7\n", - "feature_ms1['var_isotope_overlap_score'] = [1] *7\n", - "feature_ms1['var_im_ms1_delta_score'] = [1] *7\n", - "feature_ms1['var_xcorr_coelution'] = [1] *7\n", - "feature_ms1['var_xcorr_coelution_contrast'] = [1] *7\n", - "feature_ms1['var_xcorr_coelution_combined'] = [1] *7\n", - "feature_ms1['var_xcorr_shape'] = [1] *7\n", - "feature_ms1['var_xcorr_shape_contrast'] = [1] *7\n", - "feature_ms1['var_xcorr_shape_combined'] = [1] *7\n", + "feature_ms1['var_massdev_score'] = [1] * length\n", + "feature_ms1['var_mi_score'] = [1] * length\n", + "feature_ms1['var_mi_contrast_score'] = [1] * length\n", + "feature_ms1['var_mi_combined_score'] = [1] * length\n", + "feature_ms1['var_isotope_correlation_score'] = [1] * length\n", + "feature_ms1['var_isotope_overlap_score'] = [1] * length\n", + "feature_ms1['var_im_ms1_delta_score'] = [1] * length\n", + "feature_ms1['var_xcorr_coelution'] = [1] * length\n", + "feature_ms1['var_xcorr_coelution_contrast'] = [1] * length\n", + "feature_ms1['var_xcorr_coelution_combined'] = [1] * length\n", + "feature_ms1['var_xcorr_shape'] = [1] *length\n", + "feature_ms1['var_xcorr_shape_contrast'] = [1] * length\n", + "feature_ms1['var_xcorr_shape_combined'] = [1] * length\n", "\n", "feature_ms1" ] }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 20, "id": "8480c6a7-23ab-4a22-9da9-998cbc8606ac", "metadata": {}, "outputs": [], @@ -1654,7 +1522,7 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 21, "id": "98ff8eac-5acd-435a-b292-fe64d590ea51", "metadata": {}, "outputs": [], @@ -1680,17 +1548,17 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 22, "id": "7691c149-7f67-48de-9bff-2856a44d40eb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 94, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1702,7 +1570,7 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 23, "id": "863fcd87-d051-4bc9-b88c-8535cbc90c4a", "metadata": { "scrolled": true, @@ -1764,7 +1632,7 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 24, "id": "30f29e9b-b345-4a02-ba63-74dabf69555b", "metadata": {}, "outputs": [ @@ -1816,10 +1684,10 @@ " \n", " 0\n", " 0\n", - " 200010.0\n", - " 200010.0\n", - " 5\n", - " 200.01\n", + " 220010.0\n", + " 220010.0\n", + " 6\n", + " 220.01\n", " 0.01\n", " 1\n", " 1\n", @@ -1840,10 +1708,10 @@ " \n", " 1\n", " 1\n", - " 200020.0\n", - " 200020.0\n", - " 5\n", - " 200.02\n", + " 220020.0\n", + " 220020.0\n", + " 6\n", + " 220.02\n", " 0.01\n", " 1\n", " 1\n", @@ -1864,10 +1732,10 @@ " \n", " 2\n", " 2\n", - " 500030.0\n", - " 500030.0\n", - " 3\n", - " 500.03\n", + " 600030.0\n", + " 600030.0\n", + " 4\n", + " 600.03\n", " 0.01\n", " 1\n", " 1\n", @@ -1888,10 +1756,10 @@ " \n", " 3\n", " 3\n", - " 600040.0\n", - " 600040.0\n", - " 4\n", - " 600.04\n", + " 200040.0\n", + " 200040.0\n", + " 5\n", + " 200.04\n", " 0.01\n", " 1\n", " 1\n", @@ -1936,10 +1804,10 @@ " \n", " 5\n", " 5\n", - " 800060.0\n", - " 800060.0\n", - " 1\n", - " 800.06\n", + " 700060.0\n", + " 700060.0\n", + " 2\n", + " 700.06\n", " 0.01\n", " 1\n", " 1\n", @@ -1960,10 +1828,34 @@ " \n", " 6\n", " 6\n", - " 700070.0\n", - " 700070.0\n", - " 2\n", - " 700.07\n", + " 500070.0\n", + " 500070.0\n", + " 3\n", + " 500.07\n", + " 0.01\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " ...\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " \n", + " \n", + " 7\n", + " 7\n", + " 800080.0\n", + " 800080.0\n", + " 1\n", + " 800.08\n", " 0.01\n", " 1\n", " 1\n", @@ -1983,18 +1875,19 @@ " \n", " \n", "\n", - "

7 rows × 41 columns

\n", + "

8 rows × 41 columns

\n", "" ], "text/plain": [ " feature_id AREA_INTENSITY TOTAL_AREA_INTENSITY APEX_INTENSITY EXP_IM \\\n", - "0 0 200010.0 200010.0 5 200.01 \n", - "1 1 200020.0 200020.0 5 200.02 \n", - "2 2 500030.0 500030.0 3 500.03 \n", - "3 3 600040.0 600040.0 4 600.04 \n", + "0 0 220010.0 220010.0 6 220.01 \n", + "1 1 220020.0 220020.0 6 220.02 \n", + "2 2 600030.0 600030.0 4 600.03 \n", + "3 3 200040.0 200040.0 5 200.04 \n", "4 4 400050.0 400050.0 0 400.05 \n", - "5 5 800060.0 800060.0 1 800.06 \n", - "6 6 700070.0 700070.0 2 700.07 \n", + "5 5 700060.0 700060.0 2 700.06 \n", + "6 6 500070.0 500070.0 3 500.07 \n", + "7 7 800080.0 800080.0 1 800.08 \n", "\n", " DELTA_IM TOTAL_MI VAR_BSERIES_SCORE VAR_DOTPROD_SCORE \\\n", "0 0.01 1 1 1 \n", @@ -2004,6 +1897,7 @@ "4 0.01 1 1 1 \n", "5 0.01 1 1 1 \n", "6 0.01 1 1 1 \n", + "7 0.01 1 1 1 \n", "\n", " VAR_INTENSITY_SCORE ... VAR_ELUTION_MODEL_FIT_SCORE VAR_IM_XCORR_SHAPE \\\n", "0 1 ... 1 1 \n", @@ -2013,6 +1907,7 @@ "4 1 ... 1 1 \n", "5 1 ... 1 1 \n", "6 1 ... 1 1 \n", + "7 1 ... 1 1 \n", "\n", " VAR_IM_XCORR_COELUTION VAR_IM_DELTA_SCORE VAR_SONAR_LAG VAR_SONAR_SHAPE \\\n", "0 1 1 1 1 \n", @@ -2022,6 +1917,7 @@ "4 1 1 1 1 \n", "5 1 1 1 1 \n", "6 1 1 1 1 \n", + "7 1 1 1 1 \n", "\n", " VAR_SONAR_LOG_SN VAR_SONAR_LOG_DIFF VAR_SONAR_LOG_TREND VAR_SONAR_RSQ \n", "0 1 1 1 1 \n", @@ -2031,63 +1927,65 @@ "4 1 1 1 1 \n", "5 1 1 1 1 \n", "6 1 1 1 1 \n", + "7 1 1 1 1 \n", "\n", - "[7 rows x 41 columns]" + "[8 rows x 41 columns]" ] }, - "execution_count": 96, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "length = 8\n", "feature_ms2 = feature_ms1[['feature_id']].copy()\n", "feature_ms2['AREA_INTENSITY'] = feature_ms1['area_intensity']\n", "feature_ms2['TOTAL_AREA_INTENSITY'] = feature_ms2['AREA_INTENSITY']\n", "feature_ms2['APEX_INTENSITY'] = feature_ms1['apex_intensity']\n", "feature_ms2['EXP_IM'] = feature_ms1['exp_im']\n", "feature_ms2['DELTA_IM'] = feature_ms1['delta_im']\n", - "feature_ms2['TOTAL_MI'] = [1] *7 \n", - "feature_ms2['VAR_BSERIES_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_DOTPROD_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_INTENSITY_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_ISOTOPE_CORRELATION_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_ISOTOPE_OVERLAP_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_LIBRARY_CORR'] = [1] *7 \n", - "feature_ms2['VAR_LIBRARY_DOTPROD'] = [1] *7 \n", - "feature_ms2['VAR_LIBRARY_MANHATTAN'] = [1] *7 \n", - "feature_ms2['VAR_LIBRARY_RMSD'] = [1] *7 \n", - "feature_ms2['VAR_LIBRARY_ROOTMEANSQUARE'] = [1] *7 \n", - "feature_ms2['VAR_LIBRARY_SANGLE'] = [1] *7 \n", - "feature_ms2['VAR_LOG_SN_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_MANHATTAN_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_MASSDEV_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_MASSDEV_SCORE_WEIGHTED'] = [1] *7 \n", - "feature_ms2['VAR_MI_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_MI_WEIGHTED_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_MI_RATIO_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_NORM_RT_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_XCORR_COELUTION'] = [1] *7 \n", - "feature_ms2['VAR_XCORR_COELUTION_WEIGHTED'] = [1] *7 \n", - "feature_ms2['VAR_XCORR_SHAPE'] = [1] *7 \n", - "feature_ms2['VAR_XCORR_SHAPE_WEIGHTED'] = [1] *7 \n", - "feature_ms2['VAR_YSERIES_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_ELUTION_MODEL_FIT_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_IM_XCORR_SHAPE'] = [1] *7 \n", - "feature_ms2['VAR_IM_XCORR_COELUTION'] = [1] *7 \n", - "feature_ms2['VAR_IM_DELTA_SCORE'] = [1] *7 \n", - "feature_ms2['VAR_SONAR_LAG'] = [1] *7 \n", - "feature_ms2['VAR_SONAR_SHAPE'] = [1] *7 \n", - "feature_ms2['VAR_SONAR_LOG_SN'] = [1] *7 \n", - "feature_ms2['VAR_SONAR_LOG_DIFF'] = [1] *7 \n", - "feature_ms2['VAR_SONAR_LOG_TREND'] = [1] *7 \n", - "feature_ms2['VAR_SONAR_RSQ'] = [1] *7 \n", + "feature_ms2['TOTAL_MI'] = [1] * length \n", + "feature_ms2['VAR_BSERIES_SCORE'] = [1] * length \n", + "feature_ms2['VAR_DOTPROD_SCORE'] = [1] * length \n", + "feature_ms2['VAR_INTENSITY_SCORE'] = [1] * length \n", + "feature_ms2['VAR_ISOTOPE_CORRELATION_SCORE'] = [1] * length \n", + "feature_ms2['VAR_ISOTOPE_OVERLAP_SCORE'] = [1] * length \n", + "feature_ms2['VAR_LIBRARY_CORR'] = [1] * length \n", + "feature_ms2['VAR_LIBRARY_DOTPROD'] = [1] * length \n", + "feature_ms2['VAR_LIBRARY_MANHATTAN'] = [1] * length \n", + "feature_ms2['VAR_LIBRARY_RMSD'] = [1] * length \n", + "feature_ms2['VAR_LIBRARY_ROOTMEANSQUARE'] = [1] * length \n", + "feature_ms2['VAR_LIBRARY_SANGLE'] = [1] * length \n", + "feature_ms2['VAR_LOG_SN_SCORE'] = [1] * length \n", + "feature_ms2['VAR_MANHATTAN_SCORE'] = [1] * length \n", + "feature_ms2['VAR_MASSDEV_SCORE'] = [1] * length \n", + "feature_ms2['VAR_MASSDEV_SCORE_WEIGHTED'] = [1] * length \n", + "feature_ms2['VAR_MI_SCORE'] = [1] * length \n", + "feature_ms2['VAR_MI_WEIGHTED_SCORE'] = [1] * length \n", + "feature_ms2['VAR_MI_RATIO_SCORE'] = [1] * length \n", + "feature_ms2['VAR_NORM_RT_SCORE'] = [1] * length \n", + "feature_ms2['VAR_XCORR_COELUTION'] = [1] * length \n", + "feature_ms2['VAR_XCORR_COELUTION_WEIGHTED'] = [1] * length \n", + "feature_ms2['VAR_XCORR_SHAPE'] = [1] * length \n", + "feature_ms2['VAR_XCORR_SHAPE_WEIGHTED'] = [1] * length \n", + "feature_ms2['VAR_YSERIES_SCORE'] = [1] * length \n", + "feature_ms2['VAR_ELUTION_MODEL_FIT_SCORE'] = [1] * length \n", + "feature_ms2['VAR_IM_XCORR_SHAPE'] = [1] * length \n", + "feature_ms2['VAR_IM_XCORR_COELUTION'] = [1] * length \n", + "feature_ms2['VAR_IM_DELTA_SCORE'] = [1] * length \n", + "feature_ms2['VAR_SONAR_LAG'] = [1] * length \n", + "feature_ms2['VAR_SONAR_SHAPE'] = [1] * length \n", + "feature_ms2['VAR_SONAR_LOG_SN'] = [1] * length \n", + "feature_ms2['VAR_SONAR_LOG_DIFF'] = [1] * length \n", + "feature_ms2['VAR_SONAR_LOG_TREND'] = [1] * length \n", + "feature_ms2['VAR_SONAR_RSQ'] = [1] * length \n", "feature_ms2" ] }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 25, "id": "d93163c0-20a1-4d98-86de-71c6d265d418", "metadata": {}, "outputs": [], @@ -2102,7 +2000,7 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 26, "id": "db1cbc3f-e463-43a6-895a-979c7aafe393", "metadata": {}, "outputs": [], @@ -2128,17 +2026,17 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 27, "id": "bca8bfba-86e9-497c-ae94-4c0f679b45f1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 99, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -2150,7 +2048,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 28, "id": "4678179b-aea7-460f-ad71-d157a1e3ce38", "metadata": {}, "outputs": [], @@ -2160,7 +2058,7 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 29, "id": "231cf0c6-01ac-4061-b1a9-d68404f793b3", "metadata": {}, "outputs": [], @@ -2170,7 +2068,7 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 30, "id": "7f730c1c-73a3-4ce1-a2b3-35b064edd558", "metadata": {}, "outputs": [], @@ -2181,7 +2079,7 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 31, "id": "46a4405d-c5c9-48a6-aee5-abb2a3b9b047", "metadata": {}, "outputs": [ @@ -2218,104 +2116,86 @@ " \n", " 0\n", " 0\n", - " 5\n", - " 3\n", - " 5\n", - " YYYYYR2_b1^2\n", - " 201.0\n", + " 6\n", + " 6\n", + " 6\n", + " YYYYYR3_b1^2\n", + " 221.0\n", " \n", " \n", " 1\n", " 0\n", - " 5\n", - " 4\n", - " 5\n", - " YYYYYR2_y2^2\n", - " 202.0\n", + " 6\n", + " 7\n", + " 6\n", + " YYYYYR3_y2^2\n", + " 222.0\n", " \n", " \n", " 2\n", - " 0\n", - " 5\n", - " 5\n", - " 5\n", - " YYYYYR2_b3^2\n", - " 203.0\n", + " 1\n", + " 6\n", + " 6\n", + " 6\n", + " YYYYYR3_b1^2\n", + " 221.0\n", " \n", " \n", " 3\n", " 1\n", - " 5\n", - " 3\n", - " 5\n", - " YYYYYR2_b1^2\n", - " 201.0\n", + " 6\n", + " 7\n", + " 6\n", + " YYYYYR3_y2^2\n", + " 222.0\n", " \n", " \n", " 4\n", - " 1\n", - " 5\n", + " 2\n", " 4\n", - " 5\n", - " YYYYYR2_y2^2\n", - " 202.0\n", + " 15\n", + " 4\n", + " TTTTTTTTTTTTK2_b1^2\n", + " 601.0\n", " \n", " \n", " 5\n", - " 1\n", - " 5\n", - " 5\n", - " 5\n", - " YYYYYR2_b3^2\n", - " 203.0\n", + " 2\n", + " 4\n", + " 16\n", + " 4\n", + " TTTTTTTTTTTTK2_y2^2\n", + " 602.0\n", " \n", " \n", " 6\n", - " 2\n", " 3\n", - " 12\n", + " 5\n", " 3\n", - " TTTTTTTR2_b1^2\n", - " 501.0\n", + " 5\n", + " YYYYYR2_b1^2\n", + " 201.0\n", " \n", " \n", " 7\n", - " 2\n", " 3\n", - " 13\n", - " 3\n", - " TTTTTTTR2_y2^2\n", - " 502.0\n", + " 5\n", + " 4\n", + " 5\n", + " YYYYYR2_y2^2\n", + " 202.0\n", " \n", " \n", " 8\n", - " 2\n", - " 3\n", - " 14\n", " 3\n", - " TTTTTTTR2_b3^2\n", - " 503.0\n", + " 5\n", + " 5\n", + " 5\n", + " YYYYYR2_b3^2\n", + " 203.0\n", " \n", " \n", " 9\n", - " 3\n", - " 4\n", - " 15\n", - " 4\n", - " TTTTTTTTTTTTK2_b1^2\n", - " 601.0\n", - " \n", - " \n", - " 10\n", - " 3\n", - " 4\n", - " 16\n", - " 4\n", - " TTTTTTTTTTTTK2_y2^2\n", - " 602.0\n", - " \n", - " \n", - " 11\n", " 4\n", " 0\n", " 8\n", @@ -2324,7 +2204,7 @@ " 401.0\n", " \n", " \n", - " 12\n", + " 10\n", " 4\n", " 0\n", " 9\n", @@ -2333,7 +2213,7 @@ " 402.0\n", " \n", " \n", - " 13\n", + " 11\n", " 4\n", " 0\n", " 10\n", @@ -2342,7 +2222,7 @@ " 403.0\n", " \n", " \n", - " 14\n", + " 12\n", " 4\n", " 0\n", " 11\n", @@ -2351,8 +2231,62 @@ " 404.0\n", " \n", " \n", + " 13\n", + " 5\n", + " 2\n", + " 17\n", + " 2\n", + " TTR3_b1^3\n", + " 701.0\n", + " \n", + " \n", + " 14\n", + " 5\n", + " 2\n", + " 18\n", + " 2\n", + " TTR3_y2^3\n", + " 702.0\n", + " \n", + " \n", " 15\n", " 5\n", + " 2\n", + " 19\n", + " 2\n", + " TTR3_b3^3\n", + " 703.0\n", + " \n", + " \n", + " 16\n", + " 6\n", + " 3\n", + " 12\n", + " 3\n", + " TTTTTTTR2_b1^2\n", + " 501.0\n", + " \n", + " \n", + " 17\n", + " 6\n", + " 3\n", + " 13\n", + " 3\n", + " TTTTTTTR2_y2^2\n", + " 502.0\n", + " \n", + " \n", + " 18\n", + " 6\n", + " 3\n", + " 14\n", + " 3\n", + " TTTTTTTR2_b3^2\n", + " 503.0\n", + " \n", + " \n", + " 19\n", + " 7\n", " 1\n", " 20\n", " 1\n", @@ -2360,8 +2294,8 @@ " 801.0\n", " \n", " \n", - " 16\n", - " 5\n", + " 20\n", + " 7\n", " 1\n", " 21\n", " 1\n", @@ -2369,94 +2303,69 @@ " 802.0\n", " \n", " \n", - " 17\n", - " 5\n", + " 21\n", + " 7\n", " 1\n", " 22\n", " 1\n", " TTK3_b3^3\n", " 803.0\n", " \n", - " \n", - " 18\n", - " 6\n", - " 2\n", - " 17\n", - " 2\n", - " TTR3_b1^3\n", - " 701.0\n", - " \n", - " \n", - " 19\n", - " 6\n", - " 2\n", - " 18\n", - " 2\n", - " TTR3_y2^3\n", - " 702.0\n", - " \n", - " \n", - " 20\n", - " 6\n", - " 2\n", - " 19\n", - " 2\n", - " TTR3_b3^3\n", - " 703.0\n", - " \n", " \n", "\n", "" ], "text/plain": [ " feature_id precursor_id TRANSITION_ID PRECURSOR_ID \\\n", - "0 0 5 3 5 \n", - "1 0 5 4 5 \n", - "2 0 5 5 5 \n", - "3 1 5 3 5 \n", - "4 1 5 4 5 \n", - "5 1 5 5 5 \n", - "6 2 3 12 3 \n", - "7 2 3 13 3 \n", - "8 2 3 14 3 \n", - "9 3 4 15 4 \n", - "10 3 4 16 4 \n", - "11 4 0 8 0 \n", - "12 4 0 9 0 \n", - "13 4 0 10 0 \n", - "14 4 0 11 0 \n", - "15 5 1 20 1 \n", - "16 5 1 21 1 \n", - "17 5 1 22 1 \n", - "18 6 2 17 2 \n", - "19 6 2 18 2 \n", - "20 6 2 19 2 \n", + "0 0 6 6 6 \n", + "1 0 6 7 6 \n", + "2 1 6 6 6 \n", + "3 1 6 7 6 \n", + "4 2 4 15 4 \n", + "5 2 4 16 4 \n", + "6 3 5 3 5 \n", + "7 3 5 4 5 \n", + "8 3 5 5 5 \n", + "9 4 0 8 0 \n", + "10 4 0 9 0 \n", + "11 4 0 10 0 \n", + "12 4 0 11 0 \n", + "13 5 2 17 2 \n", + "14 5 2 18 2 \n", + "15 5 2 19 2 \n", + "16 6 3 12 3 \n", + "17 6 3 13 3 \n", + "18 6 3 14 3 \n", + "19 7 1 20 1 \n", + "20 7 1 21 1 \n", + "21 7 1 22 1 \n", "\n", " TRAML_ID PRODUCT_MZ \n", - "0 YYYYYR2_b1^2 201.0 \n", - "1 YYYYYR2_y2^2 202.0 \n", - "2 YYYYYR2_b3^2 203.0 \n", - "3 YYYYYR2_b1^2 201.0 \n", - "4 YYYYYR2_y2^2 202.0 \n", - "5 YYYYYR2_b3^2 203.0 \n", - "6 TTTTTTTR2_b1^2 501.0 \n", - "7 TTTTTTTR2_y2^2 502.0 \n", - "8 TTTTTTTR2_b3^2 503.0 \n", - "9 TTTTTTTTTTTTK2_b1^2 601.0 \n", - "10 TTTTTTTTTTTTK2_y2^2 602.0 \n", - "11 GGGGGGGGGGR4_b1^2 401.0 \n", - "12 GGGGGGGGGGR4_y2^2 402.0 \n", - "13 GGGGGGGGGGR4_b3^2 403.0 \n", - "14 GGGGGGGGGGR4_y4^2 404.0 \n", - "15 TTK3_b1^3 801.0 \n", - "16 TTK3_y2^3 802.0 \n", - "17 TTK3_b3^3 803.0 \n", - "18 TTR3_b1^3 701.0 \n", - "19 TTR3_y2^3 702.0 \n", - "20 TTR3_b3^3 703.0 " + "0 YYYYYR3_b1^2 221.0 \n", + "1 YYYYYR3_y2^2 222.0 \n", + "2 YYYYYR3_b1^2 221.0 \n", + "3 YYYYYR3_y2^2 222.0 \n", + "4 TTTTTTTTTTTTK2_b1^2 601.0 \n", + "5 TTTTTTTTTTTTK2_y2^2 602.0 \n", + "6 YYYYYR2_b1^2 201.0 \n", + "7 YYYYYR2_y2^2 202.0 \n", + "8 YYYYYR2_b3^2 203.0 \n", + "9 GGGGGGGGGGR4_b1^2 401.0 \n", + "10 GGGGGGGGGGR4_y2^2 402.0 \n", + "11 GGGGGGGGGGR4_b3^2 403.0 \n", + "12 GGGGGGGGGGR4_y4^2 404.0 \n", + "13 TTR3_b1^3 701.0 \n", + "14 TTR3_y2^3 702.0 \n", + "15 TTR3_b3^3 703.0 \n", + "16 TTTTTTTR2_b1^2 501.0 \n", + "17 TTTTTTTR2_y2^2 502.0 \n", + "18 TTTTTTTR2_b3^2 503.0 \n", + "19 TTK3_b1^3 801.0 \n", + "20 TTK3_y2^3 802.0 \n", + "21 TTK3_b3^3 803.0 " ] }, - "execution_count": 103, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -2467,42 +2376,639 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 32, "id": "cf55c2d6-fa9f-433a-bb8f-931984f48bbe", "metadata": {}, "outputs": [ { - "ename": "ValueError", - "evalue": "Length of values (19) does not match length of index (21)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[104], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m feature_transition[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAREA_INTENSITY\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m feature_transition[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mPRODUCT_MZ\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m*\u001b[39m (feature_transition[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfeature_id\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m) \u001b[38;5;66;03m#should be equal to product_mz * (feature_id + 1)\u001b[39;00m\n\u001b[1;32m 2\u001b[0m feature_transition[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mTOTAL_AREA_INTENSITY\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m feature_transition[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mAREA_INTENSITY\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[0;32m----> 3\u001b[0m \u001b[43mfeature_transition\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mAPEX_INTENSITY\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;241m=\u001b[39m [\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m19\u001b[39m\n\u001b[1;32m 4\u001b[0m feature_transition[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mTOTAL_MI\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m [\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m19\u001b[39m\n\u001b[1;32m 5\u001b[0m feature_transition[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mVAR_INTENSITY_SCORE\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m [\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m19\u001b[39m\n", - "File \u001b[0;32m~/mambaforge/envs/pyprophet_dev/lib/python3.11/site-packages/pandas/core/frame.py:4311\u001b[0m, in \u001b[0;36mDataFrame.__setitem__\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m 4308\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_setitem_array([key], value)\n\u001b[1;32m 4309\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 4310\u001b[0m \u001b[38;5;66;03m# set column\u001b[39;00m\n\u001b[0;32m-> 4311\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_set_item\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/pyprophet_dev/lib/python3.11/site-packages/pandas/core/frame.py:4524\u001b[0m, in \u001b[0;36mDataFrame._set_item\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m 4514\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_set_item\u001b[39m(\u001b[38;5;28mself\u001b[39m, key, value) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 4515\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 4516\u001b[0m \u001b[38;5;124;03m Add series to DataFrame in specified column.\u001b[39;00m\n\u001b[1;32m 4517\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 4522\u001b[0m \u001b[38;5;124;03m ensure homogeneity.\u001b[39;00m\n\u001b[1;32m 4523\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 4524\u001b[0m value, refs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sanitize_column\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4526\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 4527\u001b[0m key \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\n\u001b[1;32m 4528\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m value\u001b[38;5;241m.\u001b[39mndim \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 4529\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(value\u001b[38;5;241m.\u001b[39mdtype, ExtensionDtype)\n\u001b[1;32m 4530\u001b[0m ):\n\u001b[1;32m 4531\u001b[0m \u001b[38;5;66;03m# broadcast across multiple columns if necessary\u001b[39;00m\n\u001b[1;32m 4532\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mis_unique \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns, MultiIndex):\n", - "File \u001b[0;32m~/mambaforge/envs/pyprophet_dev/lib/python3.11/site-packages/pandas/core/frame.py:5266\u001b[0m, in \u001b[0;36mDataFrame._sanitize_column\u001b[0;34m(self, value)\u001b[0m\n\u001b[1;32m 5263\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _reindex_for_setitem(value, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindex)\n\u001b[1;32m 5265\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_list_like(value):\n\u001b[0;32m-> 5266\u001b[0m \u001b[43mcom\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequire_length_match\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 5267\u001b[0m arr \u001b[38;5;241m=\u001b[39m sanitize_array(value, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindex, copy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, allow_2d\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 5268\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 5269\u001b[0m \u001b[38;5;28misinstance\u001b[39m(value, Index)\n\u001b[1;32m 5270\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m value\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mobject\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 5273\u001b[0m \u001b[38;5;66;03m# TODO: Remove kludge in sanitize_array for string mode when enforcing\u001b[39;00m\n\u001b[1;32m 5274\u001b[0m \u001b[38;5;66;03m# this deprecation\u001b[39;00m\n", - "File \u001b[0;32m~/mambaforge/envs/pyprophet_dev/lib/python3.11/site-packages/pandas/core/common.py:573\u001b[0m, in \u001b[0;36mrequire_length_match\u001b[0;34m(data, index)\u001b[0m\n\u001b[1;32m 569\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 570\u001b[0m \u001b[38;5;124;03mCheck the length of data matches the length of the index.\u001b[39;00m\n\u001b[1;32m 571\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 572\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(data) \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mlen\u001b[39m(index):\n\u001b[0;32m--> 573\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 574\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLength of values \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 575\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(data)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m) \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 576\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdoes not match length of index \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 577\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m(\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(index)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m)\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 578\u001b[0m )\n", - "\u001b[0;31mValueError\u001b[0m: Length of values (19) does not match length of index (21)" - ] + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
feature_idTRANSITION_IDAREA_INTENSITYTOTAL_AREA_INTENSITYAPEX_INTENSITYTOTAL_MIVAR_INTENSITY_SCOREVAR_INTENSITY_RATIO_SCOREVAR_LOG_INTENSITYVAR_XCORR_COELUTIONVAR_XCORR_SHAPEVAR_LOG_SN_SCOREVAR_MASSDEV_SCOREVAR_MI_SCOREVAR_MI_RATIO_SCOREVAR_ISOTOPE_CORRELATION_SCOREVAR_ISOTOPE_OVERLAP_SCORE
006221.0221.01111111111111
107222.0222.01111111111111
216442.0442.01111111111111
317444.0444.01111111111111
42151803.01803.01111111111111
52161806.01806.01111111111111
633804.0804.01111111111111
734808.0808.01111111111111
835812.0812.01111111111111
9482005.02005.01111111111111
10492010.02010.01111111111111
114102015.02015.01111111111111
124112020.02020.01111111111111
135174206.04206.01111111111111
145184212.04212.01111111111111
155194218.04218.01111111111111
166123507.03507.01111111111111
176133514.03514.01111111111111
186143521.03521.01111111111111
197206408.06408.01111111111111
207216416.06416.01111111111111
217226424.06424.01111111111111
\n", + "
" + ], + "text/plain": [ + " feature_id TRANSITION_ID AREA_INTENSITY TOTAL_AREA_INTENSITY \\\n", + "0 0 6 221.0 221.0 \n", + "1 0 7 222.0 222.0 \n", + "2 1 6 442.0 442.0 \n", + "3 1 7 444.0 444.0 \n", + "4 2 15 1803.0 1803.0 \n", + "5 2 16 1806.0 1806.0 \n", + "6 3 3 804.0 804.0 \n", + "7 3 4 808.0 808.0 \n", + "8 3 5 812.0 812.0 \n", + "9 4 8 2005.0 2005.0 \n", + "10 4 9 2010.0 2010.0 \n", + "11 4 10 2015.0 2015.0 \n", + "12 4 11 2020.0 2020.0 \n", + "13 5 17 4206.0 4206.0 \n", + "14 5 18 4212.0 4212.0 \n", + "15 5 19 4218.0 4218.0 \n", + "16 6 12 3507.0 3507.0 \n", + "17 6 13 3514.0 3514.0 \n", + "18 6 14 3521.0 3521.0 \n", + "19 7 20 6408.0 6408.0 \n", + "20 7 21 6416.0 6416.0 \n", + "21 7 22 6424.0 6424.0 \n", + "\n", + " APEX_INTENSITY TOTAL_MI VAR_INTENSITY_SCORE VAR_INTENSITY_RATIO_SCORE \\\n", + "0 1 1 1 1 \n", + "1 1 1 1 1 \n", + "2 1 1 1 1 \n", + "3 1 1 1 1 \n", + "4 1 1 1 1 \n", + "5 1 1 1 1 \n", + "6 1 1 1 1 \n", + "7 1 1 1 1 \n", + "8 1 1 1 1 \n", + "9 1 1 1 1 \n", + "10 1 1 1 1 \n", + "11 1 1 1 1 \n", + "12 1 1 1 1 \n", + "13 1 1 1 1 \n", + "14 1 1 1 1 \n", + "15 1 1 1 1 \n", + "16 1 1 1 1 \n", + "17 1 1 1 1 \n", + "18 1 1 1 1 \n", + "19 1 1 1 1 \n", + "20 1 1 1 1 \n", + "21 1 1 1 1 \n", + "\n", + " VAR_LOG_INTENSITY VAR_XCORR_COELUTION VAR_XCORR_SHAPE VAR_LOG_SN_SCORE \\\n", + "0 1 1 1 1 \n", + "1 1 1 1 1 \n", + "2 1 1 1 1 \n", + "3 1 1 1 1 \n", + "4 1 1 1 1 \n", + "5 1 1 1 1 \n", + "6 1 1 1 1 \n", + "7 1 1 1 1 \n", + "8 1 1 1 1 \n", + "9 1 1 1 1 \n", + "10 1 1 1 1 \n", + "11 1 1 1 1 \n", + "12 1 1 1 1 \n", + "13 1 1 1 1 \n", + "14 1 1 1 1 \n", + "15 1 1 1 1 \n", + "16 1 1 1 1 \n", + "17 1 1 1 1 \n", + "18 1 1 1 1 \n", + "19 1 1 1 1 \n", + "20 1 1 1 1 \n", + "21 1 1 1 1 \n", + "\n", + " VAR_MASSDEV_SCORE VAR_MI_SCORE VAR_MI_RATIO_SCORE \\\n", + "0 1 1 1 \n", + "1 1 1 1 \n", + "2 1 1 1 \n", + "3 1 1 1 \n", + "4 1 1 1 \n", + "5 1 1 1 \n", + "6 1 1 1 \n", + "7 1 1 1 \n", + "8 1 1 1 \n", + "9 1 1 1 \n", + "10 1 1 1 \n", + "11 1 1 1 \n", + "12 1 1 1 \n", + "13 1 1 1 \n", + "14 1 1 1 \n", + "15 1 1 1 \n", + "16 1 1 1 \n", + "17 1 1 1 \n", + "18 1 1 1 \n", + "19 1 1 1 \n", + "20 1 1 1 \n", + "21 1 1 1 \n", + "\n", + " VAR_ISOTOPE_CORRELATION_SCORE VAR_ISOTOPE_OVERLAP_SCORE \n", + "0 1 1 \n", + "1 1 1 \n", + "2 1 1 \n", + "3 1 1 \n", + "4 1 1 \n", + "5 1 1 \n", + "6 1 1 \n", + "7 1 1 \n", + "8 1 1 \n", + "9 1 1 \n", + "10 1 1 \n", + "11 1 1 \n", + "12 1 1 \n", + "13 1 1 \n", + "14 1 1 \n", + "15 1 1 \n", + "16 1 1 \n", + "17 1 1 \n", + "18 1 1 \n", + "19 1 1 \n", + "20 1 1 \n", + "21 1 1 " + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ + "length = 22\n", "feature_transition['AREA_INTENSITY'] = feature_transition['PRODUCT_MZ'] * (feature_transition['feature_id'] + 1) #should be equal to product_mz * (feature_id + 1)\n", "feature_transition['TOTAL_AREA_INTENSITY'] = feature_transition['AREA_INTENSITY']\n", - "feature_transition['APEX_INTENSITY'] = [1] * 19\n", - "feature_transition['TOTAL_MI'] = [1] * 19\n", - "feature_transition['VAR_INTENSITY_SCORE'] = [1] * 19\n", - "feature_transition['VAR_INTENSITY_RATIO_SCORE'] = [1] * 19\n", - "feature_transition['VAR_LOG_INTENSITY'] = [1] * 19\n", - "feature_transition['VAR_XCORR_COELUTION'] = [1] * 19\n", - "feature_transition['VAR_XCORR_SHAPE'] = [1] * 19\n", - "feature_transition['VAR_LOG_SN_SCORE'] = [1] * 19\n", - "feature_transition['VAR_MASSDEV_SCORE'] = [1] * 19\n", - "feature_transition['VAR_MI_SCORE'] = [1] * 19\n", - "feature_transition['VAR_MI_RATIO_SCORE'] = [1] * 19\n", - "feature_transition['VAR_ISOTOPE_CORRELATION_SCORE'] = [1] * 19\n", - "feature_transition['VAR_ISOTOPE_OVERLAP_SCORE'] = [1] * 19\n", + "feature_transition['APEX_INTENSITY'] = [1] * length\n", + "feature_transition['TOTAL_MI'] = [1] * length\n", + "feature_transition['VAR_INTENSITY_SCORE'] = [1] * length\n", + "feature_transition['VAR_INTENSITY_RATIO_SCORE'] = [1] * length\n", + "feature_transition['VAR_LOG_INTENSITY'] = [1] * length\n", + "feature_transition['VAR_XCORR_COELUTION'] = [1] * length\n", + "feature_transition['VAR_XCORR_SHAPE'] = [1] * length\n", + "feature_transition['VAR_LOG_SN_SCORE'] = [1] * length\n", + "feature_transition['VAR_MASSDEV_SCORE'] = [1] * length\n", + "feature_transition['VAR_MI_SCORE'] = [1] * length\n", + "feature_transition['VAR_MI_RATIO_SCORE'] = [1] * length\n", + "feature_transition['VAR_ISOTOPE_CORRELATION_SCORE'] = [1] * length\n", + "feature_transition['VAR_ISOTOPE_OVERLAP_SCORE'] = [1] * length\n", "\n", "feature_transition = feature_transition.drop(columns=['precursor_id', 'PRECURSOR_ID', 'TRAML_ID', 'PRODUCT_MZ'])\n", "\n", @@ -2511,7 +3017,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "id": "588ee10a-7b00-4f61-b305-2a392b5bbd1b", "metadata": {}, "outputs": [], @@ -2528,7 +3034,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "id": "78f13b81-7db1-46ee-947b-723e8b0b340b", "metadata": {}, "outputs": [], @@ -2554,7 +3060,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "id": "8c143ee6-928c-4404-85cc-5fc7b9b1ce85", "metadata": {}, "outputs": [], @@ -2582,17 +3088,17 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 36, "id": "9a24a430-d994-4012-9ac3-37c792e30026", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 105, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -2604,7 +3110,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 37, "id": "563c3e64-e528-457c-ba42-f00fabcff0f0", "metadata": { "tags": [] @@ -2630,7 +3136,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 38, "id": "c8539aa2-f946-4be8-a891-0b55d6957322", "metadata": {}, "outputs": [ @@ -2667,7 +3173,7 @@ " \n", " 0\n", " 0\n", - " 1200\n", + " 1540\n", " 1\n", " 1\n", " 1\n", @@ -2676,7 +3182,7 @@ " \n", " 1\n", " 1\n", - " 2400\n", + " 3080\n", " 1\n", " 1\n", " 1\n", @@ -2685,7 +3191,7 @@ " \n", " 2\n", " 2\n", - " 6000\n", + " 9000\n", " 1\n", " 1\n", " 1\n", @@ -2694,7 +3200,7 @@ " \n", " 3\n", " 3\n", - " 12000\n", + " 4800\n", " 1\n", " 1\n", " 1\n", @@ -2712,7 +3218,7 @@ " \n", " 5\n", " 5\n", - " 9600\n", + " 12600\n", " 1\n", " 1\n", " 1\n", @@ -2721,7 +3227,16 @@ " \n", " 6\n", " 6\n", - " 14700\n", + " 14000\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " \n", + " \n", + " 7\n", + " 7\n", + " 12800\n", " 1\n", " 1\n", " 1\n", @@ -2733,33 +3248,35 @@ ], "text/plain": [ " feature_id SCORE RANK PVALUE QVALUE PEP\n", - "0 0 1200 1 1 1 1\n", - "1 1 2400 1 1 1 1\n", - "2 2 6000 1 1 1 1\n", - "3 3 12000 1 1 1 1\n", + "0 0 1540 1 1 1 1\n", + "1 1 3080 1 1 1 1\n", + "2 2 9000 1 1 1 1\n", + "3 3 4800 1 1 1 1\n", "4 4 2000 1 1 1 1\n", - "5 5 9600 1 1 1 1\n", - "6 6 14700 1 1 1 1" + "5 5 12600 1 1 1 1\n", + "6 6 14000 1 1 1 1\n", + "7 7 12800 1 1 1 1" ] }, - "execution_count": 107, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "length = 8 \n", "score_ms2 = feature_ms1[['feature_id']].copy()\n", - "score_ms2['SCORE'] = (features['id'] + 1) * (features['precursor_id'] + 1) * features['exp_rt'].astype(int) # (feature_id+1) * (precursor_id+1)\n", - "score_ms2['RANK'] = [1] *7\n", - "score_ms2['PVALUE'] = [1] * 7\n", - "score_ms2['QVALUE'] = [1] *7 \n", - "score_ms2['PEP'] = [1] *7\n", + "score_ms2['SCORE'] = (score_ms2['feature_id'] + 1) * (features['precursor_id'] + 1) * features['exp_rt'].astype(int) # (feature_id+1) * (precursor_id+1)\n", + "score_ms2['RANK'] = [1] * length\n", + "score_ms2['PVALUE'] = [1] * length\n", + "score_ms2['QVALUE'] = [1] * length \n", + "score_ms2['PEP'] = [1] * length\n", "score_ms2" ] }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 39, "id": "4cbda7cf-0535-4292-bfed-739a5f1bd2b8", "metadata": {}, "outputs": [], @@ -2774,7 +3291,7 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 40, "id": "adb88443-6d34-4173-8b37-9f52dba9f5e7", "metadata": {}, "outputs": [], @@ -2800,17 +3317,17 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 41, "id": "e0094b3a-5a80-48e4-8041-a537ce409480", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 110, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -2822,7 +3339,7 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 42, "id": "acf865f3-3353-4baa-b83e-91be2abed776", "metadata": { "tags": [] @@ -2849,7 +3366,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 43, "id": "f95142b9-612b-43a8-bb42-356b71839ea6", "metadata": {}, "outputs": [], @@ -2859,7 +3376,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 44, "id": "43828588-c1ff-4943-a7b7-24a968562c4e", "metadata": {}, "outputs": [ @@ -2955,7 +3472,7 @@ "6 0 GGGGGGGGGGR GGGGGGGGGGR 0" ] }, - "execution_count": 113, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -2966,7 +3483,7 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 45, "id": "f416e48b-6bb6-4cb7-8d81-597cfd52320c", "metadata": { "tags": [] @@ -2979,7 +3496,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 46, "id": "0e1eb1b9-730e-45d4-9618-fd532c1ccc25", "metadata": {}, "outputs": [], @@ -2995,7 +3512,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 47, "id": "e9692e06-ddf2-4f74-bb80-f2a92728767b", "metadata": {}, "outputs": [], @@ -3010,7 +3527,7 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 48, "id": "f720c22b-e6fa-4ac0-8402-bdcd2e74840b", "metadata": {}, "outputs": [], @@ -3036,17 +3553,17 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 49, "id": "94c860e0-880a-4091-afb8-af368ed72b26", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 118, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } @@ -3058,7 +3575,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 50, "id": "1c053178-b8a5-44ad-876b-49b5fd8afa23", "metadata": { "tags": [] @@ -3085,7 +3602,7 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 51, "id": "d70ba894-55bf-4a36-b306-45b7c5e9d1bd", "metadata": {}, "outputs": [], @@ -3095,7 +3612,7 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 52, "id": "3e1bbbeb-7cc9-4b9f-b898-5148abff911d", "metadata": {}, "outputs": [ @@ -3162,7 +3679,7 @@ "3 0 Decoy_ProtT 1" ] }, - "execution_count": 121, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } @@ -3173,7 +3690,7 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 53, "id": "7b3410b1-5d6a-4e85-838c-5ccb3b15f1c5", "metadata": {}, "outputs": [ @@ -3260,7 +3777,7 @@ "3 0 0 1 1 1 global 1" ] }, - "execution_count": 122, + "execution_count": 53, "metadata": {}, "output_type": "execute_result" } @@ -3278,7 +3795,7 @@ }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 54, "id": "51408b81-b650-4787-9050-59d63c0098c0", "metadata": {}, "outputs": [], @@ -3293,7 +3810,7 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 55, "id": "ab6a68b5-f5db-46e7-95e8-6e98a69eb062", "metadata": {}, "outputs": [], diff --git a/tests/data/dummyOSWScoredData.osw b/tests/data/dummyOSWScoredData.osw index 84e43d5c856e7d65c3100771311fd97510683068..a3fa86dd203d8283f428b012fcdce03de3e6d498 100644 GIT binary patch delta 3476 zcmZ`+eQXrR72o;VnO)!RTi@rN3AXugd?9X&F{U3;icKscrBy_cr350?Dos-gk)f?C zWLNU}qEPy6h=b05kV;KksrjRng;<7An|`o8e_J;MY26SqDRi*UGMzzz5S z?!Zm7@fUcIGex*ygfPaLMJQ*)q7JSi;hcf6f5W@*cX$Ubz`5dzgTI7_BqPe?CiX(; zd7KKILl_kd1*M##85}ZYX)_#V40s>@0heF~SDc1EK<`#rWv{k;WzSH*#kiGHdibbK zIY}uRytauRGd&{;KcVYo;B+rO|g*_Wi8({R5c8g|DFRXK}*o@Q6U? ziOB?dPXr5fpP~C|-g^n#4VZ=Bz;XC7&F zEN1ohe+qvQE=Vk9i8tblY0)lT?u9%N3h=V?0Ip#I^C}sD$$doQJFM zPaPn7K8rs83O4ElNgbbQ>QDoxX^=6D0AI5pY_7vrn(P0D$_z!1|}S1L%IDFoHL2tiOt^Y}5oHm;z}Kn8ycKZ9}jDr|*^pskQ|;1h9W zTPcSUgAg6{Ko@IeTka*KO)77L@CUl+G~0Zy#k9$TWw7EC19Po}O^nTbyxT0ZG00U5I^#5r-&;;>@_5S$Mk`x%q2Yuc^T>unGdM zxEWA`m0787OuCLO;7V$)Qn2%|B&!>~2{{;nAsB?6h1@#G@)Q0I<~H^gx3E_N%M0l(*P>aj z#mZI^qAA!*m>a}U8SEv=aW4_+6v;{Eu~cMtUbd14%t$dZ9@$;Ubw{!*Qx0jN1QNtj z7PXR?kFJ3oU7zh@tJsb}vdqflB1|H{46j4XN;a$|WtkO;3dRcT#LIR%5wimAXqj0V zuQj7$Sl?Zx>m8(tu@nsFCG za1@@0?a&F*z2u%KDjVfJ!^RA@GgV0rE+8{} z!6p+uUn1z#$d#%zPS+sW0+RBc=T1Jn7h|eQ`s4z#o!ZtjRY%R9-DBn0E7G`FgCw@9 z1!CaMui=D1)&ljp8Eloia2synx$rkU7tZ5xxMJ@#T>xI34u$DSIR&%->Bd@_bRxDL5XBZCA~k-?GdTs7czJYhDt z@3}vBcf0LQsj-!vab&MrgP%z&e`u7nHEUW!nB62bh6I=v&1e{@o6H5!eVWWLK6}L) zlK=iF$q=6Z#VAS2O?+Wii{>tjzD=wKq-hIdu9vQ{s!lc|V_~HAk~OLZvx_|yLYJI0 zqGo?yj**PK-)y6QfKnmXY?k`JLHY=XlF@xA*I7DqfP7RnHPFKb{N9~$Uy5v&Pm5MQ z$!~GSK1~l7a{FlMg;_dCO6*0FAk;8QO@~Q`EWPp)d4ifBN|>qMW%(Z*AbzQDKk-O@ z@&M^xualK@P8)dHc>`XBBk&yTgvVh$G=bq>!-bB!lkTYdwELL5)=fHhoxeJ#3%M7a zte=ePyT_tBmC3Z9_WeYNCXx2io*xg<#M3IR{8+ulbe&VqDL)#bah&XwGvzmis3K0$ zDf(W$M#Q`kZ^VZXP1GCmhI}_f)94L)gT7O*$?^tyRbKTY^(t=5%kr|{5Tdc<1$n`@ zLo`ahDc|(1dJTTSJ6MOWmT0m_Jl)XeE?t1J{{eGWhoJxf literal 237568 zcmeI*37i~Nod@uGb@u(AD>fm=bS8w%Bq4Dofq=j!ote(ipUjn>o*WYC4P<~ZkdVs{ zs4F{=^@FP-h$}81ycT7baD5c{$gYZj>;@yp`e6+wB79;{l!y={unPNO-+NWBy84XW?gSM{#?*3tc{U)2mQ?HSoToVs}Z#>-c4PR-ToG{&@rsg$N^XYv0Z;Q!S$ zo4+wtguk-D`&w_$(q?~Rg~gZZ^oB??o8EHY)XISQ5P$##AOHafKmY;|fB*y_009WR zivlN4;DwDJ7v~|Pw>16_69gat0SG_<0uX=z1Rwwb2tWV=$De?~bY_VDzdSuZ__vF@95GKfCj4w0SG_<0uX=z1Rwwb2tWV=5O_}nCfHVJ>eNuT zKjr*)-MPt@#imUQSu2avlH%KyOQNO~Zfp$Y^Ev*n!LZo)@gY+t$hqPBbc;=z#2@GH zPCS{Maw%r9>C;1o7nd(fN+M$XzfI3;^gRFV{{SBa@Cv=~o&caw2tWV=5P$##AOHaf zKmY;|fB*#ED*-Xu-xk0BTl@)tVJ031$V@|o{3(E9Jb?K9-=U{9`V+lI`{+0PI>66) z6cYp>009U<00Izz00bZa0SG_<0!O!i;qbxza_qlh$g%&r&WHVnLh|AN{O1P}Ez$qi>W=OeLhB#^0SG_<0uX=z1Rwwb2tWV=5I8acG5#O@|06R*ZU{gC0uX=z z1Rwwb2tWV=5P-nZE`Z(000; zK1Z7@ofIWT>bmT;&+sPBJ2F zlDE|TV*$KU?OCJgos;z967Meq6gsrW^rm+vu?TC6GucQx_(uf_9c%Q+J7WC5L%THo z^#5J-0{<(32l!tAcC-@r8Mjkep4Lq#3>Uod7&amqE3mF#i zmxOL5_)A>3;`}A1n=ylhqGptb5i`QWuo>oI$O`erj%7Lg#kS=0|M|oI>l^ zhVG)jrSoYX#S_0xJe;^Wu{N)KiMaW=PDQ(ZnVE$Jxmays-6>Y}_3U3+pyotN4z zj{=GFJT)(|Ltm+jk~=;;o#L#mSaQNko8!smUeLkb` z@sha(x_82hor15q_Di>{k502o%JOcpcfqxvs1=z#%_>Re-EZ%1Y;A|Ca>rD&q)NkW znY)#>AG=jvJi~~ycK%zxx2NjCJM`U(FH!ZL)~3h9fwZMb#rj^X>i+R`77xj!?&_8% z73)0g);}!n|8JsCYxFj~N&9IZ{hFTVPXc(Fp5V^|e3-sR_tU*}2i?Z+0l1lN(+*<*}Jphd9hIl%FZWy7keExr8vqq264YZslCw`N- zn@2H000Izz00bZa0SG_<0uX=z1ddo>z7w%+<~X{;*xFa;-_uc;xPjJgxpU^D8@3w@ zuX*Tmt6C5INWJPS7Hs|Kp6+{^K4w|-?PA7NTOTp(TefXe8TEx%FFf$b=T^0;SN3`_ z;~5su7;qELM2#-8=8Pw|@IXmCstZc9b{k zqkGh=_G}=dQOr1d(Q0;W@6eFSXf7Pl+?IN;+GE9xXXtK5#%(6=+_!A`b$hQ@8SRBD znOmdxx+aj174xa|>XqlyZhIzj?gsT5$mc%)|8|W({qIHE&42ZOi0-4?k9ZTK00=+; z0uX=z1Rwwb2tWV=5P$##4qG5**#4CgM?}=s4_idk6%9*7)MX0(m_VDUOAm&KsLKg@ z)Ud;@tO1Rwwb2tWV=5P$##AOHafK;T^xnBz3s z1|KyL4o+(LK)BWRM-HeT#pl>nGN_KT^YvB!hyj&hw#75>Q3AmV!m4v^pZsk#LO`XT zV^*m^^}mhtuk%L;_!*2U8T>wa`~rVufS*CHl0o(3*(bl_j|lKHuqqi;U!Q$)n?Dpl zWsv><`0E%`Riwbn{tM&4;?9ht}bpEUM|LMEhIO|l6en^w)`ozug7ZaDp^RdzRwCHal zuSD;U41^malR}p{PuO|qOV*Hes{Rzuf(Zf;fWQeNaN0OKnV!J3k#%c^uiUiZvXRZh zh1FX&ub1({;!HX}l*<&_y*KY0{dkj;oG^i1+rN4Bg_jL`v%QybojIO9lgg(T^<+}s z%+_pID%+P&4dk-D>Du|v@Rs}Y26CCsq1<49u3|}M#XupK zPw`56ibaT-*B*-nXZK_l=L^fTUHS9Mip*uZ&#Stm?V5ChlbkYz zZS9q-pUCM??;7gN7kWQf*1FE~(z))8 zG?j+=l>?cSnAzIi=BlRqIA&TjD^Om4t}ENe;l#F(`MNUsOlLmZ*X_C0yq zudG#Cc{RT zR5Z`uo2xXid|7?XCYGlhvypz{@@8J+B%EY(GrRUfvYGyo=D(qm2KciNci-*K^$!gc zdeVzBJ%K&YFYe&`^jN!;TIQb##D1n`cIC4Cpw4G|Gh^?B?PMo2&9!#IVoTfow{@cv zg!UwVC^M|>bw;tv=)mf+k~_H%E}hBjdn(z=6GpTIre?A+Hi;^*Vv9@($|kAJmh?cSTnXr1FE z&pw-t&JV6qfqz!zGub}>0PxO|WwS%oR=GTHtE#h1bCxQsY<}C-mP8&M^0vQhiI>P@bZ2K~aIp9Zs`mS8g!Y((9t}T{$d);yrs*V4Kb?)vEFtX9^!cA*iXb@8 zDqR&%IGCcQoJ>^CwoZ4FZt3_m~SX1OU^H7$2-Z!Mm9Q5wu$$E`M-%J?cg)i*{o`>c`^TnRe9EJn|89DJgc$RDOPl~+j%f4 zw%9$vZcU%$B-`8Bb*IUOsMRg@ek$ioC~0Qp@>Vsg?9*mevCxW6eVTXYE4q&A<2KLx zu|0(`lcWbSD-M#sHyh{-EKBnX3#zADkXyXw(wyi-S54E?-(4)FEdL6=o8|a-`b^J| z?1Q`4D?~3v)a@=EJdd}#ET1TIu&-)$z3IWhuFSIP%d)j**KU4xus`2Fkl~HYR!Va)ZAUh+~7yLPXsR* zxmSkN>Ao7vbG?IIwHEdF=X*2!IvBq`R6(F>A58ak^GkxIN5YugUWy;&$Tl0`n8LHQtEz~}wyi1Q&Gakwr!9QQrYIN-rM_@^<%qz#( zhrRPufp;G>J%z>DeARWx_KtDt@)!DRQMRpl0{2*}cAyOQ_wf^BPrAD-lRLlKsnW|K zPncCRiG7QAmKWDtn9pVU%6AL5)ZF0GVy-q#V;7w`Uzq+PYq~Z6KKUVX-TJZ*5%sGB z|H{^$tnnez{(t)r8S`61?N12t0Y2v0r{;;JQ~pLUSpC@I=OcbWYf-jOT-&MkN%Ag! zR`4T{bB=K|9^T1Htyl4o34F`(&j$YLRZUjy2oIjq#;ikTu2O!R82{fC|AC5E|DMD{iLWL;k@!&JoWz9qpX0mY55#Zg2{Azc0uX=z1Rwwb2tWV=5P-m8 z2sAe}*i6?;#yfeubm+5>~?lH$7&}CIX`oL)*P)&poZ)_ z#+AmE&5_y(Y-6*rxj9^!K#kP5^ws+6=1}bfrhb8bL9009U< z00Izz00bZa0SG_<0`HDMz2n&O77%skNBV2^wl_=N$}zRI-tuOtTQ8jdtT(+`>Q0EJ zuhkpgEOl!`^nrTao270gu&%6U-Yhi&K>xlN|8LR1X!HkqmUhqs{4FL3KmY;|fB*y_ z009U<00Izz00jO*0tqWLUJHfz69g`~Z2g6+FRL?oT;A31#x1MPD4k=Pb$aO>U9V%M za|}cD{|$Ol^#7mZFPI#WkKy%3(;)x>2tWV=5P$##AOHafKmYu(i1{bP;)x9NFx|NlPv zHIHC|00bZa0SG_<0uX=z1Rwwb2teRi5)c;vEblUaxFR420O)*NK$w~0_kUab{?F)j zjsL?00SG_<0uX=z1Rwwb2tWV=5P-mOBoH<1Fs|n$SM$BNu2u5`D|Bsms zkA^}30uX=z1Rwwb2tWV=5P-l5B_O)*YhRszPe)^`Z`>cN%THbB{VP=STx?zOEV*I~B``zdN z?LQt?80!E52tWV=5P$##AOHafKmY;|fPf~R0w|yV$Ab_vME_qt|6lz6zn}2@f9xU< zfB*y_009U<00Izz00bZafn!fVKL6hqp(+0gfMLp!0HweGH_@lv@&8``pLWsH{7Hb@ z>0$aF-B0(@9dsLgnQo>Vc|uGOfB*y_009U<00Izz00bZa0SFw4fasYwHi@vIQH1pk zBCM+yA=Qa6K_ZMNL>P;UFd7qKBr3viM1-NR2%V4!ZAXL_`u|6=S&$b35P$##AOHaf zKmY;|fB*y_@U9AozWV=r|3A)GVFG(hqio{yiPreWxD^|Uz8U>iba~`Yk)`23hL?o) zhh{k+w*P2<+>Tq(&fkIkM);!o|a@ zH*eWET-dyE^}0T#GAi< zo|DW>VWXDc`as!%w^j#6ZHp+mSV*J;i*fR^fH>Tqiliz0US)tg;PDN>SRy zlIB;m;9zAOl1$r%4kvl;bT%6G+qUVV^?dKTV&nSF!z1eo8@H@0jI0^#?B_exK(0TZ z$@ZO5(UfXgMD-=vQmwh6KE5&9{-&L80AA&U8da*Q%c?#7Ak`MLRjJx*x~P7``)jQ6 z@D-a!)(n@fab_T&?aIunRdq?0K(%hR;u_DXvBqBhE4){Kjotin4^nM0Ta~H@+%m)( zUp||!adYK1zI;8WhA(civmS1m0;?c_Xv zdqtJivxrIq*-ERORlC~uzY#hF5 z%f?OXHx@2meZ>_c>n`C3mYNeh!SG`%(>atI?9UZ?(*pz9zV0~{Z9C**MRSX1j>@7_ zt;O2J`DxJ%Cpp5`;G#oYgFrjRUWL;RZxu?)J@`5V){w8niZ(krX9`ZT;g|9ewJ&<89QL6}2C{C?Qd!IKCl^NBOGhcJizV zY*Vq7-YHzQa%7Fyni-X?s7NYnbeK46x4}Cz53sgS}R(ilBidAcks>^Qenj@cCtNatdhDPlU~y4HA{AriFNTy z^*$84rrOE2Ms~ix^+ZxPIZttFMTy15qBNdFwr^ES+1mTNSNWbft)j%rqyckxA2@A_ zll;&EHoC-bW$|pLviqB^SfhVxQ~QH=G@Me|2LI$(yExgZT1A!|5o=TYb8kZMb5Cuq zVzd3-VQNKVswFPjbfss}xy?>;HOCn|6dZp&#X+gg{AD~X6)h?@yMiZB;K6|W`uO2; z>0~>ZZe~|kYjdDTzl~GIYEvLXNgLdqXWk?yIej|2X8PvU7hX18^3|{;lJcs^rx*2P zQY|IRT2ieoo&A0J%!+(VN`$jgE$#u-!aF$mOm`;7=cv<7wktSS7SxjBy`dhFW?8yt zDC0+$mPYxOa2K_W&YM`J+3tDGYnmHruG(yO*&#ODJ6paR%@+6nH_;8=^Z)4g{Q3XC zqF>T8^i%o?{X6}T{*~^dyXl|l>vSu9ktf6i0SG_<0uX=z1Rwwb2tWV=5P-lD3&>&n zh79=w`dCwiNF*AzYL%;-7qxy{6B;Cd(Zzr zMpq8ah5!U0009U<00Izz00bZa0SG{#R6q;>;6MF`nf(3V5WoK$^qTkke`!+0AOHaf zKmY;|fB*y_009U<00Iy=Rs|}5|8Jtt7RUd;NPFlP^fP*jeoQ~2AJBK{TXZLVlfFv- zNH_6>m>>WF2tWV=5P$##AOHafKmY;|I5GioqkdhJ2&qwoi3SnI>qQu=6JeA@7)gjQ z92a3ICPF7FLOUWtD=b1YBtpXxp>E5NkN*#g@&92l{y!wf|A*xG|0eph+yCEB`{>v7 zJnf>V=?U6S57YPPe!7?LpxfxnbTi$^6Jmk@1Rwwb2tWV=5P$##AOHafK;TFOL?69T z_Wm1W@4sI5{_AA#pJeYpA$$LE+53;l-hWi~{v)#YAC|rUknH_CviEPx-oGV#|E49c z2gt_&>b(EYZvgQ7|A}21{gL> zDPwAx5jR7{WqORq!fKhg|Gz2mKN`JGFVb%QtN%lEAKlK!`hSM5rj0aAD`*M*6}9tg z07;7QgqR=z0SG_<0uX=z1Rwwb2tWV=$C^OQu$j6f&=ia`iiqm}Hv}W~!APBmsIEQ* zBZ*)n9*o39L|p)g1|ty>QN8_eFcJzz91&3+eLEO2gArXsR4?BOMvSOY?En8S0#CSu diff --git a/tests/test_pyprophet_export_parquet.py b/tests/test_pyprophet_export_parquet.py index 4e2b37df..453733ca 100644 --- a/tests/test_pyprophet_export_parquet.py +++ b/tests/test_pyprophet_export_parquet.py @@ -49,36 +49,60 @@ def _run_export_parquet_single_run(temp_folder, transitionLevel=False, pd_testin ### This file was configured in a way where the following tests should work parquet = pd.read_parquet("dummyOSWScoredData.parquet") ## automatically with parquet ending of input file name + ### CHECK LENGTHS ### if transitionLevel: if onlyFeatures: # length of FEATURE_TRANSITION table - expectedLength = len(pd.read_sql("select * from feature_transition", conn)) + if not noDecoys: + expectedLength = len(pd.read_sql("select * from feature_transition", conn)) + else: + expectedLength = len(pd.read_sql("select * from feature_transition inner join transition on transition.id = feature_transition.transition_id where DECOY == 0", conn)) else: - featureTransition = pd.read_sql("select * from feature_transition", conn) - precursorTransition = pd.read_sql("select * from transition_precursor_mapping", conn) + if not noDecoys: + featureTransition = pd.read_sql("select * from feature_transition", conn) + precursorTransition = pd.read_sql("select * from transition_precursor_mapping", conn) + else: + featureTransition = pd.read_sql("select * from feature_transition inner join transition on transition.id = feature_transition.transition_id where DECOY == 0", conn) + precursorTransition = pd.read_sql("select * from transition_precursor_mapping inner join transition on transition.id = transition_precursor_mapping.transition_id where DECOY=0", conn) + featureTable = pd.read_sql("select * from feature", conn) numTransNoFeature = len(precursorTransition[~precursorTransition['PRECURSOR_ID'].isin(featureTable['PRECURSOR_ID'])]) expectedLength = numTransNoFeature + len(featureTransition) - assert(expectedLength == len(parquet)) else: if onlyFeatures: # expected length, length of feature table - expectedLength = len(pd.read_sql("select * from feature", conn)) + if noDecoys: + expectedLength = len(pd.read_sql("select * from feature inner join precursor on feature.precursor_id = precursor.id where decoy = 0", conn)) + else: + expectedLength = len(pd.read_sql("select * from feature inner join precursor on precursor.id = feature.precursor_id", conn)) else: # Expected length is number of features + number of precursors with no feature - featureTable = pd.read_sql("select * from feature", conn) - precTable = pd.read_sql("select * from precursor", conn) + if noDecoys: + featureTable = pd.read_sql("select * from feature inner join precursor on feature.precursor_id = precursor.id where decoy = 0", conn) + else: + featureTable = pd.read_sql("select * from feature", conn) + + if noDecoys: + precTable = pd.read_sql("select * from precursor where decoy = 0", conn) + else: + precTable = pd.read_sql("select * from precursor", conn) numPrecsNoFeature = len(precTable[~precTable['ID'].isin(featureTable['PRECURSOR_ID'])]) expectedLength = numPrecsNoFeature + len(featureTable) - assert(expectedLength == len(parquet)) + assert(expectedLength == len(parquet)) - ########### FEATURE LEVEL TESTS ######## + ########### FEATURE LEVEL VALUE TESTS ######## # Tests that columns are equal across different sqlite3 tables to ensure joins occured correctly + # since cannot compare NAN drop rows which contain an NAN + na_columns = ['PRECURSOR.LIBRARY_INTENSITY'] # this is a list of columns which expect to be NAN + parquet = parquet.drop(columns=na_columns).dropna() + + assert(len(parquet) > 0) # assert that did not just drop everything (means that missed an na column) + if transitionLevel: ## check features and transitions joined properly for those all cases (including those with no features ## Way library was created precursor and transition m/z both are in the same 100s (e.g. if precursor m/z is 700 transition mz can be 701) @@ -86,29 +110,25 @@ def _run_export_parquet_single_run(temp_folder, transitionLevel=False, pd_testin ### Note: Current tests assume no na parquet = parquet.dropna() - proxy_feature_id = parquet['FEATURE_ID'].astype(str).apply(lambda x: x[0]).astype(int) # since id is complicated, dummy values created using a proxy id which is the first digit of the actual id pd.testing.assert_series_equal(parquet['FEATURE_MS1.APEX_INTENSITY'], parquet['PRECURSOR_ID'], **pd_testing_kwargs) pd.testing.assert_series_equal(parquet['FEATURE_MS2.APEX_INTENSITY'], parquet['PRECURSOR_ID'], **pd_testing_kwargs) pd.testing.assert_series_equal(parquet['FEATURE_MS1.EXP_IM'], parquet['FEATURE_MS2.EXP_IM'], **pd_testing_kwargs) pd.testing.assert_series_equal(parquet['FEATURE_MS2.DELTA_IM'], parquet['FEATURE_MS1.DELTA_IM'], **pd_testing_kwargs) - pd.testing.assert_series_equal(parquet['SCORE_MS2.SCORE'], (parquet['PRECURSOR_ID'] + 1) * parquet['FEATURE.EXP_RT'].astype(int) * (proxy_feature_id), **pd_testing_kwargs) - print(parquet.columns) + pd.testing.assert_series_equal(parquet['SCORE_MS2.SCORE'], (parquet['PRECURSOR_ID'] + 1) * parquet['FEATURE.EXP_RT'].astype(int) * (parquet['FEATURE_ID'].astype(int) + 1), **pd_testing_kwargs) pd.testing.assert_series_equal(parquet['SCORE_PEPTIDE.SCORE_GLOBAL'], parquet['PEPTIDE_ID'], **pd_testing_kwargs) pd.testing.assert_series_equal(parquet['SCORE_PROTEIN.SCORE_GLOBAL'], parquet['PROTEIN_ID'], **pd_testing_kwargs) - print(parquet) # check is/no decoys if noDecoys: assert(parquet[parquet['DECOY'] == 1].shape[0] == 0) - else: - assert(parquet[parquet['DECOY'] == 1].shape[0] != 0) + ############### TRANSTION LEVEL TESTS ################ if transitionLevel: - pd.testing.assert_series_equal(parquet['FEATURE_TRANSITION.AREA_INTENSITY'], parquet['TRANSITION.PRODUCT_MZ'] * (proxy_feature_id), **pd_testing_kwargs) + pd.testing.assert_series_equal(parquet['FEATURE_TRANSITION.AREA_INTENSITY'], parquet['TRANSITION.PRODUCT_MZ'] * (parquet['FEATURE_ID'].astype(int) + 1), **pd_testing_kwargs) def test_export_parquet_single_run(tmpdir): _run_export_parquet_single_run(tmpdir, transitionLevel=False) From 18aeefea57a75e3ab54d2a1d4fc9ceed5cdcf573 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Sat, 8 Feb 2025 13:37:28 -0500 Subject: [PATCH 8/8] test: feature_id is long integer this is indicative of true OSW files which are 19 digits long. Test calculations are done with a pseudo feature id which is the first digit --- tests/Create_OSW_test.ipynb | 783 ++++++++++++++++--------- tests/data/dummyOSWScoredData.osw | Bin 131072 -> 131072 bytes tests/test_pyprophet_export_parquet.py | 5 +- 3 files changed, 501 insertions(+), 287 deletions(-) diff --git a/tests/Create_OSW_test.ipynb b/tests/Create_OSW_test.ipynb index 5bbca243..0ba87a6f 100644 --- a/tests/Create_OSW_test.ipynb +++ b/tests/Create_OSW_test.ipynb @@ -781,7 +781,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 8, @@ -911,7 +911,7 @@ "outputs": [], "source": [ "## Note: The second numpy array must be edited manually if add new precursors to the library\n", - "features = pd.DataFrame(np.column_stack([np.arange(0,8), np.array([6,6,4,5,0,2,3,1])]), columns=['id', 'precursor_id'])" + "features = pd.DataFrame(np.column_stack([np.arange(1,9), np.array([6,6,4,5,0,2,3,1])]), columns=['id', 'precursor_id'])" ] }, { @@ -993,12 +993,12 @@ " \n", " \n", " 0\n", - " 0\n", + " 1\n", " 6\n", " 6\n", " 1\n", - " 220.01\n", - " 220.01\n", + " 220.02\n", + " 220.02\n", " 220\n", " 0.01\n", " 5\n", @@ -1006,12 +1006,12 @@ " \n", " \n", " 1\n", - " 1\n", + " 2\n", " 6\n", " 6\n", " 1\n", - " 220.02\n", - " 220.02\n", + " 220.03\n", + " 220.03\n", " 220\n", " 0.01\n", " 5\n", @@ -1019,12 +1019,12 @@ " \n", " \n", " 2\n", - " 2\n", + " 3\n", " 4\n", " 4\n", " 1\n", - " 600.03\n", - " 600.03\n", + " 600.04\n", + " 600.04\n", " 600\n", " 0.01\n", " 5\n", @@ -1032,12 +1032,12 @@ " \n", " \n", " 3\n", - " 3\n", + " 4\n", " 5\n", " 5\n", " 1\n", - " 200.04\n", - " 200.04\n", + " 200.05\n", + " 200.05\n", " 200\n", " 0.01\n", " 5\n", @@ -1045,12 +1045,12 @@ " \n", " \n", " 4\n", - " 4\n", + " 5\n", " 0\n", " 0\n", " 1\n", - " 400.05\n", - " 400.05\n", + " 400.06\n", + " 400.06\n", " 400\n", " 0.01\n", " 5\n", @@ -1058,12 +1058,12 @@ " \n", " \n", " 5\n", - " 5\n", + " 6\n", " 2\n", " 2\n", " 1\n", - " 700.06\n", - " 700.06\n", + " 700.07\n", + " 700.07\n", " 700\n", " 0.01\n", " 5\n", @@ -1071,12 +1071,12 @@ " \n", " \n", " 6\n", - " 6\n", + " 7\n", " 3\n", " 3\n", " 1\n", - " 500.07\n", - " 500.07\n", + " 500.08\n", + " 500.08\n", " 500\n", " 0.01\n", " 5\n", @@ -1084,12 +1084,12 @@ " \n", " \n", " 7\n", - " 7\n", + " 8\n", " 1\n", " 1\n", " 1\n", - " 800.08\n", - " 800.08\n", + " 800.09\n", + " 800.09\n", " 800\n", " 0.01\n", " 5\n", @@ -1101,14 +1101,14 @@ ], "text/plain": [ " id precursor_id ID run_id exp_rt exp_im norm_rt delta_rt \\\n", - "0 0 6 6 1 220.01 220.01 220 0.01 \n", - "1 1 6 6 1 220.02 220.02 220 0.01 \n", - "2 2 4 4 1 600.03 600.03 600 0.01 \n", - "3 3 5 5 1 200.04 200.04 200 0.01 \n", - "4 4 0 0 1 400.05 400.05 400 0.01 \n", - "5 5 2 2 1 700.06 700.06 700 0.01 \n", - "6 6 3 3 1 500.07 500.07 500 0.01 \n", - "7 7 1 1 1 800.08 800.08 800 0.01 \n", + "0 1 6 6 1 220.02 220.02 220 0.01 \n", + "1 2 6 6 1 220.03 220.03 220 0.01 \n", + "2 3 4 4 1 600.04 600.04 600 0.01 \n", + "3 4 5 5 1 200.05 200.05 200 0.01 \n", + "4 5 0 0 1 400.06 400.06 400 0.01 \n", + "5 6 2 2 1 700.07 700.07 700 0.01 \n", + "6 7 3 3 1 500.08 500.08 500 0.01 \n", + "7 8 1 1 1 800.09 800.09 800 0.01 \n", "\n", " left_width right_width \n", "0 5 5 \n", @@ -1133,6 +1133,196 @@ { "cell_type": "code", "execution_count": 16, + "id": "cbb86464-d1d6-4de9-92b0-a48a787a6895", + "metadata": {}, + "outputs": [], + "source": [ + "# make id a long string so more realistic\n", + "features['id'] = (features['id'].astype(str) * 19).astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "70d20530-d775-4b3f-896d-76a345d42e5e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idprecursor_idIDrun_idexp_rtexp_imnorm_rtdelta_rtleft_widthright_width
01111111111111111111661220.02220.022200.0155
12222222222222222222661220.03220.032200.0155
23333333333333333333441600.04600.046000.0155
34444444444444444444551200.05200.052000.0155
45555555555555555555001400.06400.064000.0155
56666666666666666666221700.07700.077000.0155
67777777777777777777331500.08500.085000.0155
78888888888888888888111800.09800.098000.0155
\n", + "
" + ], + "text/plain": [ + " id precursor_id ID run_id exp_rt exp_im norm_rt \\\n", + "0 1111111111111111111 6 6 1 220.02 220.02 220 \n", + "1 2222222222222222222 6 6 1 220.03 220.03 220 \n", + "2 3333333333333333333 4 4 1 600.04 600.04 600 \n", + "3 4444444444444444444 5 5 1 200.05 200.05 200 \n", + "4 5555555555555555555 0 0 1 400.06 400.06 400 \n", + "5 6666666666666666666 2 2 1 700.07 700.07 700 \n", + "6 7777777777777777777 3 3 1 500.08 500.08 500 \n", + "7 8888888888888888888 1 1 1 800.09 800.09 800 \n", + "\n", + " delta_rt left_width right_width \n", + "0 0.01 5 5 \n", + "1 0.01 5 5 \n", + "2 0.01 5 5 \n", + "3 0.01 5 5 \n", + "4 0.01 5 5 \n", + "5 0.01 5 5 \n", + "6 0.01 5 5 \n", + "7 0.01 5 5 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "features" + ] + }, + { + "cell_type": "code", + "execution_count": 18, "id": "9b2a1cfc-dcca-45f3-9aa4-e7f75382de3c", "metadata": {}, "outputs": [], @@ -1147,7 +1337,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "id": "4806f254-1c29-4aa5-9822-6cb8c6ea730c", "metadata": {}, "outputs": [], @@ -1173,17 +1363,17 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "id": "8d03f3ca-a4b0-48e9-bd7f-3c1c7b8d6874", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 18, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1195,7 +1385,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "id": "aae782e1-c9f9-4c2c-84f7-a7741e385a5e", "metadata": {}, "outputs": [ @@ -1243,10 +1433,10 @@ " \n", " \n", " 0\n", - " 0\n", - " 220010.0\n", + " 1111111111111111111\n", + " 220020.0\n", " 6\n", - " 220.01\n", + " 220.02\n", " 0.01\n", " 1\n", " 1\n", @@ -1264,10 +1454,10 @@ " \n", " \n", " 1\n", - " 1\n", - " 220020.0\n", + " 2222222222222222222\n", + " 220030.0\n", " 6\n", - " 220.02\n", + " 220.03\n", " 0.01\n", " 1\n", " 1\n", @@ -1285,10 +1475,10 @@ " \n", " \n", " 2\n", - " 2\n", - " 600030.0\n", + " 3333333333333333333\n", + " 600040.0\n", " 4\n", - " 600.03\n", + " 600.04\n", " 0.01\n", " 1\n", " 1\n", @@ -1306,10 +1496,10 @@ " \n", " \n", " 3\n", - " 3\n", - " 200040.0\n", + " 4444444444444444444\n", + " 200050.0\n", " 5\n", - " 200.04\n", + " 200.05\n", " 0.01\n", " 1\n", " 1\n", @@ -1327,10 +1517,10 @@ " \n", " \n", " 4\n", - " 4\n", - " 400050.0\n", + " 5555555555555555555\n", + " 400060.0\n", " 0\n", - " 400.05\n", + " 400.06\n", " 0.01\n", " 1\n", " 1\n", @@ -1348,10 +1538,10 @@ " \n", " \n", " 5\n", - " 5\n", - " 700060.0\n", + " 6666666666666666666\n", + " 700070.0\n", " 2\n", - " 700.06\n", + " 700.07\n", " 0.01\n", " 1\n", " 1\n", @@ -1369,10 +1559,10 @@ " \n", " \n", " 6\n", - " 6\n", - " 500070.0\n", + " 7777777777777777777\n", + " 500080.0\n", " 3\n", - " 500.07\n", + " 500.08\n", " 0.01\n", " 1\n", " 1\n", @@ -1390,10 +1580,10 @@ " \n", " \n", " 7\n", - " 7\n", - " 800080.0\n", + " 8888888888888888888\n", + " 800090.0\n", " 1\n", - " 800.08\n", + " 800.09\n", " 0.01\n", " 1\n", " 1\n", @@ -1414,15 +1604,15 @@ "" ], "text/plain": [ - " feature_id area_intensity apex_intensity exp_im delta_im \\\n", - "0 0 220010.0 6 220.01 0.01 \n", - "1 1 220020.0 6 220.02 0.01 \n", - "2 2 600030.0 4 600.03 0.01 \n", - "3 3 200040.0 5 200.04 0.01 \n", - "4 4 400050.0 0 400.05 0.01 \n", - "5 5 700060.0 2 700.06 0.01 \n", - "6 6 500070.0 3 500.07 0.01 \n", - "7 7 800080.0 1 800.08 0.01 \n", + " feature_id area_intensity apex_intensity exp_im delta_im \\\n", + "0 1111111111111111111 220020.0 6 220.02 0.01 \n", + "1 2222222222222222222 220030.0 6 220.03 0.01 \n", + "2 3333333333333333333 600040.0 4 600.04 0.01 \n", + "3 4444444444444444444 200050.0 5 200.05 0.01 \n", + "4 5555555555555555555 400060.0 0 400.06 0.01 \n", + "5 6666666666666666666 700070.0 2 700.07 0.01 \n", + "6 7777777777777777777 500080.0 3 500.08 0.01 \n", + "7 8888888888888888888 800090.0 1 800.09 0.01 \n", "\n", " var_massdev_score var_mi_score var_mi_contrast_score \\\n", "0 1 1 1 \n", @@ -1475,7 +1665,7 @@ "7 1 1 1 " ] }, - "execution_count": 19, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1507,7 +1697,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "id": "8480c6a7-23ab-4a22-9da9-998cbc8606ac", "metadata": {}, "outputs": [], @@ -1522,7 +1712,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 23, "id": "98ff8eac-5acd-435a-b292-fe64d590ea51", "metadata": {}, "outputs": [], @@ -1548,17 +1738,17 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 24, "id": "7691c149-7f67-48de-9bff-2856a44d40eb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 22, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -1570,7 +1760,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 25, "id": "863fcd87-d051-4bc9-b88c-8535cbc90c4a", "metadata": { "scrolled": true, @@ -1632,7 +1822,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 26, "id": "30f29e9b-b345-4a02-ba63-74dabf69555b", "metadata": {}, "outputs": [ @@ -1683,11 +1873,11 @@ " \n", " \n", " 0\n", - " 0\n", - " 220010.0\n", - " 220010.0\n", + " 1111111111111111111\n", + " 220020.0\n", + " 220020.0\n", " 6\n", - " 220.01\n", + " 220.02\n", " 0.01\n", " 1\n", " 1\n", @@ -1707,11 +1897,11 @@ " \n", " \n", " 1\n", - " 1\n", - " 220020.0\n", - " 220020.0\n", + " 2222222222222222222\n", + " 220030.0\n", + " 220030.0\n", " 6\n", - " 220.02\n", + " 220.03\n", " 0.01\n", " 1\n", " 1\n", @@ -1731,11 +1921,11 @@ " \n", " \n", " 2\n", - " 2\n", - " 600030.0\n", - " 600030.0\n", + " 3333333333333333333\n", + " 600040.0\n", + " 600040.0\n", " 4\n", - " 600.03\n", + " 600.04\n", " 0.01\n", " 1\n", " 1\n", @@ -1755,11 +1945,11 @@ " \n", " \n", " 3\n", - " 3\n", - " 200040.0\n", - " 200040.0\n", + " 4444444444444444444\n", + " 200050.0\n", + " 200050.0\n", " 5\n", - " 200.04\n", + " 200.05\n", " 0.01\n", " 1\n", " 1\n", @@ -1779,11 +1969,11 @@ " \n", " \n", " 4\n", - " 4\n", - " 400050.0\n", - " 400050.0\n", + " 5555555555555555555\n", + " 400060.0\n", + " 400060.0\n", " 0\n", - " 400.05\n", + " 400.06\n", " 0.01\n", " 1\n", " 1\n", @@ -1803,11 +1993,11 @@ " \n", " \n", " 5\n", - " 5\n", - " 700060.0\n", - " 700060.0\n", + " 6666666666666666666\n", + " 700070.0\n", + " 700070.0\n", " 2\n", - " 700.06\n", + " 700.07\n", " 0.01\n", " 1\n", " 1\n", @@ -1827,11 +2017,11 @@ " \n", " \n", " 6\n", - " 6\n", - " 500070.0\n", - " 500070.0\n", + " 7777777777777777777\n", + " 500080.0\n", + " 500080.0\n", " 3\n", - " 500.07\n", + " 500.08\n", " 0.01\n", " 1\n", " 1\n", @@ -1851,11 +2041,11 @@ " \n", " \n", " 7\n", - " 7\n", - " 800080.0\n", - " 800080.0\n", + " 8888888888888888888\n", + " 800090.0\n", + " 800090.0\n", " 1\n", - " 800.08\n", + " 800.09\n", " 0.01\n", " 1\n", " 1\n", @@ -1879,25 +2069,25 @@ "" ], "text/plain": [ - " feature_id AREA_INTENSITY TOTAL_AREA_INTENSITY APEX_INTENSITY EXP_IM \\\n", - "0 0 220010.0 220010.0 6 220.01 \n", - "1 1 220020.0 220020.0 6 220.02 \n", - "2 2 600030.0 600030.0 4 600.03 \n", - "3 3 200040.0 200040.0 5 200.04 \n", - "4 4 400050.0 400050.0 0 400.05 \n", - "5 5 700060.0 700060.0 2 700.06 \n", - "6 6 500070.0 500070.0 3 500.07 \n", - "7 7 800080.0 800080.0 1 800.08 \n", + " feature_id AREA_INTENSITY TOTAL_AREA_INTENSITY APEX_INTENSITY \\\n", + "0 1111111111111111111 220020.0 220020.0 6 \n", + "1 2222222222222222222 220030.0 220030.0 6 \n", + "2 3333333333333333333 600040.0 600040.0 4 \n", + "3 4444444444444444444 200050.0 200050.0 5 \n", + "4 5555555555555555555 400060.0 400060.0 0 \n", + "5 6666666666666666666 700070.0 700070.0 2 \n", + "6 7777777777777777777 500080.0 500080.0 3 \n", + "7 8888888888888888888 800090.0 800090.0 1 \n", "\n", - " DELTA_IM TOTAL_MI VAR_BSERIES_SCORE VAR_DOTPROD_SCORE \\\n", - "0 0.01 1 1 1 \n", - "1 0.01 1 1 1 \n", - "2 0.01 1 1 1 \n", - "3 0.01 1 1 1 \n", - "4 0.01 1 1 1 \n", - "5 0.01 1 1 1 \n", - "6 0.01 1 1 1 \n", - "7 0.01 1 1 1 \n", + " EXP_IM DELTA_IM TOTAL_MI VAR_BSERIES_SCORE VAR_DOTPROD_SCORE \\\n", + "0 220.02 0.01 1 1 1 \n", + "1 220.03 0.01 1 1 1 \n", + "2 600.04 0.01 1 1 1 \n", + "3 200.05 0.01 1 1 1 \n", + "4 400.06 0.01 1 1 1 \n", + "5 700.07 0.01 1 1 1 \n", + "6 500.08 0.01 1 1 1 \n", + "7 800.09 0.01 1 1 1 \n", "\n", " VAR_INTENSITY_SCORE ... VAR_ELUTION_MODEL_FIT_SCORE VAR_IM_XCORR_SHAPE \\\n", "0 1 ... 1 1 \n", @@ -1932,7 +2122,7 @@ "[8 rows x 41 columns]" ] }, - "execution_count": 24, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1985,7 +2175,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 27, "id": "d93163c0-20a1-4d98-86de-71c6d265d418", "metadata": {}, "outputs": [], @@ -2000,7 +2190,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 28, "id": "db1cbc3f-e463-43a6-895a-979c7aafe393", "metadata": {}, "outputs": [], @@ -2026,17 +2216,17 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 29, "id": "bca8bfba-86e9-497c-ae94-4c0f679b45f1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 27, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -2048,7 +2238,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 30, "id": "4678179b-aea7-460f-ad71-d157a1e3ce38", "metadata": {}, "outputs": [], @@ -2058,7 +2248,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 31, "id": "231cf0c6-01ac-4061-b1a9-d68404f793b3", "metadata": {}, "outputs": [], @@ -2068,7 +2258,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 32, "id": "7f730c1c-73a3-4ce1-a2b3-35b064edd558", "metadata": {}, "outputs": [], @@ -2079,7 +2269,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 33, "id": "46a4405d-c5c9-48a6-aee5-abb2a3b9b047", "metadata": {}, "outputs": [ @@ -2115,7 +2305,7 @@ " \n", " \n", " 0\n", - " 0\n", + " 1111111111111111111\n", " 6\n", " 6\n", " 6\n", @@ -2124,7 +2314,7 @@ " \n", " \n", " 1\n", - " 0\n", + " 1111111111111111111\n", " 6\n", " 7\n", " 6\n", @@ -2133,7 +2323,7 @@ " \n", " \n", " 2\n", - " 1\n", + " 2222222222222222222\n", " 6\n", " 6\n", " 6\n", @@ -2142,7 +2332,7 @@ " \n", " \n", " 3\n", - " 1\n", + " 2222222222222222222\n", " 6\n", " 7\n", " 6\n", @@ -2151,7 +2341,7 @@ " \n", " \n", " 4\n", - " 2\n", + " 3333333333333333333\n", " 4\n", " 15\n", " 4\n", @@ -2160,7 +2350,7 @@ " \n", " \n", " 5\n", - " 2\n", + " 3333333333333333333\n", " 4\n", " 16\n", " 4\n", @@ -2169,7 +2359,7 @@ " \n", " \n", " 6\n", - " 3\n", + " 4444444444444444444\n", " 5\n", " 3\n", " 5\n", @@ -2178,7 +2368,7 @@ " \n", " \n", " 7\n", - " 3\n", + " 4444444444444444444\n", " 5\n", " 4\n", " 5\n", @@ -2187,7 +2377,7 @@ " \n", " \n", " 8\n", - " 3\n", + " 4444444444444444444\n", " 5\n", " 5\n", " 5\n", @@ -2196,7 +2386,7 @@ " \n", " \n", " 9\n", - " 4\n", + " 5555555555555555555\n", " 0\n", " 8\n", " 0\n", @@ -2205,7 +2395,7 @@ " \n", " \n", " 10\n", - " 4\n", + " 5555555555555555555\n", " 0\n", " 9\n", " 0\n", @@ -2214,7 +2404,7 @@ " \n", " \n", " 11\n", - " 4\n", + " 5555555555555555555\n", " 0\n", " 10\n", " 0\n", @@ -2223,7 +2413,7 @@ " \n", " \n", " 12\n", - " 4\n", + " 5555555555555555555\n", " 0\n", " 11\n", " 0\n", @@ -2232,7 +2422,7 @@ " \n", " \n", " 13\n", - " 5\n", + " 6666666666666666666\n", " 2\n", " 17\n", " 2\n", @@ -2241,7 +2431,7 @@ " \n", " \n", " 14\n", - " 5\n", + " 6666666666666666666\n", " 2\n", " 18\n", " 2\n", @@ -2250,7 +2440,7 @@ " \n", " \n", " 15\n", - " 5\n", + " 6666666666666666666\n", " 2\n", " 19\n", " 2\n", @@ -2259,7 +2449,7 @@ " \n", " \n", " 16\n", - " 6\n", + " 7777777777777777777\n", " 3\n", " 12\n", " 3\n", @@ -2268,7 +2458,7 @@ " \n", " \n", " 17\n", - " 6\n", + " 7777777777777777777\n", " 3\n", " 13\n", " 3\n", @@ -2277,7 +2467,7 @@ " \n", " \n", " 18\n", - " 6\n", + " 7777777777777777777\n", " 3\n", " 14\n", " 3\n", @@ -2286,7 +2476,7 @@ " \n", " \n", " 19\n", - " 7\n", + " 8888888888888888888\n", " 1\n", " 20\n", " 1\n", @@ -2295,7 +2485,7 @@ " \n", " \n", " 20\n", - " 7\n", + " 8888888888888888888\n", " 1\n", " 21\n", " 1\n", @@ -2304,7 +2494,7 @@ " \n", " \n", " 21\n", - " 7\n", + " 8888888888888888888\n", " 1\n", " 22\n", " 1\n", @@ -2316,29 +2506,29 @@ "" ], "text/plain": [ - " feature_id precursor_id TRANSITION_ID PRECURSOR_ID \\\n", - "0 0 6 6 6 \n", - "1 0 6 7 6 \n", - "2 1 6 6 6 \n", - "3 1 6 7 6 \n", - "4 2 4 15 4 \n", - "5 2 4 16 4 \n", - "6 3 5 3 5 \n", - "7 3 5 4 5 \n", - "8 3 5 5 5 \n", - "9 4 0 8 0 \n", - "10 4 0 9 0 \n", - "11 4 0 10 0 \n", - "12 4 0 11 0 \n", - "13 5 2 17 2 \n", - "14 5 2 18 2 \n", - "15 5 2 19 2 \n", - "16 6 3 12 3 \n", - "17 6 3 13 3 \n", - "18 6 3 14 3 \n", - "19 7 1 20 1 \n", - "20 7 1 21 1 \n", - "21 7 1 22 1 \n", + " feature_id precursor_id TRANSITION_ID PRECURSOR_ID \\\n", + "0 1111111111111111111 6 6 6 \n", + "1 1111111111111111111 6 7 6 \n", + "2 2222222222222222222 6 6 6 \n", + "3 2222222222222222222 6 7 6 \n", + "4 3333333333333333333 4 15 4 \n", + "5 3333333333333333333 4 16 4 \n", + "6 4444444444444444444 5 3 5 \n", + "7 4444444444444444444 5 4 5 \n", + "8 4444444444444444444 5 5 5 \n", + "9 5555555555555555555 0 8 0 \n", + "10 5555555555555555555 0 9 0 \n", + "11 5555555555555555555 0 10 0 \n", + "12 5555555555555555555 0 11 0 \n", + "13 6666666666666666666 2 17 2 \n", + "14 6666666666666666666 2 18 2 \n", + "15 6666666666666666666 2 19 2 \n", + "16 7777777777777777777 3 12 3 \n", + "17 7777777777777777777 3 13 3 \n", + "18 7777777777777777777 3 14 3 \n", + "19 8888888888888888888 1 20 1 \n", + "20 8888888888888888888 1 21 1 \n", + "21 8888888888888888888 1 22 1 \n", "\n", " TRAML_ID PRODUCT_MZ \n", "0 YYYYYR3_b1^2 221.0 \n", @@ -2365,7 +2555,7 @@ "21 TTK3_b3^3 803.0 " ] }, - "execution_count": 31, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -2376,7 +2566,18 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 34, + "id": "f0b98de2-dead-43c2-85ff-40706699a9ee", + "metadata": {}, + "outputs": [], + "source": [ + "# calculations with new feature id would result in overflow so just take first digit for calculations\n", + "psuedo_feature_id = (feature_transition['feature_id'].astype(str).str.slice(start=0, stop=1)).astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, "id": "cf55c2d6-fa9f-433a-bb8f-931984f48bbe", "metadata": {}, "outputs": [ @@ -2423,7 +2624,7 @@ " \n", " \n", " 0\n", - " 0\n", + " 1111111111111111111\n", " 6\n", " 221.0\n", " 221.0\n", @@ -2443,7 +2644,7 @@ " \n", " \n", " 1\n", - " 0\n", + " 1111111111111111111\n", " 7\n", " 222.0\n", " 222.0\n", @@ -2463,7 +2664,7 @@ " \n", " \n", " 2\n", - " 1\n", + " 2222222222222222222\n", " 6\n", " 442.0\n", " 442.0\n", @@ -2483,7 +2684,7 @@ " \n", " \n", " 3\n", - " 1\n", + " 2222222222222222222\n", " 7\n", " 444.0\n", " 444.0\n", @@ -2503,7 +2704,7 @@ " \n", " \n", " 4\n", - " 2\n", + " 3333333333333333333\n", " 15\n", " 1803.0\n", " 1803.0\n", @@ -2523,7 +2724,7 @@ " \n", " \n", " 5\n", - " 2\n", + " 3333333333333333333\n", " 16\n", " 1806.0\n", " 1806.0\n", @@ -2543,7 +2744,7 @@ " \n", " \n", " 6\n", - " 3\n", + " 4444444444444444444\n", " 3\n", " 804.0\n", " 804.0\n", @@ -2563,7 +2764,7 @@ " \n", " \n", " 7\n", - " 3\n", + " 4444444444444444444\n", " 4\n", " 808.0\n", " 808.0\n", @@ -2583,7 +2784,7 @@ " \n", " \n", " 8\n", - " 3\n", + " 4444444444444444444\n", " 5\n", " 812.0\n", " 812.0\n", @@ -2603,7 +2804,7 @@ " \n", " \n", " 9\n", - " 4\n", + " 5555555555555555555\n", " 8\n", " 2005.0\n", " 2005.0\n", @@ -2623,7 +2824,7 @@ " \n", " \n", " 10\n", - " 4\n", + " 5555555555555555555\n", " 9\n", " 2010.0\n", " 2010.0\n", @@ -2643,7 +2844,7 @@ " \n", " \n", " 11\n", - " 4\n", + " 5555555555555555555\n", " 10\n", " 2015.0\n", " 2015.0\n", @@ -2663,7 +2864,7 @@ " \n", " \n", " 12\n", - " 4\n", + " 5555555555555555555\n", " 11\n", " 2020.0\n", " 2020.0\n", @@ -2683,7 +2884,7 @@ " \n", " \n", " 13\n", - " 5\n", + " 6666666666666666666\n", " 17\n", " 4206.0\n", " 4206.0\n", @@ -2703,7 +2904,7 @@ " \n", " \n", " 14\n", - " 5\n", + " 6666666666666666666\n", " 18\n", " 4212.0\n", " 4212.0\n", @@ -2723,7 +2924,7 @@ " \n", " \n", " 15\n", - " 5\n", + " 6666666666666666666\n", " 19\n", " 4218.0\n", " 4218.0\n", @@ -2743,7 +2944,7 @@ " \n", " \n", " 16\n", - " 6\n", + " 7777777777777777777\n", " 12\n", " 3507.0\n", " 3507.0\n", @@ -2763,7 +2964,7 @@ " \n", " \n", " 17\n", - " 6\n", + " 7777777777777777777\n", " 13\n", " 3514.0\n", " 3514.0\n", @@ -2783,7 +2984,7 @@ " \n", " \n", " 18\n", - " 6\n", + " 7777777777777777777\n", " 14\n", " 3521.0\n", " 3521.0\n", @@ -2803,7 +3004,7 @@ " \n", " \n", " 19\n", - " 7\n", + " 8888888888888888888\n", " 20\n", " 6408.0\n", " 6408.0\n", @@ -2823,7 +3024,7 @@ " \n", " \n", " 20\n", - " 7\n", + " 8888888888888888888\n", " 21\n", " 6416.0\n", " 6416.0\n", @@ -2843,7 +3044,7 @@ " \n", " \n", " 21\n", - " 7\n", + " 8888888888888888888\n", " 22\n", " 6424.0\n", " 6424.0\n", @@ -2866,29 +3067,29 @@ "" ], "text/plain": [ - " feature_id TRANSITION_ID AREA_INTENSITY TOTAL_AREA_INTENSITY \\\n", - "0 0 6 221.0 221.0 \n", - "1 0 7 222.0 222.0 \n", - "2 1 6 442.0 442.0 \n", - "3 1 7 444.0 444.0 \n", - "4 2 15 1803.0 1803.0 \n", - "5 2 16 1806.0 1806.0 \n", - "6 3 3 804.0 804.0 \n", - "7 3 4 808.0 808.0 \n", - "8 3 5 812.0 812.0 \n", - "9 4 8 2005.0 2005.0 \n", - "10 4 9 2010.0 2010.0 \n", - "11 4 10 2015.0 2015.0 \n", - "12 4 11 2020.0 2020.0 \n", - "13 5 17 4206.0 4206.0 \n", - "14 5 18 4212.0 4212.0 \n", - "15 5 19 4218.0 4218.0 \n", - "16 6 12 3507.0 3507.0 \n", - "17 6 13 3514.0 3514.0 \n", - "18 6 14 3521.0 3521.0 \n", - "19 7 20 6408.0 6408.0 \n", - "20 7 21 6416.0 6416.0 \n", - "21 7 22 6424.0 6424.0 \n", + " feature_id TRANSITION_ID AREA_INTENSITY TOTAL_AREA_INTENSITY \\\n", + "0 1111111111111111111 6 221.0 221.0 \n", + "1 1111111111111111111 7 222.0 222.0 \n", + "2 2222222222222222222 6 442.0 442.0 \n", + "3 2222222222222222222 7 444.0 444.0 \n", + "4 3333333333333333333 15 1803.0 1803.0 \n", + "5 3333333333333333333 16 1806.0 1806.0 \n", + "6 4444444444444444444 3 804.0 804.0 \n", + "7 4444444444444444444 4 808.0 808.0 \n", + "8 4444444444444444444 5 812.0 812.0 \n", + "9 5555555555555555555 8 2005.0 2005.0 \n", + "10 5555555555555555555 9 2010.0 2010.0 \n", + "11 5555555555555555555 10 2015.0 2015.0 \n", + "12 5555555555555555555 11 2020.0 2020.0 \n", + "13 6666666666666666666 17 4206.0 4206.0 \n", + "14 6666666666666666666 18 4212.0 4212.0 \n", + "15 6666666666666666666 19 4218.0 4218.0 \n", + "16 7777777777777777777 12 3507.0 3507.0 \n", + "17 7777777777777777777 13 3514.0 3514.0 \n", + "18 7777777777777777777 14 3521.0 3521.0 \n", + "19 8888888888888888888 20 6408.0 6408.0 \n", + "20 8888888888888888888 21 6416.0 6416.0 \n", + "21 8888888888888888888 22 6424.0 6424.0 \n", "\n", " APEX_INTENSITY TOTAL_MI VAR_INTENSITY_SCORE VAR_INTENSITY_RATIO_SCORE \\\n", "0 1 1 1 1 \n", @@ -2987,14 +3188,15 @@ "21 1 1 " ] }, - "execution_count": 32, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "length = 22\n", - "feature_transition['AREA_INTENSITY'] = feature_transition['PRODUCT_MZ'] * (feature_transition['feature_id'] + 1) #should be equal to product_mz * (feature_id + 1)\n", + "\n", + "feature_transition['AREA_INTENSITY'] = feature_transition['PRODUCT_MZ'] * (psuedo_feature_id) #should be equal to product_mz * (feature_id + 1)\n", "feature_transition['TOTAL_AREA_INTENSITY'] = feature_transition['AREA_INTENSITY']\n", "feature_transition['APEX_INTENSITY'] = [1] * length\n", "feature_transition['TOTAL_MI'] = [1] * length\n", @@ -3017,7 +3219,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 36, "id": "588ee10a-7b00-4f61-b305-2a392b5bbd1b", "metadata": {}, "outputs": [], @@ -3034,7 +3236,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 37, "id": "78f13b81-7db1-46ee-947b-723e8b0b340b", "metadata": {}, "outputs": [], @@ -3060,7 +3262,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 38, "id": "8c143ee6-928c-4404-85cc-5fc7b9b1ce85", "metadata": {}, "outputs": [], @@ -3088,17 +3290,17 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 39, "id": "9a24a430-d994-4012-9ac3-37c792e30026", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 36, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -3110,7 +3312,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 40, "id": "563c3e64-e528-457c-ba42-f00fabcff0f0", "metadata": { "tags": [] @@ -3136,7 +3338,18 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 41, + "id": "d2f151a4-a735-4034-b573-d98faf5c7d76", + "metadata": {}, + "outputs": [], + "source": [ + "# calculations with new feature id would result in overflow so just take first digit for calculations\n", + "psuedo_feature_id = (feature_ms1['feature_id'].astype(str).str.slice(start=0, stop=1)).astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, "id": "c8539aa2-f946-4be8-a891-0b55d6957322", "metadata": {}, "outputs": [ @@ -3172,7 +3385,7 @@ " \n", " \n", " 0\n", - " 0\n", + " 1111111111111111111\n", " 1540\n", " 1\n", " 1\n", @@ -3181,7 +3394,7 @@ " \n", " \n", " 1\n", - " 1\n", + " 2222222222222222222\n", " 3080\n", " 1\n", " 1\n", @@ -3190,7 +3403,7 @@ " \n", " \n", " 2\n", - " 2\n", + " 3333333333333333333\n", " 9000\n", " 1\n", " 1\n", @@ -3199,7 +3412,7 @@ " \n", " \n", " 3\n", - " 3\n", + " 4444444444444444444\n", " 4800\n", " 1\n", " 1\n", @@ -3208,7 +3421,7 @@ " \n", " \n", " 4\n", - " 4\n", + " 5555555555555555555\n", " 2000\n", " 1\n", " 1\n", @@ -3217,7 +3430,7 @@ " \n", " \n", " 5\n", - " 5\n", + " 6666666666666666666\n", " 12600\n", " 1\n", " 1\n", @@ -3226,7 +3439,7 @@ " \n", " \n", " 6\n", - " 6\n", + " 7777777777777777777\n", " 14000\n", " 1\n", " 1\n", @@ -3235,7 +3448,7 @@ " \n", " \n", " 7\n", - " 7\n", + " 8888888888888888888\n", " 12800\n", " 1\n", " 1\n", @@ -3247,18 +3460,18 @@ "" ], "text/plain": [ - " feature_id SCORE RANK PVALUE QVALUE PEP\n", - "0 0 1540 1 1 1 1\n", - "1 1 3080 1 1 1 1\n", - "2 2 9000 1 1 1 1\n", - "3 3 4800 1 1 1 1\n", - "4 4 2000 1 1 1 1\n", - "5 5 12600 1 1 1 1\n", - "6 6 14000 1 1 1 1\n", - "7 7 12800 1 1 1 1" + " feature_id SCORE RANK PVALUE QVALUE PEP\n", + "0 1111111111111111111 1540 1 1 1 1\n", + "1 2222222222222222222 3080 1 1 1 1\n", + "2 3333333333333333333 9000 1 1 1 1\n", + "3 4444444444444444444 4800 1 1 1 1\n", + "4 5555555555555555555 2000 1 1 1 1\n", + "5 6666666666666666666 12600 1 1 1 1\n", + "6 7777777777777777777 14000 1 1 1 1\n", + "7 8888888888888888888 12800 1 1 1 1" ] }, - "execution_count": 38, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -3266,7 +3479,7 @@ "source": [ "length = 8 \n", "score_ms2 = feature_ms1[['feature_id']].copy()\n", - "score_ms2['SCORE'] = (score_ms2['feature_id'] + 1) * (features['precursor_id'] + 1) * features['exp_rt'].astype(int) # (feature_id+1) * (precursor_id+1)\n", + "score_ms2['SCORE'] = (psuedo_feature_id) * (features['precursor_id'] + 1) * features['exp_rt'].astype(int) # (feature_id+1) * (precursor_id+1)\n", "score_ms2['RANK'] = [1] * length\n", "score_ms2['PVALUE'] = [1] * length\n", "score_ms2['QVALUE'] = [1] * length \n", @@ -3276,7 +3489,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 43, "id": "4cbda7cf-0535-4292-bfed-739a5f1bd2b8", "metadata": {}, "outputs": [], @@ -3291,7 +3504,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 44, "id": "adb88443-6d34-4173-8b37-9f52dba9f5e7", "metadata": {}, "outputs": [], @@ -3317,17 +3530,17 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 45, "id": "e0094b3a-5a80-48e4-8041-a537ce409480", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 41, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } @@ -3339,7 +3552,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 46, "id": "acf865f3-3353-4baa-b83e-91be2abed776", "metadata": { "tags": [] @@ -3366,7 +3579,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 47, "id": "f95142b9-612b-43a8-bb42-356b71839ea6", "metadata": {}, "outputs": [], @@ -3376,7 +3589,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 48, "id": "43828588-c1ff-4943-a7b7-24a968562c4e", "metadata": {}, "outputs": [ @@ -3472,7 +3685,7 @@ "6 0 GGGGGGGGGGR GGGGGGGGGGR 0" ] }, - "execution_count": 44, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } @@ -3483,7 +3696,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 49, "id": "f416e48b-6bb6-4cb7-8d81-597cfd52320c", "metadata": { "tags": [] @@ -3496,7 +3709,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 50, "id": "0e1eb1b9-730e-45d4-9618-fd532c1ccc25", "metadata": {}, "outputs": [], @@ -3512,7 +3725,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 51, "id": "e9692e06-ddf2-4f74-bb80-f2a92728767b", "metadata": {}, "outputs": [], @@ -3527,7 +3740,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 52, "id": "f720c22b-e6fa-4ac0-8402-bdcd2e74840b", "metadata": {}, "outputs": [], @@ -3553,17 +3766,17 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 53, "id": "94c860e0-880a-4091-afb8-af368ed72b26", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 49, + "execution_count": 53, "metadata": {}, "output_type": "execute_result" } @@ -3575,7 +3788,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 54, "id": "1c053178-b8a5-44ad-876b-49b5fd8afa23", "metadata": { "tags": [] @@ -3602,7 +3815,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 55, "id": "d70ba894-55bf-4a36-b306-45b7c5e9d1bd", "metadata": {}, "outputs": [], @@ -3612,7 +3825,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 56, "id": "3e1bbbeb-7cc9-4b9f-b898-5148abff911d", "metadata": {}, "outputs": [ @@ -3679,7 +3892,7 @@ "3 0 Decoy_ProtT 1" ] }, - "execution_count": 52, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } @@ -3690,7 +3903,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 57, "id": "7b3410b1-5d6a-4e85-838c-5ccb3b15f1c5", "metadata": {}, "outputs": [ @@ -3777,7 +3990,7 @@ "3 0 0 1 1 1 global 1" ] }, - "execution_count": 53, + "execution_count": 57, "metadata": {}, "output_type": "execute_result" } @@ -3795,7 +4008,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 58, "id": "51408b81-b650-4787-9050-59d63c0098c0", "metadata": {}, "outputs": [], @@ -3810,7 +4023,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 59, "id": "ab6a68b5-f5db-46e7-95e8-6e98a69eb062", "metadata": {}, "outputs": [], diff --git a/tests/data/dummyOSWScoredData.osw b/tests/data/dummyOSWScoredData.osw index a3fa86dd203d8283f428b012fcdce03de3e6d498..82311a35b4c721fe23f5a5aca7650f8fb2ae0d70 100644 GIT binary patch delta 2303 zcmZ{lQD_`R9L9HcZfADyZsxyxN!y&1PBc}|SS}b)gIK!yVh=4U(pF7S^r;#xp2P<) z;^wpk6--5tkb*)H@^IR#A=Q)+B&kAt(fVLhh|*YEP$)tlQYi@HZf^g(Yj5v3n4P)b zx3lwo-`!12m=pf{G+}O`rolP*3|@r?AuoTG4OyG5zbL)3nPry6xy}h0A3piabA6LT z?)%!AC$An=IMUudM{1WIX?Bkr#sK$u(&9(2-QOI1etVU=-b4PUwM#gXjf8;3f{`O< zDLdY1=d6K!K6Rl?oNA0(SfW!hAedCL}zpt(Yg3m8xcUN)r1yesgPOP4+rfDWjYp?-- zz#9ArOE3pBFg@qkb@1|(MbVhS1byeh1U+TJ1nn|{33`zUCU{I5Oz;SLjxq}$PXrGk zJJzI&QNJcu@`ZiL-}@|91sxdRKJV8giCfWgpLi{*OeA0_0z6yRYtBTSK}Ff{6@E?r zw*?x&Zdt^tAU}{v*#SMG;`OL9k$^@5FWv-=wC?;fs!XKNBSkDgv+3~LQDq_lNfBTm zba)Iab88h9n<SQrv#Di| zl=>2>@RVRGECla}(rC_PQ}Wnr1e=?!%@aAKL{Z7$<_Z7PBB)RV6Sk*0U(pdv*q!Eh zg@oX?gv|-3tMfNJB74jI8mz;&@F~0n!|(tU?Q4oD)s)p?F0+ zD*9*Zf9E~5wZMq3$I1Iw7{b z#DZ8*PG=x5=ER(`+X0Ne%dXiq1*w3y`O{vo$LujBQ;AK$u@Bpam1qaBGC6*QUs3ta zK$frbb(Omr@V}#Y3yZJuHI==YhyjzI=jWBx4qzV*x{K~oW@jKnOSGhT0>Ix0yalB@ z=}yHGMA!_hvdtxz%IFLL&+(kfv;)|2P8P|cqAlP+vUo@^vOpG;-hK;gKi4j57ZvFY z%>Q$WIooVJ7JX22c({bcyzeWAg-h6@dxjh% tT*A!w&rT*>LI(>=4h@&kv%Zp}hfBc`w*`O1dD9``8lGzY-*uw*{s%ZoRlNWJ delta 1825 zcmZ{lPe>F|9LML)o0&H=>wCX(*FSGPY(mB^B9Z;EF5<42phMIxvSWxTNP(9&Y**1k zojL?|@RHp%|Aev`=p{T@L?V)uR%``%DvE+6db94#j?QZs`2FDfnIG@>{SGtvyqOpG z95T|;IR!Rg5=P)Mv;tx8*xdBcILjqX8(9`1DnW>(?!*$-zYYi{X! ziU<(xfd)?JGKqzYJELAjKLVstA$2u9`o1*6tLkEfOzJs-ntMaF^i5?|Tra$e?p4T? zR+)+Q^fe?7mHpsVWhFr33aM)8;idNLVkfVvmnvjZwXJS{rF36@Z})Ol-Udiyn?X=4 zo>&kMlxh&P6-6;&O@Uqb1>ax?KEXD;fwkFc=P8V>b)Xaf&mo4e$iWI;r0UcKFAMK9 zDI!3TLy)r!^03>_#3v02soRs|ekL9%auBG>3AX>odJm1-HY z8I4U2qEy25S7LD|X+{{MvFSl{wX^z)L5mMECuMeq<^El_V5~Vio#2sEEQEK95 z+~l#aVlcW)mw8m8P?{^y0O}IY^YKGvEwxCQ+)Ley5+)u#(dv zq)3XqDlBO>*(TdOB2g@|NjAAHQB1N<*0~i{u#Q`B3pWoaIbl53a0;h5IUu15!{{cw4cxM0r diff --git a/tests/test_pyprophet_export_parquet.py b/tests/test_pyprophet_export_parquet.py index 453733ca..647560b6 100644 --- a/tests/test_pyprophet_export_parquet.py +++ b/tests/test_pyprophet_export_parquet.py @@ -110,13 +110,14 @@ def _run_export_parquet_single_run(temp_folder, transitionLevel=False, pd_testin ### Note: Current tests assume no na parquet = parquet.dropna() + pseudo_feature_id = (parquet['FEATURE_ID'].astype(str).str.slice(start=0, stop=1)).astype(int) pd.testing.assert_series_equal(parquet['FEATURE_MS1.APEX_INTENSITY'], parquet['PRECURSOR_ID'], **pd_testing_kwargs) pd.testing.assert_series_equal(parquet['FEATURE_MS2.APEX_INTENSITY'], parquet['PRECURSOR_ID'], **pd_testing_kwargs) pd.testing.assert_series_equal(parquet['FEATURE_MS1.EXP_IM'], parquet['FEATURE_MS2.EXP_IM'], **pd_testing_kwargs) pd.testing.assert_series_equal(parquet['FEATURE_MS2.DELTA_IM'], parquet['FEATURE_MS1.DELTA_IM'], **pd_testing_kwargs) - pd.testing.assert_series_equal(parquet['SCORE_MS2.SCORE'], (parquet['PRECURSOR_ID'] + 1) * parquet['FEATURE.EXP_RT'].astype(int) * (parquet['FEATURE_ID'].astype(int) + 1), **pd_testing_kwargs) + pd.testing.assert_series_equal(parquet['SCORE_MS2.SCORE'], (parquet['PRECURSOR_ID'] + 1) * parquet['FEATURE.EXP_RT'].astype(int) * pseudo_feature_id, **pd_testing_kwargs) pd.testing.assert_series_equal(parquet['SCORE_PEPTIDE.SCORE_GLOBAL'], parquet['PEPTIDE_ID'], **pd_testing_kwargs) pd.testing.assert_series_equal(parquet['SCORE_PROTEIN.SCORE_GLOBAL'], parquet['PROTEIN_ID'], **pd_testing_kwargs) @@ -128,7 +129,7 @@ def _run_export_parquet_single_run(temp_folder, transitionLevel=False, pd_testin ############### TRANSTION LEVEL TESTS ################ if transitionLevel: - pd.testing.assert_series_equal(parquet['FEATURE_TRANSITION.AREA_INTENSITY'], parquet['TRANSITION.PRODUCT_MZ'] * (parquet['FEATURE_ID'].astype(int) + 1), **pd_testing_kwargs) + pd.testing.assert_series_equal(parquet['FEATURE_TRANSITION.AREA_INTENSITY'], parquet['TRANSITION.PRODUCT_MZ'] * pseudo_feature_id, **pd_testing_kwargs) def test_export_parquet_single_run(tmpdir): _run_export_parquet_single_run(tmpdir, transitionLevel=False)