Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 99 additions & 10 deletions notebooks/analysis/mapping_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@
"\n",
"var_count = 0\n",
"diff_vars_dict = {}\n",
"concordant_vars_dict = {}\n",
"\n",
"for urn in score_sets:\n",
" files = list(Path(\"analysis_files/mappings\").glob(f\"*{urn}_mapping*\"))\n",
Expand All @@ -172,6 +173,7 @@
" dat = dat[\"mapped_scores\"]\n",
"\n",
" diff_vars = []\n",
" concordant_vars = []\n",
" strand = strand_dict[urn]\n",
"\n",
" for j,var_mapping in enumerate(dat):\n",
Expand All @@ -186,6 +188,8 @@
"\n",
" if not is_concordant(seq_pre, seq_post, seq_pre_rv, computed_seq_type, strand,):\n",
" diff_vars.append(j)\n",
" else:\n",
" concordant_vars.append(j)\n",
"\n",
" else:\n",
" for pre_mapped_var in var_mapping[\"pre_mapped\"][\"members\"]:\n",
Expand All @@ -198,7 +202,10 @@
"\n",
" if not is_concordant(seq_pre, seq_post, seq_pre_rv, computed_seq_type, strand,):\n",
" diff_vars.append(j)\n",
" else:\n",
" concordant_vars.append(j)\n",
" diff_vars_dict[urn] = diff_vars\n",
" concordant_vars_dict[urn] = concordant_vars\n",
"\n",
"f\"The number of examined variant pairs is: {var_count}\""
]
Expand Down Expand Up @@ -361,6 +368,39 @@
"f\"There are {mm_count} instances of reference mismatch in the subset\""
]
},
{
"cell_type": "markdown",
"id": "de7b8e56",
"metadata": {},
"source": [
"Run the cell below to count the total number of score sets without discordant variants and the total corresponding variant count"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "08d77cd1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'There are 3031995 concordant variants in this subset of 736 score sets'"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"concordant_ss = [key for key, diff_vars_list in diff_vars_dict.items() if not diff_vars_list]\n",
"concordant_ss_variant_count = 0\n",
"for key in concordant_ss:\n",
" concordant_ss_variant_count = concordant_ss_variant_count + len(concordant_vars_dict[key])\n",
"f\"There are {concordant_ss_variant_count} concordant variants in this subset of {len(concordant_ss)} score sets\""
]
},
{
"cell_type": "markdown",
"id": "92bc5e87",
Expand All @@ -372,7 +412,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 12,
"id": "3eb301ed",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -444,7 +484,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 13,
"id": "fc87cbbe",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -480,7 +520,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 14,
"id": "5d0969e2",
"metadata": {},
"outputs": [
Expand All @@ -490,7 +530,7 @@
"'There are 2994178 MAVE variants that were processed in this analysis'"
]
},
"execution_count": 13,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -523,7 +563,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 15,
"id": "a4135f64",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -577,7 +617,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 16,
"id": "335af4a1",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -664,7 +704,7 @@
"Interquartile Range for Variants in a Score Set... (920, 1794)"
]
},
"execution_count": 15,
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -697,7 +737,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 17,
"id": "7ec89e27",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -742,7 +782,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 18,
"id": "971dbd8a",
"metadata": {},
"outputs": [],
Expand All @@ -759,7 +799,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 19,
"id": "29a0b28d",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -815,6 +855,55 @@
"plt.savefig(\"mapped_variants_count.png\", dpi=300)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "feb4cfc1",
"metadata": {},
"source": [
"Compute the number of score sets where VRS IDs are expected to be equal (i.e. the MAVE target sequence is the human reference sequence)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "67e8d4ab",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'The number of score sets with equivalent target sequences and human reference sequences is 158 and the number with unequal sequences is 899'"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mave_ref_equal_count = 0\n",
"mave_ref_unequal_count = 0\n",
"for urn in score_sets:\n",
" if urn.startswith(\"urn:mavedb:00000097\"): # Edge cases where variants were mapped at protein level\n",
" mave_ref_equal_count += 1\n",
" else:\n",
" files = list(Path(\"analysis_files/mappings\").glob(f\"*{urn}_mapping*\"))\n",
" if files:\n",
" latest_file = max(files, key=os.path.getmtime)\n",
" else:\n",
" continue\n",
"\n",
" f = Path.open(latest_file)\n",
" dat = json.load(f)\n",
" mave_seq = dat[\"computed_reference_sequence\"][\"sequence_id\"]\n",
" ref_seq = dat[\"mapped_reference_sequence\"][\"sequence_id\"]\n",
" if mave_seq == ref_seq:\n",
" mave_ref_equal_count += 1\n",
" else:\n",
" mave_ref_unequal_count += 1\n",
"f\"The number of score sets with equivalent target sequences and human reference sequences is {mave_ref_equal_count} and the number with unequal sequences is {mave_ref_unequal_count}\""
]
}
],
"metadata": {
Expand Down