diff --git a/notebooks/analysis/mapping_analysis.ipynb b/notebooks/analysis/mapping_analysis.ipynb index 7443e33..00c4dd1 100644 --- a/notebooks/analysis/mapping_analysis.ipynb +++ b/notebooks/analysis/mapping_analysis.ipynb @@ -158,6 +158,7 @@ "\n", "var_count = 0\n", "diff_vars_dict = {}\n", + "concordant_vars_dict = {}\n", "\n", "for urn in score_sets:\n", " files = list(Path(\"analysis_files/mappings\").glob(f\"*{urn}_mapping*\"))\n", @@ -172,6 +173,7 @@ " dat = dat[\"mapped_scores\"]\n", "\n", " diff_vars = []\n", + " concordant_vars = []\n", " strand = strand_dict[urn]\n", "\n", " for j,var_mapping in enumerate(dat):\n", @@ -186,6 +188,8 @@ "\n", " if not is_concordant(seq_pre, seq_post, seq_pre_rv, computed_seq_type, strand,):\n", " diff_vars.append(j)\n", + " else:\n", + " concordant_vars.append(j)\n", "\n", " else:\n", " for pre_mapped_var in var_mapping[\"pre_mapped\"][\"members\"]:\n", @@ -198,7 +202,10 @@ "\n", " if not is_concordant(seq_pre, seq_post, seq_pre_rv, computed_seq_type, strand,):\n", " diff_vars.append(j)\n", + " else:\n", + " concordant_vars.append(j)\n", " diff_vars_dict[urn] = diff_vars\n", + " concordant_vars_dict[urn] = concordant_vars\n", "\n", "f\"The number of examined variant pairs is: {var_count}\"" ] @@ -361,6 +368,39 @@ "f\"There are {mm_count} instances of reference mismatch in the subset\"" ] }, + { + "cell_type": "markdown", + "id": "de7b8e56", + "metadata": {}, + "source": [ + "Run the cell below to count the total number of score sets without discordant variants and the total corresponding variant count" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "08d77cd1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'There are 3031995 concordant variants in this subset of 736 score sets'" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "concordant_ss = [key for key, diff_vars_list in diff_vars_dict.items() if not diff_vars_list]\n", + "concordant_ss_variant_count = 0\n", + "for key in concordant_ss:\n", + " concordant_ss_variant_count = concordant_ss_variant_count + len(concordant_vars_dict[key])\n", + "f\"There are {concordant_ss_variant_count} concordant variants in this subset of {len(concordant_ss)} score sets\"" + ] + }, { "cell_type": "markdown", "id": "92bc5e87", @@ -372,7 +412,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "3eb301ed", "metadata": {}, "outputs": [], @@ -444,7 +484,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "fc87cbbe", "metadata": {}, "outputs": [ @@ -480,7 +520,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "id": "5d0969e2", "metadata": {}, "outputs": [ @@ -490,7 +530,7 @@ "'There are 2994178 MAVE variants that were processed in this analysis'" ] }, - "execution_count": 13, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -523,7 +563,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "id": "a4135f64", "metadata": {}, "outputs": [ @@ -577,7 +617,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "335af4a1", "metadata": {}, "outputs": [ @@ -664,7 +704,7 @@ "Interquartile Range for Variants in a Score Set... (920, 1794)" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -697,7 +737,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "7ec89e27", "metadata": {}, "outputs": [ @@ -742,7 +782,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "971dbd8a", "metadata": {}, "outputs": [], @@ -759,7 +799,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "id": "29a0b28d", "metadata": {}, "outputs": [ @@ -815,6 +855,55 @@ "plt.savefig(\"mapped_variants_count.png\", dpi=300)\n", "plt.show()" ] + }, + { + "cell_type": "markdown", + "id": "feb4cfc1", + "metadata": {}, + "source": [ + "Compute the number of score sets where VRS IDs are expected to be equal (i.e. the MAVE target sequence is the human reference sequence)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "67e8d4ab", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'The number of score sets with equivalent target sequences and human reference sequences is 158 and the number with unequal sequences is 899'" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mave_ref_equal_count = 0\n", + "mave_ref_unequal_count = 0\n", + "for urn in score_sets:\n", + " if urn.startswith(\"urn:mavedb:00000097\"): # Edge cases where variants were mapped at protein level\n", + " mave_ref_equal_count += 1\n", + " else:\n", + " files = list(Path(\"analysis_files/mappings\").glob(f\"*{urn}_mapping*\"))\n", + " if files:\n", + " latest_file = max(files, key=os.path.getmtime)\n", + " else:\n", + " continue\n", + "\n", + " f = Path.open(latest_file)\n", + " dat = json.load(f)\n", + " mave_seq = dat[\"computed_reference_sequence\"][\"sequence_id\"]\n", + " ref_seq = dat[\"mapped_reference_sequence\"][\"sequence_id\"]\n", + " if mave_seq == ref_seq:\n", + " mave_ref_equal_count += 1\n", + " else:\n", + " mave_ref_unequal_count += 1\n", + "f\"The number of score sets with equivalent target sequences and human reference sequences is {mave_ref_equal_count} and the number with unequal sequences is {mave_ref_unequal_count}\"" + ] } ], "metadata": {