ave-dcd · jarbesfeld · Mar 14, 2025 · Mar 13, 2025
diff --git a/notebooks/analysis/mapping_analysis.ipynb b/notebooks/analysis/mapping_analysis.ipynb
@@ -158,6 +158,7 @@
     "\n",
     "var_count = 0\n",
     "diff_vars_dict = {}\n",
+    "concordant_vars_dict = {}\n",
     "\n",
     "for urn in score_sets:\n",
     "    files = list(Path(\"analysis_files/mappings\").glob(f\"*{urn}_mapping*\"))\n",
@@ -172,6 +173,7 @@
     "    dat = dat[\"mapped_scores\"]\n",
     "\n",
     "    diff_vars = []\n",
+    "    concordant_vars = []\n",
     "    strand = strand_dict[urn]\n",
     "\n",
     "    for j,var_mapping in enumerate(dat):\n",
@@ -186,6 +188,8 @@
     "\n",
     "                if not is_concordant(seq_pre, seq_post, seq_pre_rv, computed_seq_type, strand,):\n",
     "                    diff_vars.append(j)\n",
+    "                else:\n",
+    "                    concordant_vars.append(j)\n",
     "\n",
     "            else:\n",
     "                for pre_mapped_var in var_mapping[\"pre_mapped\"][\"members\"]:\n",
@@ -198,7 +202,10 @@
     "\n",
     "                        if not is_concordant(seq_pre, seq_post, seq_pre_rv, computed_seq_type, strand,):\n",
     "                            diff_vars.append(j)\n",
+    "                        else:\n",
+    "                            concordant_vars.append(j)\n",
     "    diff_vars_dict[urn] = diff_vars\n",
+    "    concordant_vars_dict[urn] = concordant_vars\n",
     "\n",
     "f\"The number of examined variant pairs is: {var_count}\""
    ]
@@ -361,6 +368,39 @@
     "f\"There are {mm_count} instances of reference mismatch in the subset\""
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "de7b8e56",
+   "metadata": {},
+   "source": [
+    "Run the cell below to count the total number of score sets without discordant variants and the total corresponding variant count"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "08d77cd1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'There are 3031995 concordant variants in this subset of 736 score sets'"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "concordant_ss = [key for key, diff_vars_list in diff_vars_dict.items() if not diff_vars_list]\n",
+    "concordant_ss_variant_count = 0\n",
+    "for key in concordant_ss:\n",
+    "    concordant_ss_variant_count = concordant_ss_variant_count + len(concordant_vars_dict[key])\n",
+    "f\"There are {concordant_ss_variant_count} concordant variants in this subset of {len(concordant_ss)} score sets\""
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "92bc5e87",
@@ -372,7 +412,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "id": "3eb301ed",
    "metadata": {},
    "outputs": [],
@@ -444,7 +484,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
    "id": "fc87cbbe",
    "metadata": {},
    "outputs": [
@@ -480,7 +520,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
    "id": "5d0969e2",
    "metadata": {},
    "outputs": [
@@ -490,7 +530,7 @@
        "'There are 2994178 MAVE variants that were processed in this analysis'"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -523,7 +563,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 15,
    "id": "a4135f64",
    "metadata": {},
    "outputs": [
@@ -577,7 +617,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 16,
    "id": "335af4a1",
    "metadata": {},
    "outputs": [
@@ -664,7 +704,7 @@
        "Interquartile Range for Variants in a Score Set...       (920, 1794)"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -697,7 +737,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
    "id": "7ec89e27",
    "metadata": {},
    "outputs": [
@@ -742,7 +782,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 18,
    "id": "971dbd8a",
    "metadata": {},
    "outputs": [],
@@ -759,7 +799,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 19,
    "id": "29a0b28d",
    "metadata": {},
    "outputs": [
@@ -815,6 +855,55 @@
     "plt.savefig(\"mapped_variants_count.png\", dpi=300)\n",
     "plt.show()"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "feb4cfc1",
+   "metadata": {},
+   "source": [
+    "Compute the number of score sets where VRS IDs are expected to be equal (i.e. the MAVE target sequence is the human reference sequence)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "67e8d4ab",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'The number of score sets with equivalent target sequences and human reference sequences is 158 and the number with unequal sequences is 899'"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mave_ref_equal_count = 0\n",
+    "mave_ref_unequal_count = 0\n",
+    "for urn in score_sets:\n",
+    "    if urn.startswith(\"urn:mavedb:00000097\"): # Edge cases where variants were mapped at protein level\n",
+    "        mave_ref_equal_count += 1\n",
+    "    else:\n",
+    "        files = list(Path(\"analysis_files/mappings\").glob(f\"*{urn}_mapping*\"))\n",
+    "        if files:\n",
+    "            latest_file = max(files, key=os.path.getmtime)\n",
+    "        else:\n",
+    "            continue\n",
+    "\n",
+    "        f = Path.open(latest_file)\n",
+    "        dat = json.load(f)\n",
+    "        mave_seq = dat[\"computed_reference_sequence\"][\"sequence_id\"]\n",
+    "        ref_seq = dat[\"mapped_reference_sequence\"][\"sequence_id\"]\n",
+    "        if mave_seq == ref_seq:\n",
+    "            mave_ref_equal_count += 1\n",
+    "        else:\n",
+    "            mave_ref_unequal_count += 1\n",
+    "f\"The number of score sets with equivalent target sequences and human reference sequences is {mave_ref_equal_count} and the number with unequal sequences is {mave_ref_unequal_count}\""
+   ]
   }
  ],
  "metadata": {