From e421e82bb4ffba51366184dc8a9c63fe5a032649 Mon Sep 17 00:00:00 2001
From: Benjamin Capodanno <bencap@outlook.com>
Date: Tue, 9 Dec 2025 22:27:52 -0800
Subject: [PATCH 1/2] refactor: change transcript parameter type to set for
 simplified transcript reduction

---
 src/dcd_mapping/lookup.py      |  2 +-
 src/dcd_mapping/transcripts.py | 36 +++++++++++-----------------------
 2 files changed, 12 insertions(+), 26 deletions(-)

diff --git a/src/dcd_mapping/lookup.py b/src/dcd_mapping/lookup.py
index 34de2f0..fbfefcf 100644
--- a/src/dcd_mapping/lookup.py
+++ b/src/dcd_mapping/lookup.py
@@ -596,7 +596,7 @@ def translate_hgvs_to_vrs(hgvs: str) -> Allele:
 # ----------------------------------- MANE ----------------------------------- #
 
 
-def get_mane_transcripts(transcripts: list[str]) -> list[ManeDescription]:
+def get_mane_transcripts(transcripts: set[str]) -> list[ManeDescription]:
     """Get corresponding MANE data for transcripts. Results given in order of
     transcript preference.
 
diff --git a/src/dcd_mapping/transcripts.py b/src/dcd_mapping/transcripts.py
index 00080d3..896da2e 100644
--- a/src/dcd_mapping/transcripts.py
+++ b/src/dcd_mapping/transcripts.py
@@ -37,12 +37,13 @@
 
 async def _get_compatible_transcripts(
     target_gene: TargetGene, align_result: AlignmentResult
-) -> list[list[str]]:
-    """Acquire matching transcripts
+) -> set[str]:
+    """Acquire transcripts which overlap with all hit subranges
+    of an alignment result.
 
     :param metadata: metadata for scoreset
     :param align_result: output of ``align()`` method
-    :return: List of list of compatible transcripts
+    :return: Set of compatible transcripts
     """
     if align_result.chrom.startswith("chr"):
         aligned_chrom = align_result.chrom[3:]
@@ -55,29 +56,15 @@ async def _get_compatible_transcripts(
             f"Unable to find gene symbol for target gene {target_gene.target_gene_name}"
         )
         raise TxSelectError(msg)
-    transcript_matches = []
+    transcript_matches: set[str] = set()
     for hit_range in align_result.hit_subranges:
         matches_list = await get_transcripts(
             gene_symbol, chromosome, hit_range.start, hit_range.end
         )
-        if matches_list:
-            transcript_matches.append(matches_list)
+        transcript_matches.intersection_update(matches_list)
     return transcript_matches
 
 
-def _reduce_compatible_transcripts(matching_transcripts: list[list[str]]) -> list[str]:
-    """Reduce list of list of transcripts to a list containing only entries present
-    in each sublist
-
-    :param matching_transcripts: list of list of transcript accession IDs
-    :return: list of transcripts shared by all sublists
-    """
-    common_transcripts_set = set(matching_transcripts[0])
-    for sublist in matching_transcripts[1:]:
-        common_transcripts_set.intersection_update(sublist)
-    return list(common_transcripts_set)
-
-
 def _choose_best_mane_transcript(
     mane_transcripts: list[ManeDescription],
 ) -> ManeDescription | None:
@@ -157,11 +144,8 @@ async def _select_protein_reference(
     reference sequence
     """
     matching_transcripts = await _get_compatible_transcripts(target_gene, align_result)
+
     if not matching_transcripts:
-        common_transcripts = None
-    else:
-        common_transcripts = _reduce_compatible_transcripts(matching_transcripts)
-    if not common_transcripts:
         if not target_gene.target_uniprot_ref:
             msg = f"Unable to find matching transcripts for target gene {target_gene.target_gene_name}"
             raise TxSelectError(msg)
@@ -174,10 +158,12 @@ async def _select_protein_reference(
         nm_accession = None
         tx_mode = None
     else:
-        mane_transcripts = get_mane_transcripts(common_transcripts)
+        mane_transcripts = get_mane_transcripts(matching_transcripts)
         best_tx = _choose_best_mane_transcript(mane_transcripts)
         if not best_tx:
-            best_tx = await _get_longest_compatible_transcript(common_transcripts)
+            best_tx = await _get_longest_compatible_transcript(
+                list(matching_transcripts)
+            )
         if not best_tx:
             msg = f"Unable to find matching MANE transcripts for target gene {target_gene.target_gene_name}"
             raise TxSelectError(msg)

From b6bbc59e199dd675740fc7880de8423a7e819c09 Mon Sep 17 00:00:00 2001
From: Benjamin Capodanno <bencap@outlook.com>
Date: Wed, 10 Dec 2025 11:26:30 -0800
Subject: [PATCH 2/2] feat: don't rely on user supplied target metadata for
 transcript selection

Prior to this change, we relied on the user supplying an appropriate HGNC symbol for their target as their target name. This is no longer required. Instead, transcript selection follows the following algorithm:

1.   Align the target sequence with BLAT.
2.    Fetch transcripts which overlap the aligned region (notably, without an HGNC symbol filter).
3.    Perform transcript selection within each distinct gene. This will either leave us with (a) one transcript in cases where we have no overlapping genes in a region, or, (2) one transcript per gene when multiple genes overlap an aligned region. These will be our candidate transcripts.
4.    If we still have more than one candidate transcript, we compare the similarity of each candidate to the provided target sequence. Select the most similar transcript.
---
 src/dcd_mapping/lookup.py      |  25 ++---
 src/dcd_mapping/transcripts.py | 177 ++++++++++++++++++++++++---------
 tests/test_transcript.py       | 161 +++++++++++++++++++++++++++++-
 3 files changed, 299 insertions(+), 64 deletions(-)

diff --git a/src/dcd_mapping/lookup.py b/src/dcd_mapping/lookup.py
index fbfefcf..3a5c186 100644
--- a/src/dcd_mapping/lookup.py
+++ b/src/dcd_mapping/lookup.py
@@ -264,35 +264,30 @@ async def get_protein_accession(transcript: str) -> str | None:
 
 
 async def get_transcripts(
-    gene_symbol: str, chromosome_ac: str, start: int, end: int
-) -> list[str]:
-    """Get transcript accessions matching given parameters (excluding non-coding RNA).
+    chromosome_ac: str, start: int, end: int
+) -> list[tuple[str, str]]:
+    """Get transcript accessions matching given parameters (excluding non-coding RNA),
+    returning both the transcript accession and HGNC symbol.
 
-    TODO: may be able to successfully query with only one of gene symbol/chromosome ac.
-    In initial testing, gene symbol doesn't seem to be a meaningful filter, but should
-    get further confirmation.
-
-    :param gene_symbol: HGNC-given gene symbol (usually, but not always, equivalent to
-        symbols available in other nomenclatures.)
     :param chromosome: chromosome accession (e.g. ``"NC_000007.13"``)
     :param start: starting position
     :param end: ending position
-    :return: candidate transcript accessions
+    :return: candidate transcript accessions and HGNC symbols
     """
     try:
         uta = CoolSeqToolBuilder().uta_db
         query = f"""
-        SELECT tx_ac
+        SELECT tx_ac, hgnc
         FROM {uta.schema}.tx_exon_aln_v
-        WHERE hgnc = '{gene_symbol}'
-        AND ({start} BETWEEN alt_start_i AND alt_end_i OR {end} BETWEEN alt_start_i AND alt_end_i)
+        WHERE ({start} BETWEEN alt_start_i AND alt_end_i OR {end} BETWEEN alt_start_i AND alt_end_i)
         AND alt_ac = '{chromosome_ac}'
         AND tx_ac NOT LIKE 'NR_%';
         """  # noqa: S608
         result = await uta.execute_query(query)
     except Exception as e:
         raise DataLookupError from e
-    return [row["tx_ac"] for row in result]
+
+    return [(row["tx_ac"], row["hgnc"]) for row in result]
 
 
 # ------------------------------ Gene Normalizer ------------------------------ #
@@ -596,7 +591,7 @@ def translate_hgvs_to_vrs(hgvs: str) -> Allele:
 # ----------------------------------- MANE ----------------------------------- #
 
 
-def get_mane_transcripts(transcripts: set[str]) -> list[ManeDescription]:
+def get_mane_transcripts(transcripts: list[str]) -> list[ManeDescription]:
     """Get corresponding MANE data for transcripts. Results given in order of
     transcript preference.
 
diff --git a/src/dcd_mapping/transcripts.py b/src/dcd_mapping/transcripts.py
index 896da2e..8af38cb 100644
--- a/src/dcd_mapping/transcripts.py
+++ b/src/dcd_mapping/transcripts.py
@@ -2,6 +2,7 @@
 import logging
 import re
 
+from Bio import Align
 from Bio.Data.CodonTable import IUPACData
 from Bio.Seq import Seq
 from Bio.SeqUtils import seq1
@@ -10,7 +11,6 @@
 from dcd_mapping.exceptions import TxSelectError
 from dcd_mapping.lookup import (
     get_chromosome_identifier,
-    get_gene_symbol,
     get_mane_transcripts,
     get_protein_accession,
     get_seqrepo,
@@ -36,35 +36,87 @@
 
 
 async def _get_compatible_transcripts(
-    target_gene: TargetGene, align_result: AlignmentResult
-) -> set[str]:
-    """Acquire transcripts which overlap with all hit subranges
+    align_result: AlignmentResult,
+) -> set[tuple[str, str]]:
+    """Acquire transcripts and their HGNC symbols which overlap with all hit subranges
     of an alignment result.
 
     :param metadata: metadata for scoreset
     :param align_result: output of ``align()`` method
     :return: Set of compatible transcripts
     """
-    if align_result.chrom.startswith("chr"):
-        aligned_chrom = align_result.chrom[3:]
-    else:
-        aligned_chrom = align_result.chrom
+    aligned_chrom = (
+        align_result.chrom[3:]
+        if align_result.chrom.startswith("chr")
+        else align_result.chrom
+    )
     chromosome = get_chromosome_identifier(aligned_chrom)
-    gene_symbol = get_gene_symbol(target_gene)
-    if not gene_symbol:
-        msg = (
-            f"Unable to find gene symbol for target gene {target_gene.target_gene_name}"
-        )
-        raise TxSelectError(msg)
-    transcript_matches: set[str] = set()
+
+    transcript_matches: set[tuple[str, str]] = set()
     for hit_range in align_result.hit_subranges:
-        matches_list = await get_transcripts(
-            gene_symbol, chromosome, hit_range.start, hit_range.end
-        )
+        matches_list = await get_transcripts(chromosome, hit_range.start, hit_range.end)
+        if not transcript_matches:
+            transcript_matches = set(matches_list)
+
         transcript_matches.intersection_update(matches_list)
+
     return transcript_matches
 
 
+def _local_alignment_identity(query: str, ref: str) -> float:
+    """Compute local alignment percent identity between two protein sequences.
+
+    Uses Smith-Waterman local alignment with BLOSUM62; gap open -10, gap extend -0.5.
+    Returns alignment score, or -1000 if either sequence is empty.
+    """
+    if not query or not ref:
+        return -1000
+
+    aligner = Align.PairwiseAligner(
+        mode="local",
+        substitution_matrix=Align.substitution_matrices.load("BLOSUM62"),
+        open_gap_score=-10,
+        extend_gap_score=-0.5,
+    )
+
+    try:
+        alignments = aligner.align(query, ref)
+    except Exception as e:
+        # Do not fallback to approximate similarity; propagate failure
+        msg = f"Local alignment failed: {e}"
+        raise TxSelectError(msg) from e
+
+    if not alignments:
+        return -1000
+
+    return alignments[0].score
+
+
+def _choose_most_similar_transcript(
+    protein_sequence: str, mane_transcripts: list[TranscriptDescription]
+) -> TranscriptDescription | None:
+    """Choose the transcript whose protein reference is most similar to the
+    provided sequence.
+
+    Selects the highest similarity; ties keep first encountered (stable).
+    """
+    if not mane_transcripts:
+        return None
+    if len(mane_transcripts) == 1:
+        return mane_transcripts[0]
+
+    best: TranscriptDescription | None = None
+    best_score = -1.0
+    for tx in mane_transcripts:
+        ref_seq = get_sequence(tx.refseq_prot)
+        score = _local_alignment_identity(protein_sequence, ref_seq)
+        if score > best_score:
+            best_score = score
+            best = tx
+
+    return best
+
+
 def _choose_best_mane_transcript(
     mane_transcripts: list[ManeDescription],
 ) -> ManeDescription | None:
@@ -143,46 +195,77 @@ async def _select_protein_reference(
     :raise TxSelectError: if no matching MANE transcripts and unable to get UniProt ID/
     reference sequence
     """
-    matching_transcripts = await _get_compatible_transcripts(target_gene, align_result)
+    matching_transcripts = await _get_compatible_transcripts(align_result)
 
-    if not matching_transcripts:
-        if not target_gene.target_uniprot_ref:
-            msg = f"Unable to find matching transcripts for target gene {target_gene.target_gene_name}"
-            raise TxSelectError(msg)
-        protein_sequence = get_uniprot_sequence(target_gene.target_uniprot_ref.id)
-        np_accession = target_gene.target_uniprot_ref.id
-        ref_sequence = get_uniprot_sequence(target_gene.target_uniprot_ref.id)
-        if not ref_sequence:
-            msg = f"Unable to grab reference sequence from uniprot.org for target gene {target_gene.target_gene_name}"
-            raise TxSelectError(msg)
-        nm_accession = None
-        tx_mode = None
-    else:
-        mane_transcripts = get_mane_transcripts(matching_transcripts)
+    # Map HGNC symbols to their compatible transcripts
+    hgnc_to_transcripts: dict[str, list[str]] = {}
+    for tx, hgnc in matching_transcripts:
+        hgnc_to_transcripts.setdefault(hgnc, []).append(tx)
+
+    per_gene_best: list[ManeDescription | TranscriptDescription] = []
+    best_tx: ManeDescription | TranscriptDescription | None = None
+
+    # Choose one best transcript per gene (based on MANE priority, falling back to longest)
+    for _, transcripts in hgnc_to_transcripts.items():
+        if not transcripts:
+            continue
+
+        mane_transcripts = get_mane_transcripts(transcripts)
         best_tx = _choose_best_mane_transcript(mane_transcripts)
+
         if not best_tx:
-            best_tx = await _get_longest_compatible_transcript(
-                list(matching_transcripts)
-            )
-        if not best_tx:
-            msg = f"Unable to find matching MANE transcripts for target gene {target_gene.target_gene_name}"
+            best_tx = await _get_longest_compatible_transcript(transcripts)
+
+        if best_tx:
+            per_gene_best.append(best_tx)
+
+    # If we found any per-gene best candidates, Step 2: choose the most similar among them and
+    # select it.
+    if per_gene_best:
+        if not target_gene.target_sequence:
+            msg = f"Unable to find target sequence for target gene {target_gene.target_gene_name}"
             raise TxSelectError(msg)
+
+        protein_sequence = _get_protein_sequence(target_gene.target_sequence)
+        best_tx = _choose_most_similar_transcript(protein_sequence, per_gene_best)
+
+        # As a fallback, pick the first candidate
+        if not best_tx:
+            best_tx = per_gene_best[0]
+
         ref_sequence = get_sequence(best_tx.refseq_prot)
-        nm_accession = best_tx.refseq_nuc
-        np_accession = best_tx.refseq_prot
-        tx_mode = best_tx.transcript_priority
+        is_full_match = ref_sequence.find(protein_sequence) != -1
+        start = ref_sequence.find(protein_sequence[:10])
+
+        return TxSelectResult(
+            nm=best_tx.refseq_nuc,
+            np=best_tx.refseq_prot,
+            start=start,
+            is_full_match=is_full_match,
+            sequence=get_sequence(best_tx.refseq_prot),
+            transcript_mode=best_tx.transcript_priority,
+        )
 
-    protein_sequence = _get_protein_sequence(target_gene.target_sequence)
-    is_full_match = ref_sequence.find(protein_sequence) != -1
-    start = ref_sequence.find(protein_sequence[:10])
+    # If we didn't find any suitable transcript, attempt to use a provided UniProt reference
+    if not target_gene.target_uniprot_ref:
+        msg = f"Unable to find matching transcripts for target gene {target_gene.target_gene_name}"
+        raise TxSelectError(msg)
+
+    uniprot_sequence = get_uniprot_sequence(target_gene.target_uniprot_ref.id)
+    if not uniprot_sequence:
+        msg = f"Unable to grab reference sequence from uniprot.org for target gene {target_gene.target_gene_name}"
+        raise TxSelectError(msg)
+
+    is_full_match = uniprot_sequence.find(protein_sequence) != -1
+    start = uniprot_sequence.find(protein_sequence[:10])
 
     return TxSelectResult(
-        nm=nm_accession,
-        np=np_accession,
+        nm=None,
+        np=target_gene.target_uniprot_ref.id,
         start=start,
         is_full_match=is_full_match,
         sequence=protein_sequence,
-        transcript_mode=tx_mode,
+        transcript_mode=None,
     )
 
 
diff --git a/tests/test_transcript.py b/tests/test_transcript.py
index 5d4612a..9219e39 100644
--- a/tests/test_transcript.py
+++ b/tests/test_transcript.py
@@ -13,10 +13,25 @@
 from unittest.mock import MagicMock
 
 import pytest
+from cool_seq_tool.schemas import TranscriptPriority
 
 from dcd_mapping.mavedb_data import _load_scoreset_records, get_scoreset_records
-from dcd_mapping.schemas import AlignmentResult, ScoresetMetadata, TxSelectResult
-from dcd_mapping.transcripts import select_transcript
+from dcd_mapping.schemas import (
+    AlignmentResult,
+    ScoresetMetadata,
+    SequenceRange,
+    TargetGene,
+    TargetSequenceType,
+    TargetType,
+    TxSelectResult,
+)
+from dcd_mapping.transcripts import (
+    TranscriptDescription,
+    _choose_most_similar_transcript,
+    _percent_similarity,
+    _select_protein_reference,
+    select_transcript,
+)
 
 
 @pytest.fixture()
@@ -163,6 +178,148 @@ async def test_1_b_2(
     check_transcript_results_equality(actual, expected)
 
 
+# --- Similarity helper tests ---
+
+
+def make_mane(nm: str, np: str, priority: TranscriptPriority):
+    # Use generic TranscriptDescription for testing similarity logic
+    return TranscriptDescription(
+        refseq_nuc=nm, refseq_prot=np, transcript_priority=priority
+    )
+
+
+def test_percent_similarity_basic():
+    assert _percent_similarity("AAAA", "AAAA") == 1.0
+    assert _percent_similarity("AAAA", "BAAAA") == 1.0  # substring fast path
+    assert 0.0 <= _percent_similarity("ABCD", "WXYZ") <= 1.0
+
+
+def test_choose_most_similar_transcript_simple(monkeypatch):
+    # Query is most similar to NP_2
+    query = "MKTFFV"
+    seqs = {
+        "NP_1": "MKAAAA",
+        "NP_2": "MKTFFV",
+        "NP_3": "MKTYFV",
+    }
+
+    def fake_get_sequence(ac):
+        return seqs[ac]
+
+    monkeypatch.setattr("dcd_mapping.transcripts.get_sequence", fake_get_sequence)
+
+    mane_list = [
+        make_mane("NM_1", "NP_1", TranscriptPriority.MANE_PLUS_CLINICAL),
+        make_mane("NM_2", "NP_2", TranscriptPriority.MANE_SELECT),
+        make_mane("NM_3", "NP_3", TranscriptPriority.MANE_PLUS_CLINICAL),
+    ]
+
+    best = _choose_most_similar_transcript(query, mane_list)
+    assert best.refseq_prot == "NP_2"
+
+
+def test_choose_most_similar_transcript_tie_keeps_first(monkeypatch):
+    # NP_1 and NP_2 have identical sequences vs query; NP_1 should win (stable tie)
+    query = "ABCDE"
+    seqs = {
+        "NP_1": "ABCDE",
+        "NP_2": "ABCDE",
+        "NP_3": "ABCXX",
+    }
+
+    def fake_get_sequence(ac):
+        return seqs[ac]
+
+    monkeypatch.setattr("dcd_mapping.transcripts.get_sequence", fake_get_sequence)
+
+    mane_list = [
+        make_mane("NM_1", "NP_1", TranscriptPriority.MANE_PLUS_CLINICAL),
+        make_mane("NM_2", "NP_2", TranscriptPriority.MANE_SELECT),
+        make_mane("NM_3", "NP_3", TranscriptPriority.MANE_PLUS_CLINICAL),
+    ]
+
+    best = _choose_most_similar_transcript(query, mane_list)
+    assert best.refseq_prot == "NP_1"
+
+
+@pytest.mark.asyncio(scope="module")
+async def test_end_to_end_per_gene_then_similarity(monkeypatch):
+    """E2E: per-gene best via MANE priority, then global similarity among winners."""
+    # Mock compatible transcripts grouped by HGNC symbol
+    compatible = {
+        ("NM_G1_A", "G1"),
+        ("NM_G1_B", "G1"),
+        ("NM_G2_A", "G2"),
+        ("NM_G2_B", "G2"),
+    }
+
+    async def fake_get_compatible(_align_result):
+        return compatible
+
+    monkeypatch.setattr(
+        "dcd_mapping.transcripts._get_compatible_transcripts", fake_get_compatible
+    )
+
+    # MANE per gene: G1 has a MANE_SELECT; G2 no select -> plus clinical wins
+    def fake_get_mane_transcripts(tx_list):
+        s = set(tx_list)
+        if s == {"NM_G1_A", "NM_G1_B"}:
+            return [
+                make_mane("NM_G1_A", "NP_G1_A", TranscriptPriority.MANE_PLUS_CLINICAL),
+                make_mane("NM_G1_B", "NP_G1_B", TranscriptPriority.MANE_SELECT),
+            ]
+        if s == {"NM_G2_A", "NM_G2_B"}:
+            return [
+                make_mane("NM_G2_A", "NP_G2_A", TranscriptPriority.MANE_PLUS_CLINICAL),
+                make_mane(
+                    "NM_G2_B",
+                    "NP_G2_B",
+                    TranscriptPriority.LONGEST_COMPATIBLE_REMAINING,
+                ),
+            ]
+        return []
+
+    monkeypatch.setattr(
+        "dcd_mapping.transcripts.get_mane_transcripts", fake_get_mane_transcripts
+    )
+
+    # Sequence database: make query closest to NP_G2_A (G2 winner)
+    seqs = {
+        "NP_G1_A": "MKAAAA",
+        "NP_G1_B": "MKBBBB",
+        "NP_G2_A": "MKTFFV",
+        "NP_G2_B": "MKTYFV",
+    }
+
+    def fake_get_sequence(ac):
+        return seqs[ac]
+
+    monkeypatch.setattr("dcd_mapping.transcripts.get_sequence", fake_get_sequence)
+
+    # Build target gene and minimal align result
+    query = "MKTFFV"
+    target_gene = TargetGene(
+        target_gene_name="Dummy",
+        target_gene_category=TargetType.PROTEIN_CODING,
+        target_sequence=query,
+        target_sequence_type=TargetSequenceType.PROTEIN,
+    )
+
+    align_result = AlignmentResult(
+        chrom="chr1",
+        strand=1,
+        coverage=None,
+        ident_pct=None,
+        query_range=SequenceRange(start=1, end=6),
+        query_subranges=[SequenceRange(start=1, end=6)],
+        hit_range=SequenceRange(start=1, end=6),
+        hit_subranges=[SequenceRange(start=1, end=6)],
+    )
+
+    tx = await _select_protein_reference(target_gene, align_result)
+    assert tx.np == "NP_G2_A"
+
+
 @pytest.mark.asyncio(scope="module")
 async def test_tx_src(
     scoreset_metadata_fixture: dict[str, ScoresetMetadata],