Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions alembic/versions/dcf8572d3a17_add_mapped_hgnc_name_to_targets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""add mapped hgnc name to targets

Revision ID: dcf8572d3a17
Revises: b22b450d409c
Create Date: 2025-12-12 17:32:40.147429

"""

import sqlalchemy as sa

from alembic import op

# revision identifiers, used by Alembic.
revision = "dcf8572d3a17"
down_revision = "b22b450d409c"
branch_labels = None
depends_on = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.add_column("target_genes", sa.Column("mapped_hgnc_name", sa.String(), nullable=True))
# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column("target_genes", "mapped_hgnc_name")
# ### end Alembic commands ###
10 changes: 9 additions & 1 deletion src/mavedb/lib/mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,19 @@
class VRSMap:
url: str

class GeneInfo(TypedDict):
hgnc_symbol: str
selection_method: str

class TargetAnnotation(TypedDict):
gene_info: "VRSMap.GeneInfo"
layers: dict[str, dict[str, dict[str, dict[str, Union[str, list[str]]]]]]

class ScoreSetMappingResults(TypedDict):
metadata: Optional[dict[str, str]]
dcd_mapping_version: str
mapped_date_utc: date
reference_sequences: Optional[dict[str, dict[str, dict[str, dict[str, Union[str, list[str]]]]]]]
reference_sequences: Optional[dict[str, "VRSMap.TargetAnnotation"]]
mapped_scores: Optional[list[dict]]
error_message: Optional[str]

Expand Down
1 change: 1 addition & 0 deletions src/mavedb/models/target_gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class TargetGene(Base):

pre_mapped_metadata: Mapped[JSONB] = Column("pre_mapped_metadata", JSONB, nullable=True)
post_mapped_metadata: Mapped[JSONB] = Column("post_mapped_metadata", JSONB, nullable=True)
mapped_hgnc_name = Column(String, nullable=True)
uniprot_id_from_mapped_metadata = Column(String, nullable=True)

creation_date = Column(Date, nullable=False, default=date.today)
Expand Down
21 changes: 13 additions & 8 deletions src/mavedb/scripts/populate_mapped_variants.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import logging
import click
from datetime import date
from typing import Sequence, Optional
from typing import Optional, Sequence, Union

import click
from sqlalchemy import cast, select
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.orm import Session
Expand All @@ -12,10 +12,9 @@
from mavedb.lib.logging.context import format_raised_exception_info_as_dict
from mavedb.lib.mapping import ANNOTATION_LAYERS
from mavedb.models.enums.mapping_state import MappingState
from mavedb.models.score_set import ScoreSet
from mavedb.models.mapped_variant import MappedVariant
from mavedb.models.score_set import ScoreSet
from mavedb.models.variant import Variant

from mavedb.scripts.environment import script_environment, with_database_session

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -111,18 +110,24 @@ def populate_mapped_variant_data(db: Session, urns: Sequence[Optional[str]], all
)
# allow for multiple annotation layers
pre_mapped_metadata = {}
post_mapped_metadata = {}
post_mapped_metadata: dict[str, Union[Optional[str], dict[str, dict[str, str | list[str]]]]] = {}
excluded_pre_mapped_keys = {"sequence"}
for annotation_layer in reference_metadata[target_gene_identifier]:
layer_premapped = reference_metadata[target_gene_identifier][annotation_layer].get(

gene_info = reference_metadata[target_gene_identifier].get("gene_info")
if gene_info:
target_gene.mapped_hgnc_name = gene_info.get("hgnc_symbol")
post_mapped_metadata["hgnc_name_selection_method"] = gene_info.get("selection_method")

for annotation_layer in reference_metadata[target_gene_identifier]["layers"]:
layer_premapped = reference_metadata[target_gene_identifier]["layers"][annotation_layer].get(
"computed_reference_sequence"
)
if layer_premapped:
pre_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = {
k: layer_premapped[k]
for k in set(list(layer_premapped.keys())) - excluded_pre_mapped_keys
}
layer_postmapped = reference_metadata[target_gene_identifier][annotation_layer].get(
layer_postmapped = reference_metadata[target_gene_identifier]["layers"][annotation_layer].get(
"mapped_reference_sequence"
)
if layer_postmapped:
Expand Down
1 change: 1 addition & 0 deletions src/mavedb/view_models/target_gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class SavedTargetGene(TargetGeneBase):
target_sequence: Optional[SavedTargetSequence] = None
target_accession: Optional[SavedTargetAccession] = None
external_identifiers: Sequence[external_gene_identifier_offset.SavedExternalGeneIdentifierOffset]
mapped_hgnc_name: Optional[str] = None
uniprot_id_from_mapped_metadata: Optional[str] = None

_record_type_factory = record_type_validator()(set_record_type)
Expand Down
24 changes: 15 additions & 9 deletions src/mavedb/worker/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,21 +453,27 @@ async def map_variants_for_score_set(
f"Target gene {target_gene_identifier} not found in database for score set {score_set.urn}."
)
# allow for multiple annotation layers
pre_mapped_metadata = {}
post_mapped_metadata = {}
pre_mapped_metadata: dict[str, Any] = {}
post_mapped_metadata: dict[str, Any] = {}
excluded_pre_mapped_keys = {"sequence"}
for annotation_layer in reference_metadata[target_gene_identifier]:
layer_premapped = reference_metadata[target_gene_identifier][annotation_layer].get(
"computed_reference_sequence"
)

gene_info = reference_metadata[target_gene_identifier].get("gene_info")
if gene_info:
target_gene.mapped_hgnc_name = gene_info.get("hgnc_symbol")
post_mapped_metadata["hgnc_name_selection_method"] = gene_info.get("selection_method")

for annotation_layer in reference_metadata[target_gene_identifier]["layers"]:
layer_premapped = reference_metadata[target_gene_identifier]["layers"][
annotation_layer
].get("computed_reference_sequence")
if layer_premapped:
pre_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = {
k: layer_premapped[k]
for k in set(list(layer_premapped.keys())) - excluded_pre_mapped_keys
}
layer_postmapped = reference_metadata[target_gene_identifier][annotation_layer].get(
"mapped_reference_sequence"
)
layer_postmapped = reference_metadata[target_gene_identifier]["layers"][
annotation_layer
].get("mapped_reference_sequence")
if layer_postmapped:
post_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = layer_postmapped
target_gene.pre_mapped_metadata = cast(pre_mapped_metadata, JSONB)
Expand Down
36 changes: 17 additions & 19 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging # noqa: F401
import sys
from datetime import datetime
from unittest import mock
import sys

import email_validator
import pytest
Expand All @@ -11,35 +11,33 @@
from sqlalchemy.pool import NullPool

from mavedb.db.base import Base
from mavedb.models import * # noqa: F403
from mavedb.models.experiment import Experiment
from mavedb.models.experiment_set import ExperimentSet
from mavedb.models.score_set_publication_identifier import ScoreSetPublicationIdentifierAssociation
from mavedb.models.user import User, UserRole, Role
from mavedb.models.license import License
from mavedb.models.taxonomy import Taxonomy
from mavedb.models.publication_identifier import PublicationIdentifier
from mavedb.models.experiment import Experiment
from mavedb.models.variant import Variant
from mavedb.models.mapped_variant import MappedVariant
from mavedb.models.publication_identifier import PublicationIdentifier
from mavedb.models.score_set import ScoreSet

from mavedb.models import * # noqa: F403

from mavedb.models.score_set_publication_identifier import ScoreSetPublicationIdentifierAssociation
from mavedb.models.taxonomy import Taxonomy
from mavedb.models.user import Role, User, UserRole
from mavedb.models.variant import Variant
from tests.helpers.constants import (
ADMIN_USER,
EXTRA_USER,
TEST_LICENSE,
TEST_BRNICH_SCORE_CALIBRATION,
TEST_INACTIVE_LICENSE,
TEST_LICENSE,
TEST_PATHOGENICITY_SCORE_CALIBRATION,
TEST_PUBMED_IDENTIFIER,
TEST_SAVED_TAXONOMY,
TEST_USER,
VALID_VARIANT_URN,
VALID_SCORE_SET_URN,
VALID_EXPERIMENT_URN,
VALID_EXPERIMENT_SET_URN,
TEST_PUBMED_IDENTIFIER,
TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X,
TEST_VALID_PRE_MAPPED_VRS_ALLELE_VRS2_X,
TEST_BRNICH_SCORE_CALIBRATION,
TEST_PATHOGENICITY_SCORE_CALIBRATION,
VALID_EXPERIMENT_SET_URN,
VALID_EXPERIMENT_URN,
VALID_SCORE_SET_URN,
VALID_VARIANT_URN,
)

sys.path.append(".")
Expand All @@ -56,7 +54,7 @@
assert pytest_postgresql.factories

# Allow the @test domain name through our email validator.
email_validator.SPECIAL_USE_DOMAIN_NAMES.remove("test")
email_validator.TEST_ENVIRONMENT = True


@pytest.fixture()
Expand Down
102 changes: 34 additions & 68 deletions tests/helpers/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -1189,27 +1189,38 @@
}
}


TEST_GENE_INFO = {
"hgnc_symbol": VALID_GENE,
"selection_method": "tx_selection",
}


TEST_GENOMIC_LAYER = {
"computed_reference_sequence": {
"sequence_type": "dna",
"sequence_id": "ga4gh:SQ.ref_test",
"sequence": "ACGTTT",
},
"mapped_reference_sequence": {
"sequence_type": "dna",
"sequence_id": "ga4gh:SQ.map_test",
"sequence_accessions": [VALID_CHR_ACCESSION],
},
}

TEST_CODING_LAYER = {
"mapped_reference_sequence": {
"sequence_accessions": [VALID_NT_ACCESSION],
},
}

TEST_SEQ_SCORESET_VARIANT_MAPPING_SCAFFOLD = {
"metadata": {},
"reference_sequences": {
"TEST1": {
"g": {
"computed_reference_sequence": {
"sequence_type": "dna",
"sequence_id": "ga4gh:SQ.ref_test",
"sequence": "ACGTTT",
},
"mapped_reference_sequence": {
"sequence_type": "dna",
"sequence_id": "ga4gh:SQ.map_test",
"sequence_accessions": [VALID_CHR_ACCESSION],
},
},
"c": {
"mapped_reference_sequence": {
"sequence_accessions": [VALID_NT_ACCESSION],
},
},
"gene_info": TEST_GENE_INFO,
"layers": {"g": TEST_GENOMIC_LAYER, "c": TEST_CODING_LAYER},
}
},
"mapped_scores": [],
Expand All @@ -1222,23 +1233,8 @@
"metadata": {},
"reference_sequences": {
"TEST2": {
"g": {
"computed_reference_sequence": {
"sequence_type": "dna",
"sequence_id": "ga4gh:SQ.ref_test",
"sequence": "ACGTTT",
},
"mapped_reference_sequence": {
"sequence_type": "dna",
"sequence_id": "ga4gh:SQ.map_test",
"sequence_accessions": [VALID_CHR_ACCESSION],
},
},
"c": {
"mapped_reference_sequence": {
"sequence_accessions": [VALID_NT_ACCESSION],
},
},
"gene_info": TEST_GENE_INFO,
"layers": {"g": TEST_GENOMIC_LAYER, "c": TEST_CODING_LAYER},
}
},
"mapped_scores": [],
Expand All @@ -1251,42 +1247,12 @@
"metadata": {},
"reference_sequences": {
"TEST3": {
"g": {
"computed_reference_sequence": {
"sequence_type": "dna",
"sequence_id": "ga4gh:SQ.ref_test3",
"sequence": "ACGTTT",
},
"mapped_reference_sequence": {
"sequence_type": "dna",
"sequence_id": "ga4gh:SQ.map_test",
"sequence_accessions": [VALID_CHR_ACCESSION],
},
},
"c": {
"mapped_reference_sequence": {
"sequence_accessions": [VALID_NT_ACCESSION],
},
},
"gene_info": TEST_GENE_INFO,
"layers": {"g": TEST_GENOMIC_LAYER, "c": TEST_CODING_LAYER},
},
"TEST4": {
"g": {
"computed_reference_sequence": {
"sequence_type": "dna",
"sequence_id": "ga4gh:SQ.ref_test4",
"sequence": "TAATGCC",
},
"mapped_reference_sequence": {
"sequence_type": "dna",
"sequence_id": "ga4gh:SQ.map_test",
"sequence_accessions": [VALID_CHR_ACCESSION],
},
},
"c": {
"mapped_reference_sequence": {
"sequence_accessions": [VALID_NT_ACCESSION],
},
},
"gene_info": TEST_GENE_INFO,
"layers": {"g": TEST_GENOMIC_LAYER, "c": TEST_CODING_LAYER},
},
},
"mapped_scores": [],
Expand Down