diff --git a/alembic/versions/dcf8572d3a17_add_mapped_hgnc_name_to_targets.py b/alembic/versions/dcf8572d3a17_add_mapped_hgnc_name_to_targets.py new file mode 100644 index 00000000..90a6fa18 --- /dev/null +++ b/alembic/versions/dcf8572d3a17_add_mapped_hgnc_name_to_targets.py @@ -0,0 +1,29 @@ +"""add mapped hgnc name to targets + +Revision ID: dcf8572d3a17 +Revises: b22b450d409c +Create Date: 2025-12-12 17:32:40.147429 + +""" + +import sqlalchemy as sa + +from alembic import op + +# revision identifiers, used by Alembic. +revision = "dcf8572d3a17" +down_revision = "b22b450d409c" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column("target_genes", sa.Column("mapped_hgnc_name", sa.String(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column("target_genes", "mapped_hgnc_name") + # ### end Alembic commands ### diff --git a/src/mavedb/lib/mapping.py b/src/mavedb/lib/mapping.py index f0303e16..d3915f53 100644 --- a/src/mavedb/lib/mapping.py +++ b/src/mavedb/lib/mapping.py @@ -13,11 +13,19 @@ class VRSMap: url: str + class GeneInfo(TypedDict): + hgnc_symbol: str + selection_method: str + + class TargetAnnotation(TypedDict): + gene_info: "VRSMap.GeneInfo" + layers: dict[str, dict[str, dict[str, dict[str, Union[str, list[str]]]]]] + class ScoreSetMappingResults(TypedDict): metadata: Optional[dict[str, str]] dcd_mapping_version: str mapped_date_utc: date - reference_sequences: Optional[dict[str, dict[str, dict[str, dict[str, Union[str, list[str]]]]]]] + reference_sequences: Optional[dict[str, "VRSMap.TargetAnnotation"]] mapped_scores: Optional[list[dict]] error_message: Optional[str] diff --git a/src/mavedb/models/target_gene.py b/src/mavedb/models/target_gene.py index 0f149248..f46d64bc 100644 --- a/src/mavedb/models/target_gene.py +++ b/src/mavedb/models/target_gene.py @@ -47,6 +47,7 @@ class TargetGene(Base): pre_mapped_metadata: Mapped[JSONB] = Column("pre_mapped_metadata", JSONB, nullable=True) post_mapped_metadata: Mapped[JSONB] = Column("post_mapped_metadata", JSONB, nullable=True) + mapped_hgnc_name = Column(String, nullable=True) uniprot_id_from_mapped_metadata = Column(String, nullable=True) creation_date = Column(Date, nullable=False, default=date.today) diff --git a/src/mavedb/scripts/populate_mapped_variants.py b/src/mavedb/scripts/populate_mapped_variants.py index ca4b251a..de9eedbd 100644 --- a/src/mavedb/scripts/populate_mapped_variants.py +++ b/src/mavedb/scripts/populate_mapped_variants.py @@ -1,8 +1,8 @@ import logging -import click from datetime import date -from typing import Sequence, Optional +from typing import Optional, Sequence, Union +import click from sqlalchemy import cast, select from sqlalchemy.dialects.postgresql import JSONB from sqlalchemy.orm import Session @@ -12,10 +12,9 @@ from mavedb.lib.logging.context import format_raised_exception_info_as_dict from mavedb.lib.mapping import ANNOTATION_LAYERS from mavedb.models.enums.mapping_state import MappingState -from mavedb.models.score_set import ScoreSet from mavedb.models.mapped_variant import MappedVariant +from mavedb.models.score_set import ScoreSet from mavedb.models.variant import Variant - from mavedb.scripts.environment import script_environment, with_database_session logger = logging.getLogger(__name__) @@ -111,10 +110,16 @@ def populate_mapped_variant_data(db: Session, urns: Sequence[Optional[str]], all ) # allow for multiple annotation layers pre_mapped_metadata = {} - post_mapped_metadata = {} + post_mapped_metadata: dict[str, Union[Optional[str], dict[str, dict[str, str | list[str]]]]] = {} excluded_pre_mapped_keys = {"sequence"} - for annotation_layer in reference_metadata[target_gene_identifier]: - layer_premapped = reference_metadata[target_gene_identifier][annotation_layer].get( + + gene_info = reference_metadata[target_gene_identifier].get("gene_info") + if gene_info: + target_gene.mapped_hgnc_name = gene_info.get("hgnc_symbol") + post_mapped_metadata["hgnc_name_selection_method"] = gene_info.get("selection_method") + + for annotation_layer in reference_metadata[target_gene_identifier]["layers"]: + layer_premapped = reference_metadata[target_gene_identifier]["layers"][annotation_layer].get( "computed_reference_sequence" ) if layer_premapped: @@ -122,7 +127,7 @@ def populate_mapped_variant_data(db: Session, urns: Sequence[Optional[str]], all k: layer_premapped[k] for k in set(list(layer_premapped.keys())) - excluded_pre_mapped_keys } - layer_postmapped = reference_metadata[target_gene_identifier][annotation_layer].get( + layer_postmapped = reference_metadata[target_gene_identifier]["layers"][annotation_layer].get( "mapped_reference_sequence" ) if layer_postmapped: diff --git a/src/mavedb/view_models/target_gene.py b/src/mavedb/view_models/target_gene.py index 48396a98..ccbebc5c 100644 --- a/src/mavedb/view_models/target_gene.py +++ b/src/mavedb/view_models/target_gene.py @@ -60,6 +60,7 @@ class SavedTargetGene(TargetGeneBase): target_sequence: Optional[SavedTargetSequence] = None target_accession: Optional[SavedTargetAccession] = None external_identifiers: Sequence[external_gene_identifier_offset.SavedExternalGeneIdentifierOffset] + mapped_hgnc_name: Optional[str] = None uniprot_id_from_mapped_metadata: Optional[str] = None _record_type_factory = record_type_validator()(set_record_type) diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index 6bd673be..3a690d97 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -453,21 +453,27 @@ async def map_variants_for_score_set( f"Target gene {target_gene_identifier} not found in database for score set {score_set.urn}." ) # allow for multiple annotation layers - pre_mapped_metadata = {} - post_mapped_metadata = {} + pre_mapped_metadata: dict[str, Any] = {} + post_mapped_metadata: dict[str, Any] = {} excluded_pre_mapped_keys = {"sequence"} - for annotation_layer in reference_metadata[target_gene_identifier]: - layer_premapped = reference_metadata[target_gene_identifier][annotation_layer].get( - "computed_reference_sequence" - ) + + gene_info = reference_metadata[target_gene_identifier].get("gene_info") + if gene_info: + target_gene.mapped_hgnc_name = gene_info.get("hgnc_symbol") + post_mapped_metadata["hgnc_name_selection_method"] = gene_info.get("selection_method") + + for annotation_layer in reference_metadata[target_gene_identifier]["layers"]: + layer_premapped = reference_metadata[target_gene_identifier]["layers"][ + annotation_layer + ].get("computed_reference_sequence") if layer_premapped: pre_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = { k: layer_premapped[k] for k in set(list(layer_premapped.keys())) - excluded_pre_mapped_keys } - layer_postmapped = reference_metadata[target_gene_identifier][annotation_layer].get( - "mapped_reference_sequence" - ) + layer_postmapped = reference_metadata[target_gene_identifier]["layers"][ + annotation_layer + ].get("mapped_reference_sequence") if layer_postmapped: post_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = layer_postmapped target_gene.pre_mapped_metadata = cast(pre_mapped_metadata, JSONB) diff --git a/tests/conftest.py b/tests/conftest.py index c79c033e..b11f728c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,7 @@ import logging # noqa: F401 +import sys from datetime import datetime from unittest import mock -import sys import email_validator import pytest @@ -11,35 +11,33 @@ from sqlalchemy.pool import NullPool from mavedb.db.base import Base +from mavedb.models import * # noqa: F403 +from mavedb.models.experiment import Experiment from mavedb.models.experiment_set import ExperimentSet -from mavedb.models.score_set_publication_identifier import ScoreSetPublicationIdentifierAssociation -from mavedb.models.user import User, UserRole, Role from mavedb.models.license import License -from mavedb.models.taxonomy import Taxonomy -from mavedb.models.publication_identifier import PublicationIdentifier -from mavedb.models.experiment import Experiment -from mavedb.models.variant import Variant from mavedb.models.mapped_variant import MappedVariant +from mavedb.models.publication_identifier import PublicationIdentifier from mavedb.models.score_set import ScoreSet - -from mavedb.models import * # noqa: F403 - +from mavedb.models.score_set_publication_identifier import ScoreSetPublicationIdentifierAssociation +from mavedb.models.taxonomy import Taxonomy +from mavedb.models.user import Role, User, UserRole +from mavedb.models.variant import Variant from tests.helpers.constants import ( ADMIN_USER, EXTRA_USER, - TEST_LICENSE, + TEST_BRNICH_SCORE_CALIBRATION, TEST_INACTIVE_LICENSE, + TEST_LICENSE, + TEST_PATHOGENICITY_SCORE_CALIBRATION, + TEST_PUBMED_IDENTIFIER, TEST_SAVED_TAXONOMY, TEST_USER, - VALID_VARIANT_URN, - VALID_SCORE_SET_URN, - VALID_EXPERIMENT_URN, - VALID_EXPERIMENT_SET_URN, - TEST_PUBMED_IDENTIFIER, TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X, TEST_VALID_PRE_MAPPED_VRS_ALLELE_VRS2_X, - TEST_BRNICH_SCORE_CALIBRATION, - TEST_PATHOGENICITY_SCORE_CALIBRATION, + VALID_EXPERIMENT_SET_URN, + VALID_EXPERIMENT_URN, + VALID_SCORE_SET_URN, + VALID_VARIANT_URN, ) sys.path.append(".") @@ -56,7 +54,7 @@ assert pytest_postgresql.factories # Allow the @test domain name through our email validator. -email_validator.SPECIAL_USE_DOMAIN_NAMES.remove("test") +email_validator.TEST_ENVIRONMENT = True @pytest.fixture() diff --git a/tests/helpers/constants.py b/tests/helpers/constants.py index 1a219f17..32918235 100644 --- a/tests/helpers/constants.py +++ b/tests/helpers/constants.py @@ -1189,27 +1189,38 @@ } } + +TEST_GENE_INFO = { + "hgnc_symbol": VALID_GENE, + "selection_method": "tx_selection", +} + + +TEST_GENOMIC_LAYER = { + "computed_reference_sequence": { + "sequence_type": "dna", + "sequence_id": "ga4gh:SQ.ref_test", + "sequence": "ACGTTT", + }, + "mapped_reference_sequence": { + "sequence_type": "dna", + "sequence_id": "ga4gh:SQ.map_test", + "sequence_accessions": [VALID_CHR_ACCESSION], + }, +} + +TEST_CODING_LAYER = { + "mapped_reference_sequence": { + "sequence_accessions": [VALID_NT_ACCESSION], + }, +} + TEST_SEQ_SCORESET_VARIANT_MAPPING_SCAFFOLD = { "metadata": {}, "reference_sequences": { "TEST1": { - "g": { - "computed_reference_sequence": { - "sequence_type": "dna", - "sequence_id": "ga4gh:SQ.ref_test", - "sequence": "ACGTTT", - }, - "mapped_reference_sequence": { - "sequence_type": "dna", - "sequence_id": "ga4gh:SQ.map_test", - "sequence_accessions": [VALID_CHR_ACCESSION], - }, - }, - "c": { - "mapped_reference_sequence": { - "sequence_accessions": [VALID_NT_ACCESSION], - }, - }, + "gene_info": TEST_GENE_INFO, + "layers": {"g": TEST_GENOMIC_LAYER, "c": TEST_CODING_LAYER}, } }, "mapped_scores": [], @@ -1222,23 +1233,8 @@ "metadata": {}, "reference_sequences": { "TEST2": { - "g": { - "computed_reference_sequence": { - "sequence_type": "dna", - "sequence_id": "ga4gh:SQ.ref_test", - "sequence": "ACGTTT", - }, - "mapped_reference_sequence": { - "sequence_type": "dna", - "sequence_id": "ga4gh:SQ.map_test", - "sequence_accessions": [VALID_CHR_ACCESSION], - }, - }, - "c": { - "mapped_reference_sequence": { - "sequence_accessions": [VALID_NT_ACCESSION], - }, - }, + "gene_info": TEST_GENE_INFO, + "layers": {"g": TEST_GENOMIC_LAYER, "c": TEST_CODING_LAYER}, } }, "mapped_scores": [], @@ -1251,42 +1247,12 @@ "metadata": {}, "reference_sequences": { "TEST3": { - "g": { - "computed_reference_sequence": { - "sequence_type": "dna", - "sequence_id": "ga4gh:SQ.ref_test3", - "sequence": "ACGTTT", - }, - "mapped_reference_sequence": { - "sequence_type": "dna", - "sequence_id": "ga4gh:SQ.map_test", - "sequence_accessions": [VALID_CHR_ACCESSION], - }, - }, - "c": { - "mapped_reference_sequence": { - "sequence_accessions": [VALID_NT_ACCESSION], - }, - }, + "gene_info": TEST_GENE_INFO, + "layers": {"g": TEST_GENOMIC_LAYER, "c": TEST_CODING_LAYER}, }, "TEST4": { - "g": { - "computed_reference_sequence": { - "sequence_type": "dna", - "sequence_id": "ga4gh:SQ.ref_test4", - "sequence": "TAATGCC", - }, - "mapped_reference_sequence": { - "sequence_type": "dna", - "sequence_id": "ga4gh:SQ.map_test", - "sequence_accessions": [VALID_CHR_ACCESSION], - }, - }, - "c": { - "mapped_reference_sequence": { - "sequence_accessions": [VALID_NT_ACCESSION], - }, - }, + "gene_info": TEST_GENE_INFO, + "layers": {"g": TEST_GENOMIC_LAYER, "c": TEST_CODING_LAYER}, }, }, "mapped_scores": [],