Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -144,3 +144,6 @@ output/*

# Local sqlite mirrors generated from MySQL dumps
config/databases/*.db

# Archive test data — large DB dumps, exports, and CSVs; not for version control
archive/test_data/
1,128 changes: 0 additions & 1,128 deletions api/Archive/efp_tables_structure_sample_data_dump_01_28_25.csv

This file was deleted.

1,128 changes: 0 additions & 1,128 deletions api/Archive/efp_tables_structure_sample_data_dump_01_28_25.sql

This file was deleted.

79 changes: 0 additions & 79 deletions api/Archive/embryo_efp_feb_6_2025_dump.sql

This file was deleted.

956 changes: 0 additions & 956 deletions api/Archive/sample_data_export_feb_4.csv

This file was deleted.

1,729 changes: 0 additions & 1,729 deletions api/Archive/sample_data_results.csv

This file was deleted.

23,169 changes: 0 additions & 23,169 deletions api/Archive/schema.sql

This file was deleted.

114 changes: 0 additions & 114 deletions api/Archive/structural_diffs_sample_data_dump_01_28_25.csv

This file was deleted.

114 changes: 0 additions & 114 deletions api/Archive/structural_diffs_sample_data_dump_01_28_25.sql

This file was deleted.

37 changes: 37 additions & 0 deletions api/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,52 @@
import re as _re
import sqlite3
import statistics as _statistics

from flask import Flask
from flask_sqlalchemy import SQLAlchemy
from flask_restx import Api
from flask_cors import CORS
from flask_caching import Cache
from flask_limiter import Limiter
from flask_limiter.util import get_remote_address
from sqlalchemy import event
from sqlalchemy.engine import Engine
import os
from pathlib import Path
import tempfile


@event.listens_for(Engine, "connect")
def _register_sqlite_functions(dbapi_conn, connection_record):
"""Register MySQL-compatible functions for SQLite (used in CI and local tests)."""
if not isinstance(dbapi_conn, sqlite3.Connection):
return

class _PopStdDev:
"""Population standard deviation aggregate (equivalent to MySQL STD())."""

def __init__(self):
self._vals = []

def step(self, value):
if value is not None:
self._vals.append(float(value))

def finalize(self):
if len(self._vals) < 2:
return None
return _statistics.pstdev(self._vals)

dbapi_conn.create_aggregate("std", 1, _PopStdDev)

def _regexp_replace(string, pattern, replacement):
if string is None:
return None
return _re.sub(pattern, replacement, string)

dbapi_conn.create_function("regexp_replace", 3, _regexp_replace)


def create_app():
"""Initialize the app factory based on the official Flask documentation"""
bar_app = Flask(__name__)
Expand Down
9 changes: 6 additions & 3 deletions api/models/efp_dynamic.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
"""
Dynamic SQLAlchemy model generation for simple eFP databases.
Reena Obmina | BCB330 Project 2025-2026 | University of Toronto

This module provides runtime generation of SQLAlchemy ORM models from schema
definitions, enabling dynamic database access without hardcoded model classes.
Dynamic SQLAlchemy model generation for all eFP databases.

At import time, one ORM model class is generated per database entry in
SIMPLE_EFP_DATABASE_SCHEMAS and stored in SIMPLE_EFP_SAMPLE_MODELS.
This replaces ~1,984 lines of hand-written boilerplate with a single registry.
"""

from __future__ import annotations
Expand Down
25 changes: 14 additions & 11 deletions api/models/efp_schemas.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
"""
Compact schema definitions for eFP databases exposing a sample_data table.
Reena Obmina | BCB330 Project 2025-2026 | University of Toronto

Each database only needs 3 columns: data_probeset_id, data_signal, data_bot_id.
All databases share the same column structure (VARCHAR(255) for string columns).
Schema definitions for all eFP databases that expose a sample_data table.

Every database shares the same three-column structure:
data_probeset_id (VARCHAR 255), data_signal (FLOAT), data_bot_id (VARCHAR 255).

To add a new database, append one tuple to _SPECS — no other changes needed.
"""

from __future__ import annotations
Expand All @@ -25,11 +29,12 @@


def _schema(species: str, charset: str = "latin1") -> DatabaseSpec:
"""Build a schema for one eFP database.
"""Build a schema entry for one eFP database.

:param species: Species name for metadata.
:param charset: MySQL character set ('latin1' or 'utf8mb4').
:return: Full database schema specification.
:param species: Species name stored in metadata (e.g., 'arabidopsis').
:param charset: MySQL character set — 'latin1' for most, 'utf8mb4' for non-Latin labels.
:returns: Full database schema dict ready for model generation.
:rtype: DatabaseSpec
"""
return {
**_SCHEMA_TEMPLATE,
Expand Down Expand Up @@ -234,9 +239,7 @@ def _schema(species: str, charset: str = "latin1") -> DatabaseSpec:
("willow", "willow"),
]

# databases that store Affymetrix/microarray probeset IDs instead of gene identifiers.
# For Arabidopsis databases in this set, the API will auto-convert AGI → probeset
# via the at_agi_lookup service before querying expression data.
# Databases that store Affymetrix/microarray probeset IDs instead of gene identifiers.
_PROBESET_DBS = {
# Arabidopsis microarray databases (Affymetrix ATH1 chip, need AGI→probeset lookup)
"affydb",
Expand Down Expand Up @@ -272,7 +275,7 @@ def _schema(species: str, charset: str = "latin1") -> DatabaseSpec:
"triticale_mas",
}

# databases that use utf8mb4 charset (all others default to latin1)
# Databases that use utf8mb4 charset (all others default to latin1)
_UTF8MB4 = {
"actinidia_bud_development", "actinidia_flower_fruit_development",
"actinidia_postharvest", "actinidia_vegetative_growth", "apple",
Expand Down
22 changes: 20 additions & 2 deletions api/resources/gene_expression.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
"""
Reena Obmina | BCB330 Project 2025-2026 | University of Toronto

REST endpoint for gene expression queries across all eFP databases.

Routes: GET /gene_expression/expression/<database>/<gene_id>

All gene IDs are validated by species before reaching the query layer.
Probeset conversion is applied automatically for microarray databases.
"""
from flask_restx import Namespace, Resource
from markupsafe import escape

Expand Down Expand Up @@ -36,6 +46,8 @@
)
class GeneExpression(Resource):
def get(self, database, gene_id):
"""Retrieve expression values for a gene from a given eFP database.
"""
database = str(escape(database))
gene_id = str(escape(gene_id))

Expand All @@ -58,7 +70,7 @@ def get(self, database, gene_id):
# 4. Normalise (e.g. strip maize transcript suffix _T##)
gene_id = normalize_gene_id(gene_id, species)

# 5. Microarray / non-direct databases need gene ID probeset conversion
# 5. Microarray / non-direct databases need gene ID -> probeset conversion
if database in PROBESET_DATABASES:
probeset, err = convert_gene_to_probeset(gene_id, species, database)
if err:
Expand All @@ -71,7 +83,13 @@ def get(self, database, gene_id):

if result["success"]:
return BARUtils.success_exit(result)
return BARUtils.error_exit(result["error"]), result.get("error_code", 500)

error_code = result.get("error_code", 500)
if error_code == 404:
return BARUtils.error_exit("No data found for the given gene"), 404
if error_code == 503:
return BARUtils.error_exit("Database not available"), 503
return BARUtils.error_exit("An error occurred"), 500


gene_expression.add_resource(GeneExpression, "/expression/<string:database>/<string:gene_id>")
13 changes: 9 additions & 4 deletions api/resources/gene_information.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,16 @@ def post(self):

if len(rows) > 0:
for row in rows:
if row.agi in data_items.keys():
data_items[row.agi].append(row.agi)
normalized_agi = BARUtils.normalize_arabidopsis_gene(row.agi)
alias_value = row.alias
if BARUtils.is_arabidopsis_gene_valid(alias_value):
alias_value = BARUtils.normalize_arabidopsis_gene(alias_value)

if normalized_agi in data_items.keys():
data_items[normalized_agi].append(normalized_agi)
else:
data_items[row.agi] = []
data_items[row.agi].append(row.alias)
data_items[normalized_agi] = []
data_items[normalized_agi].append(alias_value)

for gene in data_items.keys():
data.append({"gene": gene, "aliases": data_items[gene]})
Expand Down
23 changes: 13 additions & 10 deletions api/resources/microarray_gene_expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
from api.models.efp_dynamic import SIMPLE_EFP_SAMPLE_MODELS
from api.utils.bar_utils import BARUtils
from api.utils.world_efp_utils import WorldeFPUtils
from sqlalchemy import func
import json

# pull the dynamic model so this resource stays in sync with the schema catalog
# Pull the dynamic model so this resource stays in sync with the schema catalog
EcotypesSampleData = SIMPLE_EFP_SAMPLE_MODELS["arabidopsis_ecotypes"]

microarray_gene_expression = Namespace(
Expand All @@ -33,7 +34,7 @@ def get(self, species="", gene_id=""):
return BARUtils.error_exit("Invalid species")
subquery = (
db.select(AtAgiLookup.probeset)
.where(AtAgiLookup.agi == gene_id)
.where(func.lower(AtAgiLookup.agi) == gene_id.lower())
.order_by(AtAgiLookup.date.desc())
.limit(1)
.subquery()
Expand Down Expand Up @@ -63,8 +64,7 @@ def get(self, species="", gene_id=""):
return BARUtils.error_exit("There are no data found for the given gene")


# endpoint made by reena
# return view and database mappings for a given species
# Endpoint made by Reena
@microarray_gene_expression.route("/<string:species>/databases")
class GetDatabases(Resource):
@microarray_gene_expression.param("species", _in="path", default="arabidopsis")
Expand Down Expand Up @@ -220,8 +220,7 @@ def get(self, species=""):
return BARUtils.success_exit({"species": species, "databases": species_databases[species]})


# endpoint made by reena
# return control and sample mappings for a given species
# Endpoint made by Reena
@microarray_gene_expression.route("/<string:species>/<string:view>/samples")
class GetSamples1(Resource):
"""This endpoint returns control and sample group mappings for a given species and view (or all views)"""
Expand All @@ -234,7 +233,7 @@ def get(self, species="", view=""):
view = escape(view)

try:
with open("data/efp_info/efp_species_view_info.json") as f:
with open("data/efp_info/efp_species_view_info_typed.json") as f:
all_species_data = json.load(f)
except Exception as e:
return BARUtils.error_exit(f"Data file missing or invalid: {e}")
Expand All @@ -252,6 +251,10 @@ def get(self, species="", view=""):
if view not in species_data["views"]:
return BARUtils.error_exit("Invalid view for this species")

return BARUtils.success_exit(
{"species": species, "view": view, "groups": species_data["views"][view]["groups"]}
)
view_data = species_data["views"][view]
return BARUtils.success_exit({
"species": species,
"view": view,
"data_type": view_data.get("data_type", "Unknown"),
"groups": view_data["groups"]
})
23 changes: 16 additions & 7 deletions api/services/efp_bootstrap.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
"""
Utilities to bootstrap the simple eFP databases directly from the shared schema
definitions. Shared by the CLI script and the Flask endpoint so we only maintain
one implementation.
Reena Obmina | BCB330 Project 2025-2026 | University of Toronto

Bootstrap utilities for creating eFP MySQL databases from the shared schema registry.

Used by:
- scripts/bootstrap_simple_efp_dbs.py (CLI)
- config/init.sh (Docker / CI)
"""

from __future__ import annotations
Expand Down Expand Up @@ -86,10 +90,16 @@ def _build_table(metadata: MetaData, spec, db_name: str) -> Table:

def _make_index_name(db_name: str, index_cols: Iterable[str], max_len: int = 64) -> str:
"""
Create a MySQL-safe index name capped at 64 characters.
Create a MySQL-safe index name, capped at 64 characters.

Falls back to a hash suffix if the name would be too long,
keeping names deterministic and collision-free.

If the generated name is too long, fall back to a truncated db_name with a stable hash
to keep names deterministic and avoid collisions.
:param db_name: Database name used as the index name prefix.
:param index_cols: Column names included in the index.
:param max_len: Maximum allowed index name length (MySQL limit is 64).
:returns: A valid MySQL index name.
:rtype: str
"""
base = f"ix_{db_name}_{'_'.join(index_cols)}"
if len(base) <= max_len:
Expand Down Expand Up @@ -159,7 +169,6 @@ def ensure_database(server_url: URL, db_name: str, charset: str) -> None:

server_engine = create_engine(server_url)
with server_engine.begin() as conn:
# Safe to use f-string here since we've validated the inputs above
conn.execute(text(f"CREATE DATABASE IF NOT EXISTS `{db_name}` DEFAULT CHARACTER SET {charset}"))


Expand Down
Loading
Loading