BioAnalyticResource · rmobmina · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026 · Apr 13, 2026
diff --git a/.gitignore b/.gitignore
@@ -144,3 +144,6 @@ output/*
 
 # Local sqlite mirrors generated from MySQL dumps
 config/databases/*.db
+
+# Archive test data — large DB dumps, exports, and CSVs; not for version control
+archive/test_data/
diff --git a/api/Archive/efp_tables_structure_sample_data_dump_01_28_25.csv b/api/Archive/efp_tables_structure_sample_data_dump_01_28_25.csv
diff --git a/api/Archive/efp_tables_structure_sample_data_dump_01_28_25.sql b/api/Archive/efp_tables_structure_sample_data_dump_01_28_25.sql
diff --git a/api/Archive/embryo_efp_feb_6_2025_dump.sql b/api/Archive/embryo_efp_feb_6_2025_dump.sql
diff --git a/api/Archive/sample_data_export_feb_4.csv b/api/Archive/sample_data_export_feb_4.csv
diff --git a/api/Archive/sample_data_results.csv b/api/Archive/sample_data_results.csv
diff --git a/api/Archive/schema.sql b/api/Archive/schema.sql
diff --git a/api/Archive/structural_diffs_sample_data_dump_01_28_25.csv b/api/Archive/structural_diffs_sample_data_dump_01_28_25.csv
diff --git a/api/Archive/structural_diffs_sample_data_dump_01_28_25.sql b/api/Archive/structural_diffs_sample_data_dump_01_28_25.sql
diff --git a/api/__init__.py b/api/__init__.py
@@ -1,15 +1,52 @@
+import re as _re
+import sqlite3
+import statistics as _statistics
+
 from flask import Flask
 from flask_sqlalchemy import SQLAlchemy
 from flask_restx import Api
 from flask_cors import CORS
 from flask_caching import Cache
 from flask_limiter import Limiter
 from flask_limiter.util import get_remote_address
+from sqlalchemy import event
+from sqlalchemy.engine import Engine
 import os
 from pathlib import Path
 import tempfile
 
 
+@event.listens_for(Engine, "connect")
+def _register_sqlite_functions(dbapi_conn, connection_record):
+    """Register MySQL-compatible functions for SQLite (used in CI and local tests)."""
+    if not isinstance(dbapi_conn, sqlite3.Connection):
+        return
+
+    class _PopStdDev:
+        """Population standard deviation aggregate (equivalent to MySQL STD())."""
+
+        def __init__(self):
+            self._vals = []
+
+        def step(self, value):
+            if value is not None:
+                self._vals.append(float(value))
+
+        def finalize(self):
+            if len(self._vals) < 2:
+                return None
+            return _statistics.pstdev(self._vals)
+
+    dbapi_conn.create_aggregate("std", 1, _PopStdDev)
+
+    def _regexp_replace(string, pattern, replacement):
+        if string is None:
+            return None
+        return _re.sub(pattern, replacement, string)
+
+    dbapi_conn.create_function("regexp_replace", 3, _regexp_replace)
+
+
 def create_app():
     """Initialize the app factory based on the official Flask documentation"""
     bar_app = Flask(__name__)

diff --git a/api/models/efp_dynamic.py b/api/models/efp_dynamic.py
@@ -1,8 +1,11 @@
 """
-Dynamic SQLAlchemy model generation for simple eFP databases.
+Reena Obmina | BCB330 Project 2025-2026 | University of Toronto
 
-This module provides runtime generation of SQLAlchemy ORM models from schema
-definitions, enabling dynamic database access without hardcoded model classes.
+Dynamic SQLAlchemy model generation for all eFP databases.
+
+At import time, one ORM model class is generated per database entry in
+SIMPLE_EFP_DATABASE_SCHEMAS and stored in SIMPLE_EFP_SAMPLE_MODELS.
+This replaces ~1,984 lines of hand-written boilerplate with a single registry.
 """
 
 from __future__ import annotations

diff --git a/api/models/efp_schemas.py b/api/models/efp_schemas.py
@@ -1,8 +1,12 @@
 """
-Compact schema definitions for eFP databases exposing a sample_data table.
+Reena Obmina | BCB330 Project 2025-2026 | University of Toronto
 
-Each database only needs 3 columns: data_probeset_id, data_signal, data_bot_id.
-All databases share the same column structure (VARCHAR(255) for string columns).
+Schema definitions for all eFP databases that expose a sample_data table.
+
+Every database shares the same three-column structure:
+  data_probeset_id (VARCHAR 255), data_signal (FLOAT), data_bot_id (VARCHAR 255).
+
+To add a new database, append one tuple to _SPECS — no other changes needed.
 """
 
 from __future__ import annotations
@@ -25,11 +29,12 @@
 
 
 def _schema(species: str, charset: str = "latin1") -> DatabaseSpec:
-    """Build a schema for one eFP database.
+    """Build a schema entry for one eFP database.
 
-    :param species: Species name for metadata.
-    :param charset: MySQL character set ('latin1' or 'utf8mb4').
-    :return: Full database schema specification.
+    :param species: Species name stored in metadata (e.g., 'arabidopsis').
+    :param charset: MySQL character set — 'latin1' for most, 'utf8mb4' for non-Latin labels.
+    :returns: Full database schema dict ready for model generation.
+    :rtype: DatabaseSpec
     """
     return {
         **_SCHEMA_TEMPLATE,
@@ -234,9 +239,7 @@ def _schema(species: str, charset: str = "latin1") -> DatabaseSpec:
     ("willow", "willow"),
 ]
 
-# databases that store Affymetrix/microarray probeset IDs instead of gene identifiers.
-# For Arabidopsis databases in this set, the API will auto-convert AGI → probeset
-# via the at_agi_lookup service before querying expression data.
+# Databases that store Affymetrix/microarray probeset IDs instead of gene identifiers.
 _PROBESET_DBS = {
     # Arabidopsis microarray databases (Affymetrix ATH1 chip, need AGI→probeset lookup)
     "affydb",
@@ -272,7 +275,7 @@ def _schema(species: str, charset: str = "latin1") -> DatabaseSpec:
     "triticale_mas",
 }
 
-# databases that use utf8mb4 charset (all others default to latin1)
+# Databases that use utf8mb4 charset (all others default to latin1)
 _UTF8MB4 = {
     "actinidia_bud_development", "actinidia_flower_fruit_development",
     "actinidia_postharvest", "actinidia_vegetative_growth", "apple",

diff --git a/api/resources/gene_expression.py b/api/resources/gene_expression.py
@@ -1,3 +1,13 @@
+"""
+Reena Obmina | BCB330 Project 2025-2026 | University of Toronto
+
+REST endpoint for gene expression queries across all eFP databases.
+
+Routes: GET /gene_expression/expression/<database>/<gene_id>
+
+All gene IDs are validated by species before reaching the query layer.
+Probeset conversion is applied automatically for microarray databases.
+"""
 from flask_restx import Namespace, Resource
 from markupsafe import escape
 
@@ -36,6 +46,8 @@
 )
 class GeneExpression(Resource):
     def get(self, database, gene_id):
+        """Retrieve expression values for a gene from a given eFP database.
+        """
         database = str(escape(database))
         gene_id = str(escape(gene_id))
 
@@ -58,7 +70,7 @@ def get(self, database, gene_id):
             # 4. Normalise (e.g. strip maize transcript suffix _T##)
             gene_id = normalize_gene_id(gene_id, species)
 
-            # 5. Microarray / non-direct databases need gene ID → probeset conversion
+            # 5. Microarray / non-direct databases need gene ID -> probeset conversion
             if database in PROBESET_DATABASES:
                 probeset, err = convert_gene_to_probeset(gene_id, species, database)
                 if err:
@@ -71,7 +83,13 @@ def get(self, database, gene_id):
 
         if result["success"]:
             return BARUtils.success_exit(result)
-        return BARUtils.error_exit(result["error"]), result.get("error_code", 500)
+
+        error_code = result.get("error_code", 500)
+        if error_code == 404:
+            return BARUtils.error_exit("No data found for the given gene"), 404
+        if error_code == 503:
+            return BARUtils.error_exit("Database not available"), 503
+        return BARUtils.error_exit("An error occurred"), 500
 
 
 gene_expression.add_resource(GeneExpression, "/expression/<string:database>/<string:gene_id>")
diff --git a/api/resources/gene_information.py b/api/resources/gene_information.py
@@ -100,11 +100,16 @@ def post(self):
 
         if len(rows) > 0:
             for row in rows:
-                if row.agi in data_items.keys():
-                    data_items[row.agi].append(row.agi)
+                normalized_agi = BARUtils.normalize_arabidopsis_gene(row.agi)
+                alias_value = row.alias
+                if BARUtils.is_arabidopsis_gene_valid(alias_value):
+                    alias_value = BARUtils.normalize_arabidopsis_gene(alias_value)
+
+                if normalized_agi in data_items.keys():
+                    data_items[normalized_agi].append(normalized_agi)
                 else:
-                    data_items[row.agi] = []
-                    data_items[row.agi].append(row.alias)
+                    data_items[normalized_agi] = []
+                    data_items[normalized_agi].append(alias_value)
 
             for gene in data_items.keys():
                 data.append({"gene": gene, "aliases": data_items[gene]})

diff --git a/api/resources/microarray_gene_expression.py b/api/resources/microarray_gene_expression.py
@@ -5,9 +5,10 @@
 from api.models.efp_dynamic import SIMPLE_EFP_SAMPLE_MODELS
 from api.utils.bar_utils import BARUtils
 from api.utils.world_efp_utils import WorldeFPUtils
+from sqlalchemy import func
 import json
 
-# pull the dynamic model so this resource stays in sync with the schema catalog
+# Pull the dynamic model so this resource stays in sync with the schema catalog
 EcotypesSampleData = SIMPLE_EFP_SAMPLE_MODELS["arabidopsis_ecotypes"]
 
 microarray_gene_expression = Namespace(
@@ -33,7 +34,7 @@ def get(self, species="", gene_id=""):
             return BARUtils.error_exit("Invalid species")
         subquery = (
             db.select(AtAgiLookup.probeset)
-            .where(AtAgiLookup.agi == gene_id)
+            .where(func.lower(AtAgiLookup.agi) == gene_id.lower())
             .order_by(AtAgiLookup.date.desc())
             .limit(1)
             .subquery()
@@ -63,8 +64,7 @@ def get(self, species="", gene_id=""):
             return BARUtils.error_exit("There are no data found for the given gene")
 
 
-# endpoint made by reena
-# return view and database mappings for a given species
+# Endpoint made by Reena
 @microarray_gene_expression.route("/<string:species>/databases")
 class GetDatabases(Resource):
     @microarray_gene_expression.param("species", _in="path", default="arabidopsis")
@@ -220,8 +220,7 @@ def get(self, species=""):
         return BARUtils.success_exit({"species": species, "databases": species_databases[species]})
 
 
-# endpoint made by reena
-# return control and sample mappings for a given species
+# Endpoint made by Reena
 @microarray_gene_expression.route("/<string:species>/<string:view>/samples")
 class GetSamples1(Resource):
     """This endpoint returns control and sample group mappings for a given species and view (or all views)"""
@@ -234,7 +233,7 @@ def get(self, species="", view=""):
         view = escape(view)
 
         try:
-            with open("data/efp_info/efp_species_view_info.json") as f:
+            with open("data/efp_info/efp_species_view_info_typed.json") as f:
                 all_species_data = json.load(f)
         except Exception as e:
             return BARUtils.error_exit(f"Data file missing or invalid: {e}")
@@ -252,6 +251,10 @@ def get(self, species="", view=""):
         if view not in species_data["views"]:
             return BARUtils.error_exit("Invalid view for this species")
 
-        return BARUtils.success_exit(
-            {"species": species, "view": view, "groups": species_data["views"][view]["groups"]}
-        )
+        view_data = species_data["views"][view]
+        return BARUtils.success_exit({
+            "species": species,
+            "view": view,
+            "data_type": view_data.get("data_type", "Unknown"),
+            "groups": view_data["groups"]
+        })
diff --git a/api/services/efp_bootstrap.py b/api/services/efp_bootstrap.py
@@ -1,7 +1,11 @@
 """
-Utilities to bootstrap the simple eFP databases directly from the shared schema
-definitions. Shared by the CLI script and the Flask endpoint so we only maintain
-one implementation.
+Reena Obmina | BCB330 Project 2025-2026 | University of Toronto
+
+Bootstrap utilities for creating eFP MySQL databases from the shared schema registry.
+
+Used by:
+  - scripts/bootstrap_simple_efp_dbs.py  (CLI)
+  - config/init.sh                        (Docker / CI)
 """
 
 from __future__ import annotations
@@ -86,10 +90,16 @@ def _build_table(metadata: MetaData, spec, db_name: str) -> Table:
 
 def _make_index_name(db_name: str, index_cols: Iterable[str], max_len: int = 64) -> str:
     """
-    Create a MySQL-safe index name capped at 64 characters.
+    Create a MySQL-safe index name, capped at 64 characters.
+
+    Falls back to a hash suffix if the name would be too long,
+    keeping names deterministic and collision-free.
 
-    If the generated name is too long, fall back to a truncated db_name with a stable hash
-    to keep names deterministic and avoid collisions.
+    :param db_name: Database name used as the index name prefix.
+    :param index_cols: Column names included in the index.
+    :param max_len: Maximum allowed index name length (MySQL limit is 64).
+    :returns: A valid MySQL index name.
+    :rtype: str
     """
     base = f"ix_{db_name}_{'_'.join(index_cols)}"
     if len(base) <= max_len:
@@ -159,7 +169,6 @@ def ensure_database(server_url: URL, db_name: str, charset: str) -> None:
 
     server_engine = create_engine(server_url)
     with server_engine.begin() as conn:
-        # Safe to use f-string here since we've validated the inputs above
         conn.execute(text(f"CREATE DATABASE IF NOT EXISTS `{db_name}` DEFAULT CHARACTER SET {charset}"))