From ebd5a391663630b7114bdda8cf8236b3b116654e Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Thu, 9 Oct 2025 14:30:13 -0500
Subject: [PATCH 01/52] feat: added log statement and hint to user that an
 empty dataframe indicates an incorrect taxon id

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index 4f797658..38bd314a 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -168,6 +168,11 @@ async def _build_matrix_results(
     """
     matrix.dropna(subset="ensembl_gene_id", inplace=True)
     conversion = await ensembl_to_gene_id_and_symbol(ids=matrix["ensembl_gene_id"].tolist(), taxon=taxon)
+
+    # If any one column was
+    if any(conversion[col].eq("-").all() for col in conversion.columns):
+        logger.critical(f"Conversion of Ensembl Gene IDs to Entrez IDs and Gene Symbols was empty - is '{taxon}' the correct taxon ID for this data?")
+
     conversion["ensembl_gene_id"] = conversion["ensembl_gene_id"].str.split(",")
     conversion = conversion.explode("ensembl_gene_id")
     conversion.reset_index(inplace=True, drop=True)

From 7cab8f3a1793a696aec633702e23ac5e2099d770 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Thu, 23 Oct 2025 15:48:19 -0500
Subject: [PATCH 02/52] fix: do not drop na values; keep as much data as
 possible

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_preprocess.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py
index b83678d1..ada40a5e 100644
--- a/main/como/rnaseq_preprocess.py
+++ b/main/como/rnaseq_preprocess.py
@@ -367,7 +367,6 @@ async def _write_counts_matrix(
     final_matrix = final_matrix[["ensembl_gene_id", *rna_specific_sample_names]]
 
     output_counts_matrix_filepath.parent.mkdir(parents=True, exist_ok=True)
-    final_matrix.dropna(inplace=True)
     final_matrix.to_csv(output_counts_matrix_filepath, index=False)
     logger.success(f"Wrote gene count matrix for '{rna.value}' RNA at '{output_counts_matrix_filepath}'")
     return final_matrix
@@ -682,7 +681,7 @@ async def read_counts(file: Path) -> list[str]:
             )
 
         # Remove NA values from entrez_gene_id dataframe column
-        return conversion["entrez_gene_id"].dropna().tolist()
+        return conversion["entrez_gene_id"].tolist()
 
     logger.info("Fetching gene info - this can take up to 5 minutes depending on the number of genes and your internet connection")
     genes = set(chain.from_iterable(await asyncio.gather(*[read_counts(f) for f in counts_matrix_filepaths])))
@@ -709,7 +708,6 @@ async def read_counts(file: Path) -> list[str]:
 
     gene_info = gene_info[((~gene_info["entrez_gene_id"].isna()) & (~gene_info["ensembl_gene_id"].isna()) & (~gene_info["gene_symbol"].isna()))]
     gene_info.sort_values(by="ensembl_gene_id", inplace=True)
-    gene_info.dropna(inplace=True)
 
     output_filepath.parent.mkdir(parents=True, exist_ok=True)
     gene_info.to_csv(output_filepath, index=False)

From 7282a342c99394f3db38127c19a27718ac523afa Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Thu, 23 Oct 2025 15:52:38 -0500
Subject: [PATCH 03/52] fix: allow empty genomic values

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_preprocess.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py
index ada40a5e..3271025d 100644
--- a/main/como/rnaseq_preprocess.py
+++ b/main/como/rnaseq_preprocess.py
@@ -459,6 +459,7 @@ async def _create_config_df(  # noqa: C901
 
         fragment_label = f"{context_name}_{label}_fragment_size.txt"
         frag_paths = [p for p in aux_lookup["fragment"].values() if p.name == fragment_label]
+        print(f"HERE: {prep}")
         if not frag_paths and prep.lower() != RNAType.TRNA.value.lower():
             logger.warning(f"No fragment file for '{label}'; defaulting to 100 bp (needed for zFPKM).")
             mean_frag = 100.0
@@ -705,8 +706,6 @@ async def read_counts(file: Path) -> list[str]:
         gene_info.at[i, "entrez_gene_id"] = data.get("entrezgene", pd.NA)
         gene_info.at[i, "ensembl_gene_id"] = ensembl_ids
         gene_info.at[i, "size"] = end_pos - start_pos
-
-    gene_info = gene_info[((~gene_info["entrez_gene_id"].isna()) & (~gene_info["ensembl_gene_id"].isna()) & (~gene_info["gene_symbol"].isna()))]
     gene_info.sort_values(by="ensembl_gene_id", inplace=True)
 
     output_filepath.parent.mkdir(parents=True, exist_ok=True)
@@ -855,10 +854,10 @@ async def rnaseq_preprocess(
 async def _main():
     context_name = "notreatment"
     taxon = 9606
-    como_context_dir = Path("/Users/joshl/Projects/COMO/main/data/COMO_input/notreatment")
-    output_gene_info_filepath = Path("/Users/joshl/Projects/COMO/main/data/results/notreatment/gene_info.csv")
+    como_context_dir = Path("/Users/joshl/Projects/COMO/main/data/COMO_input/naiveB")
+    output_gene_info_filepath = Path("/Users/joshl/Projects/COMO/main/data/results/naiveB/gene_info_fixed.csv")
     output_trna_metadata_filepath = Path("/Users/joshl/Projects/COMO/main/data/config_sheets/trna_config.xlsx")
-    output_trna_count_matrix_filepath = Path("/Users/joshl/Projects/COMO/main/data/results/notreatment/total-rna/totalrna_notreatment.csv")
+    output_trna_count_matrix_filepath = Path("/Users/joshl/Projects/COMO/main/data/results/naiveB/total-rna/totalrna_naiveB.csv")
 
     await rnaseq_preprocess(
         context_name=context_name,

From c903cc8697159dcdc12cd1b89c76c68d06992c55 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:21:47 -0600
Subject: [PATCH 04/52] chore: move from `np.float(ing,64,32)` to python's
 `float`

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/create_context_specific_model.py    | 33 +++++++++----------
 .../pipelines/build_condition_heatmaps.py     |  2 +-
 main/como/plot/heatmap.py                     |  2 +-
 main/como/rnaseq_gen.py                       | 14 ++++----
 main/como/rnaseq_preprocess.py                |  2 +-
 main/como/sanity_checks/cobra_sanity_check.py |  2 +-
 main/como/sanity_checks/fast_leak_test.py     |  2 +-
 main/como/stats/_two_sample.py                | 12 +++----
 main/como/stats/ks_test.py                    | 26 +++++++--------
 main/como/stats/mann_whitney_test.py          | 12 +++----
 tests/unit/test_fisher_stats.py               |  4 +--
 tests/unit/test_ks_stats.py                   |  6 ++--
 tests/unit/test_mwu_stats.py                  |  4 +--
 13 files changed, 59 insertions(+), 62 deletions(-)

diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
index 52a035ce..8985603a 100644
--- a/main/como/create_context_specific_model.py
+++ b/main/como/create_context_specific_model.py
@@ -168,13 +168,13 @@ def _feasibility_test(model_cobra: cobra.Model, step: str):
 
 def _build_with_gimme(
     reference_model: cobra.Model,
-    lower_bounds: Sequence[float | np.floating],
-    upper_bounds: Sequence[float | np.floating],
+    lower_bounds: Sequence[float] | npt.NDArray[float],
+    upper_bounds: Sequence[float] | npt.NDArray[float],
     idx_objective: int,
-    expr_vector: npt.NDArray[np.floating],
+    expr_vector: npt.NDArray[float],
 ):
     model_reconstruction = reference_model.copy()
-    s_matrix: npt.NDArray[np.floating] = cobra.util.array.create_stoichiometric_matrix(model=model_reconstruction)
+    s_matrix: list[float] = list(cobra.util.array.create_stoichiometric_matrix(model=model_reconstruction))
     # `Becker and Palsson (2008). Context-specific metabolic networks are
     # consistent with experiments. PLoS Comput. Biol. 4, e1000082.`
     properties = GIMMEProperties(
@@ -184,7 +184,7 @@ def _build_with_gimme(
         preprocess=True,
         flux_threshold=0.9,
     )
-    algorithm = GIMME(s_matrix, lower_bounds, upper_bounds, properties)
+    algorithm = GIMME(s_matrix, list(lower_bounds), list(upper_bounds), properties)
     gene_activity = algorithm.run()
     reaction_ids = [r.id for r in model_reconstruction.reactions]
     to_remove_ids = [reaction_ids[r] for r in np.where(gene_activity == 0)[0]]
@@ -219,26 +219,27 @@ def _build_with_fastcore(cobra_model, s_matrix, lower_bounds, upper_bounds, exp_
 
 def _build_with_imat(
     reference_model: cobra.Model,
-    lower_bounds: Sequence[float],
-    upper_bounds: Sequence[float],
+    lower_bounds: Sequence[float] | npt.NDArray[float],
+    upper_bounds: Sequence[float] | npt.NDArray[float],
     expr_vector: npt.NDArray,
     expr_thresh: tuple[float, float],
-    force_gene_indices: Sequence[int],
+    force_reaction_indices: Sequence[int],
     solver: str,
 ) -> cobra.Model:
     properties: IMATProperties = IMATProperties(
         exp_vector=expr_vector,
         exp_thresholds=expr_thresh,
-        core=force_gene_indices,
+        # core=np.array(force_gene_indices).tolist(),
+        core=list(force_reaction_indices),  # `list()` is required because imat uses `if core:`; this returns an error on numpy arrays
         epsilon=0.01,
         solver=solver.upper(),
     )
 
     # Creating a copy of the model ensures we don't make any in-place modifications by accident
     # Using cobra to create the stoichiometry matrix means we have less work to do
-    force_gene_indices = np.array(force_gene_indices, dtype=np.uint16)
+    force_reaction_indices = np.array(force_reaction_indices, dtype=np.uint16)
     model_reconstruction: cobra.Model = reference_model.copy()
-    s_matrix: npt.NDArray[np.floating] = cobra.util.array.create_stoichiometric_matrix(model=model_reconstruction)
+    s_matrix: npt.NDArray[float] = cobra.util.array.create_stoichiometric_matrix(model=model_reconstruction)
     algorithm: IMAT = IMAT(S=s_matrix, lb=np.array(lower_bounds), ub=np.array(upper_bounds), properties=properties)
     rxns_from_imat: npt.NDArray[np.uint16] = algorithm.run().astype(np.uint16)
 
@@ -246,11 +247,7 @@ def _build_with_imat(
     all_rxn_ids: npt.NDArray[str] = np.array([r.id for r in model_reconstruction.reactions], dtype=object)
     all_rxn_indices: npt.NDArray[np.uint16] = np.array(range(len(model_reconstruction.reactions)), dtype=np.uint16)
 
-    # Collect reactions to keep by creating a unique set of reactions from the iMAT algorithm and force-include reactions
-    # dtype is set to uint16 because indices will not be below 0 or be greater than 65,535 (max size of uint16),
-    #   because only ~10,000 reactions exist in Recon3D
-    # Unsafe casting is OK because of these facts.
-    rxn_indices_to_keep: npt.NDArray[np.uint16] = np.unique(np.concatenate([rxns_from_imat, force_gene_indices], dtype=np.uint16))
+    rxn_indices_to_keep: npt.NDArray[int] = np.unique(np.concatenate([rxns_from_imat, force_reaction_indices], dtype=int))
 
     # Reaction indices to exclude from the model are thus reactions that are not marked to be included in the model
     # Assume unique is false because every value that is in `rxn_indices_to_keep` is included in `all_rxn_indices`
@@ -277,7 +274,7 @@ def _build_with_tinit(
         no_reverse_loops=True,
     )
     model_reconstruction = reference_model.copy()
-    s_matrix: npt.NDArray[np.floating] = cobra.util.array.create_stoichiometric_matrix(model=model_reconstruction).astype(np.float64)
+    s_matrix: npt.NDArray[float] = np.asarray(cobra.util.array.create_stoichiometric_matrix(model=model_reconstruction), dtype=float)
     algorithm = tINIT(s_matrix, lower_bounds, upper_bounds, properties)
     algorithm.preprocessing()
     algorithm.build_problem()
@@ -445,7 +442,7 @@ async def _build_model(  # noqa: C901
         high_thresh=high_thresh,
         low_thresh=low_thresh,
     )
-    expression_vector: npt.NDArray[np.int32] = np.array(list(reaction_expression.values()), dtype=np.int32)
+    expression_vector: npt.NDArray[float] = np.array(list(reaction_expression.values()), dtype=float)
 
     for rxn in force_reactions:
         if rxn not in reaction_ids:
diff --git a/main/como/pipelines/build_condition_heatmaps.py b/main/como/pipelines/build_condition_heatmaps.py
index 16b970b8..03420489 100644
--- a/main/como/pipelines/build_condition_heatmaps.py
+++ b/main/como/pipelines/build_condition_heatmaps.py
@@ -140,7 +140,7 @@ def group_reactions_by_pathway(models: cobra.Model | list[cobra.Model], flux_df:
     # for condition in flux_df.index:
     #     for pathway, reactions in pathways_by_reaction.items():
     #         pathway_flux.loc[condition, pathway] = flux_df.loc[condition, list(reactions)].sum()
-    pathway_fluxes: dict[str, pd.Series[npt.NDArray[np.floating]]] = {}
+    pathway_fluxes: dict[str, pd.Series[npt.NDArray[float]]] = {}
     for pathway, reactions in pathways_by_reaction.items():
         reactions_in_df = list(reactions.intersection(flux_df.columns))
         if reactions_in_df:
diff --git a/main/como/plot/heatmap.py b/main/como/plot/heatmap.py
index 0d6e4e55..6709a0a3 100644
--- a/main/como/plot/heatmap.py
+++ b/main/como/plot/heatmap.py
@@ -30,7 +30,7 @@ def condition_vs_pathway(
         The resulting `matpotlib.pyplt.Figure` object
     """
     plot_df: pd.DataFrame = data.copy() if copy_df else data
-    plot_df = plot_df.astype(np.float32)
+    plot_df = plot_df.astype(float)
     fig = plt.figure(figsize=(100, 40), dpi=175)
 
     if exclude_zero_flux_pathways:
diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index 38bd314a..99121f8f 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -139,7 +139,7 @@ def genefilter(data: pd.DataFrame | npt.NDArray, filter_func: Callable[[npt.NDAr
     Returns:
         A NumPy array of the filtered data.
     """
-    if not isinstance(data, pd.DataFrame | npt.NDArray):
+    if not isinstance(data, pd.DataFrame | np.ndarray):
         _log_and_raise_error(
             f"Unsupported data type. Must be a Pandas DataFrame or a NumPy array, got '{type(data)}'",
             error=TypeError,
@@ -272,11 +272,11 @@ def _calculate_fpkm(metrics: NamedMetrics, scale: int = 1e6) -> NamedMetrics:
 
         for sample in range(metrics[study].num_samples):
             layout = metrics[study].layout[sample]
-            count_matrix: npt.NDArray[np.float32] = metrics[study].count_matrix.iloc[:, sample].values
+            count_matrix: npt.NDArray[float] = metrics[study].count_matrix.iloc[:, sample].values
             gene_lengths = (
-                metrics[study].fragment_lengths[sample].astype(np.float32)
+                metrics[study].fragment_lengths[sample].astype(float)
                 if layout == LayoutMethod.paired_end
-                else metrics[study].gene_sizes.astype(np.float32)
+                else metrics[study].gene_sizes.astype(float)
             )
             gene_lengths_kb = gene_lengths / 1000.0
 
@@ -595,11 +595,11 @@ def cpm_filter(
         top_samples = round(n_top * len(counts.columns))  # noqa: F841
         test_bools = pd.DataFrame({"entrez_gene_ids": entrez_ids})
         for i in range(len(counts_per_million.columns)):
-            median_sum = np.float64(np.median(np.sum(counts[:, i])))
+            median_sum = float(np.median(np.sum(counts[:, i])))
             if cut_off == "default":  # noqa: SIM108
-                cutoff = np.float64(10e6) / median_sum
+                cutoff = float(10e6 / median_sum)
             else:
-                cutoff = np.float64(1e6 * cut_off) / median_sum
+                cutoff = float(1e6 * cut_off) / median_sum
             test_bools = test_bools.merge(counts_per_million[counts_per_million.iloc[:, i] > cutoff])
 
     return metrics
diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py
index 3271025d..142be019 100644
--- a/main/como/rnaseq_preprocess.py
+++ b/main/como/rnaseq_preprocess.py
@@ -262,7 +262,7 @@ async def _process_first_multirun_sample(strand_file: Path, all_counts_files: li
 
     # Set na values to 0
     sample_count = sample_count.fillna(value="0")
-    sample_count["counts"] = sample_count["counts"].astype(np.float64)
+    sample_count["counts"] = sample_count["counts"].astype(float)
 
     count_sums = sample_count.groupby("ensembl_gene_id", as_index=False)["counts"].mean()
     count_sums["counts"] = np.ceil(count_sums["counts"].astype(np.uint32))
diff --git a/main/como/sanity_checks/cobra_sanity_check.py b/main/como/sanity_checks/cobra_sanity_check.py
index fdd9fc28..af648dcd 100644
--- a/main/como/sanity_checks/cobra_sanity_check.py
+++ b/main/como/sanity_checks/cobra_sanity_check.py
@@ -134,7 +134,7 @@ def preprocess(model_path: Path, solver: Literal["glpk", "cplex", "gurobi"] = "g
     modelexchanges_ids: list[str] = [rxn.id for rxn in model_closed.reactions if re.search(pattern=r"^(EX|DM|sink)_", string=rxn.id)]
 
     # Step 2: Matrix-based method to find selected_exchanges reactions (single metabolite, 1 entry)
-    s_matrix: npt.NDArray[np.floating] = cobra.util.array.create_stoichiometric_matrix(model_closed)
+    s_matrix: npt.NDArray[float] = cobra.util.array.create_stoichiometric_matrix(model_closed)
 
     # Step 2: Convert it to a sparse CSC matrix to optimize memory usage and performance
     s_sparse = csc_matrix(s_matrix)
diff --git a/main/como/sanity_checks/fast_leak_test.py b/main/como/sanity_checks/fast_leak_test.py
index 0ce92c1a..55ec6b1b 100644
--- a/main/como/sanity_checks/fast_leak_test.py
+++ b/main/como/sanity_checks/fast_leak_test.py
@@ -27,7 +27,7 @@ def fast_leak_test(
     model: cobra.Model,
     test_reactions: Iterable[str],
     demand_test,
-    tolerance: np.float16 = 1e-6,
+    tolerance: float = 1e-6,
 ):
     """Run a Fast Leak Test.
 
diff --git a/main/como/stats/_two_sample.py b/main/como/stats/_two_sample.py
index d856ebeb..e97dffbf 100644
--- a/main/como/stats/_two_sample.py
+++ b/main/como/stats/_two_sample.py
@@ -9,8 +9,8 @@
 
 T_BASE_SAMPLE = TypeVar("T_BASE_SAMPLE", bound="BaseTwoSample")
 T_ALTERNATIVE = Literal["greater", "less", "two-sided"]
-KS_RESULT = tuple[np.floating, np.floating, np.floating, np.int8]
-MW_RESULT = tuple[np.floating, np.floating]
+KS_RESULT = tuple[float, float, float, int]
+MW_RESULT = tuple[float, float]
 TEST_RESULT = TypeVar("TEST_RESULT", KS_RESULT, MW_RESULT)
 
 __all__ = ["BaseTwoSample"]
@@ -21,7 +21,7 @@ class BaseTwoSample(ABC, Generic[TEST_RESULT]):
 
     @staticmethod
     @abstractmethod
-    def _worker(a: npt.NDArray[np.floating], b: npt.NDArray[np.floating], **kwargs) -> TEST_RESULT: ...
+    def _worker(a: npt.NDArray[float], b: npt.NDArray[float], **kwargs) -> TEST_RESULT: ...
 
     @property
     @abstractmethod
@@ -40,10 +40,10 @@ def _run(
         df2: pd.DataFrame,
         cores: int = 1,
         worker_kwargs: dict | None = None,
-    ) -> tuple[list[str], Mapping[str, npt.NDArray[np.float64 | np.uint8]]]:
+    ) -> tuple[list[str], Mapping[str, npt.NDArray[float | np.uint8]]]:
         all_reactions = list(set(df1.columns) & set(df2.columns))
-        array_a = df1[all_reactions].to_numpy(dtype=np.float64, copy=False)
-        array_b = df2[all_reactions].to_numpy(dtype=np.float64, copy=False)
+        array_a = df1[all_reactions].to_numpy(dtype=float, copy=False)
+        array_b = df2[all_reactions].to_numpy(dtype=float, copy=False)
         n = len(all_reactions)
 
         results = {field: np.empty(n, dtype=np.dtype(dtype)) for field, dtype in cls._fields.items()}
diff --git a/main/como/stats/ks_test.py b/main/como/stats/ks_test.py
index 9f3c1778..68fb1c2f 100644
--- a/main/como/stats/ks_test.py
+++ b/main/como/stats/ks_test.py
@@ -14,20 +14,20 @@
 @dataclass(frozen=True, kw_only=True, slots=True)
 class KSTest(BaseTwoSample[KS_RESULT]):
     _fields: ClassVar[dict[str, type]] = {
-        "statistic": np.float64,
-        "pvalue": np.float64,
-        "statistic_location": np.float64,
-        "statistic_sign": np.uint8,
+        "statistic": float,
+        "pvalue": float,
+        "statistic_location": float,
+        "statistic_sign": int,
     }
 
     reaction_ids: list[str]
-    statistic: npt.NDArray[np.float64]
-    pvalue: npt.NDArray[np.float64]
-    statistic_location: npt.NDArray[np.float64]
-    statistic_sign: npt.NDArray[np.int8]
+    statistic: npt.NDArray[float]
+    pvalue: npt.NDArray[float]
+    statistic_location: npt.NDArray[float]
+    statistic_sign: npt.NDArray[int]
 
     @staticmethod
-    def _worker(a: npt.NDArray[np.floating], b: npt.NDArray[np.floating], **kwargs) -> KS_RESULT:
+    def _worker(a: npt.NDArray[float], b: npt.NDArray[float], **kwargs) -> KS_RESULT:
         """Calculate the KS statistic.
 
         Args:
@@ -78,10 +78,10 @@ def run(
         )
         return cls(
             reaction_ids=all_reactions,
-            statistic=results["statistic"].astype(np.float64),
-            pvalue=results["pvalue"].astype(np.float64),
-            statistic_location=results["statistic_location"].astype(np.float64),
-            statistic_sign=results["statistic_sign"].astype(np.int8),
+            statistic=results["statistic"].astype(float),
+            pvalue=results["pvalue"].astype(float),
+            statistic_location=results["statistic_location"].astype(float),
+            statistic_sign=results["statistic_sign"].astype(int),
         )
 
     @property
diff --git a/main/como/stats/mann_whitney_test.py b/main/como/stats/mann_whitney_test.py
index 98c8131d..8e63f93c 100644
--- a/main/como/stats/mann_whitney_test.py
+++ b/main/como/stats/mann_whitney_test.py
@@ -13,14 +13,14 @@
 
 @dataclass(frozen=True, kw_only=True, slots=True)
 class MannWhitneyUTest(BaseTwoSample[MW_RESULT]):
-    _fields: ClassVar[dict[str, type]] = {"statistic": np.float64, "pvalue": np.float64}
+    _fields: ClassVar[dict[str, type]] = {"statistic": float, "pvalue": float}
 
     reaction_ids: list[str]
-    statistic: npt.NDArray[np.float64]
-    pvalue: npt.NDArray[np.float64]
+    statistic: npt.NDArray[float]
+    pvalue: npt.NDArray[float]
 
     @staticmethod
-    def _worker(a: npt.NDArray[np.floating], b: npt.NDArray[np.floating], **kwargs) -> MW_RESULT:
+    def _worker(a: npt.NDArray[float], b: npt.NDArray[float], **kwargs) -> MW_RESULT:
         """Calculate the MWU statistic.
 
         Args:
@@ -32,7 +32,7 @@ def _worker(a: npt.NDArray[np.floating], b: npt.NDArray[np.floating], **kwargs)
             A tuple of (statistic, pvalue)
         """
         res = mannwhitneyu(x=a, y=b, **kwargs)
-        return np.float64(res.statistic), np.float64(res.pvalue)
+        return float(res.statistic), float(res.pvalue)
 
     @classmethod
     def run(
@@ -67,7 +67,7 @@ def run(
             cores=cores,
             worker_kwargs={"alternative": alternative, "use_continuity": use_continuity, "axis": axis, "method": method},
         )
-        return cls(reaction_ids=all_reactions, statistic=results["statistic"].astype(np.float64), pvalue=results["pvalue"].astype(np.float64))
+        return cls(reaction_ids=all_reactions, statistic=results["statistic"].astype(float), pvalue=results["pvalue"].astype(float))
 
     @property
     def df(self) -> pd.DataFrame:
diff --git a/tests/unit/test_fisher_stats.py b/tests/unit/test_fisher_stats.py
index d7dad75c..dc212590 100644
--- a/tests/unit/test_fisher_stats.py
+++ b/tests/unit/test_fisher_stats.py
@@ -10,8 +10,8 @@ def test_fisher_stats():
 
     assert real == FisherExactTest(
         pathway="Glycolysis/gluconeogenesis",
-        statistic=np.float64(4.321708185053381),
-        pvalue=np.float64(1.2883495211648955e-05),
+        statistic=4.321708185053381,
+        pvalue=1.2883495211648955e-05,
         a=32,
         b=10,
         c=4496,
diff --git a/tests/unit/test_ks_stats.py b/tests/unit/test_ks_stats.py
index 08403d46..32856809 100644
--- a/tests/unit/test_ks_stats.py
+++ b/tests/unit/test_ks_stats.py
@@ -7,7 +7,7 @@
 @pytest.fixture
 def expected_result() -> pd.DataFrame:
     df = pd.read_csv("tests/inputs/expected_ks_results.csv", index_col=0)
-    df["statistic_sign"] = df["statistic_sign"].astype(np.int8)
+    df["statistic_sign"] = df["statistic_sign"].astype(int)
     return df
 
 
@@ -21,12 +21,12 @@ def test_ks_stats(expected_result: pd.DataFrame, cores: int):
     df1 = pd.DataFrame(
         index=list(range(size)),
         columns=list(range(cols)),
-        data=gen.normal(loc=0, size=size * cols).reshape(size, cols).astype(np.float32),
+        data=gen.normal(loc=0, size=size * cols).reshape(size, cols).astype(float),
     )
     df2 = pd.DataFrame(
         index=list(range(size)),
         columns=list(range(cols)),
-        data=gen.normal(loc=1, size=size * cols).reshape(size, cols).astype(np.float32),
+        data=gen.normal(loc=1, size=size * cols).reshape(size, cols).astype(float),
     )
 
     real_result = KSTest.run(df1, df2, cores=cores)
diff --git a/tests/unit/test_mwu_stats.py b/tests/unit/test_mwu_stats.py
index 9b1823e4..1889de2a 100644
--- a/tests/unit/test_mwu_stats.py
+++ b/tests/unit/test_mwu_stats.py
@@ -20,12 +20,12 @@ def test_mannwhitney_stats(expected_result: pd.DataFrame, cores: int):
     df1 = pd.DataFrame(
         index=list(range(size)),
         columns=list(range(cols)),
-        data=gen.normal(loc=0, size=size * cols).reshape(size, cols).astype(np.float32),
+        data=gen.normal(loc=0, size=size * cols).reshape(size, cols).astype(float),
     )
     df2 = pd.DataFrame(
         index=list(range(size)),
         columns=list(range(cols)),
-        data=gen.normal(loc=1, size=size * cols).reshape(size, cols).astype(np.float32),
+        data=gen.normal(loc=1, size=size * cols).reshape(size, cols).astype(float),
     )
 
     real_result = MannWhitneyUTest.run(df1, df2, cores=cores)

From 5b39a1cddb5c74d4f9e371962d7048f98930e3be Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:22:46 -0600
Subject: [PATCH 05/52] chore: ruff formatting

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 tests/unit/test_fisher_stats.py | 1 +
 tests/unit/test_ks_stats.py     | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tests/unit/test_fisher_stats.py b/tests/unit/test_fisher_stats.py
index dc212590..0f11d185 100644
--- a/tests/unit/test_fisher_stats.py
+++ b/tests/unit/test_fisher_stats.py
@@ -1,5 +1,6 @@
 import cobra
 import numpy as np
+
 from como.stats.fisher_exact_test import FisherExactTest
 
 
diff --git a/tests/unit/test_ks_stats.py b/tests/unit/test_ks_stats.py
index 32856809..fa6e742b 100644
--- a/tests/unit/test_ks_stats.py
+++ b/tests/unit/test_ks_stats.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+
 from como.stats.ks_test import KSTest
 
 
From 34451a9ece56836fe4d139f16282ef62ce276a28 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:24:11 -0600
Subject: [PATCH 06/52] chore(dev): ignore, but provide warning for, unused
 imports

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 ruff.toml | 73 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 38 insertions(+), 35 deletions(-)

diff --git a/ruff.toml b/ruff.toml
index 4929a8e3..a556c25f 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -7,38 +7,41 @@ docstring-code-format = true
 
 [lint]
 # Linting rules: https://docs.astral.sh/ruff/rules/
+unfixable = [
+    "F401", # warn about, but do not remove, unused imports
+]
+
 select = [
-    "A",        # do not use python builtins for variables or parameters; https://pypi.org/project/flake8-builtins/
-    "ASYNC",    # identify asynchronous-related problems; https://pypi.org/project/flake8-async/
-    "B",        # find likely bugs and design problems; https://pypi.org/project/flake8-bugbear/
-    "C4",       # create better list, set, & dict comprehensions; https://pypi.org/project/flake8-comprehensions/
-    "C90",      # check for complexity; https://pypi.org/project/mccabe/
-    "D",        # docstring style checker; https://pypi.org/project/pydocstyle/
-    "DOC",      # docstring linter; https://pypi.org/project/pydoclint/
-    "E",        # style guide checking; https://pypi.org/project/pycodestyle/
-    "F",        # check for errors; https://pypi.org/project/pyflakes/
-    "RUF",      # ruff-specific linting rules; https://docs.astral.sh/ruff/rules/#ruff-specific-rules-ruf
-    "FA",       # use from __future__ import annotations if needed; https://pypi.org/project/flake8-future-annotations/
-    "FURB",     # refurbish and modernize Python codebases; https://pypi.org/project/refurb/
-    "I",        # sorting imports rules; https://pypi.org/project/isort/
-    "N",        # check naming conventions; https://pypi.org/project/pep8-naming/
-    "PERF",     # check performance anti-patterns; https://pypi.org/project/perflint/
-    "PT",       # check common style issues or inconsistencies in pytest; https://pypi.org/project/flake8-pytest-style/
-    "PTH",      # use pathlib where possible; https://pypi.org/project/flake8-use-pathlib/
-    "S",        # security testing; https://pypi.org/project/flake8-bandit/
-    "SIM",      # check for code that can be simplified; https://pypi.org/project/flake8_simplify/
-    "T20",      # do not use prints in production; https://pypi.org/project/flake8-print/
-    "TRY",      # prevent Exception handling anti-patterns; https://pypi.org/project/tryceratops/
+    "A", # do not use python builtins for variables or parameters; https://pypi.org/project/flake8-builtins/
+    "ASYNC", # identify asynchronous-related problems; https://pypi.org/project/flake8-async/
+    "B", # find likely bugs and design problems; https://pypi.org/project/flake8-bugbear/
+    "C4", # create better list, set, & dict comprehensions; https://pypi.org/project/flake8-comprehensions/
+    "C90", # check for complexity; https://pypi.org/project/mccabe/
+    "D", # docstring style checker; https://pypi.org/project/pydocstyle/
+    "DOC", # docstring linter; https://pypi.org/project/pydoclint/
+    "E", # style guide checking; https://pypi.org/project/pycodestyle/
+    "F", # check for errors; https://pypi.org/project/pyflakes/
+    "RUF", # ruff-specific linting rules; https://docs.astral.sh/ruff/rules/#ruff-specific-rules-ruf
+    "FA", # use from __future__ import annotations if needed; https://pypi.org/project/flake8-future-annotations/
+    "FURB", # refurbish and modernize Python codebases; https://pypi.org/project/refurb/
+    "I", # sorting imports rules; https://pypi.org/project/isort/
+    "N", # check naming conventions; https://pypi.org/project/pep8-naming/
+    "PERF", # check performance anti-patterns; https://pypi.org/project/perflint/
+    "PT", # check common style issues or inconsistencies in pytest; https://pypi.org/project/flake8-pytest-style/
+    "PTH", # use pathlib where possible; https://pypi.org/project/flake8-use-pathlib/
+    "S", # security testing; https://pypi.org/project/flake8-bandit/
+    "SIM", # check for code that can be simplified; https://pypi.org/project/flake8_simplify/
+    "T20", # do not use prints in production; https://pypi.org/project/flake8-print/
+    "TRY", # prevent Exception handling anti-patterns; https://pypi.org/project/tryceratops/
     "UP"        # upgrade syntax for newer versions; https://pypi.org/project/pyupgrade
 ]
 ignore = [
-    "D100",     # allow undocumented public module definitions
-    "D101",     # allow undocumented public class
-    "D104",     # allow undocumented public package
-    "D203",     # do not require one blank line before class docstring
-    "D213",     # first docstring line should be on the second line
-    "TRY003",   # allow exception messages outside the `Exception` class
-    "F401",     # allow unused imports
+    "D100", # allow undocumented public module definitions
+    "D101", # allow undocumented public class
+    "D104", # allow undocumented public package
+    "D203", # do not require one blank line before class docstring
+    "D213", # first docstring line should be on the second line
+    "TRY003", # allow exception messages outside the `Exception` class
 ]
 
 [lint.pydocstyle]
@@ -46,13 +49,13 @@ convention = "google"
 
 [lint.per-file-ignores]
 "tests/*" = [
-    "D101",     # allow undocumented public class
-    "D102",     # allow undocumented class method
-    "D103",     # allow undocumented public method definitions
-    "D104",     # allow undocumented public package
-    "F811",     # allow redefinition of variables, required for pytest fixtures
-    "S101",     # allow use of `assert` in test files
+    "D101", # allow undocumented public class
+    "D102", # allow undocumented class method
+    "D103", # allow undocumented public method definitions
+    "D104", # allow undocumented public package
+    "F811", # allow redefinition of variables, required for pytest fixtures
+    "S101", # allow use of `assert` in test files
 ]
 "main/COMO.ipynb" = [
-    "E501",     # allow long lines
+    "E501", # allow long lines
 ]

From c0de5285b86b78ee62aedc6a5df64ecc0af27e93 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:24:44 -0600
Subject: [PATCH 07/52] fix: clipped values should use floats, not integers

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/combine_distributions.py | 38 ++++++++++++++----------------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py
index 7549d1f6..73c1d3e1 100644
--- a/main/como/combine_distributions.py
+++ b/main/como/combine_distributions.py
@@ -61,28 +61,26 @@ def _combine_z_distribution_for_batch(
         logger.trace(f"A single sample exists for batch '{batch.batch_num}'. Returning as-is because no additional combining can be done")
         return matrix
 
-    values = matrix.iloc[:, 1:].values
-    weighted_matrix = np.sum(values, axis=1) / np.sqrt(values.shape[1])
-    weighted_matrix = np.clip(weighted_matrix, weighted_z_floor, weighted_z_ceiling).astype(np.int8)
+    values = matrix.values
+    weighted_matrix = np.clip(
+        a=np.sum(values, axis=1) / np.sqrt(values.shape[1]),  # calculate a weighted matrix
+        a_min=weighted_z_floor,
+        a_max=weighted_z_ceiling,
+    ).astype(float)
 
-    merge_df = pd.concat([matrix, pd.Series(weighted_matrix, name="combined")], axis=1)
-    weighted_matrix = pd.DataFrame(
-        {
-            "ensembl_gene_id": matrix["ensembl_gene_id"],
-            "combine_z": weighted_matrix,
-        },
-    )
-    stack_df = pd.melt(
-        merge_df,
-        id_vars=["ensembl_gene_id"],
-        # Get all columns except ensembl_gene_id
-        value_vars=[col for col in merge_df.columns if col not in GeneIdentifier._member_map_],
-        var_name="source",
-        value_name="zscore",
-    )
-    if len(stack_df["source"].unique()) > 10:
-        stack_df = stack_df[stack_df["source"] == "combined"]
+    weighted_matrix = pd.DataFrame({"combine_z": weighted_matrix}, index=matrix.index)
 
+    # merge_df = pd.concat([matrix, pd.Series(weighted_matrix, name="combined")], axis=1)
+    # stack_df = pd.melt(
+    #     merge_df,
+    #     id_vars=["ensembl_gene_id"],
+    #     # Get all 'data' columns
+    #     value_vars=[col for col in merge_df.columns if col not in GeneIdentifier._member_map_],
+    #     var_name="source",
+    #     value_name="zscore",
+    # )
+    # if len(stack_df["source"].unique()) > 10:
+    #     stack_df = stack_df[stack_df["source"] == "combined"]
     # graph_zscore_distribution(
     #     stack_df,
     #     title=f"Combined Z-score Distribution for {context_name} - batch #{batch.batch_num}",

From 526da0d163c901a20086f6d483043ffdcbd49fc4 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:24:58 -0600
Subject: [PATCH 08/52] chore: import required modules

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/combine_distributions.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py
index 73c1d3e1..fa04a386 100644
--- a/main/como/combine_distributions.py
+++ b/main/como/combine_distributions.py
@@ -1,7 +1,8 @@
 from __future__ import annotations
 
-import asyncio
+from collections.abc import Sequence
 from pathlib import Path
+from typing import cast
 
 import numpy as np
 import pandas as pd
@@ -17,10 +18,7 @@
     _OutputCombinedSourceFilepath,
     _SourceWeights,
 )
-from como.graph import z_score_distribution as graph_zscore_distribution
-from como.utils import (
-    _num_columns,
-)
+from como.utils import LogLevel, _log_and_raise_error, _num_columns
 
 
 def _combine_z_distribution_for_batch(

From 512fa4b03eaaeceda8e5d060c8d9f563092a4dc8 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:25:18 -0600
Subject: [PATCH 09/52] fix: set index name + single column name

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/combine_distributions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py
index fa04a386..042fcbaf 100644
--- a/main/como/combine_distributions.py
+++ b/main/como/combine_distributions.py
@@ -86,8 +86,8 @@ def _combine_z_distribution_for_batch(
     #     / f"{context_name}_{source.value}_batch{batch.batch_num}_combined_zscore_distribution.pdf",
     # )
 
-    weighted_matrix.columns = ["ensembl_gene_id", batch.batch_num]
-    weighted_matrix.to_csv(output_combined_matrix_filepath, index=False)
+    weighted_matrix.columns = [batch.batch_num]
+    weighted_matrix.to_csv(output_combined_matrix_filepath, index=True)
     return weighted_matrix
 
 
From c319e8e3b8793afaab3f50e8cb7e5e3085994a23 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:32:15 -0600
Subject: [PATCH 10/52] chore: more explicit variable name

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/combine_distributions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py
index 042fcbaf..840521a7 100644
--- a/main/como/combine_distributions.py
+++ b/main/como/combine_distributions.py
@@ -94,7 +94,7 @@ def _combine_z_distribution_for_batch(
 def _combine_z_distribution_for_source(
     merged_source_data: pd.DataFrame,
     context_name: str,
-    num_replicates: int,
+    replicates_per_batch: Sequence[int],
     output_combined_matrix_filepath: Path,
     output_figure_filepath: Path,
     weighted_z_floor: int = -6,
@@ -267,7 +267,7 @@ async def _begin_combining_distributions(
         merged_source_results: pd.DataFrame = _combine_z_distribution_for_source(
             merged_source_data=merged_batch_results,
             context_name=context_name,
-            num_replicates=sum(batch.num_samples for batch in batch_names[source.value]),
+            replicates_per_batch=replicates_per_batch,
             output_combined_matrix_filepath=(
                 output_filepaths[source.value].parent / f"{context_name}_{source.value}_combined_zscore_distribution.csv"
             ),

From 86ea37650dd28e9c6d2419272c0aa8fcda5d1b0b Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:32:37 -0600
Subject: [PATCH 11/52] chore: update docstring

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/combine_distributions.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py
index 840521a7..9e5ac3e7 100644
--- a/main/como/combine_distributions.py
+++ b/main/como/combine_distributions.py
@@ -103,13 +103,14 @@ def _combine_z_distribution_for_source(
     """Combine z-score distributions across batches for a single source.
 
     Args:
-        merged_source_data: DataFrame with 'ensembl_gene_id' and batch columns.
-        context_name: Name of the context (e.g., tissue or condition).
-        num_replicates: Number of replicates (samples) for weighting.
-        output_combined_matrix_filepath: Path to save the combined z-score matrix.
-        output_figure_filepath: Path to save the z-score distribution figure.
-        weighted_z_floor: Minimum z-score value after combining.
-        weighted_z_ceiling: Maximum z-score value after combining.
+        merged_source_data : DataFrame with columns: ["ensembl_gene_id", <batch1>, <batch2>, ...]. Each batch column contains the within-batch combined z for that gene.
+        context_name: tissue or condition name
+        replicates_per_batch: vector of replicate counts per batch, aligned exactly to the order of
+            batch columns in `merged_source_data.iloc[:, 1:]`.
+        output_combined_matrix_filepath: directory to write combined z-score figures
+        output_figure_filepath: filepath to write z-score figure
+        weighted_z_floor: lower bound to clip z-scores
+        weighted_z_ceiling : upper bound to clip z-scores
 
     Returns:
           A pandas dataframe of the weighted z-distributions

From d8179bb520779a642a199ceed9182ec70773c215 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:32:47 -0600
Subject: [PATCH 12/52] chore: do not modify input dataframe inplace

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/combine_distributions.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py
index 9e5ac3e7..91221ca8 100644
--- a/main/como/combine_distributions.py
+++ b/main/como/combine_distributions.py
@@ -115,10 +115,12 @@ def _combine_z_distribution_for_source(
     Returns:
           A pandas dataframe of the weighted z-distributions
     """
-    if _num_columns(merged_source_data) <= 2:
-        logger.warning("A single source exists, returning matrix as-is because no additional combining can be done")
-        merged_source_data.columns = ["ensembl_gene_id", "combine_z"]
-        return merged_source_data
+    # If only one batch column exists, return as-is (rename like R path-through).
+    if _num_columns(merged_source_data) <= 1:
+        logger.warning("A single batch exists for this source; returning matrix as-is.")
+        out_df = merged_source_data.copy()
+        out_df.columns = ["combine_z"]
+        return out_df
 
     output_combined_matrix_filepath.parent.mkdir(parents=True, exist_ok=True)
     output_figure_filepath.parent.mkdir(parents=True, exist_ok=True)

From 838947f1ec6508b760d7255de9c1ad30b1d7081e Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:33:30 -0600
Subject: [PATCH 13/52] fix: use a per-replicate count for weighting instead of
 a single weight across the study

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/combine_distributions.py | 133 +++++++++++++++++------------
 1 file changed, 80 insertions(+), 53 deletions(-)

diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py
index 91221ca8..c152d69f 100644
--- a/main/como/combine_distributions.py
+++ b/main/como/combine_distributions.py
@@ -125,39 +125,50 @@ def _combine_z_distribution_for_source(
     output_combined_matrix_filepath.parent.mkdir(parents=True, exist_ok=True)
     output_figure_filepath.parent.mkdir(parents=True, exist_ok=True)
 
-    logger.trace(f"Found {_num_columns(merged_source_data) - 1} samples for context '{context_name}' to combine")
-    values = merged_source_data.iloc[:, 1:].values
-    mask = ~np.isnan(values)
-    masked_values = np.where(mask, values, 0)  # Replace NaN with 0
-    masked_num_replicates = np.where(mask, num_replicates, 0)
-
-    weights = masked_num_replicates / np.sum(masked_num_replicates, axis=1, keepdims=True)
-    numerator = np.sum(weights * masked_values, axis=1)
-    denominator = np.sqrt(np.sum(weights**2, axis=1))
-    weighted_matrix = numerator / denominator
-    weighted_matrix = np.clip(weighted_matrix, weighted_z_floor, weighted_z_ceiling)
-    logger.trace("Finished combining z-distribution")
-    # merge_df = pd.concat([merged_source_data, pd.Series(weighted_matrix, name="combined")], axis=1)
-    weighted_matrix = pd.DataFrame(
-        {
-            "ensembl_gene_id": merged_source_data["ensembl_gene_id"],
-            "combine_z": weighted_matrix,
-        }
+    # alidate alignment between columns and replicate vector
+    batch_column_names: list[str] = list(merged_source_data.columns)
+    expected_num_batches = len(batch_column_names)
+    if expected_num_batches != len(replicates_per_batch):
+        raise ValueError(
+            f"[{context_name}] Mismatch between number of batch columns "
+            f"({expected_num_batches}: {batch_column_names}) and length of "
+            f"replicates_per_batch ({len(replicates_per_batch)}: {list(replicates_per_batch)})."
+        )
+
+    logger.trace(
+        f"[{context_name}] Combining {expected_num_batches} batches with replicate counts: {dict(zip(batch_column_names, replicates_per_batch))}"
     )
 
-    # stack_df = pd.melt(
-    #     merge_df,
-    #     id_vars=["ensembl_gene_id"],
-    #     value_vars=merge_df.columns[1:],  # all other columns are values
-    #     var_name="source",
-    #     value_name="zscore",
-    # )
-    # graph_zscore_distribution(
-    #     df=stack_df,
-    #     title=f"Combined Z-score Distribution for {context_name}",
-    #     output_filepath=output_figure_filepath,
-    # )
-    return weighted_matrix
+    # extract values and build masks (shape is in gene x batch)
+    gene_by_batch_values: np.ndarray = merged_source_data.to_numpy(dtype=float)
+    present_mask: np.ndarray = ~np.isnan(gene_by_batch_values)
+
+    # per-batch weights from replicate counts, normalized over all batches
+    replicates_per_batch_array: np.ndarray = np.asarray(replicates_per_batch, dtype=float)
+    total_replicates: float = replicates_per_batch_array.sum()
+    if total_replicates <= 0.0:
+        raise ValueError(f"[{context_name}] Sum of replicates_per_batch must be > 0; got {total_replicates}.")
+    base_weights_per_batch: np.ndarray = replicates_per_batch_array / total_replicates  # shape: (B,)
+
+    # drop NA columns per gene by zeroing their weights
+    masked_weights_per_batch: np.ndarray = np.where(present_mask, base_weights_per_batch, 0.0)  # (G,B)
+    masked_values: np.ndarray = np.where(present_mask, gene_by_batch_values, 0.0)  # (G,B)
+
+    # weighted z combine per gene: sum(w*x)/sqrt(sum(w^2)), with NA rows → NaN
+    numerator_per_gene: np.ndarray = np.sum(masked_weights_per_batch * masked_values, axis=1)  # (G,)
+    denominator_per_gene: np.ndarray = np.sqrt(np.sum(masked_weights_per_batch**2, axis=1))  # (G,)
+    combined_z_per_gene: np.ndarray = np.divide(
+        numerator_per_gene,
+        denominator_per_gene,
+        out=np.full_like(numerator_per_gene, np.nan, dtype=float),
+        where=denominator_per_gene != 0.0,
+    )
+
+    combined_z_per_gene = np.clip(combined_z_per_gene, weighted_z_floor, weighted_z_ceiling)
+
+    logger.trace(f"[{context_name}] Finished combining batch z-distributions for source.")
+
+    return pd.DataFrame({"combine_z": combined_z_per_gene}, index=merged_source_data.index)
 
 
 def _combine_z_distribution_for_context(
@@ -171,14 +182,30 @@ def _combine_z_distribution_for_context(
         logger.warning("No zscore results exist, returning empty dataframe")
         return pd.DataFrame({"ensembl_gene_id": [], "combine_z": []})
 
-    z_matrices = [
-        res.z_score_matrix.set_index("ensembl_gene_id").rename(columns=dict.fromkeys(res.z_score_matrix.columns[1:], res.type.value))
-        for res in zscore_results
-    ]
-    z_matrix = pd.concat(z_matrices, axis=1, join="outer").reset_index()
+    z_matrices: list[pd.DataFrame] = []
+    for res in zscore_results:
+        matrix = res.z_score_matrix.copy()
+        if len(matrix.columns) > 1:
+            _log_and_raise_error(
+                f"Expected a single column for combined z-score dataframe for data '{res.type.value.lower()}'. Got '{len(matrix.columns)}' columns",
+                error=ValueError,
+                level=LogLevel.ERROR,
+            )
+
+        matrix.columns = [res.type.value.lower()]
+        matrix = cast(pd.DataFrame, matrix.loc[matrix.index != "-"])
+
+        # if the matrix has duplicate ids (i.e., multiple entrez ids map to a single ensembl id, etc.)
+        #   take the mean value of them to remove the duplicates
+        if not matrix.index.is_unique:
+            # matrix = cast(pd.DataFrame, matrix.groupby(level=0).mean())
+            matrix = cast(pd.DataFrame, matrix.groupby(level=0).max())
+        z_matrices.append(matrix)
+
+    z_matrix = pd.concat(z_matrices, axis="columns", join="outer", ignore_index=False)
     if _num_columns(z_matrix) <= 1:
         logger.trace(f"Only 1 source exists for '{context}', returning dataframe as-is becuase no data exists to combine")
-        z_matrix.columns = ["ensembl_gene_id", "combine_z"]
+        z_matrix.columns = ["combine_z"]
         return z_matrix
 
     values = z_matrix.iloc[:, 1:].values
@@ -191,7 +218,7 @@ def _combine_z_distribution_for_context(
     numerator = np.sum(normalized_weights * masked_values, axis=1)
     denominator = np.sqrt(np.sum(normalized_weights**2, axis=1))
     combined_z_matrix = numerator / denominator
-    combined_z_matrix = np.clip(combined_z_matrix, weighted_z_floor, weighted_z_ceiling)
+    combined_z_matrix = np.clip(combined_z_matrix, weighted_z_floor, weighted_z_ceiling, dtype=float)
     combined_z_matrix_df = pd.DataFrame(
         {
             "ensembl_gene_id": z_matrix["ensembl_gene_id"],
@@ -199,21 +226,21 @@ def _combine_z_distribution_for_context(
         }
     )
 
-    stack_df = pd.melt(
-        z_matrix,
-        id_vars=["ensembl_gene_id"],
-        value_vars=z_matrix.columns[1:],
-        var_name="source",
-        value_name="zscore",
-    )
-    combined_df = pd.DataFrame(
-        {
-            "ensembl_gene_id": z_matrix["ensembl_gene_id"],
-            "zscore": combined_z_matrix,
-            "source": "combined",
-        }
-    )
-    stack_df = pd.concat([stack_df, combined_df])
+    # combined_df = pd.DataFrame(
+    #     {
+    #         "ensembl_gene_id": z_matrix.index,
+    #         "zscore": combined_z_matrix,
+    #         "source": "combined",
+    #     }
+    # )
+    # stack_df = pd.melt(
+    #     z_matrix,
+    #     id_vars=["ensembl_gene_id"],
+    #     value_vars=z_matrix.columns[1:],
+    #     var_name="source",
+    #     value_name="zscore",
+    # )
+    # stack_df = pd.concat([stack_df, combined_df])
     # graph_zscore_distribution(
     #     df=stack_df,
     #     title=f"Combined Z-score Distribution for {context}",

From ae0797f8ff8e8273ac11243bc0b6f62ffce1a854 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:33:49 -0600
Subject: [PATCH 14/52] fix: use index value for ensembl ids

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/combine_distributions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py
index c152d69f..42c34354 100644
--- a/main/como/combine_distributions.py
+++ b/main/como/combine_distributions.py
@@ -221,7 +221,7 @@ def _combine_z_distribution_for_context(
     combined_z_matrix = np.clip(combined_z_matrix, weighted_z_floor, weighted_z_ceiling, dtype=float)
     combined_z_matrix_df = pd.DataFrame(
         {
-            "ensembl_gene_id": z_matrix["ensembl_gene_id"],
+            "ensembl_gene_id": z_matrix.index,
             "combine_z": combined_z_matrix,
         }
     )

From f423d1bc9baafa6ca5eba4a99c998777ed39c180 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:34:28 -0600
Subject: [PATCH 15/52] refactor: do not use async

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/combine_distributions.py | 57 ++++++++++++++++++++++--------
 1 file changed, 42 insertions(+), 15 deletions(-)

diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py
index 42c34354..9b87ea47 100644
--- a/main/como/combine_distributions.py
+++ b/main/como/combine_distributions.py
@@ -272,27 +272,54 @@ async def _begin_combining_distributions(
             logger.critical(f"Invalid source; got '{source.value}', expected 'trna', 'mrna', 'scrna', or 'proteomics'.")
             raise ValueError("Invalid source")
 
-        batch_results = await asyncio.gather(
-            *[
+        batch_results: list[pd.DataFrame] = []
+        for batch in batch_names[source.value]:
+            batch: _BatchEntry
+            matrix_subset = cast(pd.DataFrame, matrix[[GeneIdentifier.ensembl_gene_id.value, *batch.sample_names]])
+            matrix_subset = matrix_subset.set_index(keys=[GeneIdentifier.ensembl_gene_id.value], drop=True)
+
+            output_fp = output_filepaths[source.value].parent / f"{source.value}_batch{batch.batch_num}_combined_z_distrobution_{context_name}.csv"
+            batch_results.append(
                 _combine_z_distribution_for_batch(
                     context_name=context_name,
                     batch=batch,
-                    matrix=matrix[[GeneIdentifier.ENSEMBL_GENE_ID.value, *batch.sample_names]],
+                    matrix=matrix_subset,
                     source=source,
-                    output_combined_matrix_filepath=(
-                        output_filepaths[source.value].parent / f"{context_name}_{source.value}_batch{batch.batch_num}_combined_z_distribution.csv"
-                    ),
+                    output_combined_matrix_filepath=output_fp,
                     output_figure_dirpath=output_figure_dirpath,
                     weighted_z_floor=weighted_z_floor,
                     weighted_z_ceiling=weighted_z_ceiling,
                 )
-                for batch in batch_names[source.value]
-            ]
-        )
+            )
 
-        merged_batch_results = pd.DataFrame()
-        for df in batch_results:
-            merged_batch_results = df if merged_batch_results.empty else merged_batch_results.merge(df, on="ensembl_gene_id", how="outer")
+        index_name: str = (
+            "ensembl_gene_id"
+            if all(df.index.name == "ensembl_gene_id" for df in batch_results)
+            else "entrez_gene_id"
+            if all(df.index.name == "entrez_gene_id" for df in batch_results)
+            else "gene_symbol"
+            if all(df.index.name == "gene_symbol" for df in batch_results)
+            else ""
+        )
+        if not index_name:
+            _log_and_raise_error(
+                f"Unable to find common gene identifier across batches for source '{source.value}' in context '{context_name}'",
+                error=ValueError,
+                level=LogLevel.ERROR,
+            )
+        merged_batch_results = pd.concat(batch_results, axis="columns")
+        merged_batch_results.index.name = index_name
+
+        replicates_by_batch_num_map: dict[str, int] = {str(batch.batch_num): batch.num_samples for batch in batch_names[source.value]}
+        batch_column_names: list[str] = list(merged_batch_results.columns)
+        replicates_per_batch: list[int] = []
+        for col in batch_column_names:
+            key = str(col)
+            if key not in replicates_by_batch_num_map:
+                raise KeyError(
+                    f"Missing replicate count for batch column '{col}' in source '{source.value}'. Known: {list(replicates_by_batch_num_map.keys())}"
+                )
+            replicates_per_batch.append(replicates_by_batch_num_map[key])
 
         merged_source_results: pd.DataFrame = _combine_z_distribution_for_source(
             merged_source_data=merged_batch_results,
@@ -312,7 +339,7 @@ async def _begin_combining_distributions(
                 weight=source_weights[source.value],
             )
         )
-        merged_source_results.to_csv(output_filepaths[source.value], index=False)
+        merged_source_results.to_csv(output_filepaths[source.value], index=True)
         logger.success(f"Wrote z-scores for source '{source.value}' in context '{context_name}' to '{output_filepaths[source.value]}'")
 
     logger.trace(f"Combining z-score distributions for all sources in context '{context_name}'")
@@ -321,5 +348,5 @@ async def _begin_combining_distributions(
         zscore_results=z_score_results,
         output_graph_filepath=output_figure_dirpath / f"{context_name}_combined_omics_distribution.pdf",
     )
-    merged_context_results.to_csv(output_final_model_scores, index=False)
-    logger.success(f"Finished combining z-scores for context '{context_name}'")
+    merged_context_results.to_csv(output_final_model_scores, index=True)
+    logger.success(f"Wrote combined z-scores for context '{context_name}' to {output_final_model_scores}")

From 7f99af70be47703884869eced5a5dd05b1ec89d8 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:34:37 -0600
Subject: [PATCH 16/52] refactor: do not use async

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/create_context_specific_model.py | 225 ++++++++++++---------
 1 file changed, 124 insertions(+), 101 deletions(-)

diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py
index 8985603a..cd771c4b 100644
--- a/main/como/create_context_specific_model.py
+++ b/main/como/create_context_specific_model.py
@@ -1,12 +1,14 @@
 from __future__ import annotations
 
+import asyncio
 import collections
+import os
 import re
 import sys
-from collections.abc import Sequence
+from collections.abc import Coroutine, Sequence
 from io import TextIOWrapper
 from pathlib import Path
-from typing import Literal, TextIO, cast
+from typing import Any, Literal, TextIO, cast
 
 import cobra
 import cobra.util.array
@@ -15,14 +17,16 @@
 import pandas as pd
 from cobra import Model
 from cobra.flux_analysis import pfba
+from fast_bioservices.common import Taxon
+from fast_bioservices.pipeline import ensembl_to_gene_id_and_symbol
 from loguru import logger
 from troppo.methods.reconstruction.fastcore import FASTcore, FastcoreProperties
 from troppo.methods.reconstruction.gimme import GIMME, GIMMEProperties
 from troppo.methods.reconstruction.imat import IMAT, IMATProperties
 from troppo.methods.reconstruction.tINIT import tINIT, tINITProperties
 
-from como.data_types import Algorithm, BoundaryReactions, BuildResults, CobraCompartments, LogLevel, Solver
-from como.utils import _log_and_raise_error, read_file, set_up_logging, split_gene_expression_data
+from como.data_types import Algorithm, CobraCompartments, LogLevel, Solver, _BoundaryReactions, _BuildResults
+from como.utils import _log_and_raise_error, _read_file, _set_up_logging, split_gene_expression_data
 
 
 def _correct_bracket(rule: str, name: str) -> str:
@@ -287,6 +291,7 @@ async def _map_expression_to_reaction(
     recon_algorithm: Algorithm,
     low_thresh: float,
     high_thresh: float,
+    taxon: int | str | Taxon,
 ) -> collections.OrderedDict[str, int]:
     """Map gene ids to a reaction based on GPR (gene to protein to reaction) association rules.
 
@@ -298,6 +303,7 @@ async def _map_expression_to_reaction(
         recon_algorithm: Algorithm to use for reconstruction (GIMME, FASTCORE, iMAT, or tINIT)
         low_thresh: Low expression threshold for algorithms that require it (iMAT, tINIT)
         high_thresh: High expression threshold for algorithms that require it (iMAT, tINIT)
+        taxon: Taxon ID or Taxon object for gene ID conversion.
 
     Returns:
         An ordered dictionary mapping reaction IDs to their corresponding expression values.
@@ -305,7 +311,7 @@ async def _map_expression_to_reaction(
     Raises:
         ValueError: If neither 'entrez_gene_id' nor 'ensembl_gene_id' columns are found in the gene expression file.
     """
-    expression_data = await read_file(gene_expression_file)
+    expression_data = await _read_file(gene_expression_file)
     identifier_column = next((col for col in ("entrez_gene_id", "ensembl_gene_id") if col in expression_data.columns), "")
 
     if not identifier_column:
@@ -317,6 +323,14 @@ async def _map_expression_to_reaction(
         identifier_column=cast(Literal["ensembl_gene_id", "entrez_gene_id"], identifier_column),
         recon_algorithm=recon_algorithm,
     )
+
+    # convert ensembl IDs to entrez ids to map expression data to reactions in the reference model
+    if identifier_column == "ensembl_gene_id":
+        conversion = await ensembl_to_gene_id_and_symbol(ids=gene_activity.index.tolist(), taxon=taxon)
+        gene_activity = gene_activity.merge(conversion, how="left", left_index=True, right_on="ensembl_gene_id")
+        gene_activity = gene_activity.set_index(keys=["entrez_gene_id"])
+        gene_activity = gene_activity.drop(columns=["ensembl_gene_id", "gene_symbol"])
+
     reaction_expression = collections.OrderedDict()
 
     # fmt: off
@@ -343,7 +357,7 @@ async def _map_expression_to_reaction(
             activity = gene_activity.at[gene_id, "active"] if gene_id in gene_activity.index else f"{default_expression!s}"
             # replace gene_id with activity, using optional whitespace before and after the gene id
             # Do not replace the whitespace (if it exists) before and after the gene ID
-            gene_reaction_rule = re.sub(pattern=rf"\b{gene_id}\b", repl=activity, string=gene_reaction_rule)
+            gene_reaction_rule = re.sub(pattern=rf"\b{gene_id}\b", repl=str(activity), string=str(gene_reaction_rule))
 
         try:
             # We are using eval here because ast.literal_eval is unable to process an evaluable such as `max(-4, 0, 1)`
@@ -359,7 +373,24 @@ async def _map_expression_to_reaction(
     return reaction_expression
 
 
-async def _build_model(  # noqa: C901
+def _read_reference_model(filepath: Path) -> cobra.Model:
+    match filepath.suffix:
+        case ".mat":
+            reference_model = cobra.io.load_matlab_model(filepath)
+        case ".xml" | ".sbml":
+            reference_model = cobra.io.read_sbml_model(filepath)
+        case ".json":
+            reference_model = cobra.io.load_json_model(filepath)
+        case _:
+            _log_and_raise_error(
+                f"Reference model format must be .xml, .mat, or .json; found '{filepath.suffix}'",
+                error=ValueError,
+                level=LogLevel.ERROR,
+            )
+    return reference_model
+
+
+async def _build_model(
     general_model_file: Path,
     gene_expression_file: Path,
     recon_algorithm: Algorithm,
@@ -373,9 +404,10 @@ async def _build_model(  # noqa: C901
     low_thresh: float,
     high_thresh: float,
     output_flux_result_filepath: Path,
+    taxon: int | str | Taxon,
     *,
     force_boundary_rxn_inclusion: bool,
-) -> BuildResults:
+) -> _BuildResults:
     """Seed a context specific reference_model.
 
     Core reactions are determined from GPR associations with gene expression logicals.
@@ -397,25 +429,13 @@ async def _build_model(  # noqa: C901
         low_thresh: Low expression threshold for algorithms that require it (iMAT, tINIT)
         high_thresh: High expression threshold for algorithms that require it (iMAT, tINIT)
         output_flux_result_filepath: Path to save flux results (for iMAT only)
+        taxon: Taxon ID or Taxon object for gene ID conversion.
         force_boundary_rxn_inclusion: If True, ensure that all boundary reactions are included in the final model.
 
     Returns:
         A _BuildResults object containing the context-specific model, list of expression indices used, and a DataFrame of infeasible reactions.
     """
-    reference_model: cobra.Model
-    match general_model_file.suffix:
-        case ".mat":
-            reference_model = cobra.io.load_matlab_model(general_model_file)
-        case (".xml", ".sbml"):
-            reference_model = cobra.io.read_sbml_model(general_model_file)
-        case ".json":
-            reference_model = cobra.io.load_json_model(general_model_file)
-        case _:
-            _log_and_raise_error(
-                f"Reference model format must be .xml, .mat, or .json; found '{general_model_file.suffix}'",
-                error=ValueError,
-                level=LogLevel.ERROR,
-            )
+    reference_model: cobra.Model = _read_reference_model(general_model_file)
 
     if objective not in force_reactions:
         force_reactions.append(objective)
@@ -426,21 +446,19 @@ async def _build_model(  # noqa: C901
     # inconsistent_reactions, cobra_model = _feasibility_test(cobra_model, "before_seeding")
     inconsistent_reactions = []
     s_matrix = cobra.util.array.create_stoichiometric_matrix(reference_model, array_type="dense")
-    lower_bounds = []
-    upper_bounds = []
-    reaction_ids = []
-    for reaction in reference_model.reactions:
+    lower_bounds: list[int] = []
+    upper_bounds: list[int] = []
+    reaction_ids: list[str] = []
+    for i, reaction in enumerate(reference_model.reactions):
+        # if reaction.id in boundary_reactions:
+        #     lower_bounds.append()
         lower_bounds.append(reaction.lower_bound)
         upper_bounds.append(reaction.upper_bound)
         reaction_ids.append(reaction.id)
 
     # get expressed reactions
     reaction_expression: collections.OrderedDict[str, int] = await _map_expression_to_reaction(
-        reference_model,
-        gene_expression_file,
-        recon_algorithm,
-        high_thresh=high_thresh,
-        low_thresh=low_thresh,
+        reference_model, gene_expression_file, recon_algorithm, high_thresh=high_thresh, low_thresh=low_thresh, taxon=taxon
     )
     expression_vector: npt.NDArray[float] = np.array(list(reaction_expression.values()), dtype=float)
 
@@ -483,6 +501,16 @@ async def _build_model(  # noqa: C901
     expression_threshold = (low_thresh, high_thresh)
 
     match recon_algorithm:
+        case Algorithm.IMAT:
+            context_model_cobra: cobra.Model = _build_with_imat(
+                reference_model=reference_model,
+                lower_bounds=lower_bounds,
+                upper_bounds=upper_bounds,
+                expr_vector=expression_vector,
+                expr_thresh=expression_threshold,
+                force_reaction_indices=force_reaction_indices,
+                solver=solver,
+            )
         case Algorithm.GIMME:
             context_model_cobra: cobra.Model = _build_with_gimme(
                 reference_model=reference_model,
@@ -500,22 +528,12 @@ async def _build_model(  # noqa: C901
                 exp_idx_list=expression_vector_indices,
                 solver=solver,
             )
-        case Algorithm.IMAT:
-            context_model_cobra: cobra.Model = _build_with_imat(
-                reference_model=reference_model,
-                lower_bounds=lower_bounds,
-                upper_bounds=upper_bounds,
-                expr_vector=expression_vector,
-                expr_thresh=expression_threshold,
-                force_gene_indices=force_reaction_indices,
-                solver=solver,
-            )
             context_model_cobra.objective = objective
             flux_sol: cobra.Solution = context_model_cobra.optimize()
             fluxes: pd.Series = flux_sol.fluxes
             model_reactions: list[str] = [reaction.id for reaction in context_model_cobra.reactions]
             reaction_intersections: set[str] = set(fluxes.index).intersection(model_reactions)
-            flux_df: pd.DataFrame = fluxes[~fluxes.index.isin(reaction_intersections)]
+            flux_df: pd.DataFrame = cast(pd.DataFrame, fluxes[~fluxes.index.isin(reaction_intersections)])
             flux_df.dropna(inplace=True)
             flux_df.to_csv(output_flux_result_filepath)
         case Algorithm.TINIT:
@@ -549,7 +567,7 @@ async def _build_model(  # noqa: C901
         axis=0,
     )
 
-    return BuildResults(
+    return _BuildResults(
         model=context_model_cobra,
         expression_index_list=expression_vector_indices,
         infeasible_reactions=inconsistent_and_infeasible_reactions,
@@ -559,7 +577,7 @@ async def _build_model(  # noqa: C901
 async def _create_df(path: Path, *, lowercase_col_names: bool = False) -> pd.DataFrame:
     if path.suffix not in {".csv", ".tsv"}:
         raise ValueError(f"File must be a .csv or .tsv file, got '{path.suffix}'")
-    df: pd.DataFrame = await read_file(path=path, header=0, sep="," if path.suffix == ".csv" else "\t", h5ad_as_df=True)
+    df: pd.DataFrame = await _read_file(path=path, header=0, sep="," if path.suffix == ".csv" else "\t", h5ad_as_df=True)
 
     if not isinstance(df, pd.DataFrame):
         _log_and_raise_error(
@@ -573,7 +591,7 @@ async def _create_df(path: Path, *, lowercase_col_names: bool = False) -> pd.Dat
     return df
 
 
-async def _collect_boundary_reactions(path: Path) -> BoundaryReactions:
+async def _collect_boundary_reactions(path: Path) -> _BoundaryReactions:
     df: pd.DataFrame = await _create_df(path, lowercase_col_names=True)
     for column in df.columns:
         if column not in [
@@ -609,39 +627,48 @@ async def _collect_boundary_reactions(path: Path) -> BoundaryReactions:
         shorthand_compartment = CobraCompartments.get_shorthand(reaction_compartment[i])
         reactions[i] = f"{boundary_map.get(boundary)}_{reaction_abbreviation[i]}[{shorthand_compartment}]"
 
-    return BoundaryReactions(
+    return _BoundaryReactions(
         reactions=reactions,
         lower_bounds=df["minimum reaction rate"].tolist(),
         upper_bounds=df["maximum reaction rate"].tolist(),
     )
 
 
-def _write_model_to_disk(context_name: str, model: cobra.Model, output_filepaths: list[Path]) -> None:
+async def _write_model_to_disk(
+    context_name: str,
+    model: cobra.Model,
+    output_filepaths: list[Path],
+    mat_suffix: set[str],
+    json_suffix: set[str],
+    xml_suffix: set[str],
+) -> None:
+    tasks: set[Coroutine[Any, Any, None]] = set()
     for path in output_filepaths:
         path.parent.mkdir(parents=True, exist_ok=True)
-        if path.suffix == ".mat":
-            cobra.io.save_matlab_model(model=model, file_name=path)
-        elif path.suffix == ".json":
-            cobra.io.save_json_model(model=model, filename=path, pretty=True)
-        elif path.suffix in {".sbml", ".xml"}:
-            cobra.io.write_sbml_model(cobra_model=model, filename=path)
+        if path.suffix in mat_suffix:
+            tasks.add(asyncio.to_thread(cobra.io.save_matlab_model, model=model, file_name=path))
+        elif path.suffix in json_suffix:
+            tasks.add(asyncio.to_thread(cobra.io.save_json_model, model=model, filename=path, pretty=True))
+        elif path.suffix in xml_suffix:
+            tasks.add(asyncio.to_thread(cobra.io.write_sbml_model, model=model, filename=path))
         else:
             _log_and_raise_error(
                 f"Invalid output model filetype. Should be one of .xml, .sbml, .mat, or .json. Got '{path.suffix}'",
                 error=ValueError,
                 level=LogLevel.ERROR,
             )
-        logger.success(f"Saved metabolic model for context '{context_name}' to '{path}'")
+        logger.success(f"Will save metabolic model for context '{context_name}' to: '{path}'")
+    await asyncio.gather(*tasks)
 
 
 async def create_context_specific_model(  # noqa: C901
     context_name: str,
+    taxon: int | str | Taxon,
     reference_model: Path,
     active_genes_filepath: Path,
     output_infeasible_reactions_filepath: Path,
     output_flux_result_filepath: Path,
     output_model_filepaths: Path | list[Path],
-    output_filetypes: list[str] | None = None,
     output_fastcore_expression_index_filepath: Path | None = None,
     objective: str = "biomass_reaction",
     boundary_rxns_filepath: str | Path | None = None,
@@ -660,12 +687,12 @@ async def create_context_specific_model(  # noqa: C901
 
     Args:
         context_name: Name of the context-specific model.
+        taxon: NCBI taxonomy ID or name for the organism of interest.
         reference_model: Path to the general genome-scale metabolic model file (.xml, .mat, or .json).
         active_genes_filepath: Path to the gene expression data file (csv, tsv, or Excel).
         output_infeasible_reactions_filepath: Path to save infeasible reactions (csv).
         output_flux_result_filepath: Path to save flux results (csv).
         output_model_filepaths: Path or list of paths to save the context-specific model (.xml, .mat, or .json).
-        output_filetypes: List of file types to save the model as ('xml', 'mat', 'json').
         output_fastcore_expression_index_filepath: Path to save Fastcore expression indices (txt). Required if using Fastcore.
         objective: Objective function reaction ID.
         boundary_rxns_filepath: Optional path to boundary reactions file (csv, tsv, or Excel).
@@ -682,22 +709,9 @@ async def create_context_specific_model(  # noqa: C901
     Raises:
         ImportError: If Gurobi solver is selected but gurobipy is not installed.
     """
+    _set_up_logging(level=log_level, location=log_location)
     boundary_rxns_filepath: Path | None = Path(boundary_rxns_filepath) if boundary_rxns_filepath else None
-    set_up_logging(level=log_level, location=log_location)
     output_model_filepaths = [output_model_filepaths] if isinstance(output_model_filepaths, Path) else output_model_filepaths
-    for path in output_model_filepaths:
-        if path.suffix not in {".mat", ".xml", ".sbml", ".json"}:
-            _log_and_raise_error(
-                f"Invalid output model filetype. Should be one of .xml, .sbml, .mat, or .json. Got '{path.suffix}'",
-                error=ValueError,
-                level=LogLevel.ERROR,
-            )
-    if len(output_model_filepaths) != len(output_model_filepaths):
-        _log_and_raise_error(
-            "The number of output model filepaths must be the same as the number of output flux result filepaths",
-            error=ValueError,
-            level=LogLevel.ERROR,
-        )
 
     if not reference_model.exists():
         _log_and_raise_error(
@@ -724,15 +738,6 @@ async def create_context_specific_model(  # noqa: C901
             level=LogLevel.ERROR,
         )
 
-    output_filetypes = ["mat"] if output_filetypes is None else output_filetypes
-    for output_type in output_filetypes:
-        if output_type not in {"xml", "mat", "json"}:
-            _log_and_raise_error(
-                f"Output file type {output_type} not recognized. Must be one of: 'xml', 'mat', 'json'",
-                error=ValueError,
-                level=LogLevel.ERROR,
-            )
-
     if algorithm not in Algorithm:
         _log_and_raise_error(
             f"Algorithm {algorithm} not supported. Use one of {', '.join(a.value for a in Algorithm)}",
@@ -747,6 +752,16 @@ async def create_context_specific_model(  # noqa: C901
             level=LogLevel.ERROR,
         )
 
+    mat_suffix, json_suffix, xml_suffix = {".mat"}, {".json"}, {".sbml", ".xml"}
+    if any(path.suffix not in {*mat_suffix, *json_suffix, *xml_suffix} for path in output_model_filepaths):
+        invalid_suffix = "\n".join(path for path in output_model_filepaths if path.suffix not in {*mat_suffix, *json_suffix, *xml_suffix})
+        _log_and_raise_error(
+            f"Invalid output filetype. Should be 'xml', 'sbml', 'mat', or 'json'. Got:\n{invalid_suffix}'",
+            error=ValueError,
+            level=LogLevel.ERROR,
+        )
+
+    boundary_reactions = None
     if boundary_rxns_filepath:
         boundary_reactions = await _collect_boundary_reactions(boundary_rxns_filepath)
 
@@ -776,42 +791,43 @@ async def create_context_specific_model(  # noqa: C901
 
     # Test that gurobi is using a valid license file
     if solver == Solver.GUROBI:
-        # test if gurobi is available
-        try:
-            import gurobipy as gp
-        except ImportError as e:
-            logger.error(
-                "The gurobi solver requires the gurobipy package to be installed. "
-                "Please install gurobipy and try again. "
-                "This can be done by installing the 'gurobi' optional dependency."
+        from importlib.util import find_spec
+
+        gurobi_present = find_spec("gurobipy")
+        if not gurobi_present:
+            _log_and_raise_error(
+                message=(
+                    "The gurobi solver requires the gurobipy package to be installed. "
+                    "Please install gurobipy and try again. "
+                    "This can be done by installing the 'gurobi' optional dependency."
+                ),
+                error=ImportError,
+                level=LogLevel.ERROR,
             )
-            raise ImportError from e
 
-        env = gp.Env()
-        if env.getParam("WLSACCESSID") == "" or env.getParam("WLSSECRET") == "":
+        if not Path(f"{os.environ['HOME']}/gurobi.lic").exists():
             logger.critical(
                 "Gurobi solver requested, but license information cannot be found. "
                 "COMO will continue, but it is HIGHLY unlikely the resulting model will be valid."
             )
-        # remove gurobi-related information, it is no longer required
-        del env, gp
 
-    logger.info(f"Creating '{context_name}' model using '{algorithm.value}' reconstruction and '{solver.value}' solver")
-    build_results: BuildResults = await _build_model(
+    logger.info(f"context='{context_name}', reconstruction method='{algorithm.value}', solver='{solver.value}'")
+    build_results: _BuildResults = await _build_model(
         general_model_file=reference_model,
         gene_expression_file=active_genes_filepath,
         recon_algorithm=algorithm,
         objective=objective,
-        boundary_reactions=boundary_reactions.reactions,
+        boundary_reactions=boundary_reactions.reactions if boundary_reactions else [],
         exclude_reactions=exclude_rxns,
         force_reactions=force_rxns,
-        lower_bounds=boundary_reactions.lower_bounds,
-        upper_bounds=boundary_reactions.upper_bounds,
+        lower_bounds=boundary_reactions.lower_bounds if boundary_reactions else [],
+        upper_bounds=boundary_reactions.upper_bounds if boundary_reactions else [],
         solver=solver.value.lower(),
         low_thresh=low_threshold,
         high_thresh=high_threshold,
         output_flux_result_filepath=output_flux_result_filepath,
         force_boundary_rxn_inclusion=force_boundary_rxn_inclusion,
+        taxon=taxon,
     )
 
     build_results.infeasible_reactions.dropna(inplace=True)
@@ -822,7 +838,14 @@ async def create_context_specific_model(  # noqa: C901
         fastcore_df.dropna(inplace=True)
         fastcore_df.to_csv(output_fastcore_expression_index_filepath, index=False)
 
-    _write_model_to_disk(context_name=context_name, model=build_results.model, output_filepaths=output_model_filepaths)
-    logger.debug(f"Number of Genes: {len(build_results.model.genes):,}")
-    logger.debug(f"Number of Metabolites: {len(build_results.model.metabolites):,}")
-    logger.debug(f"Number of Reactions: {len(build_results.model.reactions):,}")
+    await _write_model_to_disk(
+        context_name=context_name,
+        model=build_results.model,
+        output_filepaths=output_model_filepaths,
+        mat_suffix=mat_suffix,
+        json_suffix=json_suffix,
+        xml_suffix=xml_suffix,
+    )
+    logger.info(
+        f"context={context_name}, reactions={len(build_results.model.reactions)}, genes={len(build_results.model.genes)}, metabolites={len(build_results.model.metabolites)}"
+    )

From 926c1393a01da47e748710bc59f463a323169368 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:34:57 -0600
Subject: [PATCH 17/52] refactor: use lowercase identifier names; provide
 default values

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/data_types.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/main/como/data_types.py b/main/como/data_types.py
index 8b439d47..a004f862 100644
--- a/main/como/data_types.py
+++ b/main/como/data_types.py
@@ -40,9 +40,9 @@ class FilteringTechnique(str, Enum):
 
 
 class GeneIdentifier(str, Enum):
-    ENSEMBL_GENE_ID = "ENSEMBL_GENE_ID"
-    ENTREZ_GENE_ID = "ENTREZ_GENE_ID"
-    GENE_SYMBOL = "GENE_SYMBOL"
+    ensembl_gene_id = "ensembl_gene_id"
+    entrez_gene_id = "entrez_gene_id"
+    gene_symbol = "gene_symbol"
 
 
 class LogLevel(int, Enum):
@@ -72,15 +72,15 @@ class Solver(str, Enum):
 
 
 class SourceTypes(str, Enum):
-    TRNA = "TRNA"
-    MRNA = "MRNA"
-    SCRNA = "SCRNA"
-    PROTEOMICS = "PROTEOMICS"
+    trna = "trna"
+    mrna = "mrna"
+    scrna = "scrna"
+    proteomics = "proteomics"
 
 
 class PeakIdentificationParameters(NamedTuple):
-    height: float
-    distance: float
+    height: float = 0.02
+    distance: float = 1.0
 
 
 class CobraCompartments:

From 4149eb95dc862913e756f83d3030c4d7f77f8306 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:35:17 -0600
Subject: [PATCH 18/52] refactor: import required modules

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/merge_xomics.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/main/como/merge_xomics.py b/main/como/merge_xomics.py
index a89219ef..3c3d4f43 100644
--- a/main/como/merge_xomics.py
+++ b/main/como/merge_xomics.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import asyncio
+import re
 import sys
 from pathlib import Path
 from typing import TextIO, cast
@@ -10,9 +11,7 @@
 from fast_bioservices.biothings.mygene import MyGene
 from loguru import logger
 
-from como.combine_distributions import (
-    _begin_combining_distributions,
-)
+from como.combine_distributions import _begin_combining_distributions
 from como.data_types import (
     AdjustmentMethod,
     LogLevel,

From f56b85312fbbe52233f737218b6e478de92dfe85 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:35:42 -0600
Subject: [PATCH 19/52] fix: use `.loc[]` to prevent copy-on-write warning

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/merge_xomics.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/main/como/merge_xomics.py b/main/como/merge_xomics.py
index 3c3d4f43..f1bac265 100644
--- a/main/como/merge_xomics.py
+++ b/main/como/merge_xomics.py
@@ -106,7 +106,7 @@ def _merge_logical_table(df: pd.DataFrame):
     """
     # step 1: get all plural ENTREZ_GENE_IDs in the input table, extract unique IDs
     df.dropna(subset=["entrez_gene_id"], inplace=True)
-    df["entrez_gene_id"] = df["entrez_gene_id"].astype(str).str.replace(" /// ", "//").astype(str)
+    df.loc[:, "entrez_gene_id"] = df.loc[:, "entrez_gene_id"].astype(str).str.replace(" /// ", "//").astype(str)
 
     id_list: list[str] = df.loc[~df["entrez_gene_id"].str.contains("//"), "entrez_gene_id"].tolist()  # Collect "single" ids, like "123"
     multiple_entrez_ids: list[str] = df.loc[
@@ -287,6 +287,8 @@ async def _merge_xomics(
         expression_list.append(expressed_sourcetype)
         high_confidence_list.append(high_expressed_sourcetype)
         matrix.rename(columns={"expressed": expressed_sourcetype, "high": high_expressed_sourcetype}, inplace=True)
+        matrix = cast(pd.DataFrame, matrix[matrix["entrez_gene_id"] != "-"])
+        matrix.loc[:, "entrez_gene_id"] = matrix.loc[:, "entrez_gene_id"].astype(int)
         merge_data = matrix if merge_data.empty else merge_data.merge(matrix, on="entrez_gene_id", how="outer")
 
     logger.trace(f"Shape of merged data before merging logical tables: {merge_data.shape}")

From 6105b98c16cdda6c4289f168eaabe70973c28584 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:36:01 -0600
Subject: [PATCH 20/52] fix: explicitly check column names

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/merge_xomics.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/main/como/merge_xomics.py b/main/como/merge_xomics.py
index f1bac265..91bd83c4 100644
--- a/main/como/merge_xomics.py
+++ b/main/como/merge_xomics.py
@@ -367,16 +367,25 @@ async def _update_missing_data(input_matrices: _InputMatrices, taxon_id: int) ->
         if matrix is not None:
             # fmt: off
             existing_data = (
-                "gene_symbol" if "gene_symbol" in matrix
-                else "entrez_gene_id" if "entrez_gene_id" in matrix
+                "gene_symbol" if "gene_symbol" in matrix.columns
+                else "entrez_gene_id" if "entrez_gene_id" in matrix.columns
                 else "ensembl_gene_id"
             )
             # fmt: on
             logger.trace(f"Merging conversion data for {matrix_name}, existing id column is: {existing_data}")
+            # input_matrices[matrix_name][existing_data] = input_matrices[matrix_name][existing_data].astype(str)
+            if existing_data == "entrez_gene_id":
+                input_matrices[matrix_name]["entrez_gene_id"] = input_matrices[matrix_name]["entrez_gene_id"].astype(int)
+                conversion.index = conversion.index.astype(int)
+
             input_matrices[matrix_name] = (
-                input_matrices[matrix_name].merge(conversion, how="left", on=[existing_data]).dropna().reset_index(drop=True)
+                input_matrices[matrix_name].merge(conversion, how="left", left_on=existing_data, right_index=True).reset_index(drop=True)
             )
 
+            # input_matrices[matrix_name] = (
+            #     input_matrices[matrix_name].merge(conversion, how="left", left_index=True, right_index=True).dropna().reset_index(drop=True)
+            # )
+
     logger.debug("Updated missing genomic data")
     return input_matrices
 

From 1219cc40b23f128731c62f3baced4e93dccd362f Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:36:18 -0600
Subject: [PATCH 21/52] refactor: raise error if adjustment method not found

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/merge_xomics.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/main/como/merge_xomics.py b/main/como/merge_xomics.py
index 91bd83c4..4580b0eb 100644
--- a/main/como/merge_xomics.py
+++ b/main/como/merge_xomics.py
@@ -450,6 +450,12 @@ async def _process(
         adjusted_expression_requirement = expression_requirement - (4 - num_sources)
     elif adjust_method == AdjustmentMethod.FLAT:
         adjusted_expression_requirement = expression_requirement
+    else:
+        _log_and_raise_error(
+            message=f"Unknown `adjust_method`: {adjust_method}.",
+            error=ValueError,
+            level=LogLevel.ERROR,
+        )
     logger.debug(f"Adjusted expression requirement: {adjusted_expression_requirement}")
 
     if adjusted_expression_requirement != expression_requirement:

From 9c7175473e870b38f9f896f2d16f9a8027ebfe3d Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:37:02 -0600
Subject: [PATCH 22/52] refactor: validate batch in loop

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/merge_xomics.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/main/como/merge_xomics.py b/main/como/merge_xomics.py
index 4580b0eb..381f66d6 100644
--- a/main/como/merge_xomics.py
+++ b/main/como/merge_xomics.py
@@ -513,7 +513,16 @@ def _build_batches(
             continue
 
         metadata: pd.DataFrame  # Re-assign type to assist in type hinting
-        for batch_num, study in enumerate(sorted(metadata["study"].unique()), start=1):
+        for study in sorted(metadata["study"].unique()):
+            batch_search = re.search(r"\d+", study)
+            if not batch_search:
+                _log_and_raise_error(
+                    message=f"Unable to find batch number in study name. Expected a digit in the study value: {study}",
+                    error=ValueError,
+                    level=LogLevel.ERROR,
+                )
+
+            batch_num = int(batch_search.group(0))  # ty: ignore[possibly-missing-attribute]
             study_sample_names = metadata[metadata["study"] == study]["sample_name"].tolist()
             batch_names[source.value].append(_BatchEntry(batch_num=batch_num, sample_names=study_sample_names))
             logger.debug(f"Found {len(study_sample_names)} sample names for study '{study}', batch number {batch_num}")

From 4142156e8a11ea0b457ce611bf54e3f8bc561002 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:37:11 -0600
Subject: [PATCH 23/52] refactor: use lowercase names

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/merge_xomics.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/main/como/merge_xomics.py b/main/como/merge_xomics.py
index 381f66d6..87b5ba62 100644
--- a/main/como/merge_xomics.py
+++ b/main/como/merge_xomics.py
@@ -596,10 +596,10 @@ async def merge_xomics(  # noqa: C901
 
     # fmt: off
     source_data = {
-        SourceTypes.TRNA: (trna_matrix_or_filepath, trna_boolean_matrix_or_filepath, trna_metadata_filepath_or_df, output_trna_activity_filepath),
-        SourceTypes.MRNA: (mrna_matrix_or_filepath, mrna_boolean_matrix_or_filepath, mrna_metadata_filepath_or_df, output_mrna_activity_filepath),
-        SourceTypes.SCRNA: (scrna_matrix_or_filepath, scrna_boolean_matrix_or_filepath, scrna_metadata_filepath_or_df, output_scrna_activity_filepath),  # noqa: E501
-        SourceTypes.PROTEOMICS: (proteomic_matrix_or_filepath, proteomic_boolean_matrix_or_filepath, proteomic_metadata_filepath_or_df, output_proteomic_activity_filepath),  # noqa: E501
+        SourceTypes.trna: (trna_matrix_or_filepath, trna_boolean_matrix_or_filepath, trna_metadata_filepath_or_df, output_trna_activity_filepath),
+        SourceTypes.mrna: (mrna_matrix_or_filepath, mrna_boolean_matrix_or_filepath, mrna_metadata_filepath_or_df, output_mrna_activity_filepath),
+        SourceTypes.scrna: (scrna_matrix_or_filepath, scrna_boolean_matrix_or_filepath, scrna_metadata_filepath_or_df, output_scrna_activity_filepath),  # noqa: E501
+        SourceTypes.proteomics: (proteomic_matrix_or_filepath, proteomic_boolean_matrix_or_filepath, proteomic_metadata_filepath_or_df, output_proteomic_activity_filepath),  # noqa: E501
     }
     # fmt: on
     for source in source_data:

From 89f05aa35d5e3253efa4050e82c4f66be8a6674e Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:37:42 -0600
Subject: [PATCH 24/52] refactor: import required modules

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/proteomics_gen.py    | 8 +++++---
 main/como/rnaseq_gen.py        | 8 ++++----
 main/como/rnaseq_preprocess.py | 3 ++-
 main/como/utils.py             | 1 +
 4 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/main/como/proteomics_gen.py b/main/como/proteomics_gen.py
index 012486a4..e0d9b078 100644
--- a/main/como/proteomics_gen.py
+++ b/main/como/proteomics_gen.py
@@ -1,15 +1,17 @@
 from __future__ import annotations
 
+import itertools
 import sys
-from io import TextIOWrapper
 from pathlib import Path
+from typing import TextIO, cast
 
 import numpy as np
 import pandas as pd
-from fast_bioservices.biodbnet import BioDBNet, Input, Output
+from bioservices import BioDBNet
+from fast_bioservices.biodbnet import Input, Output  # BioDBNet
 from loguru import logger
 
-from como.data_types import LOG_FORMAT, LogLevel
+from como.data_types import LogLevel
 from como.project import Config
 from como.proteomics_preprocessing import protein_transform_main
 from como.utils import _log_and_raise_error, _set_up_logging, return_placeholder_data
diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index 99121f8f..bad4f70e 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import itertools
 import multiprocessing
 import sys
 import time
@@ -8,9 +9,8 @@
 from concurrent.futures import Future, ProcessPoolExecutor, as_completed
 from dataclasses import dataclass, field
 from enum import Enum
-from io import TextIOWrapper
 from pathlib import Path
-from typing import NamedTuple
+from typing import NamedTuple, TextIO, cast
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -22,11 +22,11 @@
 from fast_bioservices.pipeline import ensembl_to_gene_id_and_symbol, gene_symbol_to_ensembl_and_gene_id
 from loguru import logger
 from pandas import DataFrame
-from scipy.signal import find_peaks
-from sklearn.neighbors import KernelDensity
 
 from como.data_types import FilteringTechnique, LogLevel, PeakIdentificationParameters, RNAType
+from como.density import density
 from como.migrations import gene_info_migrations
+from como.peak_finder import find_peaks
 from como.project import Config
 from como.utils import _log_and_raise_error, _num_columns, _read_file, _set_up_logging
 
diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py
index 142be019..5ef58fb9 100644
--- a/main/como/rnaseq_preprocess.py
+++ b/main/como/rnaseq_preprocess.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import asyncio
+import functools
 import json
 import re
 import sys
@@ -14,7 +15,7 @@
 import numpy as np
 import pandas as pd
 from fast_bioservices.biothings.mygene import MyGene
-from fast_bioservices.pipeline import ensembl_to_gene_id_and_symbol, gene_symbol_to_ensembl_and_gene_id
+from fast_bioservices.pipeline import gene_symbol_to_ensembl_and_gene_id
 from loguru import logger
 
 from como.data_types import PATH_TYPE, LogLevel, RNAType
diff --git a/main/como/utils.py b/main/como/utils.py
index ececc902..05629032 100644
--- a/main/como/utils.py
+++ b/main/como/utils.py
@@ -2,6 +2,7 @@
 
 import contextlib
 import io
+import itertools
 import sys
 from collections.abc import Iterator
 from pathlib import Path

From 60a883a81a1ede81b768799c47064d0d91c91b94 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:39:17 -0600
Subject: [PATCH 25/52] feat: replicate R's zFPKM module

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/approx.py      | 310 +++++++++++++++++++++++++++++++++++++
 main/como/density.py     | 325 +++++++++++++++++++++++++++++++++++++++
 main/como/peak_finder.py | 191 +++++++++++++++++++++++
 main/como/rnaseq_gen.py  | 116 ++++----------
 4 files changed, 859 insertions(+), 83 deletions(-)
 create mode 100644 main/como/approx.py
 create mode 100644 main/como/density.py
 create mode 100644 main/como/peak_finder.py

diff --git a/main/como/approx.py b/main/como/approx.py
new file mode 100644
index 00000000..3ef69a74
--- /dev/null
+++ b/main/como/approx.py
@@ -0,0 +1,310 @@
+from __future__ import annotations
+
+from collections.abc import Callable, Sequence
+from typing import Any
+
+import numpy as np
+
+
+def _coerce_to_float_array(a):
+    """Helper to ensure input is a 1D float array."""
+    arr = np.asarray(a, dtype=float)
+    if arr.ndim != 1:
+        arr = arr.ravel()
+    return arr
+
+
+def _regularize_values(x: np.ndarray, y: np.ndarray, ties, na_rm: bool) -> dict[str, Any]:
+    """Minimal reimplementation of R's regularize.values() used by approx().
+
+      - Removes NA pairs if na_rm is True (else requires x to have no NA).
+      - Sorts by x (stable).
+      - Collapses duplicate x via `ties` aggregator.
+    Returns dict with:
+      x: sorted unique x
+      y: aggregated y aligned with x
+      not_na: boolean mask of y that are not NaN
+      kept_na: True iff any NaN remained in y after regularization
+    """
+    if na_rm:
+        # Remove pairs where x or y is NA
+        keep = ~np.isnan(x) & ~np.isnan(y)
+        x = x[keep]
+        y = y[keep]
+        kept_na = False
+    else:
+        # R's C code errors if na_rm=False and any x is NA
+        if np.isnan(x).any():
+            raise ValueError("approx(x,y, .., na.rm=False): NA values in x are not allowed")
+        kept_na = np.isnan(y).any()
+
+    if x.size == 0:
+        # THIS IS THE CORRECTED LINE:
+        return {"x": x, "y": y, "not_na": np.array([], dtype=bool), "kept_na": kept_na}
+
+    # Use a stable sort (mergesort) to match R's order()
+    order = np.argsort(x, kind="mergesort")
+    x_sorted = x[order]
+    y_sorted = y[order]
+
+    # Handle the 'ties' argument
+    if callable(ties):
+        agg_fn = ties
+    else:
+        ties_str = "mean" if ties is None else str(ties).lower()
+        if ties_str in ("mean", "avg", "average"):
+            agg_fn = np.mean
+        elif ties_str in ("first", "left"):
+
+            def agg_fn(a):
+                return a[0]
+        elif ties_str in ("last", "right"):
+
+            def agg_fn(a):
+                return a[-1]
+        elif ties_str == "min":
+            agg_fn = np.min
+        elif ties_str == "max":
+            agg_fn = np.max
+        elif ties_str == "median":
+            agg_fn = np.median
+        elif ties_str == "sum":
+            agg_fn = np.sum
+        else:
+            raise ValueError("Unsupported `ties`; use a callable or one of 'mean', 'first', 'last', 'min', 'max', 'median', 'sum'.")
+
+    # Find unique x values and their indices/counts
+    unique_x, start_idx, counts = np.unique(x_sorted, return_index=True, return_counts=True)
+
+    # Aggregate y values for tied x values
+    y_agg = np.empty(unique_x.shape, dtype=float)
+    for k, (start, count) in enumerate(zip(start_idx, counts, strict=True)):
+        seg = y_sorted[start : start + count]
+        # Note: aggregators like np.mean will return NaN if `seg` contains NaN,
+        # matching R's default (mean(..., na.rm = FALSE)).
+        y_agg[k] = agg_fn(seg)
+
+    not_na = ~np.isnan(y_agg)
+    # Check if any NaNs remain in the *aggregated* y values
+    kept_na_final = np.any(~not_na) if np.any(np.isnan(y_agg)) else False
+    return {"x": unique_x, "y": y_agg, "not_na": not_na, "kept_na": kept_na_final}
+
+
+def approx(
+    x: Sequence[float],
+    y: Sequence[float] | None = None,
+    xout: Sequence[float] | None = None,
+    method: str | int = "linear",
+    n: int = 50,
+    yleft: float | None = None,
+    yright: float | None = None,
+    rule: int | Sequence[int] = 1,
+    f: float = 0.0,
+    ties: str | Callable[[np.ndarray], float] = "mean",
+    na_rm: bool = True,
+) -> dict[str, np.ndarray]:
+    """Faithful Python port of R's `approx` function.
+
+    This implementation aims to replicate the behavior of R's `approx`
+    function, including its handling of `ties`, `rule`, `na_rm`, and
+    `method`.
+
+    Args:
+        x: Coordinates, or y-values if `y` is None.
+        y: y-coordinates. If None, `x` is treated as `y` and `x` becomes `1..n`.
+        xout: Points at which to interpolate.
+        method: Interpolation method. "linear" (1) or "constant" (2).
+        n: If `xout` is not provided, interpolation happens at `n`
+             equally spaced points spanning the range of `x`.
+        yleft: Value to use for extrapolation to the left.
+               Defaults to `NA` (np.nan) if `rule` is 1.
+        yright: Value to use for extrapolation to the right.
+                Defaults to `NA` (np.nan) if `rule` is 1.
+        rule: Extrapolation rule.
+              - 1: Return `np.nan` for points outside the `x` range.
+              - 2: Use `yleft` and `yright` for points outside the range.
+        f: For `method="constant"`, determines the split point.
+           `f=0` is left-step, `f=1` is right-step, `f=0.5` is midpoint.
+        ties: How to handle duplicate `x` values.
+              Can be 'mean', 'first', 'last', 'min', 'max', 'median', 'sum',
+              or a callable function.
+        na_rm: If True, `NA` pairs are removed before interpolation.
+               If False, `NA`s in `x` cause an error, `NA`s in `y`
+               are propagated.
+
+    Returns:
+        dict with:
+          - 'x': numpy array of xout used
+          - 'y': numpy array of interpolated values
+    """
+    # One-argument form: approx(y) -> x := 1..n, y := y
+    if y is None:
+        y_arr = _coerce_to_float_array(x)
+        x_arr = np.arange(1.0, y_arr.size + 1.0, dtype=float)
+    else:
+        x_arr = _coerce_to_float_array(x)
+        y_arr = _coerce_to_float_array(y)
+        if x_arr.size != y_arr.size:
+            raise ValueError("x and y must have same length")
+
+    # --- Method normalization ---
+    # (matches R's pmatch() result: 1=linear, 2=constant)
+    if isinstance(method, str):
+        m = method.strip().lower()
+        if m.startswith("lin"):
+            method_code = 1
+        elif m.startswith("const"):
+            method_code = 2
+        else:
+            raise ValueError("invalid interpolation method")
+    else:
+        if method in (1, 2):
+            method_code = int(method)
+        else:
+            raise ValueError("invalid interpolation method")
+
+    # --- Rule normalization ---
+    if isinstance(rule, list | tuple | np.ndarray):
+        rlist = list(rule)
+        if not (1 <= len(rlist) <= 2):
+            raise ValueError("`rule` must have length 1 or 2")
+        if len(rlist) == 1:
+            rlist = [rlist[0], rlist[0]]
+    else:
+        rlist = [rule, rule]
+
+    # --- Regularize values ---
+    # Sort by x, collapse ties, and handle NAs like R's regularize.values()
+    r = _regularize_values(x_arr, y_arr, ties, na_rm)
+    x_reg = r["x"]
+    y_reg = r["y"]
+    not_na_mask = r["not_na"]
+    # no_na is True if we don't have to worry about NAs in y_reg
+    no_na = na_rm or (not r["kept_na"])
+    # nx is the number of *valid* (non-NA) points for interpolation
+    nx = x_reg.size if no_na else int(np.count_nonzero(not_na_mask))
+
+    if np.isnan(nx):
+        raise ValueError("invalid length(x)")
+
+    # R's C_ApproxTest logic
+    if nx <= 1:
+        if method_code == 1:
+            raise ValueError("need at least two non-NA values to interpolate")
+        if nx == 0:
+            raise ValueError("zero non-NA points")
+
+    # --- Set extrapolation values (yleft/yright) ---
+    # This logic matches R's C code (R_approxtest)
+    if no_na:
+        y_first = y_reg[0]
+        y_last = y_reg[-1]
+    else:
+        # Find first and last non-NA y values
+        y_valid = y_reg[not_na_mask]
+        y_first = y_valid[0]
+        y_last = y_valid[-1]
+
+    yleft_val = (np.nan if int(rlist[0]) == 1 else y_first) if yleft is None else float(yleft)
+    yright_val = (np.nan if int(rlist[1]) == 1 else y_last) if yright is None else float(yright)
+
+    # --- Fabricate xout if missing ---
+    if xout is None:
+        if n <= 0:
+            raise ValueError("'approx' requires n >= 1")
+        if no_na:
+            x_first = x_reg[0]
+            x_last = x_reg[nx - 1]
+        else:
+            x_valid = x_reg[not_na_mask]
+            x_first = x_valid[0]
+            x_last = x_valid[-1]
+        xout_arr = np.linspace(x_first, x_last, num=int(n), dtype=float)
+    else:
+        xout_arr = _coerce_to_float_array(xout)
+
+    # --- C_ApproxTest checks ---
+    if method_code == 2 and (not np.isfinite(f) or f < 0.0 or f > 1.0):
+        raise ValueError("approx(): invalid f value")
+    if not no_na:
+        # R re-filters x and y here if NAs remained
+        x_reg = x_reg[not_na_mask]
+        y_reg = y_reg[not_na_mask]
+
+    # --- Interpolation ---
+    # This section vectorized the logic from R's approx1 and R_approxfun
+    yout = np.empty_like(xout_arr, dtype=float)
+    mask_nan_xout = np.isnan(xout_arr)
+    yout[mask_nan_xout] = np.nan
+    mask_valid = ~mask_nan_xout
+
+    if x_reg.size == 0:
+        # No valid points to interpolate against
+        yout[mask_valid] = np.nan
+        return {"x": xout_arr, "y": yout}
+
+    xv = xout_arr[mask_valid]
+    left_mask = xv < x_reg[0]
+    right_mask = xv > x_reg[-1]
+    mid_mask = ~(left_mask | right_mask)
+
+    res = np.empty_like(xv)
+    res[left_mask] = yleft_val
+    res[right_mask] = yright_val
+
+    if np.any(mid_mask):
+        xm = xv[mid_mask]
+
+        # Find indices
+        # j_right[k] = index of first x_reg > xm[k]
+        j_right = np.searchsorted(x_reg, xm, side="right")
+        # j_left[k] = index of first x_reg >= xm[k]
+        j_left = np.searchsorted(x_reg, xm, side="left")
+
+        # Points that exactly match an x_reg value
+        eq_mask = j_left != j_right
+        # Points that fall between x_reg values
+        in_interval_mask = ~eq_mask
+
+        res_mid = np.empty_like(xm)
+
+        if np.any(eq_mask):
+            # For exact matches, use the corresponding y_reg value
+            # R's C code uses the 'j_left' index here
+            res_mid[eq_mask] = y_reg[j_left[eq_mask]]
+
+        if np.any(in_interval_mask):
+            # R's C code sets i = j-1, where j is the 'right' index
+            j = j_right[in_interval_mask]
+            i = j - 1
+
+            # Get the surrounding x and y values
+            xi = x_reg[i]
+            xj = x_reg[j]
+            yi = y_reg[i]
+            yj = y_reg[j]
+
+            if method_code == 1:  # linear
+                t = (xm[in_interval_mask] - xi) / (xj - xi)
+                res_mid[in_interval_mask] = yi + (yj - yi) * t
+            else:  # constant
+                # This matches R_approxfun's constant logic
+                if f == 0.0:
+                    res_mid[in_interval_mask] = yi
+                elif f == 1.0:
+                    res_mid[in_interval_mask] = yj
+                else:
+                    # R computes (1-f)*yi + f*yj, but carefully
+                    f1 = 1.0 - f
+                    f2 = f
+                    part = np.zeros_like(yi)
+                    if f1 != 0.0:
+                        part = part + yi * f1
+                    if f2 != 0.0:
+                        part = part + yj * f2
+                    res_mid[in_interval_mask] = part
+
+        res[mid_mask] = res_mid
+
+    yout[mask_valid] = res
+    return {"x": xout_arr, "y": yout}
diff --git a/main/como/density.py b/main/como/density.py
new file mode 100644
index 00000000..0f75623a
--- /dev/null
+++ b/main/como/density.py
@@ -0,0 +1,325 @@
+from __future__ import annotations
+
+import sys
+from collections.abc import Sequence
+from typing import Literal, NamedTuple, cast
+
+import numpy as np
+import numpy.typing as npt
+from scipy.interpolate import interp1d
+
+from como.approx import approx
+
+
+class DensityResult(NamedTuple):
+    x: npt.NDArray[float]
+    y: npt.NDArray[float]
+    x_grid: npt.NDArray[float]
+    y_grid: npt.NDArray[float]
+    bw: float
+    n: int
+
+
+NUMBER = int | float | np.number
+
+
+def bin_distance(x: npt.NDArray[float], weights: npt.NDArray[float], lo: NUMBER, up: NUMBER, n: int) -> npt.NDArray[float]:
+    """Bin weighted distances."""
+    ixmin: int = 0
+    ixmax: int = n - 2
+    delta: float = (up - lo) / (n - 1)
+
+    y: npt.NDArray[float] = np.zeros((2 * n,), dtype=float)
+    for i in range(x.size):
+        i: int
+        xpos: float = (x[i] - lo) / delta
+        if xpos > sys.maxsize or xpos < -sys.maxsize:  # avoid integer overflows (taken from R's massdist.c)
+            continue
+        ix: int = np.floor(xpos).astype(int)
+        fx: float = xpos - ix
+        wi: float = weights[i]
+        if ixmin <= ix <= ixmax:
+            y[ix] += (1 - fx) * wi
+            y[ix + 1] += fx * wi
+        elif ix == -1:
+            y[0] += fx * wi
+        elif ix == ixmax + 1:
+            y[ix] += (1 - fx) * wi
+    return y
+
+
+def dnorm(x: float, mean: NUMBER = 0.0, sd: NUMBER = 1.0, log: bool = False, fast_dnorm: bool = False) -> float:
+    """Density function for the normal distribution.
+
+    This is a reproduction of R's `density` function.
+    Neither SciPy nor NumPy are capable of producing KDE values that align with R, and as a result,
+        a manual translation of R's KDE implementation was necessary.
+
+    References:
+        1) Documentation: https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/Normal
+        2) Source code (2025-OCT-30): https://github.com/wch/r-source/blob/3f7e2528990351bc4b0d1f1b237714668ab4c0c5/src/nmath/dnorm.c
+
+    Args:
+        x: Value at which to evaluate the density.
+        mean: Mean of the normal distribution.
+        sd: Standard deviation of the normal distribution.
+        log: If True, return the log density.
+        fast_dnorm: If True, use a faster but less accurate calculation for small `x`.
+
+
+    """
+    # Constants
+    m_ln2: float = 0.693147180559945309417232121458  # ln(2)
+    m_1_sqrt_2pi: float = 0.398942280401432677939946059934  # 1/sqrt(2pi)
+    m_ln_sqrt_2pi: float = 0.918938533204672741780329736406  # log(sqrt(2*pi))
+    r_d__0 = float(-np.inf) if log else 0.0  # R's R_D__0: (log_p ? ML_NEGINF : 0.)
+
+    # 11 bits exponent, where one bit is used to indicate special numbers (e.g. NaN and Infinity),
+    #   so the maximum exponent is 10 bits wide (2^10 == 1024).
+    # dbl_min_exp = -1022
+    dbl_min_exp = -1074
+
+    # The mantissa is 52 bits wide, but because numbers are normalized the initial binary 1 is represented
+    #   implicitly (the so-called "hidden bit"), which leaves us with the ability to represent 53 bits wide mantissa.
+    dbl_mant_dig = 53
+
+    mean: float = float(mean)
+    sd: float = float(sd)
+
+    if np.isnan(x) or np.isnan(mean) or np.isnan(sd):
+        return x + mean + sd
+    if sd < 0.0:
+        return float(np.nan)
+    if not np.isfinite(sd):
+        return r_d__0
+    if not np.isfinite(x) and x == mean:
+        return float(np.nan)
+    if sd == 0.0:
+        return float(np.inf) if x == mean else r_d__0
+
+    # From this point on, dividing by `sd` is safe because we know it is not 0
+    z = (x - mean) / sd
+    if (not np.isfinite(z)) and (x == mean):
+        return float(np.nan)
+
+    if not np.isfinite(z):
+        return r_d__0
+
+    a = np.fabs(z)
+    if a >= 2 * np.sqrt(np.finfo(float).max):
+        return r_d__0
+    if log:
+        return -(m_ln_sqrt_2pi + 0.5 * a * a + np.log(sd))
+    if a < 5 or fast_dnorm:  # for `x < 5`, this is more accurate but less fast
+        return m_1_sqrt_2pi * np.exp(-0.5 * a * a) / sd
+
+    # underflow boundary
+    boundary = np.sqrt(-2.0 * m_ln2 * (dbl_min_exp + 1 - dbl_mant_dig))
+    if a > boundary:
+        return float(0.0)
+
+    # Now, to get full accuracy, split x into two parts,
+    #   x = x1+x2, such that |x2| <= 2^-16.
+    #   Assuming that we are using IEEE doubles, that means that
+    #   x1*x1 is error free for x<1024 (but we have x < 38.6 anyway).
+    #   If we do not have IEEE this is still an improvement over the naive formula.
+    a1 = np.ldexp(np.rint(np.ldexp(a, 16)), -16)
+    a2 = a - a1
+    return (m_1_sqrt_2pi / sd) * (np.exp(-0.5 * a1 * a1) * np.exp((-a1 * a2) - (0.5 * a2 * a2)))
+
+
+def nrd0(x: npt.NDArray[float]) -> float:
+    """Calculate nrd0 from R source.
+
+    This bandwidth calculation matches R's
+
+    References:
+        1) Documentation: https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/bandwidth
+        2) Source code (as of 2025-OCT-30, copied below): https://github.com/wch/r-source/blob/trunk/src/library/stats/R/bandwidths.R
+            ```R
+            bw.nrd0 <- function (x)
+            {
+                if(length(x) < 2L) stop("need at least 2 data points")
+                hi <- sd(x)
+                if(!(lo <- min(hi, IQR(x)/1.34)))# qnorm(.75) - qnorm(.25) = 1.34898
+                    (lo <- hi) || (lo <- abs(x[1L])) || (lo <- 1.)
+                0.9 * lo * length(x)^(-0.2)
+            }
+            ```
+    """
+    if x.size < 2:
+        raise ValueError("need at least 2 data points")
+
+    hi = np.std(x, ddof=1)
+    q75, q25 = cast(tuple[float, float], cast(object, np.percentile(x, [75, 25])))
+    iqr_over_134 = (q75 - q25) / 1.34
+
+    # We are using a cascading series of checks on `lo` to  make sure it is a non-zero value
+    lo = min(hi, iqr_over_134)
+    if lo == 0:
+        if hi != 0:
+            lo = hi
+        elif abs(x[0]) != 0:
+            lo = abs(x[0])
+        else:
+            lo = 1.0
+
+    return float(0.9 * lo * x.size ** (-1 / 5))
+
+
+def density(
+    x: Sequence[NUMBER] | npt.NDArray[NUMBER],
+    bw: NUMBER | Literal["nrd0"] = "nrd0",
+    adjust: NUMBER = 1,
+    kernel: Literal["gaussian", "epanechnikov", "rectangular", "triangular", "biweight", "cosine", "optcosine"] = "gaussian",
+    weights: Sequence[NUMBER] | None = None,
+    n: int = 512,
+    from_: NUMBER | None = None,
+    to_: NUMBER | None = None,
+    cut: int = 3,
+    ext: int = 4,
+    remove_na: bool = False,
+    kernel_only: bool = False,
+) -> DensityResult:
+    """Compute kernel density estimates (KDE) using FFT method.
+
+    This is a reproduction of R's `density` function.
+    Neither SciPy nor NumPy are capable of producing KDE values that align with R, and as a result,
+        a manual translation of R's KDE implementation was necessary.
+
+    Args:
+        x: Input data points.
+        bw: Bandwidth for the kernel. If "nrd0", uses R's nrd0 method.
+        adjust: Adjustment factor for the bandwidth.
+        kernel: Kernel type to use.
+        weights: Optional weights for each data point.
+        n: Number of points in the output grid.
+        from_: Start of the grid (calculated automatically if not provided).
+        to_: End of the grid (calculated automatically if not provided).
+        cut: Number of bandwidths to extend the grid on each side.
+        ext: Number of bandwidths to extend the grid for FFT calculation.
+        remove_na: Whether to remove NA values from `x`.
+        kernel_only: If True, returns only the integral of the kernel function.
+
+    Returns:
+        DensityResult: A named tuple containing:
+            x: The x-coordinates of the density estimate.
+            y: The estimated density values at the x-coordinates.
+            x_grid: The grid of x-coordinates used for FFT calculation.
+            y_grid: The density values on the FFT grid.
+            bw: The bandwidth used.
+            n: The number of points in the output grid.
+    """
+    if not isinstance(x, Sequence | np.ndarray) or (len(x) < 2 and bw == "nrd0"):
+        raise ValueError("Need at at least two points to select a bandwidth automatically using 'nrd0'")
+    if kernel_only:
+        if kernel == "gaussian":
+            return 1 / (2 * np.sqrt(np.pi))
+        elif kernel == "epanechnikov":
+            return 3 / (5 * np.sqrt(5))
+        elif kernel == "rectangular":
+            return np.sqrt(3) / 6
+        elif kernel == "triangular":
+            return np.sqrt(6) / 9
+        elif kernel == "biweight":
+            return 5 * np.sqrt(7) / 49
+        elif kernel == "cosine":
+            return 3 / 4 * np.sqrt(1 / 3 - 2 / np.pi**2)
+        elif kernel == "optcosine":
+            return np.sqrt(1 - 8 / np.pi**2) * np.pi**2 / 16
+
+    x: npt.NDArray[float] = np.asarray(x, dtype=float)
+
+    has_weights = weights is not None
+    weights: npt.NDArray[float] | None = np.asarray(weights, float) if weights is not None else None
+    if has_weights and (weights is not None and weights.size != n):
+        raise ValueError(f"The length of provided weights does not match the length of x: {weights.size} != {n}")
+
+    x_na: npt.NDArray[np.bool_] = np.isnan(x)
+    if np.any(x_na):
+        if remove_na:
+            x: npt.NDArray[float] = x[~x_na]
+            if has_weights and weights is not None:
+                true_d = weights.sum().astype(float) == 1
+                weights = weights[~x_na]
+                if true_d:
+                    weights = weights / weights.sum()
+        else:
+            raise ValueError("NA values found in 'x'. Set 'remove_na=True' to ignore them.")
+
+    nx = n
+    x_finite = np.isfinite(x)
+    if np.any(~x_finite):
+        x = x[x_finite]
+        nx = x.size
+    if not has_weights:
+        weights: npt.NDArray[float] = np.full(shape=nx, fill_value=1 / nx, dtype=float)
+        total_mass = nx / n
+    else:
+        weights = np.asarray(weights)
+        if not np.all(np.isfinite(weights)):
+            raise ValueError("Non-finite values found in 'weights'.")
+        if np.any(weights < 0):
+            raise ValueError("Negative values found in 'weights'.")
+        wsum: float = weights.sum()
+        if np.any(~x_finite):
+            weights = weights[x_finite]
+            total_mass = float(weights.sum() / wsum)
+        else:
+            total_mass = float(1)
+
+    n_user: int = n
+    n = max(n, 512)
+    if n > 512:  # round n up to the next power of 2 (i.e., 2^8=512, 2^9=1024)
+        n: int = int(2 ** np.ceil(np.log2(n)))
+
+    if isinstance(bw, str) and bw != "nrd0":
+        raise TypeError("Bandwidth 'bw' must be a number or 'nrd0'.")
+    elif isinstance(bw, str) and bw == "nrd0":
+        bw_calc = nrd0(x)
+    else:
+        bw_calc = float(bw)
+    if not np.isfinite(bw_calc):
+        raise ValueError("Calculated bandwidth 'bw' is not finite.")
+    bw_calc *= adjust
+
+    if bw_calc <= 0:
+        raise ValueError("Bandwidth 'bw' must be positive.")
+
+    from_ = float(from_ or min(x) - cut * bw_calc)
+    to_ = float(to_ or max(x) + cut * bw_calc)
+
+    if not np.isfinite(from_):
+        raise ValueError("'from_' is not finite.")
+    if not np.isfinite(to_):
+        raise ValueError("'to_' is not finite.")
+
+    lo = float(from_ - ext * bw_calc)
+    up = float(to_ + ext * bw_calc)
+
+    y: npt.NDArray[float] = bin_distance(x, weights, lo, up, n) * total_mass
+    kords: npt.NDArray[float] = np.linspace(start=0, stop=((2 * n - 1) / (n - 1) * (up - lo)), num=2 * n, dtype=float)
+    kords[n + 1 : 2 * n] = -kords[n:1:-1]  # mirror/negate tail: R's kords[n:2] will index from the reverse if `n`>2
+
+    # Initial diverge here (inside dnorm calculation)
+    kords: npt.NDArray[float] = np.asarray([dnorm(i, sd=bw_calc) for i in kords], dtype=float)
+
+    fft_y: npt.NDArray[np.complex128] = np.fft.fft(y)
+    conj_fft_kords: npt.NDArray[np.complex128] = np.conjugate(np.fft.fft(kords))
+    # Must multiply by `kords.size` because R does not produce a normalize inverse FFT, but NumPy normalizes by `1/size`
+    kords: npt.NDArray[np.complex128] = np.fft.ifft(fft_y * conj_fft_kords) * kords.size
+    kords: npt.NDArray[float] = (np.maximum(0, kords.real)[0:n]) / y.size  # for values of kords, get 0 or kords[i], whichever is larger
+    xords: npt.NDArray[float] = np.linspace(lo, up, num=n, dtype=float)
+
+    # xp=known x-coords, fp=known y-cords, x=unknown x-coords; returns interpolated (e.g., unknown) y-coords
+    interp_x: npt.NDArray[float] = np.linspace(from_, to_, num=n_user)
+    interp_y: npt.NDArray[float] = approx(xords, kords, interp_x)
+
+    return DensityResult(
+        x=interp_x,
+        y=interp_y["y"],
+        x_grid=xords,
+        y_grid=kords,
+        bw=bw_calc,
+        n=n,
+    )
diff --git a/main/como/peak_finder.py b/main/como/peak_finder.py
new file mode 100644
index 00000000..230db014
--- /dev/null
+++ b/main/como/peak_finder.py
@@ -0,0 +1,191 @@
+import re
+from collections.abc import Sequence
+from typing import Literal, cast, overload
+
+import numpy as np
+import numpy.typing as npt
+import pandas as pd
+
+from como.utils import _num_rows
+
+
+def _validate_args(
+    x: npt.NDArray[np.number],
+    nups: int,
+    ndowns: int,
+    zero: Literal["0", "+", "-"],
+    min_peak_height: float,
+    min_peak_distance: int,
+    threshold: float,
+):
+    """Validate input arguments to `find_peaks` function.
+
+    Function created to reduce the complexity of `find_peaks` (ruff linting rule C901)
+    """
+    if x.ndim != 1:
+        raise ValueError(f"Expected a 1D array, got {x.ndim}D array instead.")
+    if np.any(np.isnan(x)):
+        raise ValueError("Input x contains NaNs")
+    if nups < 0:
+        raise ValueError("Argument 'nups' must be non-negative")
+    if ndowns < 0:
+        raise ValueError("Argument 'ndowns' must be non-negative")
+    if zero not in {"0", "+", "-"}:
+        raise ValueError("Argument 'zero' must be '0', '+', or '-'")
+    if min_peak_height < 0:
+        raise ValueError("Argument 'min_peak_height' must be non-negative")
+    if min_peak_distance < 0:
+        raise ValueError("Argument 'minpeakdistance' must be non-negative")
+    if threshold < 0:
+        raise ValueError("Argument 'threshold' must be non-negative")
+
+
+def _encode_signs(x: npt.NDArray[np.number], zero: str) -> str:
+    """Encode the signs of the differences in `x` into a string.
+
+    Function created to reduce the complexity of `find_peaks` (ruff linting rule C901)
+    """
+    diff_signs = np.sign(np.diff(x))
+    chars = ["+" if d > 0 else "-" if d < 0 else "0" for d in diff_signs]
+    x_chars = "".join(chars)
+    if zero != "0":
+        x_chars = x_chars.replace("0", zero)
+    return x_chars
+
+
+@overload
+def _enforce_minimum_peak_distance(df: pd.DataFrame, min_peak_distance: int, inplace: Literal[True] = True) -> None: ...
+
+
+@overload
+def _enforce_minimum_peak_distance(df: pd.DataFrame, min_peak_distance: int, inplace: Literal[False] = False) -> pd.DataFrame: ...
+
+
+def _enforce_minimum_peak_distance(df: pd.DataFrame, min_peak_distance: int, inplace: bool = True) -> None | pd.DataFrame:
+    """Enforces a minimum peak distance between identified peaks.
+
+    Function created to reduce the complexity of `find_peaks` (ruff linting rule C901)
+
+    Modifies `df` inplace
+    """
+    good_peaks = np.ones(_num_rows(df), dtype=bool)
+    for i in range(_num_rows(df)):
+        if not good_peaks[i]:
+            continue
+        dpos = np.abs(df.at[i, "peak_idx"] - df["peak_idx"].values)
+        close = (dpos > 0) & (dpos < min_peak_distance)
+        good_peaks[close] = False
+
+    if inplace:
+        df.drop(df.index[~good_peaks], inplace=True)
+        df.reset_index(drop=True, inplace=True)
+        return None
+    return cast(pd.DataFrame, df.iloc[good_peaks, :].reset_index(drop=True))
+
+
+def find_peaks(
+    x: Sequence[float] | Sequence[int] | npt.NDArray[np.number],
+    nups: int = 1,
+    ndowns: int | None = None,
+    zero: Literal["0", "+", "-"] = "0",
+    peak_pattern: str | None = None,
+    min_peak_height: float = 0.02,  # default for R's zFPKM `peak_parameters`
+    min_peak_distance: int = 1,  # default for R's zFPKM `peak_parameters`
+    threshold: float = 0.0,
+    npeaks: int = 0,
+    sortstr: bool = False,
+) -> pd.DataFrame:
+    """Identify peaks in a given time series.
+
+    This function is modelled after R's `pracma::findpeaks` function.
+    SciPy's `scipy.signal.find_peaks` provides different results than `pracma::findpeaks`,
+        resulting in the requirement for this translation.
+
+    References:
+        1) pracma::findpeaks: https://rdrr.io/cran/pracma/man/findpeaks.html
+
+    Args:
+        x: numerical vector taken as a time series (no NA values allowed)
+        nups: minimum number of increasing steps before a peak is reached
+        ndowns: minimum number of decreasing steps after the peak (defaults to the same value as `nups`)
+        zero: can be '+', '-', or '0'; how to interprete succeeding steps of the same value: increasing, decreasing, or special
+        peak_pattern: define a peak as a regular pattern, such as the default pattern `[+]{1,}[-]{1,}`
+            If a pattern is provided, parameters `nups` and `ndowns` are not taken into account
+        min_peak_height: the minimum (absolute) height a peak has to have before being recognized
+        min_peak_distance: the minimum distance (in indices) between peaks before they are counted
+        threshold: the minimum difference in height between a peak and its surrounding values
+        npeaks: the number of peaks to return (<=0 returns all)
+        sortstr: should the peaks be returned in decreasing order of their peak height?
+
+    Returns:
+        A dataframe with the columns:
+            height: the height of the peak
+            peak_idx: the index (from `x`) of the identified peak
+            start_idx: the starting index (from `x`) of the identified peak
+            end_idx: the ending index (from `x`) of the identified peak
+        If the dataframe is empty, no peaks could be identified
+    """
+    x = np.asarray(x, dtype=float) if not isinstance(x, np.ndarray) else x
+    npeaks: int = max(npeaks, 0)
+    ndowns: int = ndowns or nups
+    peak_pattern: str = peak_pattern or rf"[+]{{{nups},}}[-]{{{ndowns},}}"
+    _validate_args(
+        x=x,
+        nups=nups,
+        ndowns=ndowns,
+        zero=zero,
+        min_peak_height=min_peak_height,
+        min_peak_distance=min_peak_distance,
+        threshold=threshold,
+    )
+
+    # find peaks by regex matching the sign pattern
+    derivative_chars = _encode_signs(x=x, zero=zero)
+    matches: list[re.Match[str]] = list(re.finditer(peak_pattern, derivative_chars))
+    if not matches:
+        return pd.DataFrame(columns=["height", "peak_idx", "start_idx", "end_idx"])
+
+    pattern_start_index: npt.NDArray[int] = np.asarray([m.start() for m in matches], dtype=int)
+    pattern_end_index: npt.NDArray[int] = np.asarray([m.end() for m in matches], dtype=int)
+
+    num_matches: int = len(pattern_start_index)
+    peak_index: npt.NDArray[int] = np.zeros(num_matches, dtype=int)
+    peak_height: npt.NDArray[float] = np.zeros(num_matches, dtype=float)
+
+    # for each match region, find the local max
+    for i in range(num_matches):
+        segment: npt.NDArray[np.uint64] = x[pattern_start_index[i] : pattern_end_index[i] + 1]
+        segment_max_idx: np.uint64 = np.argmax(segment)
+        peak_index[i] = pattern_start_index[i] + segment_max_idx
+        peak_height[i] = segment[segment_max_idx]
+
+    # filter for values that are too low or below the threshold difference
+    x_left: float = x[pattern_start_index]
+    x_right: float = x[np.minimum(pattern_end_index, x.size - 1)]
+    valid_peaks: list[int] = list(np.where((peak_height >= min_peak_height) & ((peak_height - np.maximum(x_left, x_right)) >= threshold))[0].astype(int))  # noqa: E501  # fmt: skip
+
+    if len(valid_peaks) == 0:
+        return pd.DataFrame(columns=["height", "peak_idx", "start_idx", "end_idx"])
+
+    out_df: pd.DataFrame = pd.DataFrame(
+        data={
+            "height": peak_height[valid_peaks],
+            "peak_idx": peak_index[valid_peaks],
+            "start_idx": pattern_start_index[valid_peaks],
+            "end_idx": pattern_end_index[valid_peaks],
+        }
+    )
+
+    # sort by peak height in descending order (largest to smallest)
+    if sortstr or min_peak_distance > 1:
+        out_df.sort_values(by=["height"], ascending=False, ignore_index=True, inplace=True)
+
+    # if provided, enforce a minimum distance between the peaks (removes smaller peaks next to larger ones)
+    if min_peak_distance > 1 and len(out_df) > 1:
+        _enforce_minimum_peak_distance(df=out_df, min_peak_distance=min_peak_distance)
+
+    # if provided, limit the number of peaks returned
+    if 0 < npeaks < _num_rows(out_df):
+        out_df = cast(pd.DataFrame, out_df.iloc[:npeaks, :].reset_index(drop=True))
+
+    return out_df
diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index bad4f70e..308cd9d9 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -99,7 +99,7 @@ class _ZFPKMResult(NamedTuple):
     density: Density
     mu: float
     std_dev: float
-    max_fpkm: float
+    fpkm_at_mu: float
 
 
 class _ReadMatrixResults(NamedTuple):
@@ -257,7 +257,7 @@ def calculate_tpm(metrics: NamedMetrics) -> NamedMetrics:
     return metrics
 
 
-def _calculate_fpkm(metrics: NamedMetrics, scale: int = 1e6) -> NamedMetrics:
+def _calculate_fpkm(metrics: NamedMetrics, scale: float = 1e6) -> NamedMetrics:
     """Calculate the Fragments Per Kilobase of transcript per Million mapped reads (FPKM) for each sample in the metrics dictionary.
 
     Args:
@@ -318,92 +318,42 @@ def _calculate_fpkm(metrics: NamedMetrics, scale: int = 1e6) -> NamedMetrics:
     return metrics
 
 
-def _zfpkm_calculation(
-    column: pd.Series,
-    peak_parameters: PeakIdentificationParameters,
-    bandwidth: float = 1.0,
-    epsilon: float = 1e-10,
-) -> _ZFPKMResult:
-    """Log2 Transformations.
-
-    Stabilize the variance in the data to make the distribution more symmetric; this is helpful for Gaussian fitting
-
-    Kernel Density Estimation (kde)
-        - SciKit Learn: https://scikit-learn.org/stable/modules/density.html
-        - Non-parametric method to estimate the probability density function (PDF) of a random variable
-        - Estimates the distribution of log2-transformed FPKM values
-        - Bandwidth parameter controls the smoothness of the density estimate
-        - KDE Explanation
-            - A way to smooth a histogram to get a better idea of the underlying distribution of the data
-            - Given a set of data points, we want to understand how they are distributed.
-                Histograms can be useful, but are sensitive to bin size and number
-            - The KDE places a "kernel" - a small symmetric function (i.e., Gaussian curve) - at each data point
-            - The "kernel" acts as a weight, giving more weight to points closer to the center of the kernel,
-                and less weight to points farther away
-            - Kernel functions are summed along each point on the x-axis
-            - A smooth curve is created that represents the estimated density of the data
-
-    Peak Finding
-        - Identifies that are above a certain height and separated by a minimum distance
-        - Represent potential local maxima in the distribution
-
-    Peak Selection
-        - The peak with the highest x-value (from log2-FPKM) is chosen as the mean (mu)
-            of the "inactive" gene distribution
-        - The peak representing unexpressed or inactive genes should be at a lower FPKM
-            value compared to the peak representing expressed genes
-
-    Standard Deviation Estimation
-         - The mean of log2-FPKM values are greater than the calculated mu
-         - Standard deviation is estimated based on the assumption that the right tail of the distribution
-            This represents expressed genes) can be approximated by a half-normal distribution
-
-    zFPKM Transformation
-        - Centers disbribution around 0 and scales it by the standard deviation.
-            This makes it easier to compare gene expression across different samples
-        - Represents the number of standard deviations away from the mean of the "inactive" gene distribution
-        - Higher zFPKM values indicate higher expression levels relative to the "inactive" peak
-        - A zFPKM value of 0 represents the mean of the "inactive" distribution
-        - Research shows that a zFPKM value of -3 or greater can be used as
-            a threshold for calling a gene as "active" and/or "expressed"
-            : https://doi.org/10.1186/1471-2164-14-778
+def _zfpkm_calculation(col_fpkm: pd.Series, min_peak_height: float, min_peak_distance: int):
+    """ZFPKM Transformations.
+
+    This function reproduces R's `zFPKM::zFPKM` function.
+
+    References:
+        1) zFPKM implementation in R: https://github.com/ronammar/zFPKM
+        2) zFPKM publication: https://doi.org/10.1186/1471-2164-14-778
 
     Args:
-          column: A pandas Series representing a single sample's FPKM values.
-          peak_parameters: Parameters for peak identification.
-          bandwidth: The bandwidth for the Kernel Density Estimation (KDE).
-          epsilon: A small value to add to FPKM values to prevent log2-divide-by-0 errors
+        col_fpkm: The raw FPKM values to perform zFPKM on
+        min_peak_distance: Minimum distance between peaks; passed on to `find_peaks` function
+        min_peak_height: Minimum height of peaks; passed on to `find_peaks` function
 
     Returns:
-            A named tuple containing the zFPKM values, density estimate, mean (mu), standard deviation, and maximum FPKM value.
-
+        A named tuple containing the zFPKM values, density estimate, mean (mu), standard deviation, and maximum FPKM value.
     """
-    log2values: npt.NDArray[float] = np.log2(column.values + epsilon)
-
-    # 1D KDE *always* has one feature and many samples. scikit-learn expects data in the format of (n_samples, n_features)
-    # Thus, we use `.reshape(-1, 1)` because we know there is a single feature
-    # Even though the data is FPKM of many genes for a single sample, it is still one feature over many samples
-    # `-1` indicates the unknown dimension (the number of samples)
-    # `1` indicates the known dimension (the number of genes [also known as features])
-    # https://scikit-learn.org/stable/auto_examples/neighbors/plot_kde_1d.html#sphx-glr-auto-examples-neighbors-plot-kde-1d-py
-    kde: KernelDensity = KernelDensity(kernel="gaussian", bandwidth=bandwidth).fit(log2values.reshape(-1, 1))
-
-    x_range: npt.NDArray[float] = np.linspace(log2values.min(), log2values.max(), 2000).reshape(-1, 1)
-    density: npt.NDArray[float] = np.exp(kde.score_samples(x_range))
-    peaks, _ = find_peaks(density, height=peak_parameters.height, distance=peak_parameters.distance)
-    peak_positions = x_range[peaks]
-
-    mu = 0
-    max_fpkm = 0
-    stddev = 1
-    if len(peaks) != 0:
-        mu = peak_positions.max()
-        max_fpkm = density[peaks[np.argmax(peak_positions)]]
-        u = log2values[log2values > mu].mean()
-        stddev = (u - mu) * np.sqrt(np.pi / 2)
-    zfpkm = pd.Series((log2values - mu) / stddev, dtype=float, name=column.name)
-
-    return _ZFPKMResult(zfpkm=zfpkm, density=Density(x_range, density), mu=mu, std_dev=stddev, max_fpkm=max_fpkm)
+    # Ignore np.log2(0) errors; we know this will happen, and are removing non-finite values in the density calculation
+    # This is required in order to match R's zFPKM calculations, as R's `density` function removes NA values.
+    with np.errstate(divide="ignore", invalid="ignore"):
+        log2fpkm: npt.NDArray[float] = np.log2(col_fpkm.values).astype(float)
+    d = density(log2fpkm)
+
+    peaks: pd.DataFrame = find_peaks(d.y_grid, min_peak_height=min_peak_height, min_peak_distance=min_peak_distance)
+    peak_positions = d.x_grid[peaks["peak_idx"].astype(int).tolist()]
+
+    sd = 1.0
+    mu = 0.0
+    fpkm_at_mu = 0.0
+    if peak_positions.size > 0:
+        mu = float(peak_positions.max())
+        u = float(log2fpkm[log2fpkm > mu].mean())
+        fpkm_at_mu = float(d.y_grid[int(peaks.loc[np.argmax(peak_positions), "peak_idx"])])
+        sd = float((u - mu) * np.sqrt(np.pi / 2))
+    zfpkm = pd.Series((log2fpkm - mu) / sd, dtype=float, name=col_fpkm.name, index=col_fpkm.index)
+    return _ZFPKMResult(zfpkm=zfpkm, density=Density(d.x_grid, d.y_grid), mu=mu, std_dev=sd, fpkm_at_mu=fpkm_at_mu)
 
 
 def zfpkm_transform(

From 95dc2db16a69feaeeee43bdc0580eee5a1e0c771 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:40:24 -0600
Subject: [PATCH 26/52] refactor: do not drop na values, keep as much data for
 as long as possible

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index 308cd9d9..dec2e78d 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -166,18 +166,18 @@ async def _build_matrix_results(
     Returns:
         A dataclass `ReadMatrixResults`
     """
-    matrix.dropna(subset="ensembl_gene_id", inplace=True)
     conversion = await ensembl_to_gene_id_and_symbol(ids=matrix["ensembl_gene_id"].tolist(), taxon=taxon)
 
-    # If any one column was
-    if any(conversion[col].eq("-").all() for col in conversion.columns):
+    # If all columns are empty, it is indicative that the incorrect taxon id was provided
+    if all(conversion[col].eq("-").all() for col in conversion.columns):
         logger.critical(f"Conversion of Ensembl Gene IDs to Entrez IDs and Gene Symbols was empty - is '{taxon}' the correct taxon ID for this data?")
 
-    conversion["ensembl_gene_id"] = conversion["ensembl_gene_id"].str.split(",")
-    conversion = conversion.explode("ensembl_gene_id")
-    conversion.reset_index(inplace=True, drop=True)
-    conversion = conversion[conversion["entrez_gene_id"] != "-"]  # drop missing entrez IDs
-    conversion["entrez_gene_id"] = conversion["entrez_gene_id"].astype(int)  # float32 is needed because np.nan is a float
+    # 2025-NOV-3: commented out `conversion` types to evaluate if it can be skipped
+    # conversion["ensembl_gene_id"] = conversion["ensembl_gene_id"].str.split(",")
+    # conversion = conversion.explode("ensembl_gene_id")
+    # conversion.reset_index(inplace=True, drop=True)
+    # conversion = conversion[conversion["entrez_gene_id"] != "-"]  # drop missing entrez IDs
+    # conversion["entrez_gene_id"] = conversion["entrez_gene_id"].astype(int)  # float32 is needed because np.nan is a float
 
     # merge_on should contain at least one of "ensembl_gene_id", "entrez_gene_id", or "gene_symbol"
     merge_on: list[str] = list(set(matrix.columns).intersection(conversion.columns))
@@ -195,11 +195,11 @@ async def _build_matrix_results(
     matrix = matrix.merge(conversion, on=merge_on, how="left")
 
     # drop rows that have `0` in `entrez_gene_id` column
-    matrix = matrix[matrix["entrez_gene_id"] != 0].reset_index(drop=True, inplace=False)
-    gene_info = gene_info[gene_info["entrez_gene_id"] != 0].reset_index(drop=True, inplace=False)
+    # matrix = matrix[matrix["entrez_gene_id"] != 0].reset_index(drop=True, inplace=False)
+    # gene_info = gene_info[gene_info["entrez_gene_id"] != 0].reset_index(drop=True, inplace=False)
 
     gene_info = gene_info_migrations(gene_info)
-    gene_info["entrez_gene_id"] = gene_info["entrez_gene_id"].astype(int)
+    # gene_info["entrez_gene_id"] = gene_info["entrez_gene_id"].astype(int)
 
     counts_matrix = matrix.merge(
         gene_info[["entrez_gene_id", "ensembl_gene_id"]],

From bb1449a69e98addeef68003d06f78198c8b1a747 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:41:15 -0600
Subject: [PATCH 27/52] refactor: explicit type cast

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index dec2e78d..2c3e07d2 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -219,7 +219,7 @@ async def _build_matrix_results(
         study_sample_names = metadata_df[metadata_df["study"] == study]["sample_name"].tolist()
         layouts = metadata_df[metadata_df["study"] == study]["layout"].tolist()
         metrics[study] = _StudyMetrics(
-            count_matrix=counts_matrix[counts_matrix.columns.intersection(study_sample_names)],
+            count_matrix=cast(pd.DataFrame, counts_matrix[counts_matrix.columns.intersection(study_sample_names)]),
             fragment_lengths=metadata_df[metadata_df["study"] == study]["fragment_length"].values.astype(float),
             sample_names=study_sample_names,
             layout=[LayoutMethod(layout) for layout in layouts],
@@ -393,6 +393,7 @@ def zfpkm_transform(
     zfpkm_series: list[pd.Series] = []
     results: dict[str, _ZFPKMResult] = {}
 
+    slim_fpkm_df: pd.DataFrame = cast(pd.DataFrame, fpkm_df[fpkm_df.index != "-"] if remove_na else fpkm_df)
     with ProcessPoolExecutor(max_workers=cores) as pool:
         futures: list[Future[_ZFPKMResult]] = [
             pool.submit(
@@ -523,7 +524,7 @@ def cpm_filter(
     for sample, metric in metrics.items():
         counts: pd.DataFrame = metric.count_matrix
         entrez_ids: list[str] = metric.entrez_gene_ids
-        library_size: pd.DataFrame = counts.sum(axis=1)
+        library_size: pd.DataFrame = cast(pd.DataFrame, counts.sum(axis=1))
 
         # For library_sizes equal to 0, add 1 to prevent divide by 0
         # This will not impact the final counts per million calculation because the original counts are still 0
@@ -595,8 +596,8 @@ def tpm_quantile_filter(*, metrics: NamedMetrics, filtering_options: _FilteringO
         # Only keep `entrez_gene_ids` that pass `min_genes`
         metric.entrez_gene_ids = [gene for gene, keep in zip(entrez_ids, min_genes, strict=True) if keep]
         metric.gene_sizes = np.array(gene for gene, keep in zip(gene_size, min_genes, strict=True) if keep)
-        metric.count_matrix = metric.count_matrix.iloc[min_genes, :]
-        metric.normalization_matrix = metrics[sample].normalization_matrix.iloc[min_genes, :]
+        metric.count_matrix = cast(pd.DataFrame, metric.count_matrix.iloc[min_genes, :])
+        metric.normalization_matrix = cast(pd.DataFrame, metrics[sample].normalization_matrix.iloc[min_genes, :])
 
         keep_top_genes = [gene for gene, keep in zip(entrez_ids, top_genes, strict=True) if keep]
         metric.high_confidence_entrez_gene_ids = [gene for gene, keep in zip(entrez_ids, keep_top_genes, strict=True) if keep]

From 54802460f7b10798d73b89865747dc8ff6f3c418 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:42:02 -0600
Subject: [PATCH 28/52] refactor!: provide default min peak height and distance

BREAKING-CHANGE: instead of providing a type `PeakIdentificationParameters`,  simply provide the min peak height and distance directly
Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index 2c3e07d2..e3d43ab8 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -358,16 +358,16 @@ def _zfpkm_calculation(col_fpkm: pd.Series, min_peak_height: float, min_peak_dis
 
 def zfpkm_transform(
     fpkm_df: pd.DataFrame,
-    peak_parameters: PeakIdentificationParameters,
-    bandwidth: float,
+    min_peak_height: float = 0.02,
+    min_peak_distance: int = 1,
     update_every_percent: float = 0.1,
 ) -> tuple[dict[str, _ZFPKMResult], DataFrame]:
     """Perform zFPKM calculation/transformation.
 
     Args:
         fpkm_df: A DataFrame containing FPKM values with genes as rows and samples as columns.
-        peak_parameters: Parameters for peak identification in zFPKM calculation.
-        bandwidth: The bandwidth for kernel density estimation in zFPKM calculation.
+        min_peak_height: Minimum height of peaks; passed on to `find_peaks` function.
+        min_peak_distance: Minimum distance between peaks; passed on to `find_peaks` function.
         update_every_percent: Frequency of progress updates as a decimal between 0 and 1 (e.g., 0.1 for every 10%).
 
     Returns:
@@ -398,12 +398,13 @@ def zfpkm_transform(
         futures: list[Future[_ZFPKMResult]] = [
             pool.submit(
                 _zfpkm_calculation,
-                column=fpkm_df[column],
-                peak_parameters=peak_parameters,
-                bandwidth=bandwidth,
+                col_fpkm=fpkm_df[column],
+                min_peak_height=min_peak_height,
+                min_peak_distance=min_peak_distance,
             )
-            for column in fpkm_df
+            for column in slim_fpkm_df
         ]
+
         for i, future in enumerate(as_completed(futures)):
             result = future.result()
             key = str(result.zfpkm.name)

From 038a5feeca61aba05f5812725e6d4fc0128bfeca Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:42:32 -0600
Subject: [PATCH 29/52] feat: remove NA values by default (replicates R
 functionality)

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index e3d43ab8..87c82a6a 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -361,6 +361,7 @@ def zfpkm_transform(
     min_peak_height: float = 0.02,
     min_peak_distance: int = 1,
     update_every_percent: float = 0.1,
+    remove_na: bool = True,
 ) -> tuple[dict[str, _ZFPKMResult], DataFrame]:
     """Perform zFPKM calculation/transformation.
 
@@ -369,6 +370,7 @@ def zfpkm_transform(
         min_peak_height: Minimum height of peaks; passed on to `find_peaks` function.
         min_peak_distance: Minimum distance between peaks; passed on to `find_peaks` function.
         update_every_percent: Frequency of progress updates as a decimal between 0 and 1 (e.g., 0.1 for every 10%).
+        remove_na: Whether to remove NaN & blank values from the input DataFrame before processing.
 
     Returns:
         A tuple containing:

From 31cb4d4f590a035f8bcd0c6cb71e0e18df4952fd Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:42:47 -0600
Subject: [PATCH 30/52] fix: do not build a list of `None`

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index 87c82a6a..26803c4e 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -440,7 +440,7 @@ def zfpkm_plot(results: dict[str, _ZFPKMResult], *, output_png_dirpath: Path, pl
         subplot_titles: Whether to display facet titles (sample names).
 
     """
-    to_concat: list[pd.DataFrame] = [None] * len(results)  # type: ignore  # ignoring because None is not of type pd.DataFrame
+    to_concat: list[pd.DataFrame] = []
     for name, result in results.items():
         stddev: float = float(result.std_dev)
         x: npt.NDArray[float] = result.density.x.flatten()

From 2f1727908158dec6a0538eae5d6efabbbdb25f5b Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:43:14 -0600
Subject: [PATCH 31/52] fix: plot gaussian distribution with a peak of the fpkm
 value at `mu`, not at the maximum fpkm distribution

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index 26803c4e..250e429f 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -447,13 +447,13 @@ def zfpkm_plot(results: dict[str, _ZFPKMResult], *, output_png_dirpath: Path, pl
         y: npt.NDArray[float] = result.density.y.flatten()
 
         fitted: npt.NDArray[float] = np.exp(-0.5 * ((x - result.mu) / stddev) ** 2) / (stddev * np.sqrt(2 * np.pi))
-        max_fpkm: float = float(y.max())
+        fpkm_at_mu: float = result.fpkm_at_mu
         max_fitted: float = float(fitted.max())
-        scale_fitted: float = fitted * max_fpkm / max_fitted
-        to_concat.append(pd.DataFrame({"sample_name": name, "log2fpkm": x, "fpkm_density": y, "fitted_density_scaled": scale_fitted}))
+        scale_fitted: float = fitted * fpkm_at_mu / max_fitted
+        to_concat.append(pd.DataFrame({"sample_name": name, "log2fpkm": x, "fpkm_density": y, "zfpkm_density": scale_fitted}))
 
     mega_df = pd.concat(to_concat, ignore_index=True)
-    mega_df.columns = pd.Series(data=["sample_name", "log2fpkm", "fpkm_density", "fitted_density_scaled"])
+    mega_df.columns = pd.Series(data=["sample_name", "log2fpkm", "fpkm_density", "zfpkm_density"])
     mega_df = mega_df.melt(id_vars=["log2fpkm", "sample_name"], var_name="source", value_name="density")
 
     fig: plt.Figure

From 2d6b20f396ccb49bdf7600503d6dec078d069f59 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:44:10 -0600
Subject: [PATCH 32/52] refactor!: provide default min peak height and distance

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/proteomics_preprocessing.py         |  18 +++----
 main/como/rnaseq_gen.py                       |  46 ++++++++++--------
 .../proteomics_data_inputs_paper.xlsx         | Bin 6136 -> 10147 bytes
 3 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/main/como/proteomics_preprocessing.py b/main/como/proteomics_preprocessing.py
index 39d4c85c..44c21f6d 100644
--- a/main/como/proteomics_preprocessing.py
+++ b/main/como/proteomics_preprocessing.py
@@ -21,11 +21,11 @@ class ZResult:
     """Dataclass to store results of Z-score calculation."""
 
     zfpkm: pd.DataFrame
-    x_range: pd.DataFrame  # npt.NDArray[np.float64]
-    density: pd.DataFrame  # npt.NDArray[np.float64]
-    mu: npt.NDArray[np.float64]
-    std_dev: npt.NDArray[np.float64]
-    max_fpkm_peak: npt.NDArray[np.float64]
+    x_range: pd.DataFrame  # npt.NDArray[float]
+    density: pd.DataFrame  # npt.NDArray[float]
+    mu: npt.NDArray[float]
+    std_dev: npt.NDArray[float]
+    max_fpkm_peak: npt.NDArray[float]
 
 
 def z_score_calc(abundance: pd.DataFrame, min_thresh: int) -> ZResult:
@@ -42,11 +42,11 @@ def z_score_calc(abundance: pd.DataFrame, min_thresh: int) -> ZResult:
     log_abundance_filt = np.log2(values[values > min_thresh]).reshape((len(abundance), len(abundance.columns)))
     log_abundance = np.log2(values)
 
-    # np.zeros((1000, len(abundance.columns)), dtype=np.float64),
+    # np.zeros((1000, len(abundance.columns)), dtype=float),
     z_result = ZResult(
-        zfpkm=pd.DataFrame(data=np.nan * np.ones_like(values), index=abundance.index, columns=abundance.columns, dtype=np.float64),
-        x_range=pd.DataFrame(data=np.zeros((1000, len(abundance.columns))), columns=abundance.columns, dtype=np.float64),
-        density=pd.DataFrame(data=np.zeros((1000, len(abundance.columns))), columns=abundance.columns, dtype=np.float64),
+        zfpkm=pd.DataFrame(data=np.nan * np.ones_like(values), index=abundance.index, columns=abundance.columns, dtype=float),
+        x_range=pd.DataFrame(data=np.zeros((1000, len(abundance.columns))), columns=abundance.columns, dtype=float),
+        density=pd.DataFrame(data=np.zeros((1000, len(abundance.columns))), columns=abundance.columns, dtype=float),
         mu=np.zeros(len(abundance.columns)),
         std_dev=np.zeros(len(abundance.columns)),
         max_fpkm_peak=np.zeros(len(abundance.columns)),
diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index 250e429f..b760041c 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -616,9 +616,9 @@ def zfpkm_filter(
     filtering_options: _FilteringOptions,
     calculate_fpkm: bool,
     force_zfpkm_plot: bool,
-    peak_parameters: PeakIdentificationParameters,
-    bandwidth: float,
-    output_png_filepath: Path | None,
+    min_peak_height: float,
+    min_peak_distance: int,
+    output_png_dirpath: Path | None,
 ) -> NamedMetrics:
     """Apply zFPKM filtering to the FPKM matrix for a given sample.
 
@@ -627,9 +627,9 @@ def zfpkm_filter(
         filtering_options: Options for filtering the count matrix.
         calculate_fpkm: Whether to calculate FPKM from counts.
         force_zfpkm_plot: Whether to force plotting of zFPKM results even if there are many samples.
-        peak_parameters: Parameters for peak identification in zFPKM calculation.
-        bandwidth: The bandwidth for kernel density estimation in zFPKM calculation.
-        output_png_filepath: Optional filepath to save the zFPKM plot.
+        min_peak_height: Minimum peak height for zFPKM peak identification.
+        min_peak_distance: Minimum peak distance for zFPKM peak identification.
+        output_png_dirpath: Optional directory path to save zFPKM plots.
 
     Returns:
         A dictionary of filtered study metrics.
@@ -643,9 +643,15 @@ def zfpkm_filter(
         metric: _StudyMetrics
         # if fpkm was not calculated, the normalization matrix will be empty; collect the count matrix instead
         matrix = metric.count_matrix if metric.normalization_matrix.empty else metric.normalization_matrix
-        matrix = matrix[matrix.sum(axis=1) > 0]  # remove rows (genes) that have no counts across all samples
 
-        results, zfpkm_df = zfpkm_transform(matrix, peak_parameters=peak_parameters, bandwidth=bandwidth)
+        # TODO: 2025-OCT-31: Re-evaluate whether to remove rows with all 0 counts
+        # matrix = matrix[matrix.sum(axis=1) > 0]  # remove rows (genes) that have no counts across all samples
+
+        results, zfpkm_df = zfpkm_transform(
+            fpkm_df=matrix,
+            min_peak_height=min_peak_height,
+            min_peak_distance=min_peak_distance,
+        )
         zfpkm_df[(matrix == 0) | (zfpkm_df.isna())] = -4
 
         if len(results) > 10 and not force_zfpkm_plot:
@@ -683,8 +689,8 @@ def filter_counts(
     filtering_options: _FilteringOptions,
     prep: RNAType,
     force_zfpkm_plot: bool,
-    peak_parameters: PeakIdentificationParameters,
-    bandwidth: float,
+    zfpkm_min_peak_height: float,
+    zfpkm_min_peak_distance: int,
     output_zfpkm_plot_dirpath: Path | None = None,
 ) -> NamedMetrics:
     """Filter the count matrix based on the specified technique.
@@ -696,8 +702,8 @@ def filter_counts(
         filtering_options: Options for filtering the count matrix.
         prep: The RNA preparation type.
         force_zfpkm_plot: Whether to force plotting of zFPKM results even if there are many samples.
-        peak_parameters: Parameters for peak identification in zFPKM calculation.
-        bandwidth: The bandwidth for kernel density estimation in zFPKM calculation.
+        zfpkm_min_peak_height: Minimum peak height for zFPKM peak identification.
+        zfpkm_min_peak_distance: Minimum peak distance for zFPKM peak identification.
         output_zfpkm_plot_dirpath: Optional filepath to save the zFPKM plot.
 
     Returns:
@@ -714,8 +720,8 @@ def filter_counts(
                 filtering_options=filtering_options,
                 calculate_fpkm=True,
                 force_zfpkm_plot=force_zfpkm_plot,
-                peak_parameters=peak_parameters,
-                bandwidth=bandwidth,
+                min_peak_height=zfpkm_min_peak_height,
+                min_peak_distance=zfpkm_min_peak_distance,
                 output_png_dirpath=output_zfpkm_plot_dirpath,
             )
         case FilteringTechnique.UMI:
@@ -725,8 +731,8 @@ def filter_counts(
                 filtering_options=filtering_options,
                 calculate_fpkm=False,
                 force_zfpkm_plot=force_zfpkm_plot,
-                peak_parameters=peak_parameters,
-                bandwidth=bandwidth,
+                min_peak_height=zfpkm_min_peak_height,
+                min_peak_distance=zfpkm_min_peak_distance,
                 output_png_dirpath=output_zfpkm_plot_dirpath,
             )
         case _:
@@ -751,8 +757,8 @@ async def _process(
     technique: FilteringTechnique,
     cut_off: int | float,
     force_zfpkm_plot: bool,
-    peak_parameters: PeakIdentificationParameters,
-    bandwidth: float,
+    zfpkm_min_peak_height: float,
+    zfpkm_min_peak_distance: int,
     output_boolean_activity_filepath: Path,
     output_zscore_normalization_filepath: Path,
     output_zfpkm_plot_dirpath: Path | None,
@@ -792,8 +798,8 @@ async def _process(
         filtering_options=filtering_options,
         prep=prep,
         force_zfpkm_plot=force_zfpkm_plot,
-        peak_parameters=peak_parameters,
-        bandwidth=bandwidth,
+        zfpkm_min_peak_height=zfpkm_min_peak_height,
+        zfpkm_min_peak_distance=zfpkm_min_peak_distance,
         output_zfpkm_plot_dirpath=output_zfpkm_plot_dirpath,
     )
 
diff --git a/main/data/config_sheets/proteomics_data_inputs_paper.xlsx b/main/data/config_sheets/proteomics_data_inputs_paper.xlsx
index ceedca03f451c0c4d4a10c92c9c5f7699d69f2c9..79c1f230d7c5772b9ba31e53cc6ca196ec00277e 100644
GIT binary patch
literal 10147
zcmeHN^<Px$_8z25hLTjeyFsKRq`Q%3Xc$sbkZzFfl1}OF?i7&@38ic3_>P`?ujhEq
z_b<5j{bBFfv*%f#J@5NGYpuQ3t0)5thXX(WAOQdXG5{9-@{$P*0Dun<0AK@<9_fgJ
zZJj{2PWq~Db|6PxW>*_)(hqQtXtMy1puhj$_%EJ;p15(_E>^U@!`So3YUL_KtE_U^
z0p0h+Jh_#K1B9X!Q<S0Te)X9|1}XWE{e|cAl!$I_yzGrJ`gi+G8G_AQ7{uGZg`mCO
zthGN?Zc)fQ<&P$#D~N|Ymac@s`8o-iv1>wpM_|U!C@zO{kk?ZjC}Fq;%e$O>^@VnQ
zmN!vE4V4R#?;FGxwE5I0CruljjZm;lmWc_ZV0GU@6=I-dg0ax?xdewE&Wc`KmR>{t
z!C}zlF}ztmeyLmsQ6I2sI(zAA!65NsC@(}plv^oUPqiE0!Dh1{Dh%Tguh%g8v9wTX
zGD^+I1ayt84xordDNJFdxOSKXLH)MiYNoGJexA4C`*TmJOR>mKA$#Bo|NHN5UUV@d
z(S8&*jH`Lx-dF-#uS<}wBgkF8^}={rlB}=Z@P`PJx;XexmsEgfxcM;{;iAzzd8}sh
z9^P!w*}a*|PUj$W3aXcNv^j&y@8JO!p!gSY)~c~moIx2U3)LPfRGj(_AZte!<{!uZ
ziSoZ#iGTX*r7>~}-K=PVhmyBJ0~a$(?}0C6Ttp<B$yB}GNiAa3M&wWuEw$4S162wA
zU|)H+dfg5!EWL=>86Z1f=PV7w!r`Z=b14f<yta2nq^EXBkgzXZ>%n|81DQEbekJWj
z`=&LDzNDcrTV`;DLSp(@tP(iRtU-v3oktw_L?GEmyH8GQ+32zyX6B{Jepz5eJx|t7
z?0Bm8Y(oBbbfI8A*}bV`+yO@;v$=B50c$eI6``7nIiE#^QHH~FayNYw>yA^g)OPGU
zPbT@a0cCPdtV{OsSN&935O3`o&g0>9w;mqk?vjC{LEkWv!ewZj`nyPOD+h=1U;zLa
zL;wH-Iy0`;EG}RND<d%2>PPg-Ra;S5U<G<*);x5cc-W=YWsAub+-DI^;Uj%H4_@rC
zc|puy{Z!lka>YeH<<TyxUtw)}+UZqu^C5E%%YyO;58KiS`rb3zp5PMM?U!XLw_i;$
z(b%dYi_)aZTti73diDD<g(A4;7ny3?L;+dWp{U!%{=ih+7L#*Hjn_V)Jh?&Q#T2f}
z_%+6Q`0esIN|f%TI9b|7a>^|xeD_)`g3R4g=0*pm;I$aOEHJ9l+~cy5<wqLqjv5*w
z`NJrr2`^DDCf*sTnhwrsTXZL2EiO;N%oASr9u<(GDY;-JJ&vRoH?5graDki1rFy$6
zgpQo|@nwZTc;~|V>z*TzHOB@s(7H=i=!NDn2R^z`xP*vyggO?|GXErkhb`u_DD{^o
znPpCxCB=uOXUQ@6Ql?85%*5j=o@n*%-TDHo&Rk^ya1425PmS1@hp(ox(mrZ3EukYd
z`YG7sNVKw^@r&SNe{WD4IZjr8?9_0&o0DQyVHSW8ki&3a@FY;w(5J0K179nlBCpk?
zT(~xxj#`M3SHa6lN7BLK44lCyV~V$p7f~oVMof{c&QRHAQ#=-fmjVicKOrdwC1b$*
zXx4ZxK-%qL=BdBOxqV0Th0NU&nQW&liIxjvUX{m<d$PON({muVzTLYBywmc2w6twv
zM>x@dQSv<wq1Ba*FJsJuCRY3bIgQOe#zMW>hz1rTeO++lw!>+xE<GSnis{6RFly}p
z<@R=bVlJblyKU_b$J00y{wsL$JW;W6ne&-|(=N<6cpJiQmgwe3#Ni&@chw(Xm!cZ*
z`^yJM8xJ$A^eQj5AW*II<zM2~yA9jGXsiZlEDoSL*fjO$m`W7Wq4o9%WWMCNKyN_i
zk)S>{OR7$DKh!&@+Z-;N*gMzBp*~$gCWs~Y6rTCQRE5lQw@r)ZS!YWLT9M&f#ZzTQ
zba1NisZU3i6$~>;ztRh$J|&=>IZM`=6HwOUPNafYmof0nggY~xF(Zqb@Dtp7GbZF+
zG|#&7%bjZxzD(N(&#49p13dE@V~M7ZvzIJTh5TJZ(na}(*U&q!pgJN1Ai+R2^hd1u
zvy%QDcVM8&AXJI}-J>OD)Ut;a&Hpf<&2O^RKIUF==(z`7Kx^FsB5Ng9kesz~k&xR>
zO56c=MQ-9m4khO)+lc86-`V<83g=}y+A3nWruo#dS`@tt1&s^R;98ERi^sU(k`-ml
zEZpN?tmG9;nuW=Wc$E#3yZz!+b%;4Zf>|<&ef=VigvJbh<rq!kT3c7?;q6>mDHfsW
zN<fz_!X7WQ!7H%M3a4&FRGSyk#4!QtU<hh#RwPrEuf?t26Rk(^ZW-CHW7oR-VYS2r
z?TIxg3uGGMRBky&jM(d~y57%Vj-vLf3O>){&|#r+tfohcdb@L7$-p@(bnc-3u!H(f
z%xPR_o3nwk<`U|C@u8IXm-lrv2Z5X%S$>|_ez@VZ*oDXiQefaA1LP_2f=!T8eKxdu
z=fHBNwW!z@V-_yT7>AYly-G=&2cj}fv;%e-$QW(pV!Y$<>Q0ZL3Vt{DyHyGm9n}Qt
z^njPz^x;Yd4GFD%#s#`Yeo)G@j>((e8<*4cN~>8IS6~0-nVM6tLVU*$u%o(02;c1U
zxzrt_GT-zr8u;+ANirl9SPXI|3J74)d!TmLl;Iqi^hk&vMgdVnJw{fX9qcIN!gTCX
z79gfcNI8gNjCnEl!0b44rPfk+<11yur5o!Z_&J>xN6%0t>PUS$3%XMKd<b&P9NYHp
z$>Eo$U)RN&${(m@xj|3r*ojvO;@g?>tlt=InrR+OXuA0Z>{rc5aC6<o-nUa<j=hY{
zNVPv;(H$WQp&>3Ne=?uqHBuZz019(E8^=KAcbTNgV9^)xIeJG(K-C^G`O%l*k{GSf
zj#1nSahf?oE9eOCb<&5(Fhh|bRF?W9B$PxUgS!I_)jsB-s@MaN_sAOG{B0PO%)#np
zVP@5_E-)|GCjlJSjz1Dpm6i_@<d^f}YE~fCNbi4Nq0wor=IHu}W@0nRET(rpK&dd8
zX_Ec6{F`!H)Zvm6t3vFDfoXD7yGr}zZ*1|JVsN+6ReyMZF8RYl+|(pW7by<Hsz&rF
zAUX97TiV2zuwZ1}vM<qU;1#;{kzA4GqKBvz%o^87wg^Etv+ZHI5dm7DD+GJ0*h+XY
z{ydvspf8wAz9{HUp1_W~rRf#@H0G3-y|h*%FGDvCaa&iQs1(qO^7f-raxJ9of>Jc2
zw!N&b?*qF|ldEQijb_u2kNFiR*4^w}PvYmE<EUrEC%pC^!v6J)QN!}xtUz(^-V}I-
zPn7U+<S~-Cg+`)kxLzGp(><1Hm3`Rs#}M8vwOmslyt)mK?aKzcrcRpCi0`dCb}Ur6
z)t=G%D;cHmwA^z1@yLx>Ra)*G<KF7{bJ0k1j#6Pw5*e-*3Y02_%D6d)97C%(m7RJu
z_&YLoJzeuig`HoADiY;r%MqK2v<M2hKdsQ6H6xlyH%O}}f(MNB6l@<oOxxg675na7
zvkj@^%{MjQG<1hmew!hTvXoK3NYt?ol;`N-Ozk(;qHOP`*ham=7`ueUy-7v7NszvO
zPW@`#4~&`j!-fCdxqrrKW_t}?tmXe-qU?W2R2RHJiW7K9voAz?VBBF_+#9M=R#pr#
zO*wayJF*Xd)CYS~Yw^&EXl75bNljk`L-KW708wxJ@ci?8rpqi{4W(#%=`5A<&r8K0
zgxZAU1QoG9>{(j8M#L2m&T<L6INajjp3R02hC|jZG$%6FXMduKsvbB3nOIBhb7P<o
zyv9XsMbyF|(du>>11HBf<-B_N{i9(+{v^t?^m)y1>zM1E1Ho58>KAKb)jNb~@m{Up
zX@=&WVanJzy+BixE~U@ZNI!M3s5oeChz{Ko(BoU~l`;+1r`0NJK`jp<_#x5Z6Eo|+
zTtcZth<*Sj&q%j=0j#I^CRj#F_LcfQ3d1r|^HW2MX3xe+XYq%V2OHlU$)}li(M{Fo
zQOsMlec1A9m>6>6LZK&K(sM?Fm)iw_y1c2eLSu7aX~N4iG_#2z;<7Z^CMVnU1fd+>
zSGE9H`ndo};<_UMLgLE!#CAI(RoVwMBImJfn$d#J&-yR4h5#62w^Pp6v9wfNoDRn8
zT8bvqY_~J}f<Ah+n0N)cQg!sZ-4t*EUVUgBY^p_RB20TBGvE8ggof@|4<eQM#zV=s
z0|qe5xyhb;NE=Cd16qD^;6JkTWyR?`)*$eBx20UP!OQ9yU5TKp<r+A%;Ns64A^Nge
z<$>fzSGL0{+~RDIKM9P%bun)l;37mO;OtSBQFol(-11YGvom9L^CFt{uo2&}r*78i
zS;Jtl1^3f<hDU+jPtqUS#f|G8cMa*>EfmWz5>`9os4sJu=J$MH)1fikwCT3|@g-<8
zU{lFx>n5k=XBz*<6GkmTMTr)mPjv#XVZM)0C|VN_M6&PZMAswDv9+4~6urwHQb2?5
zE&2iPvP;R+L0)isnfuZpFWZat>1S=>rh(6MSDQ*@ok(2!Y59Aon#@ySYUjG0Rc2~$
zyy=+cQ9xS-DO%e-xg#{ouRcwtrkG6FG}ubiM?Gd`^l#OsIzT;Qq))(+CE5ZL`yYH(
zfibf{*}@h7er1l;CDd_1E|To1w)AT1r02P}GdBN==<ZJIBZD`4*4|CsrhBErs);48
zueS%I+=i~gZ(ohoB!wK-v+beMUyUXaYw86)Q5tsnM@60+)QB`d)rd*{qv-ZCXgZmL
zY(OkOk3Zw(p7v-IQ9W)u;=qIOi0<c+Zyttx69qiy2kvX}x*L*Gae;bccEZZd3^*_-
z+n;279$8Gv_$<5Ah9a21LulNHVl$wSdkB8vAd#{b8rpg<9Ao6d!tv=);C`#Mzn>?{
z@glY)DI1lXa#Ly+_2BXu`C$0CMb)0W0`*tv1ySWSlSl#u$ypuj?{6IrzeXLb!{SCK
zk-k^hfH`oFR{kzhOK-*+zWFpG2F^>z4$~OE-bgDQ)f3s>SRMUsyH*67+>5wX{7-_~
zjNihuLe#%6Ezpv%HOR$%g5{tdtFbha!r!}nF@e;W082dqZ>o#2m3%sk+^GD_NR$3D
zd-rN<ykxmqrx&pisN!KF<6wN6IK27j^mJl`E@65<_ny{z*00P9*1!^v$8z_fOc1uC
zn&5>+UtTGPIp^J*Bx9MZy{YWxX%tKDw%qo*ZoY4vH-<}7VBzHZ>jRER{v5;ZE+WN1
z@ZPG2*O$Xjj4d#r%dN+4f;+Wc(E&VJ!IK-FcuUr`3!}uX0gSx-PF?(?`74B5M~~nq
zD<YT$dVOgu^-Fs%3Gj>o=A_-B)RJW?RUI6`>w3ivVxsgafwRw~?YpSZZVZ9K-)8RY
zZorI@7)Z=gR(c&`2WS?8l4EdkYO@{y?N+RfO`eq&*Lz6V_JiATFCI(A1M1UBP%gV#
z!XlY`0;rMX{=Wa=4&w5y{o#HAL;rp-A`x_3;(dQVQ@?T-=dNiPK&N+qRidHq4LM=)
zzPu;Ve_nF|myxiBO^6*LKS1R^V(cnxjFKvp9GWU6OD@ZVOzdMDv=bf5xH?G4fmg0z
zjV*}#9`w57ro`;YY_()5j`PSZYbety+ji-Az%&2@@|p@&L49yapiwj4K8mf)Zd9Nn
zWOH?rKHyARmmU$b(maDbjkbjnSxNGtZyeeF#gbW^_8WnGj!r6l+A{+2x3cNf$rDAm
zk^vKB5uGencp`SSpwzDgh2zP)ML`=~5?xgtBO+celR$kpu6cHQ%6y${3`Dm`B?ICo
z)=Z|LMM?Ujvl!M$2}klgnXTw|+f00SWPv?fPB&~i{oKm5?gvwD{(LMHLARd3HC|_j
zd@0pFFZyvZF}CNML-0ad=O~jGtB7fP&KD}uj4thhm@j<bH1bv9g6~08#BD7K4Xw{+
zt$jimhR?*TL9-cIW8c1gOi(u`!R$Js%E0!`@Z2$&*e(K{`kwhARSl8Z!+(ag?k{dr
z2=-@}n&oPyI|-I%Fc}|+Ib3sWX-jbZ%o|eqrQiy##WvtTwZq6s(Y42P8?=fXaDuVk
zYZ3Pf%PG#+qefxZ!oRFlW<|!Or95D>okz>Kf;n@^N!J&xu@a5)oTqLSj>F2A+<-di
z1BFHtJm&CSK=86g#Frtu`U*UoH3Zu5Qg@pC!QIeIY(}{DTQ|#040T8YZo5otHOoh0
zy%LA>HLrl$QQ*fIl}8JMw(CtyHnuWEhYxKvbM*;e)RYC^t-37xR^PYpee(+qvs&o1
z1@e!v5BF~vX^qV+M+c%g%Sezva-FiFrDVie$lBh#vw%yyb~AW{T95N(fV6W4To^h=
zp7zq4mQ{BoZ$yoSO1ja%E}DS*PSruK-a@c!-B@sZK0UvE!;Csk6ZfkZETO_2Z%wgk
zpl#iW$S7EJsX{kJeK+u3LMw8}lnHl8il3Mj$0Z?FljFIUrV}yay!Kt68h<{(Qs-lb
z2aj((q<pr}EIf+>k?Z>P4V@#TuX&#DaAx`@d~gU8P}Zxf;KDiE8dZ?Vj9SZcR3qNS
zONGji>VF(hcVq$w?sjuZxH%BQwc~*}7uXevE2J7#6QU57od+G&%9GDJz{(6I`%-<w
z`s|7nHYUoE_NEp7EMkpap3F7Z>!QmVIf#XWstWgkyNC)`8ND)`JuI~}_KGGy6sgC2
z5ps44?)IUg4l&tyUg6|jwDGca)yra|W4A7G&`89zc<*8=xX$SfDxILrlQI2>%LTt$
zN`0T^@efy2DOK^V;bc98Wx^gV@>IkmaLFaxIAh&)$)&;VR{Sa%cV(jG<Rr!k61*$Q
zLvt_uQqe&&{fjm>icQ0r1ZYiGUEh@tD-K)s+q^!0PGC6im#?(Ywh#xW9#~(my}80F
zF~Y0W1U25ckL_E>JcTQEOm)3zZ5Jy`8u^e<aJXfdUURCHI`2`6B;Bmw7`0TF>OYMI
zZie_RLJDX2d-5wdeZY^$hRqf{uhC-qf%2FdjQsni$cxES>;kxb^(un;+ljKKA=Iah
z%+l4W_eqSvbbaU62{pd0pnf;54lLo+6g>4W>JwV~bH47?!{XKO&A4MjU#uX-OfB(W
zh^uV_<g6yeb#!gyKRK7#ntpyfoH3tkXiV2IUrSyy2p>4b$3HUaz_O9d5U(+B7ohI@
zrgh~yLhmh*8v(a{314lewn^WD$Guki<76Rxj>Ia)&DyrGg1G&}K$|Ciw*^pVdqpVe
zI2}y7KI$`@P7qbH==oKVYnf+77;fzzOxh7G%s>nesla<o?`Jqs`l;KnddhajJznGs
zaiGALn$)k$+s~R<SF?nOBl{yNricOM2N#6#dhOhWoC`T#ypJUy*%+&yH;uma`00+0
zIvsgy&WEr5G0=Cpt8*nmi}(Bp008<Qf!@){%^KwRvofi=0#5yRWs;w3;yo3nB9~k=
zz0Gv)n<>4gtRZR-!gaCHayKh3fs{b!`TTSEm8a(}D_6~q?$gpf=OdcsYJoW=K#F{6
zUHjU>quB!TBtw&@YPh)Cm1rbMy*yVpE=A6WozLpS`QU&<rmnlMt4Vtjbf}A3r_fc7
z3O-9HBbLtr7w{z*b}^(x)>+fAq#C;fx?Uzcz>*iaEpv$!fkdC`J9aK*cXvr9gat)R
z1(w4r+T<WJDG|xZg=;%XPl@YqM7MCx@67AMFi$#<%ACg>q>UgmAyNu^Jyni-&)Qbl
z%Joq!1=xMQI2?3Z^La>1btm0{lo|-psBl%kZl=FwA=z`;EV}bKR~4r0p;W+)FTDVx
z7jy+#(l_#+b$&q{V&Zyob`W)ibc`Go7J&Ua*zx*{nXhn$uY4qd+B?6<^XY7u0Gt|C
zjOqQG{)33@AX<3|M9D-}h1q%4i7wH#0>umzb$(LIOVeGA0;0?ET1QgH4D~eInh#xx
zQ+{dav;`(FUtDuPsOuZlb1Cy{NnR*|@Dcj-y9WJVMw&*=@;~zq|7_e&@T3+(B}f`+
zuK!-J;FaDEn)kUQU6yzx(7g3)mpufr>z!6@H?Gs&`fa##qT|>j9;M2Z1AmX_4<kng
z-bT19gO@G4s4G^pA||dCLhgAViZqgI4{`#|eR22E#*fBtHiNEq+JgPG8q(lntrFl)
zO&U>R2A3Ra>9-7tm+uJWu8Fo<Jd#dusLc)GTM)K1_bPk($r9G0r>76yam=F{$G=**
zrivLVz7A4iO`g(XP1U0ji55Gye6F=jBhJ2_(Q+byB!vTE38TqmBu<pIbDrACo!~_w
z_qj<VACN+3Z@In#|09K|k3OQVL+`9Wvv!<+QP|wj0c4`$<X~ZI_LIp@(VfygtkB|)
z@1f$>LH>SdFr}8_L@Hv*HdT!=kvTP6Ec5lbL_KL;bN^MFiPz<;1HmOtV%921Zq|I9
zB57?~^(O{bkGCyN0lu->W8oQ!op|hfXPW^g^5XY0r3R98i^!e(@5YY$5Ob}U-#Srm
zvYq>iicKHCOIoDescOL-)AaoS!KQ+Dj!Yl$lD)Kpn|V@WQUaQaus7^6`>Gh4n|GQ0
z>J;Y=C*gg#!0KIFkzR40rTWI9{adf`^Ohz(`ZonRW^K~RSM)|(yLauNz3*Z~cMsfR
zg75F4!u|UjwX&a|WDs;dzCopp2`!hJfQ=O$z;=!-#$X4~4{1ZYbN{Q7LdoK(JPd6a
z16wq9gv8nqX?&DT0!V9dgHUp%-)Td(=206c>p-cqE3@{bKdzSuqD;+rNS+q6&-+xq
z93+$GM-CNc?+tj3K4~NviAY-&DJ?HUWf&Z3Fj?5pKAcP~OW?{O?PveUdLurUN)}^s
zo==Xo&Ke!J_UZWhXRqm1l(uE6yHcvqbz#P0a-$gR;#6MQGvV*5W2RH`Sy4eg$BDe-
zK%-J6P4Bg!>BTFDRhbhKg4+nyLU#G&l^?rgWiv)t0U{|i54pK(t3E^^TJ{bV%8%8;
z=Xz2#779b}Fcyqm?oZyISl~UkKg+{9TeHvwI|6VX;uO|#edh;~FyH3+^o|F;Jt#(-
z2g0C(vZ~O(t~Te)sno%3;BZUyQNzrDW+xu6wZmKn%%A(I0p{6Nqg3n^f!m>A`$&J`
zKn!D8Y!L6$k!gPuffzKwT`lnaEFWjYj>(a?I7?!W7C7Uv<VEz^wMM9!570(x({dN1
z5J13UO155yOWo8ux(<EE{5yN87jD=ypzM`{)`Wn6X0M^0-T%l7W$nL4no6hbCstr9
z#W6<op>6@Hs`+Mv?~_lkxC}P)8Hi?8l{;Ue%?S5TAiP8a)ONx<Hd~|9!>K%#O@cMd
zyKFJ&ZNS8m*CyZO3QJe;*0XQjkCjY%RvGL%jmaV<*R|W<=G3t{;m?tSq`Fkd2)Ar0
zsFD$MveXD~IEzHBeN>1p*gw^Bxe|@Ygk@|#AOA@C4F7Gwj5rT+sFMBksg-z%0fP6L
z=CfuK%@VAIF_;|nlh+A8o$+<zcVwL^IfBmV)d;-e9}dexd|r;u3E=PYDaO9#!*`M%
zv!G|H6+OjbB5^L?Xi?*beGHBv(e<NHJwYqv_b1zB`XZ?fY56{B7fdjhNw;4+;00K#
z5GQjFe)6Kg24Vh7n&m9!t2>-^4z6HXY3eSdj3GW3y@d<+x8%7-Qp9i)6c!aVn%Zw9
zS9Y1i?lZr}22CEC_xtT1pDwO!#LdIE*&WUDv}--cV(1ds28G>^IUyJ8!eW*<1=5`&
zE4t8)eM5*&MYw%dB9!Kh*xi1_+xGaaTkX)w=Jy5;*}+r(iFWAf{K09MM@-O&^v@fa
z{=J6(KL5kECPkUQ8u;sGi+=-uoYSE~`P0^m-+{k3pnpNTpwZ*EcJ%Myzjoe!K>+{+
zl%L@Lrw{kLo!|Q~zbpx({pTV6)|L6)%I~G(Usk?CTkHSO+xb~I{@u#&<*i><oT1e&
zsFmM~UB4UnJ#GGFK#%CBfnQ1V@6g}Vm0wV@XMaF{PhNhv@Ymq}3l9KrkpKXH3-!Ol
k|LR@;3_m6N6Z~JkR#65X%8?(tdr$#=&}^cg^2gQx0nhZ<(f|Me

literal 6136
zcmaJ_1z1$=@?W}37NtW{K|oNXK?PR2Q|YDIrORbOO1fLRU8JOuP)X@->6Av02Jf!l
z{r}~9@Au7lo;`b>b7tP>otgJHGd~r13``&Z7Z(>GoKda;xF!V1|AvlcwoW|UsNdq4
zemRW$q=CEekSLg2HH8$oqA_S7<r(}{QqPs~qhwwx7dXrhR}Ybg6DjL;(EI4=o0%Kt
zKSEMLAu@J_dYoYq7nHD^4C3QWnnoM#Ugkb6ITjMruELR4+e5|}C|WrU@5Bq$W*r0J
zMWa00x<K#@80HVrH^le}UOx&#9L-wJo`-7%9{VSjAvR?K-Ifi|WrPM7wX>t{9dpR=
z@Hdbt_e-g3W>PEF+w!=b!jy11#-6{Q4{}JW)u;$d`sgBB7wDKKLB8f+a^)mciz^d(
z)exijt=n?+(;&O01VBX|2iLsNw`Lx>Z4q<;^7-!^LxTLm9l3^`qm7B3oej4;#5P>L
z6FSXH+IY-~ISaXGU`>;N0g8wQ%#apzCdbI!v<U%z*-yy+IZI}i$E^nxdv@Ex<G_5?
z%FC%R*9~AApmm?%9PL2TO!$^HO+~5l=B&TR&AP6-5quL@>~84I-Bl386)`3f`8_^d
zH(00NbIzPq<Li+NQHW@O00rp7n_@c~4AB}^fYwl?uhEwF3y!hdJv~2-*0V{eikz!U
zC5;J8Bpq3fs)1Ezp&m>uqBTSw4J8u$A(ref%zF$Q2CqyS;w<Xnjrr?I6fg8LW#r)(
zsa}4B2xe~91IGwbB|qa^-)a11LUQ@QwuJrY5l4w1>x5fjkk)rvUsl-o^}3R>CJhV8
zRDqLoqABJo%A!?i`-itRN+R#)P^6Xif!sXFgc2%0?HloJ*&Z`iv$osbP-0$XQf5eb
z+-3_@(PIje)^5$R-&h7{l-8*}z0w3hEoR2mp_w7-Cy=M{wj@HDuELKU>HH}WEbZoO
zUySL$zkS@$S3T-teHc07!#sRHQ`W3f*vE)Nd=MtR)sjz#(FRH*Z;c6habX8*{Ho0u
zt?qrvh)bg*xPTM710%~CalU#Qq?U`V_f3d`&(2P&r|BaDXbH?0@p6!l_ON*mI#N$Z
z2x|N>^HPytS*z3e(hvAe#p00V9z685S1E`?z(TDJp}97s`u$EJT*TaqJ-<yEl|fZ5
zK|isO*@M9Nrwk(cD}$VzJ#5XKQ2Dd1qpv*83xv;=mR*R)#Hkp``BADDG%8pE@vNr&
zWK&z48KSqJw3N=dq~_xVC~nYe<%&-pFQ+dp**vdGc~OK3iqxRVUnT^Bm}E4}?^=|n
zE=;#wEK$j{-$*7eVM$wfP3;-&^(+h`*ofe$!Ug6gFLX_vsgouZW&0*EYpUZJ;^$$#
zC*9!Vt;%P46v5)oi`Qbsh#M=YxUAL_@Q_1Y-s-aq3DcZsH(F{trAHlSykC5&hMJ-v
z@es>8=1{JRV-C3~Np7%*Or*cxv-KfXvXPF&Hy#1Lxa~9~O0-d$QZz|v1yz3T2s+G`
z#;lrfPV%J&o?%QFqoTFRy-`PEw2iaS{MUN&I-WektCy_xBe}tB>=;j`Kh++~t(8r_
zCfH_*<5#;Ydjb(_!CH8|NPm`{zD%t^u0%l4{3br$rnnG;-K6-&fUeWm0fV~TPkU4=
z7wy|$>rMF2Nu{*gj+9~{0Gwuz-46x`f@k^%<|U~?qw-T|8PjoO&Yik<u;9_|#qoK`
z5E27|0z6&zY&okV!XYSHH#Dc-$y#@nQNl2cOMjA~>qaShdin#zEx?g9n9mO9wO#&@
zf;iVm>W_>p1s6LEhj_68Pe{3mkP1%n=R$W?guQx!<+!kmxZzD}ToTTzS?t}8o@Jv|
zB7TvKV!!17yYzcnJI)W~sX(w5wpq?cv2aJHtBhXEmd*N)JtRB(gC*l9n(wzg*Vh<|
zKl+Kl2jU&~BsaQ++c1E21*?SKZl(6(Z@*r~ICSSm`a+uHZ}=7+<iwIKoG7B7QC)4w
z4wG4(-L*nIjMp7@QUax(D5nnUSTN6ivuUs?<k#^_cNEV)DcH?VBDKBZmG#Cz<?;7~
z^6qeC9=nkJQyx?PMnWe`Gc#u=o?o~7sFZHhQnZ^EByHR+C3Rl6vN`29jMvZYGq0DV
z#&MWJhvi6j-G1Vr5mUZ&aYh#tQzU0aw#6i_Gh~0V+DjA6zu!^t0#hXa0gey}$j_LP
zHyN<vC_?g4{KZEJ0t`q{uGA~=!QLn56Ni0J1I~}P{Ot;_0BZd$JxPnVL!4vYr>>&`
z6&B_2@3Lbvj0y||nD~ECtP5&yS&)q*=YAS#pJm{OX4OURiY1%OZzURJ;!BxZ%5Shv
zy&c?yA9=UDL&HY=UWm3=-e(#`n7fcN(DGR?qfbNjSpRoX;h%T<MLme#r@rkdcaQvp
z#a6T6o8q@b$!&t;Rp1*(&@yr>QztPFx7WUy@p-lRs?EY!FUHrbEQ5%f4jK@H*0>!1
z>Mb9WZ=wyf`krreN!IqlD;l9QIKqCH55IF_4L(fezbY1NT!_gUeOt~q+v0_{$dB)1
zrv+Oud>t~1-Bv@z*I<mWVkDX@N;V2Mu57f^r1p6{7U+qQjh4y9n#Y@f{u)Db+U<)w
zy&PwO<TDQ2AXf6`d@AhVAD1}$%K657gi)pBw^ecn?I&I~xlGq<ajp>wNqz$(w6;6S
z(A+E8LPaX29d&g)2uvhWL!>E5D|eDCDrkGBus=Q;ka!`zWuktcV5X@w)M<N`ipw<*
zYhy1>J(j6r(8$%*OCT+^k2<f6J0n?>F0ljSqB7$>eN|o|z#U;X5rKE;7B(diHqRgC
zb-OG`RcrgI_>(ViDxnNrQ4Ydbnsj>GELDxokZ{vvbs0~0F<#znbk!&nymG|5v|z;N
zfPebss)F@3mkHLnWSzcv8T(|b@ai1XLZdfxQ0SaINk#R-R()O?4s^_N=H6?E$_h-*
zNM(4W>Jk~#=IgVQa6}}mwViGQ9@*JF!fvMkA7U2RR@sGJ_?{hxBa~sS=BRguDjALU
zf9OU|$OgWg%aIqehL;B{4i({#fAJ#}!4a4<EtsUQR~Yx5DG^eW+_#8&9mJ}HqfkZ6
zNb5kTd`1v$muwg`<w(f&f`^(|secQBXb||pOuqn?dK0dk!m#lH&F@K5WlSBmJLWSq
zg-s#rDHmO%M2;lA1B+FXdM%J#-B9~c+BDxBuXj=1m;uA#sHszVpihAQom(l*2I63a
z5*4(f(nXF}G>NOmXk8r5@7W=kvh+Nxm+20}3Iu4xpyIw%BR{<-&^2yI$~&p&h)I`3
z{TiS)RtNuQHns72OWDl%>U`#R0ot2WRzQ{Xd>wWx+N)?ah=J<a_|Ay{k4ff8KTW6`
z3y(P0%+GoT)+7=V!EfvHHw%(rUxF_V(jg9Es$PSq;q5z)R3H##=z8acgw#9P`JG)_
z+W-pCw&?!*<lI|7(NHK7(f`vBfC&JEBYVle@2CC-Mfm<g5qYO+LBgip(t-7X;j!<`
zeecSJgDWO?6sRp0E7K-v?y22m2WG{@(XEa1Jd-Oo$xG{@U-V-|XWU!BkY>EP&<HYW
ziCbl~Sg_<J#9&j;d{JVtP!qTrSzw$=6di{fWqWfciCBW&yY;zBq_9N~=DOsDZW4`p
zIw)GnNsCnMRgCNiI1}H*9x)cpO~E_@xhwU6o;Whc*PiMn$=b#}amH@<t`f##e7116
z>?B6)h0WB&5Bo<_bIV8=Y15z)r3=zF7(Sz+`*A@bs+q~e?*|N_)|2S2Sl^OPmp;2`
zgryp(c}hl?nDjM-0P0F?n$+5NnugEEO{i|@W1`P@`@vIv*)iMBo|>U=C>Y691<BHm
zro<*W<_;u?sc@wP+{!uVMpW5j&k0R$^uaQnCJdOH<Va^~oti&V6g+yt7e}o*+6<fM
zd+ho6_8!=U;wK-th>WADk8FXcTVZo3YBUBffm+7UUsprMC~l=h$H!7kB_UdlG8Ape
z$9kE@Uz8wkMLkoQ0}>`g-n>eMji|cB!JuUb_{=Z)ULjaAL8isn0=`+zAa^h05f~?a
zDQ|45)EH~eQhIE6K&vy)`1D@zrK2^_SW@MKF@5ifuG@AE_F2I@+>I$;-Hw^#e1Y+u
zFGj^SY*7IWsc&c<!m)=togawND?d*!<1$QQr5jyB^P23}iI7o?B}!kXsWwL6rpqiL
zWInmbz6FS(lpWof8{&M15TQ`ugk-*rKE1Jm!VscOlbg$Us*ADm$0Ozz^5D)RXnV_N
z@FL~+=u2&RLRk~6>0cOvGub}1<_&0m{_{2It&fTH=8%@j*t|3>=orTgON-!?&a0|6
zvpJIyJ*l7%0pea3>r3=yu^8*{(9X>P_FY2w5QghZK%?D0!8&M{@bWFr))wI9+~Tc)
z0_3|XiO#^lMmxtKu?^3Pyu~3dIV6bCqCf;*pu1b&lZ9EY4=Z@J*w08r=9EgDwDDC^
zaFH_ohyn#7!Fo>r1`#FYd>6kdo_%aL<{@X#p{u9~3QF+zhl)GY^Rth!jf6MfLdG?N
z6|=$Cof*j=w}pxBEXmPKyF%ZYtW*a!RXutHZ?r8v;w^*T>T1g`izg9^B>)2miUe=W
zx992V`0|C=+sbl29_k>F^<Ms3IsD{Ri)sOX!9?b&%0!*~^z2<#Eh|@eP>GN`<X(wf
zM2&B6?ww|Xm?xbORZs$<@^}^VolMd0GL@lP08Am={qZ-$luHjKB#MyGcH*eTWvuS}
zZ0X1EDZrt1akdXN?GXb?U3v7633hOlI=kk)hrMVWoDKi6{`o>8G9m6$<D5`URjJ^{
zYH+t}oYK{=Uc@^5O92W+BK{FYsQ>Ck{+d{?&t|`-*T$HB>lR)@<n)T4bH2zzV;Uyd
z827mJ`7xkfb#b6oAm*sRP9ie=UVxe_hX_^Y(eb&Y_1Nw~u0gbW*k?)2xJ=&KiEv}y
zy#`qA2S1m3YkG(Bn2a(8@^=^PWoSGxJ5}au4%IJO{h8(tqs=+%av!sA9=S^DC$1Fd
zvtmp!=TEs}TTW|cx(51t8+SA@&_T0d$@jkm<MuixMez>~y)v19$IXXx`0eB@X5TLQ
zXp3xRe7+!lMLcpNwdaRK_sAL;$}LQ3hgZ*@)nL|r$~8U5_fCdk+u-~c&f2e(eVan=
zzx*HPR^Y#<4ohQ4GqAd|qZQQR*YFWP^jWEukraErUU{%*Ca<_XiKr}?ZL`;Jw1R27
zmh}!s_khft!3f(+zC&Eko}aOdB)ibUn_;$McPw2WYCLOX;7S8Fh%)flWi5g&d1Ki%
z_IwC&;7rNt@ufF7Db?{|OtlPyRI{hA;*@7So!M5*MDd8pdOtCnIL2CTp2-NB;uX&J
z#{nyaT-adg`&Sk@R+947dR~EBLP_ElYj?sN9uK5B7&n8KR6=s|PO^XQ1+K5Duyjsf
z?HbgtBJO<Gr#h#bh8}9uT)ISC)GUN^a@#a5mQQhB4eKs@=SlqX(Ai~M+B|aq@&D){
zWU~tZ+nK63+SxntnA$m-{hIC7x|LdZNgEC*qIZL1D+9XZ)bLc4HJ(_L1*N>NK`^;;
zO~Wt6l!kJ#Ud~rn@TbFtQs}@GTc31P-_f$FhM}tztv-ntda2vnatFUCCuF!#D=LRs
z=K&{_FCXt6k!XsZp98w>e0UtI!kXn$DjVhVYBX8MGDTPH=G3zd(Fd$!aqz(#_S79U
zI=(rn!XY#*DkKr>;Lb@SL`}Xmkzk9859SJ&QZ{Br6>(UlHbZ4kfEvErh`r`itlybY
z8{^mA<3X<xIrvXJujiuord6M2z0tfcDeY?Q&x<jFM$&-xt?Nh6=K6xeLy$Ceffc24
zGx&p?Nlb*SD7=5K^f=u3)_A_<1pnTBobIrJzPU1HZx+##w|8s27O>v#nN7;Q)^O?y
z(W}`584}DNgLWNQ$K6<z>%1c#u*Hh-QR#<`&{W0~ZAl#*hd4ZUex*3>dcKd><HAhz
zsY9t2|ARBb)oEngBUG~hIwVX1qX7WZ{}>|D>mf3>w?{=NElOD#dAQqlJwxtJRVgCm
zlJg&s<2t;-4D_wlWOAs<yii|MnyF!bRk_{M+uwgS>1A85qfX1jwym!WefI`9pQ}B!
z4f`%2nrNfO0#w3VqB~s147NQbf`yAqN)aKxU?#qFlq95)ixquEf#|C4?pw~GNHY7X
zE}^K`9Gh5y6<CSbjA-_q2fpP?^;wRr6<YMp>gvuoA3C?kPwtom2vGWbqH2uu0~upO
zeCAe;bjt!s!xGS)ZhIC;FgIM3Wa^J<9cxxhSv}%e>i9rYUtLR0c|c&u7+w8i!(mJ&
zo+xLWSB3|%Z-?^JkOJY|4Wt)dB7ODuO%*xvi|0@6pw4DcXG0ASdow3}l&jh_M3Al`
zojvEgUCQA)jm8vLjS)$9SJfQ6C~rb70j|%xJWf@ATX+MVf=nR+e%x3zY*rCo82eC=
zCp<lhj4lS#8R|c>;xEb#9VXyQAuewl^Hna>KF32mDN77HZ9s1mdwBDK7XA{D#NCCp
zLe%j4jR|)_QH#1kH0Nheml=n*sPUT$pE_22UQ6R{mUXP!3{@j<E-p6f!${CAD%VQ&
zn{oq;>(SQ9&!wqK35YetjAg)!ja)of7&R4c&P|i?+?--kM7w`{GNXk5Z1rn^NJ*KG
zSQ|^&QEGvrm3oEtQbK<_m(_7i&D-ScC?VX~Qs9%fHN>@h6p|_rAxed{BXjHLWyPP5
zi)W**7(FodiiBjuQYH>E)=e*z#a&I^1WCLf_%<yPw?4J6wRoA~|8$3&At*z{R1C_2
zQ;>4<g>DcbecS%AYvjBE8Nxe_da;j;QQU@vZ;Wf}8x1Nv7CWU64UnOk!$O4z4IK#h
zO|M?(7g6-;f66to^-sm?v=WMb{4G?-rT!oz|5Uy{%cGd5-|`&EQX!Rp^H6`PUT@%0
zZTfFv#{U0x|7-OB)5`S%{-0K?@c*ZkU#<V2R<0L0RI&dprNqCi{7vcq)6Vr$fGUZ<
zWf19Flso<`kbhdZj#Q|U@mmg%@bq5=<WKeM2N(rsza`<uHT8dk+n?Ik^9yyze@hec
a@qZk76?rUVtN;Lf<Q0I#R(%T8)4u>)@@wA!


From 8076c76b26514b6b93d8ec51ffee0599ad1614ed Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:44:48 -0600
Subject: [PATCH 33/52] refactor: pythonic method to collect merged gene
 z-scores

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 23 +++++------------------
 1 file changed, 5 insertions(+), 18 deletions(-)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index b760041c..fd247d0d 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -803,27 +803,14 @@ async def _process(
         output_zfpkm_plot_dirpath=output_zfpkm_plot_dirpath,
     )
 
-    merged_zscore_df = pd.DataFrame()
-    expressed_genes: list[str] = []
-    top_genes: list[str] = []
-    for metric in metrics.values():
-        expressed_genes.extend(metric.entrez_gene_ids)
-        top_genes.extend(metric.high_confidence_entrez_gene_ids)
-
-        merged_zscore_df = (
-            metric.z_score_matrix
-            if merged_zscore_df.empty
-            else merged_zscore_df.merge(
-                metric.z_score_matrix,
-                how="outer",
-                left_index=True,
-                right_index=True,
-            )
-        )
-    merged_zscore_df[merged_zscore_df.isna()] = -4
+    merged_zscore_df = pd.concat([m.z_score_matrix[m.z_score_matrix.index != "-"] for m in metrics.values()], axis="columns")
+    merged_zscore_df.fillna(-4, inplace=True)
+    expressed_genes: list[str] = list(itertools.chain.from_iterable(m.entrez_gene_ids for m in metrics.values()))
+    top_genes: list[str] = list(itertools.chain.from_iterable(m.high_confidence_entrez_gene_ids for m in metrics.values()))
 
     # If any of the normalization metrics are not empty, write the normalized metrics to disk
     if not all(metric.normalization_matrix.empty for metric in metrics.values()):
+        merged_zscore_df: pd.DataFrame = merged_zscore_df.reindex(columns=sorted(merged_zscore_df))
         merged_zscore_df.to_csv(output_zscore_normalization_filepath, index=True)
         logger.success(f"Wrote z-score normalization matrix to {output_zscore_normalization_filepath}")
     else:

From 9bcda5f901d61c78c790f4d14a80f7b8ac6a9869 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:44:58 -0600
Subject: [PATCH 34/52] refactor: do not drop na values, keep as much data for
 as long as possible

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index fd247d0d..f0c35326 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -840,7 +840,8 @@ async def _process(
     expressed_count = len(boolean_matrix[boolean_matrix["expressed"] == 1])
     high_confidence_count = len(boolean_matrix[boolean_matrix["high"] == 1])
 
-    boolean_matrix.dropna(subset="entrez_gene_id", inplace=True)
+    # TODO: 2025-OCT-31: commented out dropping entrez ids, should this be kept?
+    # boolean_matrix.dropna(subset="entrez_gene_id", inplace=True)
     boolean_matrix.to_csv(output_boolean_activity_filepath, index=False)
     logger.info(f"{context_name} - Found {expressed_count} expressed genes, {high_confidence_count} of which are confidently expressed")
     logger.success(f"Wrote boolean matrix to {output_boolean_activity_filepath}")

From 6fde5bdd49091a98d23925365abe579a5d42f8ea Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:45:12 -0600
Subject: [PATCH 35/52] refactor: use new min zfpkm peak height/distance

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_gen.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py
index f0c35326..a3e496fe 100644
--- a/main/como/rnaseq_gen.py
+++ b/main/como/rnaseq_gen.py
@@ -861,9 +861,8 @@ async def rnaseq_gen(  # noqa: C901
     batch_ratio: float = 0.5,
     high_batch_ratio: float = 0.75,
     technique: FilteringTechnique | str = FilteringTechnique.ZFPKM,
-    zfpkm_peak_height: float = 0.02,
-    zfpkm_peak_distance: float = 1.0,
-    zfpkm_bandwidth: float = 1.0,
+    zfpkm_min_peak_height: float = 0.02,
+    zfpkm_min_peak_distance: int = 1,
     cutoff: int | float | None = None,
     force_zfpkm_plot: bool = False,
     log_level: LogLevel = LogLevel.INFO,
@@ -892,9 +891,8 @@ async def rnaseq_gen(  # noqa: C901
     :param high_batch_ratio: The percentage of batches that a gene must
         appear in for a gene to be marked "highly confident" in its expression
     :param technique: The filtering technique to use
-    :param zfpkm_peak_height: The height of the zFPKM peak
-    :param zfpkm_peak_distance: The distance of the zFPKM peak
-    :param zfpkm_bandwidth: The bandwidth of the zFPKM
+    :param zfpkm_min_peak_height: The height of the zFPKM peak
+    :param zfpkm_min_peak_distance: The distance of the zFPKM peak
     :param cutoff: The cutoff value to use for the provided filtering technique
     :param force_zfpkm_plot: If too many samples exist, should plotting be done anyway?
     :param log_level: The level of logging to output
@@ -988,8 +986,8 @@ async def rnaseq_gen(  # noqa: C901
         technique=technique,
         cut_off=cutoff,
         force_zfpkm_plot=force_zfpkm_plot,
-        peak_parameters=PeakIdentificationParameters(height=zfpkm_peak_height, distance=zfpkm_peak_distance),
-        bandwidth=zfpkm_bandwidth,
+        zfpkm_min_peak_height=zfpkm_min_peak_height,
+        zfpkm_min_peak_distance=zfpkm_min_peak_distance,
         output_boolean_activity_filepath=output_boolean_activity_filepath,
         output_zscore_normalization_filepath=output_zscore_normalization_filepath,
         output_zfpkm_plot_dirpath=output_zfpkm_plot_dirpath,

From 74ca5de85d56a18d0deb7f09ee1fa9dfa47b8a88 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:45:53 -0600
Subject: [PATCH 36/52] refactor: do not drop na values, keep as much data for
 as long as possible

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_preprocess.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py
index 5ef58fb9..52385101 100644
--- a/main/como/rnaseq_preprocess.py
+++ b/main/como/rnaseq_preprocess.py
@@ -74,7 +74,6 @@ async def build_from_tab(cls, filepath: Path) -> _STARinformation:
                 "second_read_transcription_strand",
             ],
         )
-        df = df[~df["ensembl_gene_id"].isna()]
         return _STARinformation(
             num_unmapped=num_unmapped,
             num_multimapping=num_multimapping,

From 666fb84b3197ec8eb070b7c8808d671786f36520 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:46:07 -0600
Subject: [PATCH 37/52] chore: remove `__main__` function

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_preprocess.py | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py
index 52385101..125e10ff 100644
--- a/main/como/rnaseq_preprocess.py
+++ b/main/como/rnaseq_preprocess.py
@@ -849,28 +849,3 @@ async def rnaseq_preprocess(
         cache=cache,
         create_gene_info_only=create_gene_info_only,
     )
-
-
-async def _main():
-    context_name = "notreatment"
-    taxon = 9606
-    como_context_dir = Path("/Users/joshl/Projects/COMO/main/data/COMO_input/naiveB")
-    output_gene_info_filepath = Path("/Users/joshl/Projects/COMO/main/data/results/naiveB/gene_info_fixed.csv")
-    output_trna_metadata_filepath = Path("/Users/joshl/Projects/COMO/main/data/config_sheets/trna_config.xlsx")
-    output_trna_count_matrix_filepath = Path("/Users/joshl/Projects/COMO/main/data/results/naiveB/total-rna/totalrna_naiveB.csv")
-
-    await rnaseq_preprocess(
-        context_name=context_name,
-        taxon=taxon,
-        como_context_dir=como_context_dir,
-        input_matrix_filepath=None,
-        output_gene_info_filepath=output_gene_info_filepath,
-        output_trna_metadata_filepath=output_trna_metadata_filepath,
-        output_trna_count_matrix_filepath=output_trna_count_matrix_filepath,
-        cache=False,
-        log_level="INFO",
-    )
-
-
-if __name__ == "__main__":
-    asyncio.run(_main())

From 4de54212d15002109a1131b0a9c85857eace79cb Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:46:24 -0600
Subject: [PATCH 38/52] fix: convert identifiers to lowercase

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_preprocess.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py
index 125e10ff..a5fea646 100644
--- a/main/como/rnaseq_preprocess.py
+++ b/main/como/rnaseq_preprocess.py
@@ -359,12 +359,12 @@ async def _write_counts_matrix(
     """
     study_metrics = _organize_gene_counts_files(data_dir=como_context_dir)
     counts: list[pd.DataFrame] = await asyncio.gather(*[_create_sample_counts_matrix(metric) for metric in study_metrics])
-    rna_specific_sample_names = set(config_df.loc[config_df["library_prep"] == rna.value, "sample_name"].tolist())
+    rna_specific_sample_names = set(config_df.loc[config_df["library_prep"].str.lower() == rna.value.lower(), "sample_name"].tolist())
 
     final_matrix: pd.DataFrame = functools.reduce(lambda left, right: pd.merge(left, right, on="ensembl_gene_id", how="outer"), counts)
     final_matrix.fillna(value=0, inplace=True)
     final_matrix.iloc[:, 1:] = final_matrix.iloc[:, 1:].astype(np.uint64)
-    final_matrix = final_matrix[["ensembl_gene_id", *rna_specific_sample_names]]
+    final_matrix = cast(pd.DataFrame, final_matrix[["ensembl_gene_id", *rna_specific_sample_names]])
 
     output_counts_matrix_filepath.parent.mkdir(parents=True, exist_ok=True)
     final_matrix.to_csv(output_counts_matrix_filepath, index=False)
@@ -459,7 +459,6 @@ async def _create_config_df(  # noqa: C901
 
         fragment_label = f"{context_name}_{label}_fragment_size.txt"
         frag_paths = [p for p in aux_lookup["fragment"].values() if p.name == fragment_label]
-        print(f"HERE: {prep}")
         if not frag_paths and prep.lower() != RNAType.TRNA.value.lower():
             logger.warning(f"No fragment file for '{label}'; defaulting to 100 bp (needed for zFPKM).")
             mean_frag = 100.0

From b917ac1ac433fe6a7e40578d9ab25ae5dca1300b Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:47:37 -0600
Subject: [PATCH 39/52] refactor: more robust data handling when creating the
 gene info file

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/rnaseq_preprocess.py | 70 ++++++++++++++++++++--------------
 1 file changed, 42 insertions(+), 28 deletions(-)

diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py
index a5fea646..60457fb9 100644
--- a/main/como/rnaseq_preprocess.py
+++ b/main/como/rnaseq_preprocess.py
@@ -664,15 +664,13 @@ async def _create_gene_info_file(
     The gene information file will be created by reading each matrix filepath in the provided list
     """
 
-    async def read_counts(file: Path) -> list[str]:
+    async def read_ensembl_gene_ids(file: Path) -> list[str]:
         data = await _read_file(file, h5ad_as_df=False)
-
+        if isinstance(data, pd.DataFrame):
+            data: pd.DataFrame
+            return data["ensembl_gene_id"].tolist()
         try:
-            conversion = await (
-                ensembl_to_gene_id_and_symbol(ids=data["ensembl_gene_id"].tolist(), taxon=taxon)
-                if isinstance(data, pd.DataFrame)
-                else gene_symbol_to_ensembl_and_gene_id(symbols=data.var_names.tolist(), taxon=taxon)
-            )
+            conversion = await gene_symbol_to_ensembl_and_gene_id(symbols=data.var_names.tolist(), taxon=taxon)
         except json.JSONDecodeError:
             _log_and_raise_error(
                 f"Got a JSON decode error for file '{counts_matrix_filepaths}'",
@@ -681,32 +679,48 @@ async def read_counts(file: Path) -> list[str]:
             )
 
         # Remove NA values from entrez_gene_id dataframe column
-        return conversion["entrez_gene_id"].tolist()
+        return conversion["ensembl_gene_id"].tolist()
 
     logger.info("Fetching gene info - this can take up to 5 minutes depending on the number of genes and your internet connection")
-    genes = set(chain.from_iterable(await asyncio.gather(*[read_counts(f) for f in counts_matrix_filepaths])))
-    gene_data = await MyGene(cache=cache).query(items=list(genes), taxon=taxon, scopes="entrezgene")
+
+    ensembl_ids: set[str] = set(chain.from_iterable(await asyncio.gather(*[read_ensembl_gene_ids(f) for f in counts_matrix_filepaths])))
+    gene_data: list[dict[str, str | int | list[str] | list[int] | None]] = await MyGene(cache=cache).query(
+        items=list(ensembl_ids),
+        taxon=taxon,
+        scopes="ensemblgene",
+    )
     gene_info: pd.DataFrame = pd.DataFrame(
         data=None,
-        columns=["ensembl_gene_id", "gene_symbol", "entrez_gene_id", "size"],
-        index=list(range(len(gene_data))),
+        columns=pd.Index(data=["ensembl_gene_id", "gene_symbol", "entrez_gene_id", "size"]),
+        index=pd.Index(data=list(range(len(ensembl_ids)))),
     )
-    for i, data in enumerate(gene_data):
-        ensembl_ids = data.get("genomic_pos.ensemblgene", pd.NA)
-        if isinstance(ensembl_ids, list):
-            ensembl_ids = ensembl_ids[0]
-
-        start_pos = data.get("genomic_pos.start", 0)
-        start_pos: int = int(sum(start_pos) / len(start_pos)) if isinstance(start_pos, list) else int(start_pos)
-        end_pos = data.get("genomic_pos.end", 0)
-        end_pos: int = int(sum(end_pos) / len(end_pos)) if isinstance(end_pos, list) else int(end_pos)
-
-        gene_info.at[i, "gene_symbol"] = data.get("symbol", pd.NA)
-        gene_info.at[i, "entrez_gene_id"] = data.get("entrezgene", pd.NA)
-        gene_info.at[i, "ensembl_gene_id"] = ensembl_ids
-        gene_info.at[i, "size"] = end_pos - start_pos
-    gene_info.sort_values(by="ensembl_gene_id", inplace=True)
 
+    for i, data in enumerate(gene_data):
+        data: dict[str, str | int | list[str] | list[int] | None]
+        ensembl_genes: str | list[str] = cast(str | list[str], data.get("ensembl.gene", "-"))
+        start_pos: int | list[int] = cast(int | list[int], data.get("genomic_pos.start", 0))
+        end_pos: int | list[int] = cast(int | list[int], data.get("genomic_pos.end", 0))
+
+        avg_start: int | float = sum(start_pos) / len(start_pos) if isinstance(start_pos, list) else start_pos
+        avg_end: int | float = sum(end_pos) / len(end_pos) if isinstance(end_pos, list) else end_pos
+        size: int = int(avg_end - avg_start)
+
+        gene_info.at[i, "gene_symbol"] = data.get("symbol", "-")
+        gene_info.at[i, "entrez_gene_id"] = data.get("entrezgene", "-")
+        gene_info.at[i, "ensembl_gene_id"] = ",".join(ensembl_genes) if isinstance(ensembl_genes, list) else ensembl_genes
+        gene_info.at[i, "size"] = size if size > 0 else -1
+
+    gene_info["size"] = gene_info["size"].astype(str)  # replace no-length values with "-" to match rows where every value is "-"
+    gene_info["size"] = gene_info["size"].replace("-1", "-")
+    gene_info = cast(pd.DataFrame, gene_info[~(gene_info == "-").all(axis=1)])  # remove rows where every value is "-"
+
+    gene_info["ensembl_gene_id"] = gene_info["ensembl_gene_id"].str.split(",")  # extend lists into multiple rows
+    gene_info = gene_info.explode(column=["ensembl_gene_id"])
+    gene_info["size"] = gene_info["size"].astype(int)
+    # we would set `entrez_gene_id` to int here as well, but not all ensembl ids are mapped to entrez ids,
+    #   and as a result, there are still "-" values in the entrez id column that cannot be converted to an integer
+
+    gene_info: pd.DataFrame = cast(pd.DataFrame, gene_info.sort_values(by="ensembl_gene_id"))
     output_filepath.parent.mkdir(parents=True, exist_ok=True)
     gene_info.to_csv(output_filepath, index=False)
     logger.success(f"Gene Info file written at '{output_filepath}'")
@@ -728,7 +742,7 @@ async def _process_como_input(
         rna=rna,
     )
     with pd.ExcelWriter(output_config_filepath) as writer:
-        subset_config = config_df[config_df["library_prep"] == rna.value]
+        subset_config = config_df[config_df["library_prep"].str.lower() == rna.value.lower()]
         subset_config.to_excel(writer, sheet_name=context_name, header=True, index=False)
 
 
From e5127409905b2bc39e45dadc076f48196964832b Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:48:01 -0600
Subject: [PATCH 40/52] refactor: include more robust error handling for
 dataframes when collecting gene data

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/utils.py | 58 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 45 insertions(+), 13 deletions(-)

diff --git a/main/como/utils.py b/main/como/utils.py
index 05629032..dfb9ac1e 100644
--- a/main/como/utils.py
+++ b/main/como/utils.py
@@ -135,7 +135,7 @@ async def _format_determination(
 
 
 async def get_missing_gene_data(values: list[str] | pd.DataFrame, taxon_id: int | str | Taxon) -> pd.DataFrame:
-    if isinstance(values, list):
+    if isinstance(values, list) and not isinstance(values, pd.DataFrame):  # second isinstance required for static type check to be happy
         gene_type = await determine_gene_type(values)
         if all(v == "gene_symbol" for v in gene_type.values()):
             return await gene_symbol_to_ensembl_and_gene_id(values, taxon=taxon_id)
@@ -144,19 +144,51 @@ async def get_missing_gene_data(values: list[str] | pd.DataFrame, taxon_id: int
         elif all(v == "entrez_gene_id" for v in gene_type.values()):
             return await gene_id_to_ensembl_and_gene_symbol(ids=values, taxon=taxon_id)
         else:
-            logger.critical("Gene data must be of the same type (i.e., all Ensembl, Entrez, or Gene Symbols)")
-            raise ValueError("Gene data must be of the same type (i.e., all Ensembl, Entrez, or Gene Symbols)")
-    else:
-        values: pd.DataFrame  # Re-define type to assist in type hinting
-        if "gene_symbol" in values:
-            return await get_missing_gene_data(values["gene_symbol"].tolist(), taxon_id=taxon_id)
-        elif "entrez_gene_id" in values:
-            return await get_missing_gene_data(values["entrez_gene_id"].tolist(), taxon_id=taxon_id)
-        elif "ensembl_gene_id" in values:
-            return await get_missing_gene_data(values["ensembl_gene_id"].tolist(), taxon_id=taxon_id)
+            _log_and_raise_error(
+                message="Gene data must be of the same type (i.e., all Ensembl, Entrez, or Gene Symbols)",
+                error=ValueError,
+                level=LogLevel.CRITICAL,
+            )
+    elif isinstance(values, pd.DataFrame):
+        # raise error if duplicate column names exist
+        if any(values.columns.duplicated(keep=False)):
+            duplicate_cols = values.columns[values.columns.duplicated(keep=False)].unique().tolist()
+            _log_and_raise_error(
+                message=f"Duplicate column names exist! This will result in an error processing data. Duplicates: {','.join(duplicate_cols)}",
+                error=ValueError,
+                level=LogLevel.CRITICAL,
+            )
+        if "gene_symbol" in itertools.chain(values.columns, [values.index.name]):
+            return await get_missing_gene_data(
+                values["gene_symbol"].tolist() if "gene_symbol" in values.columns else values.index.tolist(),
+                taxon_id=taxon_id,
+            )
+        elif "entrez_gene_id" in itertools.chain(values.columns, [values.index.name]):
+            return await get_missing_gene_data(
+                values["entrez_gene_id"].tolist() if "entrez_gene_id" in values.columns else values.index.tolist(),
+                taxon_id=taxon_id,
+            )
+        elif "ensembl_gene_id" in itertools.chain(values.columns, [values.index.name]):
+            return await get_missing_gene_data(
+                values["ensembl_gene_id"].tolist() if "ensembl_gene_id" in values.columns else values.index.tolist(),
+                taxon_id=taxon_id,
+            )
         else:
-            logger.critical("Unable to find 'gene_symbol', 'entrez_gene_id', or 'ensembl_gene_id' in the input matrix.")
-            raise ValueError("Unable to find 'gene_symbol', 'entrez_gene_id', or 'ensembl_gene_id' in the input matrix.")
+            _log_and_raise_error(
+                message="Unable to find 'gene_symbol', 'entrez_gene_id', or 'ensembl_gene_id' in the input matrix.",
+                error=ValueError,
+                level=LogLevel.CRITICAL,
+            )
+    else:
+        _log_and_raise_error(
+            message=f"Values must be a list of strings or a pandas DataFrame, got: {type(values)}",
+            error=TypeError,
+            level=LogLevel.CRITICAL,
+        )
+
+
+@overload
+async def _read_file(path: None, h5ad_as_df: Literal[True] | Literal[False], **kwargs) -> None: ...
 
 
 @overload

From 8d505f7196bc40fd174022bb7732f9794b6aece5 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:48:18 -0600
Subject: [PATCH 41/52] chore(dev): expand overloaded functions to add more
 type paths

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/main/como/utils.py b/main/como/utils.py
index dfb9ac1e..84df2aac 100644
--- a/main/como/utils.py
+++ b/main/como/utils.py
@@ -192,23 +192,23 @@ async def _read_file(path: None, h5ad_as_df: Literal[True] | Literal[False], **k
 
 
 @overload
-async def _read_file(path: None, h5ad_as_df: bool, **kwargs) -> None: ...
+async def _read_file(path: pd.DataFrame, h5ad_as_df: Literal[True] | Literal[False], **kwargs) -> pd.DataFrame: ...
 
 
 @overload
-async def _read_file(path: pd.DataFrame, h5ad_as_df: bool, **kwargs) -> pd.DataFrame: ...
+async def _read_file(path: sc.AnnData, h5ad_as_df: Literal[False] = False, **kwargs) -> sc.AnnData: ...
 
 
 @overload
-async def _read_file(path: sc.AnnData, h5ad_as_df: bool = False, **kwargs) -> sc.AnnData: ...
+async def _read_file(path: sc.AnnData, h5ad_as_df: Literal[True] = True, **kwargs) -> pd.DataFrame: ...
 
 
-def _num_rows(item: pd.DataFrame | npt.NDArray) -> int:
-    return item.shape[0]
+@overload
+async def _read_file(path: Path, h5ad_as_df: Literal[False] = False, **kwargs) -> pd.DataFrame | sc.AnnData: ...
 
 
 @overload
-async def _read_file(path: sc.AnnData, h5ad_as_df: bool = True, **kwargs) -> pd.DataFrame: ...
+async def _read_file(path: Path, h5ad_as_df: Literal[True] = True, **kwargs) -> pd.DataFrame: ...
 
 
 async def _read_file(

From 448627b0910e9674116ccd03cf5b388725f25ea7 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:49:35 -0600
Subject: [PATCH 42/52] feat(dev): added ty.toml file for type hints

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/proteomics_gen.py | 43 ++++++++++++++++++++++++++-----------
 ty.toml                     | 16 ++++++++++++++
 2 files changed, 47 insertions(+), 12 deletions(-)
 create mode 100644 ty.toml

diff --git a/main/como/proteomics_gen.py b/main/como/proteomics_gen.py
index e0d9b078..caeeb8a3 100644
--- a/main/como/proteomics_gen.py
+++ b/main/como/proteomics_gen.py
@@ -29,16 +29,25 @@ def process_proteomics_data(path: Path) -> pd.DataFrame:
     """
     # Preprocess data, drop na, duplicate ';' in symbol,
     matrix: pd.DataFrame = pd.read_csv(path)
-    if "gene_symbol" not in matrix.columns:
+    gene_symbol_colname = [col for col in matrix.columns if "symbol" in col]
+    if len(gene_symbol_colname) == 0:
         _log_and_raise_error(
             "No gene_symbol column found in proteomics data.",
             error=ValueError,
             level=LogLevel.ERROR,
         )
-
+    if len(gene_symbol_colname) > 1:
+        _log_and_raise_error(
+            "Multiple gene_symbol columns found in proteomics data.",
+            error=ValueError,
+            level=LogLevel.ERROR,
+        )
+    symbol_col = gene_symbol_colname[0]
+    matrix = matrix.rename(columns={symbol_col: "gene_symbol"})
     matrix["gene_symbol"] = matrix["gene_symbol"].astype(str)
     matrix.dropna(subset=["gene_symbol"], inplace=True)
-    matrix = matrix.assign(gene_symbol=matrix["gene_symbol"].str.split(";")).explode("gene_symbol")
+    matrix["gene_symbol"] = matrix["gene_symbol"].str.split(":")
+    matrix = matrix.explode(column=["gene_symbol"])
     return matrix
 
 
@@ -56,12 +65,23 @@ async def load_gene_symbol_map(gene_symbols: list[str], entrez_map: Path | None
     if entrez_map and entrez_map.exists():
         df = pd.read_csv(entrez_map, index_col="gene_symbol")
     else:
-        biodbnet = BioDBNet()
-        df = await biodbnet.async_db2db(
-            values=gene_symbols,
-            input_db=Input.GENE_SYMBOL,
-            output_db=[Output.GENE_ID, Output.ENSEMBL_GENE_ID],
-        )
+        dataframes: list[pd.DataFrame] = []
+        step_size: int = 300
+        biodbnet = BioDBNet(cache=True)
+        biodbnet.services.settings.TIMEOUT = 60
+        for i in range(0, len(gene_symbols), step_size):
+            # Operations: Goes from gene_symbols=["A", "B;C", "D"] -> gene.split=[["A"], ["B", "C"], ["D"]] -> chain.from_iterable["A", "B", "C", "D"]
+            chunk: list[str] = list(itertools.chain.from_iterable(gene.split(";") for gene in gene_symbols[i : i + step_size]))
+            dataframes.append(
+                biodbnet.db2db(
+                    input_values=chunk,
+                    input_db=Input.GENE_SYMBOL.value,
+                    output_db=[Output.GENE_ID.value, Output.ENSEMBL_GENE_ID.value],
+                    taxon=9606,
+                )
+            )
+        df = pd.concat(dataframes, axis="columns")
+        print(df)
         df.loc[df["gene_id"].isna(), ["gene_id"]] = np.nan
         df.to_csv(entrez_map, index_label="gene_symbol")
 
@@ -188,7 +208,7 @@ async def proteomics_gen(
     high_confidence_batch_ratio: float = 0.7,
     quantile: int = 25,
     log_level: LogLevel = LogLevel.INFO,
-    log_location: str | TextIOWrapper = sys.stderr,
+    log_location: str | TextIO = sys.stderr,
 ):
     """Generate proteomics data."""
     _set_up_logging(level=log_level, location=log_location)
@@ -229,13 +249,12 @@ async def proteomics_gen(
 
     config_df = pd.read_excel(config_filepath, sheet_name=context_name)
     matrix: pd.DataFrame = process_proteomics_data(matrix_filepath)
-
     groups = config_df["group"].unique().tolist()
 
     for group in groups:
         indices = np.where([g == group for g in config_df["group"]])
         sample_columns = [*np.take(config_df["sample_name"].to_numpy(), indices).ravel().tolist(), "gene_symbol"]
-        matrix = matrix.loc[:, sample_columns]
+        matrix = cast(pd.DataFrame, matrix.loc[:, sample_columns])
 
         symbols_to_gene_ids = await load_gene_symbol_map(
             gene_symbols=matrix["gene_symbol"].tolist(),
diff --git a/ty.toml b/ty.toml
new file mode 100644
index 00000000..a78d595f
--- /dev/null
+++ b/ty.toml
@@ -0,0 +1,16 @@
+[environment]
+root = ["main"]
+
+[rules]
+call-non-callable = "error"
+invalid-argument-type = "error"
+unresolved-reference = "error"
+unsupported-operator = "error"
+
+ambiguous-protocol-member = "warn"
+deprecated = "warn"
+division-by-zero = "warn"
+possibly-missing-attribute = "warn"
+possibly-unresolved-reference = "warn"
+unsupported-base = "warn"
+unused-ignore-comment = "warn"

From cebab4448f6ba15b9bf5e9f6b9c4bd2f28a4bcac Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 10:58:54 -0600
Subject: [PATCH 43/52] fix(test): lowercase names

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 tests/unit/test_data_types.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/test_data_types.py b/tests/unit/test_data_types.py
index 3b24f9a2..45939ba8 100644
--- a/tests/unit/test_data_types.py
+++ b/tests/unit/test_data_types.py
@@ -3,7 +3,7 @@
 
 def test_source_types():
     """Validate that source types always go in the order of 'trna', 'mrna', 'scrna', 'protemics'."""
-    expected_order = ["TRNA", "MRNA", "SCRNA", "PROTEOMICS"]
+    expected_order = ["trna", "mrna", "scrna", "proteomics"]
     for expected, source in zip(expected_order, SourceTypes, strict=True):
         expected: str
         source: SourceTypes

From 37be94ba55c5432fcb582a9c22b4a301e6d23d01 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 11:00:28 -0600
Subject: [PATCH 44/52] format: ruff formatting

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 tests/test_proteomics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_proteomics.py b/tests/test_proteomics.py
index 0f38be27..defb0fef 100644
--- a/tests/test_proteomics.py
+++ b/tests/test_proteomics.py
@@ -2,11 +2,11 @@
 
 import aioftp
 import pytest
+
 from como.proteomics.Crux import MZMLtoSQT, RAWtoMZML, SQTtoCSV
 from como.proteomics.FileInformation import FileInformation
 from como.proteomics.FTPManager import Download, Reader, aioftp_client
 from como.proteomics.proteomics_preprocess import ParseCSVInput, PopulateInformation
-
 from tests.fixtures.fixture_ftp_server import fixture_ftp_server, ftp_file_names
 
 
From a19536bd7cf1cf32c08731137d7289e22cbfc066 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 14:14:08 -0600
Subject: [PATCH 45/52] feat(test): added `approx` tests

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 tests/unit/test_approx.py | 237 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 237 insertions(+)
 create mode 100644 tests/unit/test_approx.py

diff --git a/tests/unit/test_approx.py b/tests/unit/test_approx.py
new file mode 100644
index 00000000..772dbc77
--- /dev/null
+++ b/tests/unit/test_approx.py
@@ -0,0 +1,237 @@
+import numpy as np
+import pytest
+
+from como.approx import _coerce_to_float_array, _regularize_values, approx
+
+
+class TestCoerceToFloatArray:
+    """Tests for the _coerce_to_float_array helper function."""
+
+    def test_converts_int_list_to_array(self):
+        result = _coerce_to_float_array([1, 2, 3])
+        assert isinstance(result, np.ndarray)
+        assert result.dtype == float
+        np.testing.assert_array_equal(result, [1.0, 2.0, 3.0])
+
+    def test_converts_float_list_to_array(self):
+        result = _coerce_to_float_array([1.0, 2.0, 3.0])
+        assert isinstance(result, np.ndarray)
+        assert result.dtype == float
+        np.testing.assert_array_equal(result, [1.0, 2.0, 3.0])
+
+
+class TestRegularizeValues:
+    """Tests for the _regularize_values function."""
+
+    def test_removes_na_pairs_when_na_rm_true(self):
+        x: list[float] = np.array([1.0, 2.0, np.nan, 4.0])
+        y: list[float] = np.array([10.0, np.nan, 30.0, 40.0])
+        result = _regularize_values(x, y, na_rm=True, ties="mean")
+        np.testing.assert_array_equal(result.x, [1.0, 4.0])
+        np.testing.assert_array_equal(result.y, [10.0, 40.0])
+
+    def test_raises_error_with_na_in_x_when_na_rm_false(self):
+        x: list[float] = np.array([1.0, np.nan, 3.0])
+        y: list[float] = np.array([10.0, 20.0, 30.0])
+        with pytest.raises(ValueError, match="NA values in x are not allowed"):
+            _regularize_values(x, y, na_rm=False, ties="mean")
+
+    def test_sorts_by_x(self):
+        x: list[float] = np.array([3.0, 1.0, 2.0])
+        y: list[float] = np.array([30.0, 10.0, 20.0])
+        result = _regularize_values(x, y, na_rm=True, ties="mean")
+        np.testing.assert_array_equal(result.x, [1.0, 2.0, 3.0])
+        np.testing.assert_array_equal(result.y, [10.0, 20.0, 30.0])
+
+    def test_aggregates_duplicates_with_mean(self):
+        x: list[float] = np.array([1.0, 1.0, 2.0])
+        y: list[float] = np.array([10.0, 20.0, 30.0])
+        result = _regularize_values(x, y, na_rm=True, ties="mean")
+        np.testing.assert_array_equal(result.x, [1.0, 2.0])
+        np.testing.assert_array_equal(result.y, [15.0, 30.0])
+
+    def test_aggregates_duplicates_with_first(self):
+        x: list[float] = np.array([1.0, 1.0, 2.0])
+        y: list[float] = np.array([10.0, 20.0, 30.0])
+        result = _regularize_values(x, y, na_rm=True, ties="first")
+        np.testing.assert_array_equal(result.x, [1.0, 2.0])
+        np.testing.assert_array_equal(result.y, [10.0, 30.0])
+
+    def test_aggregates_duplicates_with_last(self):
+        x: list[float] = np.array([1.0, 1.0, 2.0])
+        y: list[float] = np.array([10.0, 20.0, 30.0])
+        result = _regularize_values(x, y, na_rm=True, ties="last")
+        np.testing.assert_array_equal(result.x, [1.0, 2.0])
+        np.testing.assert_array_equal(result.y, [20.0, 30.0])
+
+    def test_handles_empty_arrays(self):
+        x: list[float] = np.array([])
+        y: list[float] = np.array([])
+        result = _regularize_values(x, y, na_rm=True, ties="mean")
+        assert result.x.size == 0
+        assert result.y.size == 0
+        assert result.not_na.size == 0
+
+    def test_callable_ties_function(self):
+        x: list[float] = np.array([1.0, 1.0, 2.0])
+        y: list[float] = np.array([10.0, 20.0, 30.0])
+        result = _regularize_values(x, y, na_rm=True, ties=np.sum)
+        np.testing.assert_array_equal(result.x, [1.0, 2.0])
+        np.testing.assert_array_equal(result.y, [30.0, 30.0])
+
+
+class TestApprox:
+    """Tests for the main approx function."""
+
+    def test_basic_linear_interpolation(self):
+        x: list[float] = [1.0, 2.0, 3.0]
+        y: list[float] = [10.0, 20.0, 30.0]
+        xout: list[float] = [1.5, 2.5]
+        result = approx(x, y, xout=xout)
+        np.testing.assert_array_almost_equal(result.y, [15.0, 25.0])
+
+    def test_one_argument_form(self):
+        y: list[float] = [10.0, 20.0, 30.0]
+        result = approx(y, xout=[1.5, 2.5])
+        np.testing.assert_array_almost_equal(result.y, [15.0, 25.0])
+
+    def test_default_n_points(self):
+        x: list[float] = [1.0, 5.0]
+        y: list[float] = [10.0, 50.0]
+        result = approx(x, y)
+        assert len(result.x) == 50
+        assert len(result.y) == 50
+
+    def test_custom_n_points(self):
+        x: list[float] = [1.0, 5.0]
+        y: list[float] = [10.0, 50.0]
+        result = approx(x, y, n=10)
+        assert len(result.x) == 10
+
+    def test_extrapolation_rule_1(self):
+        x: list[float] = [1.0, 2.0, 3.0]
+        y: list[float] = [10.0, 20.0, 30.0]
+        xout: list[float] = [0.5, 3.5]
+        result = approx(x, y, xout=xout, rule=1)
+        assert np.isnan(result.y[0])
+        assert np.isnan(result.y[1])
+
+    def test_extrapolation_rule_2(self):
+        x: list[float] = [1.0, 2.0, 3.0]
+        y: list[float] = [10.0, 20.0, 30.0]
+        xout: list[float] = [0.5, 3.5]
+        result = approx(x, y, xout=xout, rule=2)
+        assert result.y[0] == 10.0
+        assert result.y[1] == 30.0
+
+    def test_yleft_yright_parameters(self):
+        x: list[float] = [1.0, 2.0, 3.0]
+        y: list[float] = [10.0, 20.0, 30.0]
+        xout: list[float] = [0.5, 3.5]
+        result = approx(x, y, xout=xout, yleft=5.0, yright=35.0)
+        assert result.y[0] == 5.0
+        assert result.y[1] == 35.0
+
+    def test_constant_method_f_0(self):
+        x: list[float] = [1.0, 2.0, 3.0]
+        y: list[float] = [10.0, 20.0, 30.0]
+        xout: list[float] = [1.5]
+        result = approx(x, y, xout=xout, method="constant", f=0.0)
+        assert result.y[0] == 10.0
+
+    def test_constant_method_f_1(self):
+        x: list[float] = [1.0, 2.0, 3.0]
+        y: list[float] = [10.0, 20.0, 30.0]
+        xout: list[float] = [1.5]
+        result = approx(x, y, xout=xout, method="constant", f=1.0)
+        assert result.y[0] == 20.0
+
+    def test_constant_method_f_05(self):
+        x: list[float] = [1.0, 2.0, 3.0]
+        y: list[float] = [10.0, 20.0, 30.0]
+        xout: list[float] = [1.5]
+        result = approx(x, y, xout=xout, method="constant", f=0.5)
+        assert result.y[0] == 15.0
+
+    def test_exact_match_returns_exact_value(self):
+        x: list[float] = [1.0, 2.0, 3.0]
+        y: list[float] = [10.0, 20.0, 30.0]
+        xout: list[float] = [2.0]
+        result = approx(x, y, xout=xout)
+        assert result.y[0] == 20.0
+
+    def test_na_in_xout_returns_nan(self):
+        x: list[float] = [1.0, 2.0, 3.0]
+        y: list[float] = [10.0, 20.0, 30.0]
+        xout: list[float] = [np.nan, 2.0]
+        result = approx(x, y, xout=xout)
+        assert np.isnan(result.y[0])
+        assert result.y[1] == 20.0
+
+    def test_method_numeric_codes(self):
+        x: list[float] = [1.0, 2.0]
+        y: list[float] = [10.0, 20.0]
+        xout: list[float] = [1.5]
+        result1 = approx(x, y, xout=xout, method=1)
+        result2 = approx(x, y, xout=xout, method=2, f=0.0)
+        assert result1.y[0] == 15.0
+        assert result2.y[0] == 10.0
+
+    def test_raises_error_different_length_xy(self):
+        x: list[float] = [1.0, 2.0]
+        y: list[float] = [10.0]
+        with pytest.raises(ValueError, match="x and y must have same length"):
+            approx(x, y)
+
+    def test_raises_error_invalid_method(self):
+        x: list[float] = [1.0, 2.0]
+        y: list[float] = [10.0, 20.0]
+        with pytest.raises(ValueError, match="invalid interpolation method"):
+            approx(x, y, method="invalid")
+
+    def test_raises_error_invalid_method_code(self):
+        x: list[float] = [1.0, 2.0]
+        y: list[float] = [10.0, 20.0]
+        with pytest.raises(ValueError, match="invalid interpolation method"):
+            approx(x, y, method=3)
+
+    def test_raises_error_invalid_f(self):
+        x: list[float] = [1.0, 2.0]
+        y: list[float] = [10.0, 20.0]
+        with pytest.raises(ValueError, match="invalid f value"):
+            approx(x, y, method="constant", f=2.0)
+
+    def test_raises_error_need_two_points_for_linear(self):
+        x: list[float] = [1.0]
+        y: list[float] = [10.0]
+        with pytest.raises(ValueError, match="need at least two non-NA values"):
+            approx(x, y, method="linear")
+
+    def test_raises_error_zero_points(self):
+        x: list[float] = []
+        y: list[float] = []
+        with pytest.raises(ValueError, match="zero non-NA points"):
+            approx(x, y, method="constant")
+
+    def test_handles_ties_mean(self):
+        x: list[float] = [1.0, 1.0, 2.0]
+        y: list[float] = [10.0, 20.0, 30.0]
+        xout: list[float] = [1.5]
+        result = approx(x, y, xout=xout, ties="mean")
+        assert result.y[0] == 22.5
+
+    def test_rule_as_list(self):
+        x: list[float] = [1.0, 2.0, 3.0]
+        y: list[float] = [10.0, 20.0, 30.0]
+        xout: list[float] = [0.5, 3.5]
+        result = approx(x, y, xout=xout, rule=[1, 2])
+        assert np.isnan(result.y[0])
+        assert result.y[1] == 30.0
+
+    def test_na_rm_false_with_na_in_y(self):
+        x: list[float] = [1.0, 2.0, 3.0]
+        y: list[float] = [10.0, np.nan, 30.0]
+        xout: list[float] = [2.5]
+        result = approx(x, y, xout=xout, na_rm=False)
+        # After filtering out NA, should interpolate between 1.0->10.0 and 3.0->30.0
+        assert result.y[0] == 25.0

From 0891f010066f7bfd42057636c8795ff924847371 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 14:14:15 -0600
Subject: [PATCH 46/52] feat(test): added `density` tests

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 tests/unit/test_density.py | 226 +++++++++++++++++++++++++++++++++++++
 1 file changed, 226 insertions(+)
 create mode 100644 tests/unit/test_density.py

diff --git a/tests/unit/test_density.py b/tests/unit/test_density.py
new file mode 100644
index 00000000..c4d88721
--- /dev/null
+++ b/tests/unit/test_density.py
@@ -0,0 +1,226 @@
+from typing import Literal, cast
+
+import numpy as np
+import numpy.typing as npt
+import pytest
+from numpy.testing import assert_allclose, assert_array_equal
+
+from como.density import DensityResult, bin_distance, density, dnorm, nrd0
+
+KERNEL_TYPE = Literal["gaussian", "epanechnikov", "rectangular", "triangular", "biweight", "cosine", "optcosine"]
+
+
+class TestBinDistance:
+    def test_basic_binning(self):
+        x: npt.NDArray[float] = np.array([0.5, 1.5, 2.5])
+        weights: npt.NDArray[float] = np.array([1.0, 1.0, 1.0])
+        result: npt.NDArray[float] = bin_distance(x, weights, lo=0, up=3, n=4)
+
+        assert result.shape == (8,)
+        assert np.all(result >= 0)
+
+    def test_weighted_binning(self):
+        x: npt.NDArray[float] = np.array([1.0, 2.0])
+        weights: npt.NDArray[float] = np.array([2.0, 1.0])
+        result: npt.NDArray[float] = bin_distance(x, weights, lo=0, up=3, n=4)
+
+        assert result.shape == (8,)
+        assert result.sum() > 0
+
+    def test_empty_array(self):
+        x: npt.NDArray[float] = np.array([])
+        weights: npt.NDArray[float] = np.array([])
+        result: npt.NDArray[float] = bin_distance(x, weights, lo=0, up=1, n=2)
+
+        assert result.shape == (4,)
+        assert_array_equal(result, np.zeros(4))
+
+    def test_out_of_bounds_handling(self):
+        x: npt.NDArray[float] = np.array([10.0])
+        weights: npt.NDArray[float] = np.array([1.0])
+        result: npt.NDArray[float] = bin_distance(x, weights, lo=0, up=1, n=2)
+
+        assert result.shape == (4,)
+
+
+class TestDnorm:
+    def test_standard_normal(self):
+        # dnorm(0) for standard normal should be ~0.3989
+        result: float = dnorm(0.0, mean=0.0, sd=1.0)
+        expected: float = 1.0 / np.sqrt(2 * np.pi)
+        assert_allclose(result, expected, rtol=1e-10)
+
+    def test_with_mean_and_sd(self):
+        result: float = dnorm(5.0, mean=5.0, sd=2.0)
+        expected: float = 1.0 / (2.0 * np.sqrt(2 * np.pi))
+        assert_allclose(result, expected, rtol=1e-10)
+
+    def test_log_density(self):
+        result: float = dnorm(0.0, mean=0.0, sd=1.0, log=True)
+        expected: float = np.log(1.0 / np.sqrt(2 * np.pi))
+        assert_allclose(result, expected, rtol=1e-10)
+
+    def test_nan_inputs(self):
+        assert np.isnan(dnorm(np.nan))
+        assert np.isnan(dnorm(0.0, mean=np.nan))
+        assert np.isnan(dnorm(0.0, sd=np.nan))
+
+    def test_negative_sd(self):
+        result: float = dnorm(0.0, sd=-1.0)
+        assert np.isnan(result)
+
+    def test_infinite_sd(self):
+        result: float = dnorm(0.0, sd=np.inf)
+        assert result == 0.0 or result == -np.inf
+
+    def test_zero_sd_at_mean(self):
+        result: float = dnorm(5.0, mean=5.0, sd=0.0)
+        assert np.isinf(result)
+
+    def test_zero_sd_away_from_mean(self):
+        result: float = dnorm(5.0, mean=3.0, sd=0.0, log=False)
+        assert result == 0.0
+
+    def test_fast_dnorm_flag(self):
+        result_fast: float = dnorm(1.0, fast_dnorm=True)
+        result_slow: float = dnorm(1.0, fast_dnorm=False)
+        assert_allclose(result_fast, result_slow, rtol=1e-5)
+
+    def test_large_values(self):
+        result: float = dnorm(100.0, mean=0.0, sd=1.0)
+        assert result >= 0.0
+        assert result < 1e-10
+
+
+class TestNrd0:
+    def test_basic_calculation(self):
+        x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
+        result: float = nrd0(x)
+        assert result > 0
+        assert np.isfinite(result)
+
+    def test_with_constant_values(self):
+        x: npt.NDArray[float] = np.array([5.0, 5.0, 5.0, 5.0])
+        result: float = nrd0(x)
+        assert result > 0
+        assert np.isfinite(result)
+
+    def test_single_nonzero_value(self):
+        x: npt.NDArray[float] = np.array([0.0, 7.0])
+        result: float = nrd0(x)
+        assert result > 0
+
+    def test_all_zeros(self):
+        # if the input array is changed from a shape of `(3,)`, the result will change
+        # this is because `nrd0` takes the length of the input into account when calculating bandwidth
+        x: npt.NDArray[float] = np.array([0.0, 0.0, 0.0], dtype=float)
+        result: float = nrd0(x)
+        assert result == 0.7224674055842076
+
+    def test_insufficient_data(self):
+        with pytest.raises(ValueError, match="need at least 2 data points"):
+            nrd0(np.array([1.0]))
+
+    def test_empty_array(self):
+        with pytest.raises(ValueError, match="need at least 2 data points"):
+            nrd0(np.array([]))
+
+
+class TestDensity:
+    def test_basic_density(self):
+        x: npt.NDArray[float] = np.random.normal(0, 1, 100)
+        result: DensityResult = density(x)
+
+        assert isinstance(result, DensityResult)
+        assert len(result.x) == 512
+        assert len(result.y) == 512
+        assert result.bw > 0
+        assert np.all(result.y >= 0)
+
+    def test_custom_bandwidth(self):
+        x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
+        result: DensityResult = density(x, bw=0.5)
+
+        assert_allclose(result.bw, 0.5, rtol=1e-10)
+
+    def test_with_weights(self):
+        x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
+        weights: npt.NDArray[float] = np.array([0.1, 0.2, 0.4, 0.2, 0.1])
+        result: DensityResult = density(x, weights=weights, n=5)
+
+        assert len(result.x) == 5
+        assert np.all(result.y >= 0)
+
+    def test_custom_grid_range(self):
+        x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
+        result: DensityResult = density(x, from_=0, to_=6, n=100)
+        assert result.x[0] == 0
+        assert result.x[-1] == 6
+        assert len(result.x) == 100
+
+    def test_different_kernels(self):
+        x: npt.NDArray[float] = np.random.normal(0, 1, 50)
+
+        for kernel in ["epanechnikov", "rectangular", "triangular", "biweight", "cosine", "optcosine"]:
+            with pytest.raises(NotImplementedError, match=f"Only 'gaussian' kernel is implemented; got '{kernel}'"):
+                density(x, kernel=cast(KERNEL_TYPE, kernel))
+
+    def test_kernel_only_mode(self):
+        result: npt.NDArray[float] = density([1, 2, 3], kernel="gaussian", kernel_only=True)
+        expected: float = 1 / (2 * np.sqrt(np.pi))
+        assert isinstance(result, float)
+        assert_allclose(result, expected, rtol=1e-10)
+
+    def test_na_removal(self):
+        x: npt.NDArray[float] = np.array([1.0, 2.0, np.nan, 4.0, 5.0])
+        result: DensityResult = density(x, remove_na=True)
+
+        assert len(result.x) == 512
+        assert np.all(np.isfinite(result.y))
+
+    def test_na_without_removal(self):
+        x: npt.NDArray[float] = np.array([1.0, 2.0, np.nan, 4.0, 5.0])
+
+        with pytest.raises(ValueError, match="NA values found"):
+            density(x, remove_na=False)
+
+    def test_insufficient_data_for_nrd0(self):
+        with pytest.raises(ValueError, match="at least two points"):
+            density([1.0], bw="nrd0")
+
+    def test_adjust_parameter(self):
+        x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
+        result1: DensityResult = density(x, bw=1.0, adjust=1.0)
+        result2: DensityResult = density(x, bw=1.0, adjust=2.0)
+
+        assert_allclose(result2.bw, 2.0 * result1.bw, rtol=1e-10)
+
+    def test_invalid_bandwidth_string(self):
+        x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0])
+
+        with pytest.raises(TypeError, match="must be a number or 'nrd0'"):
+            density(x, bw="invalid")
+
+    def test_negative_weights(self):
+        x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0])
+        weights: npt.NDArray[float] = np.array([1.0, -1.0, 1.0])
+
+        with pytest.raises(ValueError, match="Negative values found"):
+            density(x, weights=weights)
+
+    def test_infinite_values_in_x(self):
+        x: npt.NDArray[float] = np.array([1.0, 2.0, np.inf, 4.0, 5.0])
+        result: DensityResult = density(x)
+
+        assert np.all(np.isfinite(result.y))
+
+    def test_result_named_tuple_fields(self):
+        x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
+        result: DensityResult = density(x, n=100)
+
+        assert hasattr(result, "x")
+        assert hasattr(result, "y")
+        assert hasattr(result, "x_grid")
+        assert hasattr(result, "y_grid")
+        assert hasattr(result, "bw")
+        assert hasattr(result, "n")

From 330fc445639ecca62e697df4cd40daf4954e1683 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 14:14:29 -0600
Subject: [PATCH 47/52] feat(test): added `find_peaks` tests

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 tests/unit/test_peak_finder.py | 204 +++++++++++++++++++++++++++++++++
 1 file changed, 204 insertions(+)
 create mode 100644 tests/unit/test_peak_finder.py

diff --git a/tests/unit/test_peak_finder.py b/tests/unit/test_peak_finder.py
new file mode 100644
index 00000000..7d2d730d
--- /dev/null
+++ b/tests/unit/test_peak_finder.py
@@ -0,0 +1,204 @@
+import numpy as np
+import numpy.typing as npt
+import pandas as pd
+import pytest
+
+from como.peak_finder import (
+    _encode_signs,
+    _enforce_minimum_peak_distance,
+    _validate_args,
+    find_peaks,
+)
+
+
+class TestValidateArgs:
+    def test_multidimensional_array_raises_error(self):
+        x: npt.NDArray[float] = np.array([[1, 2], [3, 4]])
+        with pytest.raises(ValueError, match="Expected a 1D array, got 2D array instead"):
+            _validate_args(x, 1, 1, "0", 0.0, 1, 0.0)
+
+    def test_nan_values_raise_error(self):
+        x: npt.NDArray[float] = np.array([1.0, 2.0, np.nan, 4.0])
+        with pytest.raises(ValueError, match="Input x contains NaNs"):
+            _validate_args(x, 1, 1, "0", 0.0, 1, 0.0)
+
+    def test_negative_nups_raises_error(self):
+        x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0])
+        with pytest.raises(ValueError, match="Argument 'nups' must be non-negative"):
+            _validate_args(x, -1, 1, "0", 0.0, 1, 0.0)
+
+    def test_negative_ndowns_raises_error(self):
+        x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0])
+        with pytest.raises(ValueError, match="Argument 'ndowns' must be non-negative"):
+            _validate_args(x, 1, -1, "0", 0.0, 1, 0.0)
+
+    def test_invalid_zero_raises_error(self):
+        x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0])
+        with pytest.raises(ValueError, match="Argument 'zero' must be '0', '\\+', or '-'"):
+            _validate_args(x, 1, 1, "x", 0.0, 1, 0.0)  # type: ignore[invalid-type-argument]
+
+    def test_negative_min_peak_height_raises_error(self):
+        x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0])
+        with pytest.raises(ValueError, match="Argument 'min_peak_height' must be non-negative"):
+            _validate_args(x, 1, 1, "0", -1.0, 1, 0.0)
+
+    def test_negative_min_peak_distance_raises_error(self):
+        x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0])
+        with pytest.raises(ValueError, match="Argument 'minpeakdistance' must be non-negative"):
+            _validate_args(x, 1, 1, "0", 0.0, -1, 0.0)
+
+    def test_negative_threshold_raises_error(self):
+        x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0])
+        with pytest.raises(ValueError, match="Argument 'threshold' must be non-negative"):
+            _validate_args(x, 1, 1, "0", 0.0, 1, -1.0)
+
+    def test_valid_args_does_not_raise(self):
+        x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0])
+        _validate_args(x, 1, 1, "0", 0.0, 1, 0.0)
+
+
+class TestEncodeSigns:
+    def test_increasing_sequence(self):
+        x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0, 4.0])
+        result: str = _encode_signs(x, "0")
+        assert result == "+++"
+
+    def test_decreasing_sequence(self):
+        x: npt.NDArray[float] = np.array([4.0, 3.0, 2.0, 1.0])
+        result: str = _encode_signs(x, "0")
+        assert result == "---"
+
+    def test_flat_sequence_with_zero(self):
+        x: npt.NDArray[float] = np.array([1.0, 1.0, 1.0])
+        result: str = _encode_signs(x, "0")
+        assert result == "00"
+
+    def test_flat_sequence_with_plus(self):
+        x: npt.NDArray[float] = np.array([1.0, 1.0, 1.0])
+        result: str = _encode_signs(x, "+")
+        assert result == "++"
+
+    def test_flat_sequence_with_minus(self):
+        x: npt.NDArray[float] = np.array([1.0, 1.0, 1.0])
+        result: str = _encode_signs(x, "-")
+        assert result == "--"
+
+    def test_flat_sequence_with_dollarsign(self):
+        x: npt.NDArray[float] = np.array([1.0, 1.0, 1.0])
+        result: str = _encode_signs(x, "$")
+        assert result == "$$"
+
+    def test_mixed_sequence(self):
+        x: npt.NDArray[float] = np.array([1.0, 2.0, 2.0, 3.0, 2.0])
+        result: str = _encode_signs(x, "0")
+        assert result == "+0+-"
+
+
+class TestEnforceMinimumPeakDistance:
+    def test_inplace_removes_close_peaks(self):
+        df: pd.DataFrame = pd.DataFrame(
+            {
+                "height": [10.0, 8.0, 5.0],
+                "peak_idx": [0, 2, 10],
+                "start_idx": [0, 1, 9],
+                "end_idx": [1, 3, 11],
+            }
+        )
+        _enforce_minimum_peak_distance(df, min_peak_distance=5, inplace=True)
+        assert len(df) == 2
+        assert df["peak_idx"].tolist() == [0, 10]
+
+    def test_not_inplace_returns_new_dataframe(self):
+        df: pd.DataFrame = pd.DataFrame(
+            {
+                "height": [10.0, 8.0, 5.0],
+                "peak_idx": [0, 2, 10],
+                "start_idx": [0, 1, 9],
+                "end_idx": [1, 3, 11],
+            }
+        )
+        result = _enforce_minimum_peak_distance(df, min_peak_distance=5, inplace=False)
+        assert len(result) == 2
+        assert len(df) == 3  # original unchanged
+        assert result["peak_idx"].tolist() == [0, 10]
+
+    def test_all_peaks_sufficiently_spaced(self):
+        df: pd.DataFrame = pd.DataFrame(
+            {
+                "height": [10.0, 8.0],
+                "peak_idx": [0, 10],
+                "start_idx": [0, 9],
+                "end_idx": [1, 11],
+            }
+        )
+        _enforce_minimum_peak_distance(df, min_peak_distance=5, inplace=True)
+        assert len(df) == 2
+
+
+class TestFindPeaks:
+    def test_simple_peak(self):
+        x: npt.NDArray[float] = np.asarray([0.0, 1.0, 2.0, 3.0, 2.0, 1.0, 0.0], dtype=float)
+        result: pd.DataFrame = find_peaks(x, min_peak_height=0.0)
+        assert len(result) == 1
+        assert result.iloc[0]["peak_idx"] == 3
+        assert result.iloc[0]["height"] == 3.0
+
+    def test_multiple_peaks(self):
+        x: npt.NDArray[float] = np.asarray([0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0], dtype=float)
+        result: pd.DataFrame = find_peaks(x, min_peak_height=0.0)
+        assert len(result) == 3
+
+    def test_no_peaks(self):
+        x: npt.NDArray[float] = np.asarray([1.0, 2.0, 3.0, 4.0, 5.0], dtype=float)
+        result: pd.DataFrame = find_peaks(x)
+        assert len(result) == 0
+        assert list(result.columns) == ["height", "peak_idx", "start_idx", "end_idx"]
+
+    def test_min_peak_height_filter(self):
+        x: npt.NDArray[float] = np.asarray([0.0, 1.0, 0.0, 5.0, 0.0], dtype=float)
+        result: pd.DataFrame = find_peaks(x, min_peak_height=2.0)
+        assert len(result) == 1
+        assert result.iloc[0]["height"] == 5.0
+
+    def test_nups_and_ndowns(self):
+        x: npt.NDArray[float] = np.asarray([0.0, 1.0, 2.0, 3.0, 2.0, 1.0, 0.0], dtype=float)
+        result: pd.DataFrame = find_peaks(x, nups=2, ndowns=2, min_peak_height=0.0)
+        assert len(result) == 1
+
+    def test_npeaks_limits_output(self):
+        x: npt.NDArray[float] = np.asarray([0.0, 5.0, 0.0, 3.0, 0.0, 1.0, 0.0], dtype=float)
+        result: pd.DataFrame = find_peaks(x, npeaks=2, min_peak_height=0.0)
+        assert len(result) == 2
+
+    def test_sortstr_orders_by_height(self):
+        x: npt.NDArray[float] = np.asarray([0.0, 1.0, 0.0, 5.0, 0.0, 3.0, 0.0], dtype=float)
+        result: pd.DataFrame = find_peaks(x, sortstr=True, min_peak_height=0.0)
+        heights = result["height"].tolist()
+        assert heights == sorted(heights, reverse=True)
+
+    def test_min_peak_distance(self):
+        x: npt.NDArray[float] = np.asarray([0.0, 10.0, 0.0, 8.0, 0.0], dtype=float)
+        result: pd.DataFrame = find_peaks(x, min_peak_distance=3, min_peak_height=0.0)
+        assert len(result) == 1
+        assert result.iloc[0]["height"] == 10.0
+
+    def test_threshold_filter(self):
+        x: npt.NDArray[float] = np.asarray([1.0, 2.0, 1.5, 5.0, 1.0], dtype=float)
+        result: pd.DataFrame = find_peaks(x, threshold=2.0, min_peak_height=0.0)
+        assert len(result) == 1
+        assert result.iloc[0]["height"] == 5.0
+
+    def test_accepts_list_input(self):
+        x: npt.NDArray[float] = np.asarray([0.0, 1.0, 2.0, 1.0, 0.0], dtype=float)
+        result: pd.DataFrame = find_peaks(x, min_peak_height=0.0)
+        assert len(result) == 1
+
+    def test_accepts_numpy_array(self):
+        x: npt.NDArray[float] = np.array([0.0, 1.0, 2.0, 1.0, 0.0])
+        result: pd.DataFrame = find_peaks(x, min_peak_height=0.0)
+        assert len(result) == 1
+
+    def test_custom_peak_pattern(self):
+        x: npt.NDArray[float] = np.asarray([0.0, 1.0, 2.0, 3.0, 2.0, 1.0, 0.0], dtype=float)
+        result: pd.DataFrame = find_peaks(x, peak_pattern=r"[+]{3,}[-]{3,}", min_peak_height=0.0)
+        assert len(result) == 1

From 46178635ef66cec27df5d34afa9f93998e13d20f Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 14:15:41 -0600
Subject: [PATCH 48/52] feat(dev): use named tuples for return types

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/approx.py | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/main/como/approx.py b/main/como/approx.py
index 3ef69a74..7bc8f53a 100644
--- a/main/como/approx.py
+++ b/main/como/approx.py
@@ -1,9 +1,22 @@
 from __future__ import annotations
 
 from collections.abc import Callable, Sequence
-from typing import Any
+from typing import Literal, NamedTuple
 
 import numpy as np
+import numpy.typing as npt
+
+
+class RegularizedArray(NamedTuple):
+    x: npt.NDArray[np.number]
+    y: npt.NDArray[np.number]
+    not_na: npt.NDArray[bool]
+    kept_na: bool
+
+
+class Approx(NamedTuple):
+    x: npt.NDArray[float]
+    y: npt.NDArray[float]
 
 
 def _coerce_to_float_array(a):
@@ -39,8 +52,7 @@ def _regularize_values(x: np.ndarray, y: np.ndarray, ties, na_rm: bool) -> dict[
         kept_na = np.isnan(y).any()
 
     if x.size == 0:
-        # THIS IS THE CORRECTED LINE:
-        return {"x": x, "y": y, "not_na": np.array([], dtype=bool), "kept_na": kept_na}
+        return RegularizedArray(x=x, y=y, not_na=np.array([], dtype=bool), kept_na=kept_na)
 
     # Use a stable sort (mergesort) to match R's order()
     order = np.argsort(x, kind="mergesort")
@@ -88,6 +100,7 @@ def agg_fn(a):
     # Check if any NaNs remain in the *aggregated* y values
     kept_na_final = np.any(~not_na) if np.any(np.isnan(y_agg)) else False
     return {"x": unique_x, "y": y_agg, "not_na": not_na, "kept_na": kept_na_final}
+    return RegularizedArray(x=unique_x, y=y_agg, not_na=not_na, kept_na=kept_na_final)
 
 
 def approx(
@@ -102,7 +115,7 @@ def approx(
     f: float = 0.0,
     ties: str | Callable[[np.ndarray], float] = "mean",
     na_rm: bool = True,
-) -> dict[str, np.ndarray]:
+) -> Approx:
     """Faithful Python port of R's `approx` function.
 
     This implementation aims to replicate the behavior of R's `approx`
@@ -133,7 +146,7 @@ def approx(
                are propagated.
 
     Returns:
-        dict with:
+        `Approx` class with:
           - 'x': numpy array of xout used
           - 'y': numpy array of interpolated values
     """
@@ -179,6 +192,7 @@ def approx(
     x_reg = r["x"]
     y_reg = r["y"]
     not_na_mask = r["not_na"]
+    r: RegularizedArray = _regularize_values(x_arr, y_arr, na_rm=na_rm, ties=ties)
     # no_na is True if we don't have to worry about NAs in y_reg
     no_na = na_rm or (not r["kept_na"])
     # nx is the number of *valid* (non-NA) points for interpolation
@@ -307,4 +321,4 @@ def approx(
         res[mid_mask] = res_mid
 
     yout[mask_valid] = res
-    return {"x": xout_arr, "y": yout}
+    return Approx(x=xout_arr, y=yout)

From 688ad8ed282873cb1101e88aece23e78a2c121a5 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 14:17:34 -0600
Subject: [PATCH 49/52] style: update docstrings and types

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/approx.py | 192 +++++++++++++++++++++++---------------------
 1 file changed, 101 insertions(+), 91 deletions(-)

diff --git a/main/como/approx.py b/main/como/approx.py
index 7bc8f53a..14c69e3e 100644
--- a/main/como/approx.py
+++ b/main/como/approx.py
@@ -19,25 +19,46 @@ class Approx(NamedTuple):
     y: npt.NDArray[float]
 
 
-def _coerce_to_float_array(a):
-    """Helper to ensure input is a 1D float array."""
+def _coerce_to_float_array(a: Sequence[int | float | np.number]):
+    """Coerce input to a 1D float array.
+
+    Args:
+        a: the array to coerce
+
+    Returns:
+        A floating point 1D array.
+    """
     arr = np.asarray(a, dtype=float)
     if arr.ndim != 1:
         arr = arr.ravel()
     return arr
 
 
-def _regularize_values(x: np.ndarray, y: np.ndarray, ties, na_rm: bool) -> dict[str, Any]:
-    """Minimal reimplementation of R's regularize.values() used by approx().
+def _regularize_values(
+    x: npt.NDArray[np.number],
+    y: npt.NDArray[np.number],
+    *,
+    na_rm: bool,
+    ties: Callable[[np.ndarray], float] | Literal["mean", "first", "last", "min", "max", "median", "sum"] = "mean",
+) -> RegularizedArray:
+    """Reimplementation of R's regularize.values() used by approx().
 
       - Removes NA pairs if na_rm is True (else requires x to have no NA).
       - Sorts by x (stable).
       - Collapses duplicate x via `ties` aggregator.
-    Returns dict with:
-      x: sorted unique x
-      y: aggregated y aligned with x
-      not_na: boolean mask of y that are not NaN
-      kept_na: True iff any NaN remained in y after regularization
+
+    Args:
+        x: the x values to regularize
+        y: ties: the corresponding y values
+        na_rm: should NA values be removed?
+        ties: how to handle duplicate x values; can be a string or a callable
+
+    Returns:
+        A NamedTuple with:
+          - x: sorted unique x
+          - y: aggregated y aligned with x
+          - not_na: boolean mask of y that are not NaN
+          - kept_na: True iff any NaN remained in y after regularization
     """
     if na_rm:
         # Remove pairs where x or y is NA
@@ -98,8 +119,7 @@ def agg_fn(a):
 
     not_na = ~np.isnan(y_agg)
     # Check if any NaNs remain in the *aggregated* y values
-    kept_na_final = np.any(~not_na) if np.any(np.isnan(y_agg)) else False
-    return {"x": unique_x, "y": y_agg, "not_na": not_na, "kept_na": kept_na_final}
+    kept_na_final = bool(np.any(~not_na) if np.any(np.isnan(y_agg)) else False)
     return RegularizedArray(x=unique_x, y=y_agg, not_na=not_na, kept_na=kept_na_final)
 
 
@@ -111,9 +131,9 @@ def approx(
     n: int = 50,
     yleft: float | None = None,
     yright: float | None = None,
-    rule: int | Sequence[int] = 1,
+    rule: int | Sequence[int] | npt.NDArray[int] = 1,
     f: float = 0.0,
-    ties: str | Callable[[np.ndarray], float] = "mean",
+    ties: Callable[[np.ndarray], float] | Literal["mean", "first", "last", "min", "max", "median", "sum"] = "mean",
     na_rm: bool = True,
 ) -> Approx:
     """Faithful Python port of R's `approx` function.
@@ -127,23 +147,15 @@ def approx(
         y: y-coordinates. If None, `x` is treated as `y` and `x` becomes `1..n`.
         xout: Points at which to interpolate.
         method: Interpolation method. "linear" (1) or "constant" (2).
-        n: If `xout` is not provided, interpolation happens at `n`
-             equally spaced points spanning the range of `x`.
-        yleft: Value to use for extrapolation to the left.
-               Defaults to `NA` (np.nan) if `rule` is 1.
-        yright: Value to use for extrapolation to the right.
-                Defaults to `NA` (np.nan) if `rule` is 1.
+        n: If `xout` is not provided, interpolation happens at `n` equally spaced points spanning the range of `x`.
+        yleft: Value to use for extrapolation to the left. Defaults to `NA` (np.nan) if `rule` is 1.
+        yright: Value to use for extrapolation to the right. Defaults to `NA` (np.nan) if `rule` is 1.
         rule: Extrapolation rule.
               - 1: Return `np.nan` for points outside the `x` range.
               - 2: Use `yleft` and `yright` for points outside the range.
-        f: For `method="constant"`, determines the split point.
-           `f=0` is left-step, `f=1` is right-step, `f=0.5` is midpoint.
-        ties: How to handle duplicate `x` values.
-              Can be 'mean', 'first', 'last', 'min', 'max', 'median', 'sum',
-              or a callable function.
-        na_rm: If True, `NA` pairs are removed before interpolation.
-               If False, `NA`s in `x` cause an error, `NA`s in `y`
-               are propagated.
+        f: For `method="constant"`, determines the split point. `f=0` is left-step, `f=1` is right-step, `f=0.5` is midpoint.
+        ties: How to handle duplicate `x` values. Can be 'mean', 'first', 'last', 'min', 'max', 'median', 'sum', or a callable function.
+        na_rm: If True, `NA` pairs are removed before interpolation. If False, `NA`s in `x` cause an error, `NA`s in `y` are propagated.
 
     Returns:
         `Approx` class with:
@@ -177,26 +189,24 @@ def approx(
             raise ValueError("invalid interpolation method")
 
     # --- Rule normalization ---
-    if isinstance(rule, list | tuple | np.ndarray):
-        rlist = list(rule)
-        if not (1 <= len(rlist) <= 2):
+    if isinstance(rule, Sequence | np.ndarray):
+        rule_list: list[int] = list(rule)  # type: ignore[invalid-argument-type]  # This is a valid argument type, not sure why ty is upset
+        if not (1 <= len(rule_list) <= 2):
             raise ValueError("`rule` must have length 1 or 2")
-        if len(rlist) == 1:
-            rlist = [rlist[0], rlist[0]]
+        if len(rule_list) == 1:
+            rule_list = [rule_list[0], rule_list[0]]
     else:
-        rlist = [rule, rule]
+        rule_list = [rule, rule]
 
-    # --- Regularize values ---
     # Sort by x, collapse ties, and handle NAs like R's regularize.values()
-    r = _regularize_values(x_arr, y_arr, ties, na_rm)
-    x_reg = r["x"]
-    y_reg = r["y"]
-    not_na_mask = r["not_na"]
     r: RegularizedArray = _regularize_values(x_arr, y_arr, na_rm=na_rm, ties=ties)
+    x_reg: npt.NDArray[float] = r.x
+    y_reg: npt.NDArray[float] = r.y
+    not_na_mask: npt.NDArray[bool] = r.not_na
     # no_na is True if we don't have to worry about NAs in y_reg
-    no_na = na_rm or (not r["kept_na"])
+    no_na: bool = na_rm or (not r.kept_na)
     # nx is the number of *valid* (non-NA) points for interpolation
-    nx = x_reg.size if no_na else int(np.count_nonzero(not_na_mask))
+    nx: int = x_reg.size if no_na else int(np.count_nonzero(not_na_mask))
 
     if np.isnan(nx):
         raise ValueError("invalid length(x)")
@@ -208,98 +218,98 @@ def approx(
         if nx == 0:
             raise ValueError("zero non-NA points")
 
-    # --- Set extrapolation values (yleft/yright) ---
+    # set extrapolation values (yleft/yright)
     # This logic matches R's C code (R_approxtest)
     if no_na:
-        y_first = y_reg[0]
-        y_last = y_reg[-1]
+        y_first: float = y_reg[0]
+        y_last: float = y_reg[-1]
     else:
         # Find first and last non-NA y values
-        y_valid = y_reg[not_na_mask]
-        y_first = y_valid[0]
-        y_last = y_valid[-1]
+        y_valid: npt.NDArray[float] = y_reg[not_na_mask]
+        y_first: float = y_valid[0]
+        y_last: float = y_valid[-1]
 
-    yleft_val = (np.nan if int(rlist[0]) == 1 else y_first) if yleft is None else float(yleft)
-    yright_val = (np.nan if int(rlist[1]) == 1 else y_last) if yright is None else float(yright)
+    yleft_val: float = (np.nan if int(rule_list[0]) == 1 else y_first) if yleft is None else float(yleft)
+    yright_val: float = (np.nan if int(rule_list[1]) == 1 else y_last) if yright is None else float(yright)
 
-    # --- Fabricate xout if missing ---
+    # fabricate xout if it is missing
     if xout is None:
         if n <= 0:
             raise ValueError("'approx' requires n >= 1")
         if no_na:
-            x_first = x_reg[0]
-            x_last = x_reg[nx - 1]
+            x_first: float = x_reg[0]
+            x_last: float = x_reg[nx - 1]
         else:
-            x_valid = x_reg[not_na_mask]
-            x_first = x_valid[0]
-            x_last = x_valid[-1]
-        xout_arr = np.linspace(x_first, x_last, num=int(n), dtype=float)
+            x_valid: npt.NDArray[float] = x_reg[not_na_mask]
+            x_first: float = x_valid[0]
+            x_last: float = x_valid[-1]
+        xout_arr: npt.NDArray[float] = np.linspace(x_first, x_last, num=int(n), dtype=float)
     else:
-        xout_arr = _coerce_to_float_array(xout)
+        xout_arr: npt.NDArray[float] = _coerce_to_float_array(xout)
 
-    # --- C_ApproxTest checks ---
+    # replicate R's C_ApproxTest checks
     if method_code == 2 and (not np.isfinite(f) or f < 0.0 or f > 1.0):
-        raise ValueError("approx(): invalid f value")
+        raise ValueError("approx(): invalid f value; with `method=2`, `f` must be in the range [0,1] (inclusive) or NA")
     if not no_na:
         # R re-filters x and y here if NAs remained
-        x_reg = x_reg[not_na_mask]
-        y_reg = y_reg[not_na_mask]
+        x_reg: npt.NDArray[float] = x_reg[not_na_mask]
+        y_reg: npt.NDArray[float] = y_reg[not_na_mask]
 
-    # --- Interpolation ---
-    # This section vectorized the logic from R's approx1 and R_approxfun
-    yout = np.empty_like(xout_arr, dtype=float)
-    mask_nan_xout = np.isnan(xout_arr)
+    # perform interpolation
+    # this section is a vectorized form of the logic from R's approx1 and R_approxfun
+    yout: npt.NDArray[float] = np.empty_like(xout_arr, dtype=float)
+    mask_nan_xout: npt.NDArray[bool] = np.isnan(xout_arr)
     yout[mask_nan_xout] = np.nan
-    mask_valid = ~mask_nan_xout
+    mask_valid: npt.NDArray[bool] = ~mask_nan_xout
 
     if x_reg.size == 0:
         # No valid points to interpolate against
         yout[mask_valid] = np.nan
-        return {"x": xout_arr, "y": yout}
+        return Approx(x=xout_arr, y=yout)
 
-    xv = xout_arr[mask_valid]
-    left_mask = xv < x_reg[0]
-    right_mask = xv > x_reg[-1]
-    mid_mask = ~(left_mask | right_mask)
+    xv: npt.NDArray[float] = xout_arr[mask_valid]
+    left_mask: npt.NDArray[bool] = xv < x_reg[0]
+    right_mask: npt.NDArray[bool] = xv > x_reg[-1]
+    mid_mask: npt.NDArray[bool] = ~(left_mask | right_mask)
 
-    res = np.empty_like(xv)
+    res: npt.NDArray[float] = np.empty_like(xv, dtype=float)
     res[left_mask] = yleft_val
     res[right_mask] = yright_val
 
     if np.any(mid_mask):
-        xm = xv[mid_mask]
+        xm: npt.NDArray[float] = xv[mid_mask]
 
         # Find indices
         # j_right[k] = index of first x_reg > xm[k]
-        j_right = np.searchsorted(x_reg, xm, side="right")
+        j_right: npt.NDArray[int] = np.searchsorted(x_reg, xm, side="right")
         # j_left[k] = index of first x_reg >= xm[k]
-        j_left = np.searchsorted(x_reg, xm, side="left")
+        j_left: npt.NDArray[int] = np.searchsorted(x_reg, xm, side="left")
 
         # Points that exactly match an x_reg value
-        eq_mask = j_left != j_right
+        eq_mask: npt.NDArray[bool] = j_left != j_right
         # Points that fall between x_reg values
-        in_interval_mask = ~eq_mask
+        in_interval_mask: npt.NDArray[bool] = ~eq_mask
 
-        res_mid = np.empty_like(xm)
+        res_mid: npt.NDArray[float] = np.empty_like(xm, dtype=float)
 
         if np.any(eq_mask):
             # For exact matches, use the corresponding y_reg value
             # R's C code uses the 'j_left' index here
-            res_mid[eq_mask] = y_reg[j_left[eq_mask]]
+            res_mid[eq_mask] = y_reg[j_left[eq_mask]]  # type: ignore[non-subscriptable]  # we know this is of type npt.NDArray[float], not sure why ty is upset
 
         if np.any(in_interval_mask):
             # R's C code sets i = j-1, where j is the 'right' index
-            j = j_right[in_interval_mask]
-            i = j - 1
+            j: npt.NDArray[float] = j_right[in_interval_mask]  # type: ignore[non-subscriptable]  # we know this is of type npt.NDArray[float], not sure why ty is upset
+            i: npt.NDArray[float] = j - 1
 
             # Get the surrounding x and y values
-            xi = x_reg[i]
-            xj = x_reg[j]
-            yi = y_reg[i]
-            yj = y_reg[j]
+            xi: npt.NDArray[float] = x_reg[i]
+            xj: npt.NDArray[float] = x_reg[j]
+            yi: npt.NDArray[float] = y_reg[i]
+            yj: npt.NDArray[float] = y_reg[j]
 
             if method_code == 1:  # linear
-                t = (xm[in_interval_mask] - xi) / (xj - xi)
+                t: npt.NDArray[float] = (xm[in_interval_mask] - xi) / (xj - xi)
                 res_mid[in_interval_mask] = yi + (yj - yi) * t
             else:  # constant
                 # This matches R_approxfun's constant logic
@@ -308,14 +318,14 @@ def approx(
                 elif f == 1.0:
                     res_mid[in_interval_mask] = yj
                 else:
-                    # R computes (1-f)*yi + f*yj, but carefully
-                    f1 = 1.0 - f
-                    f2 = f
-                    part = np.zeros_like(yi)
+                    # computes R's (1-f)*yi + f*yj
+                    f1: float = float(1.0 - f)
+                    f2: float = float(f)
+                    part: npt.NDArray[float] = np.zeros_like(yi, dtype=float)
                     if f1 != 0.0:
-                        part = part + yi * f1
+                        part: npt.NDArray[float] = part + yi * f1
                     if f2 != 0.0:
-                        part = part + yj * f2
+                        part: npt.NDArray[float] = part + yj * f2
                     res_mid[in_interval_mask] = part
 
         res[mid_mask] = res_mid

From 874fdf2dfb24542b96b67cce48d81a60196c5f3a Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 14:18:08 -0600
Subject: [PATCH 50/52] refactor: do not create temporary variable for
 aggregation function

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/approx.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/main/como/approx.py b/main/como/approx.py
index 14c69e3e..17805253 100644
--- a/main/como/approx.py
+++ b/main/como/approx.py
@@ -84,24 +84,23 @@ def _regularize_values(
     if callable(ties):
         agg_fn = ties
     else:
-        ties_str = "mean" if ties is None else str(ties).lower()
-        if ties_str in ("mean", "avg", "average"):
+        if ties in ("mean", "avg", "average"):
             agg_fn = np.mean
-        elif ties_str in ("first", "left"):
+        elif ties in ("first", "left"):
 
             def agg_fn(a):
                 return a[0]
-        elif ties_str in ("last", "right"):
+        elif ties in ("last", "right"):
 
             def agg_fn(a):
                 return a[-1]
-        elif ties_str == "min":
+        elif ties == "min":
             agg_fn = np.min
-        elif ties_str == "max":
+        elif ties == "max":
             agg_fn = np.max
-        elif ties_str == "median":
+        elif ties == "median":
             agg_fn = np.median
-        elif ties_str == "sum":
+        elif ties == "sum":
             agg_fn = np.sum
         else:
             raise ValueError("Unsupported `ties`; use a callable or one of 'mean', 'first', 'last', 'min', 'max', 'median', 'sum'.")

From c6a0015ed459a3df3242bd38867c516102d6b48d Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 14:20:27 -0600
Subject: [PATCH 51/52] style: ruff formatting

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/approx.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main/como/approx.py b/main/como/approx.py
index 17805253..441a489f 100644
--- a/main/como/approx.py
+++ b/main/como/approx.py
@@ -84,14 +84,13 @@ def _regularize_values(
     if callable(ties):
         agg_fn = ties
     else:
+        # fmt: off
         if ties in ("mean", "avg", "average"):
             agg_fn = np.mean
         elif ties in ("first", "left"):
-
             def agg_fn(a):
                 return a[0]
         elif ties in ("last", "right"):
-
             def agg_fn(a):
                 return a[-1]
         elif ties == "min":
@@ -104,6 +103,7 @@ def agg_fn(a):
             agg_fn = np.sum
         else:
             raise ValueError("Unsupported `ties`; use a callable or one of 'mean', 'first', 'last', 'min', 'max', 'median', 'sum'.")
+        # fmt: on
 
     # Find unique x values and their indices/counts
     unique_x, start_idx, counts = np.unique(x_sorted, return_index=True, return_counts=True)

From ae60218a0661eed4bc778b787ae98ef5a2c6fb19 Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Mon, 3 Nov 2025 14:20:52 -0600
Subject: [PATCH 52/52] feat(dev): use named tuples for return types

Signed-off-by: Josh Loecker <joshloecker@icloud.com>
---
 main/como/density.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/main/como/density.py b/main/como/density.py
index 0f75623a..810ced26 100644
--- a/main/como/density.py
+++ b/main/como/density.py
@@ -6,9 +6,8 @@
 
 import numpy as np
 import numpy.typing as npt
-from scipy.interpolate import interp1d
 
-from como.approx import approx
+from como.approx import Approx, approx
 
 
 class DensityResult(NamedTuple):
@@ -116,7 +115,7 @@ def dnorm(x: float, mean: NUMBER = 0.0, sd: NUMBER = 1.0, log: bool = False, fas
     # underflow boundary
     boundary = np.sqrt(-2.0 * m_ln2 * (dbl_min_exp + 1 - dbl_mant_dig))
     if a > boundary:
-        return float(0.0)
+        return 0.0
 
     # Now, to get full accuracy, split x into two parts,
     #   x = x1+x2, such that |x2| <= 2^-16.
@@ -228,12 +227,15 @@ def density(
         elif kernel == "optcosine":
             return np.sqrt(1 - 8 / np.pi**2) * np.pi**2 / 16
 
+    if kernel != "gaussian":
+        raise NotImplementedError(f"Only 'gaussian' kernel is implemented; got '{kernel}'")
+
     x: npt.NDArray[float] = np.asarray(x, dtype=float)
 
     has_weights = weights is not None
     weights: npt.NDArray[float] | None = np.asarray(weights, float) if weights is not None else None
-    if has_weights and (weights is not None and weights.size != n):
-        raise ValueError(f"The length of provided weights does not match the length of x: {weights.size} != {n}")
+    if has_weights and (weights is not None and weights.size != x.size):
+        raise ValueError(f"The length of provided weights does not match the length of x: {weights.size} != {x.size}")
 
     x_na: npt.NDArray[np.bool_] = np.isnan(x)
     if np.any(x_na):
@@ -286,8 +288,9 @@ def density(
     if bw_calc <= 0:
         raise ValueError("Bandwidth 'bw' must be positive.")
 
-    from_ = float(from_ or min(x) - cut * bw_calc)
-    to_ = float(to_ or max(x) + cut * bw_calc)
+    # have to use `... if ... else` because `0` is falsey, resulting in the right-half being used instead of the user-provided value
+    from_ = float(from_ if from_ is not None else min(x) - cut * bw_calc)
+    to_ = float(to_ if to_ is not None else max(x) + cut * bw_calc)
 
     if not np.isfinite(from_):
         raise ValueError("'from_' is not finite.")
@@ -313,11 +316,11 @@ def density(
 
     # xp=known x-coords, fp=known y-cords, x=unknown x-coords; returns interpolated (e.g., unknown) y-coords
     interp_x: npt.NDArray[float] = np.linspace(from_, to_, num=n_user)
-    interp_y: npt.NDArray[float] = approx(xords, kords, interp_x)
+    interp_y: Approx = approx(xords, kords, interp_x)
 
     return DensityResult(
         x=interp_x,
-        y=interp_y["y"],
+        y=interp_y.y,
         x_grid=xords,
         y_grid=kords,
         bw=bw_calc,