From ebd5a391663630b7114bdda8cf8236b3b116654e Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Thu, 9 Oct 2025 14:30:13 -0500 Subject: [PATCH 01/52] feat: added log statement and hint to user that an empty dataframe indicates an incorrect taxon id Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 4f797658..38bd314a 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -168,6 +168,11 @@ async def _build_matrix_results( """ matrix.dropna(subset="ensembl_gene_id", inplace=True) conversion = await ensembl_to_gene_id_and_symbol(ids=matrix["ensembl_gene_id"].tolist(), taxon=taxon) + + # If any one column was + if any(conversion[col].eq("-").all() for col in conversion.columns): + logger.critical(f"Conversion of Ensembl Gene IDs to Entrez IDs and Gene Symbols was empty - is '{taxon}' the correct taxon ID for this data?") + conversion["ensembl_gene_id"] = conversion["ensembl_gene_id"].str.split(",") conversion = conversion.explode("ensembl_gene_id") conversion.reset_index(inplace=True, drop=True) From 7cab8f3a1793a696aec633702e23ac5e2099d770 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Thu, 23 Oct 2025 15:48:19 -0500 Subject: [PATCH 02/52] fix: do not drop na values; keep as much data as possible Signed-off-by: Josh Loecker --- main/como/rnaseq_preprocess.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index b83678d1..ada40a5e 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -367,7 +367,6 @@ async def _write_counts_matrix( final_matrix = final_matrix[["ensembl_gene_id", *rna_specific_sample_names]] output_counts_matrix_filepath.parent.mkdir(parents=True, exist_ok=True) - final_matrix.dropna(inplace=True) final_matrix.to_csv(output_counts_matrix_filepath, index=False) logger.success(f"Wrote gene count matrix for '{rna.value}' RNA at '{output_counts_matrix_filepath}'") return final_matrix @@ -682,7 +681,7 @@ async def read_counts(file: Path) -> list[str]: ) # Remove NA values from entrez_gene_id dataframe column - return conversion["entrez_gene_id"].dropna().tolist() + return conversion["entrez_gene_id"].tolist() logger.info("Fetching gene info - this can take up to 5 minutes depending on the number of genes and your internet connection") genes = set(chain.from_iterable(await asyncio.gather(*[read_counts(f) for f in counts_matrix_filepaths]))) @@ -709,7 +708,6 @@ async def read_counts(file: Path) -> list[str]: gene_info = gene_info[((~gene_info["entrez_gene_id"].isna()) & (~gene_info["ensembl_gene_id"].isna()) & (~gene_info["gene_symbol"].isna()))] gene_info.sort_values(by="ensembl_gene_id", inplace=True) - gene_info.dropna(inplace=True) output_filepath.parent.mkdir(parents=True, exist_ok=True) gene_info.to_csv(output_filepath, index=False) From 7282a342c99394f3db38127c19a27718ac523afa Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Thu, 23 Oct 2025 15:52:38 -0500 Subject: [PATCH 03/52] fix: allow empty genomic values Signed-off-by: Josh Loecker --- main/como/rnaseq_preprocess.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index ada40a5e..3271025d 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -459,6 +459,7 @@ async def _create_config_df( # noqa: C901 fragment_label = f"{context_name}_{label}_fragment_size.txt" frag_paths = [p for p in aux_lookup["fragment"].values() if p.name == fragment_label] + print(f"HERE: {prep}") if not frag_paths and prep.lower() != RNAType.TRNA.value.lower(): logger.warning(f"No fragment file for '{label}'; defaulting to 100 bp (needed for zFPKM).") mean_frag = 100.0 @@ -705,8 +706,6 @@ async def read_counts(file: Path) -> list[str]: gene_info.at[i, "entrez_gene_id"] = data.get("entrezgene", pd.NA) gene_info.at[i, "ensembl_gene_id"] = ensembl_ids gene_info.at[i, "size"] = end_pos - start_pos - - gene_info = gene_info[((~gene_info["entrez_gene_id"].isna()) & (~gene_info["ensembl_gene_id"].isna()) & (~gene_info["gene_symbol"].isna()))] gene_info.sort_values(by="ensembl_gene_id", inplace=True) output_filepath.parent.mkdir(parents=True, exist_ok=True) @@ -855,10 +854,10 @@ async def rnaseq_preprocess( async def _main(): context_name = "notreatment" taxon = 9606 - como_context_dir = Path("/Users/joshl/Projects/COMO/main/data/COMO_input/notreatment") - output_gene_info_filepath = Path("/Users/joshl/Projects/COMO/main/data/results/notreatment/gene_info.csv") + como_context_dir = Path("/Users/joshl/Projects/COMO/main/data/COMO_input/naiveB") + output_gene_info_filepath = Path("/Users/joshl/Projects/COMO/main/data/results/naiveB/gene_info_fixed.csv") output_trna_metadata_filepath = Path("/Users/joshl/Projects/COMO/main/data/config_sheets/trna_config.xlsx") - output_trna_count_matrix_filepath = Path("/Users/joshl/Projects/COMO/main/data/results/notreatment/total-rna/totalrna_notreatment.csv") + output_trna_count_matrix_filepath = Path("/Users/joshl/Projects/COMO/main/data/results/naiveB/total-rna/totalrna_naiveB.csv") await rnaseq_preprocess( context_name=context_name, From c903cc8697159dcdc12cd1b89c76c68d06992c55 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:21:47 -0600 Subject: [PATCH 04/52] chore: move from `np.float(ing,64,32)` to python's `float` Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 33 +++++++++---------- .../pipelines/build_condition_heatmaps.py | 2 +- main/como/plot/heatmap.py | 2 +- main/como/rnaseq_gen.py | 14 ++++---- main/como/rnaseq_preprocess.py | 2 +- main/como/sanity_checks/cobra_sanity_check.py | 2 +- main/como/sanity_checks/fast_leak_test.py | 2 +- main/como/stats/_two_sample.py | 12 +++---- main/como/stats/ks_test.py | 26 +++++++-------- main/como/stats/mann_whitney_test.py | 12 +++---- tests/unit/test_fisher_stats.py | 4 +-- tests/unit/test_ks_stats.py | 6 ++-- tests/unit/test_mwu_stats.py | 4 +-- 13 files changed, 59 insertions(+), 62 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 52a035ce..8985603a 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -168,13 +168,13 @@ def _feasibility_test(model_cobra: cobra.Model, step: str): def _build_with_gimme( reference_model: cobra.Model, - lower_bounds: Sequence[float | np.floating], - upper_bounds: Sequence[float | np.floating], + lower_bounds: Sequence[float] | npt.NDArray[float], + upper_bounds: Sequence[float] | npt.NDArray[float], idx_objective: int, - expr_vector: npt.NDArray[np.floating], + expr_vector: npt.NDArray[float], ): model_reconstruction = reference_model.copy() - s_matrix: npt.NDArray[np.floating] = cobra.util.array.create_stoichiometric_matrix(model=model_reconstruction) + s_matrix: list[float] = list(cobra.util.array.create_stoichiometric_matrix(model=model_reconstruction)) # `Becker and Palsson (2008). Context-specific metabolic networks are # consistent with experiments. PLoS Comput. Biol. 4, e1000082.` properties = GIMMEProperties( @@ -184,7 +184,7 @@ def _build_with_gimme( preprocess=True, flux_threshold=0.9, ) - algorithm = GIMME(s_matrix, lower_bounds, upper_bounds, properties) + algorithm = GIMME(s_matrix, list(lower_bounds), list(upper_bounds), properties) gene_activity = algorithm.run() reaction_ids = [r.id for r in model_reconstruction.reactions] to_remove_ids = [reaction_ids[r] for r in np.where(gene_activity == 0)[0]] @@ -219,26 +219,27 @@ def _build_with_fastcore(cobra_model, s_matrix, lower_bounds, upper_bounds, exp_ def _build_with_imat( reference_model: cobra.Model, - lower_bounds: Sequence[float], - upper_bounds: Sequence[float], + lower_bounds: Sequence[float] | npt.NDArray[float], + upper_bounds: Sequence[float] | npt.NDArray[float], expr_vector: npt.NDArray, expr_thresh: tuple[float, float], - force_gene_indices: Sequence[int], + force_reaction_indices: Sequence[int], solver: str, ) -> cobra.Model: properties: IMATProperties = IMATProperties( exp_vector=expr_vector, exp_thresholds=expr_thresh, - core=force_gene_indices, + # core=np.array(force_gene_indices).tolist(), + core=list(force_reaction_indices), # `list()` is required because imat uses `if core:`; this returns an error on numpy arrays epsilon=0.01, solver=solver.upper(), ) # Creating a copy of the model ensures we don't make any in-place modifications by accident # Using cobra to create the stoichiometry matrix means we have less work to do - force_gene_indices = np.array(force_gene_indices, dtype=np.uint16) + force_reaction_indices = np.array(force_reaction_indices, dtype=np.uint16) model_reconstruction: cobra.Model = reference_model.copy() - s_matrix: npt.NDArray[np.floating] = cobra.util.array.create_stoichiometric_matrix(model=model_reconstruction) + s_matrix: npt.NDArray[float] = cobra.util.array.create_stoichiometric_matrix(model=model_reconstruction) algorithm: IMAT = IMAT(S=s_matrix, lb=np.array(lower_bounds), ub=np.array(upper_bounds), properties=properties) rxns_from_imat: npt.NDArray[np.uint16] = algorithm.run().astype(np.uint16) @@ -246,11 +247,7 @@ def _build_with_imat( all_rxn_ids: npt.NDArray[str] = np.array([r.id for r in model_reconstruction.reactions], dtype=object) all_rxn_indices: npt.NDArray[np.uint16] = np.array(range(len(model_reconstruction.reactions)), dtype=np.uint16) - # Collect reactions to keep by creating a unique set of reactions from the iMAT algorithm and force-include reactions - # dtype is set to uint16 because indices will not be below 0 or be greater than 65,535 (max size of uint16), - # because only ~10,000 reactions exist in Recon3D - # Unsafe casting is OK because of these facts. - rxn_indices_to_keep: npt.NDArray[np.uint16] = np.unique(np.concatenate([rxns_from_imat, force_gene_indices], dtype=np.uint16)) + rxn_indices_to_keep: npt.NDArray[int] = np.unique(np.concatenate([rxns_from_imat, force_reaction_indices], dtype=int)) # Reaction indices to exclude from the model are thus reactions that are not marked to be included in the model # Assume unique is false because every value that is in `rxn_indices_to_keep` is included in `all_rxn_indices` @@ -277,7 +274,7 @@ def _build_with_tinit( no_reverse_loops=True, ) model_reconstruction = reference_model.copy() - s_matrix: npt.NDArray[np.floating] = cobra.util.array.create_stoichiometric_matrix(model=model_reconstruction).astype(np.float64) + s_matrix: npt.NDArray[float] = np.asarray(cobra.util.array.create_stoichiometric_matrix(model=model_reconstruction), dtype=float) algorithm = tINIT(s_matrix, lower_bounds, upper_bounds, properties) algorithm.preprocessing() algorithm.build_problem() @@ -445,7 +442,7 @@ async def _build_model( # noqa: C901 high_thresh=high_thresh, low_thresh=low_thresh, ) - expression_vector: npt.NDArray[np.int32] = np.array(list(reaction_expression.values()), dtype=np.int32) + expression_vector: npt.NDArray[float] = np.array(list(reaction_expression.values()), dtype=float) for rxn in force_reactions: if rxn not in reaction_ids: diff --git a/main/como/pipelines/build_condition_heatmaps.py b/main/como/pipelines/build_condition_heatmaps.py index 16b970b8..03420489 100644 --- a/main/como/pipelines/build_condition_heatmaps.py +++ b/main/como/pipelines/build_condition_heatmaps.py @@ -140,7 +140,7 @@ def group_reactions_by_pathway(models: cobra.Model | list[cobra.Model], flux_df: # for condition in flux_df.index: # for pathway, reactions in pathways_by_reaction.items(): # pathway_flux.loc[condition, pathway] = flux_df.loc[condition, list(reactions)].sum() - pathway_fluxes: dict[str, pd.Series[npt.NDArray[np.floating]]] = {} + pathway_fluxes: dict[str, pd.Series[npt.NDArray[float]]] = {} for pathway, reactions in pathways_by_reaction.items(): reactions_in_df = list(reactions.intersection(flux_df.columns)) if reactions_in_df: diff --git a/main/como/plot/heatmap.py b/main/como/plot/heatmap.py index 0d6e4e55..6709a0a3 100644 --- a/main/como/plot/heatmap.py +++ b/main/como/plot/heatmap.py @@ -30,7 +30,7 @@ def condition_vs_pathway( The resulting `matpotlib.pyplt.Figure` object """ plot_df: pd.DataFrame = data.copy() if copy_df else data - plot_df = plot_df.astype(np.float32) + plot_df = plot_df.astype(float) fig = plt.figure(figsize=(100, 40), dpi=175) if exclude_zero_flux_pathways: diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 38bd314a..99121f8f 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -139,7 +139,7 @@ def genefilter(data: pd.DataFrame | npt.NDArray, filter_func: Callable[[npt.NDAr Returns: A NumPy array of the filtered data. """ - if not isinstance(data, pd.DataFrame | npt.NDArray): + if not isinstance(data, pd.DataFrame | np.ndarray): _log_and_raise_error( f"Unsupported data type. Must be a Pandas DataFrame or a NumPy array, got '{type(data)}'", error=TypeError, @@ -272,11 +272,11 @@ def _calculate_fpkm(metrics: NamedMetrics, scale: int = 1e6) -> NamedMetrics: for sample in range(metrics[study].num_samples): layout = metrics[study].layout[sample] - count_matrix: npt.NDArray[np.float32] = metrics[study].count_matrix.iloc[:, sample].values + count_matrix: npt.NDArray[float] = metrics[study].count_matrix.iloc[:, sample].values gene_lengths = ( - metrics[study].fragment_lengths[sample].astype(np.float32) + metrics[study].fragment_lengths[sample].astype(float) if layout == LayoutMethod.paired_end - else metrics[study].gene_sizes.astype(np.float32) + else metrics[study].gene_sizes.astype(float) ) gene_lengths_kb = gene_lengths / 1000.0 @@ -595,11 +595,11 @@ def cpm_filter( top_samples = round(n_top * len(counts.columns)) # noqa: F841 test_bools = pd.DataFrame({"entrez_gene_ids": entrez_ids}) for i in range(len(counts_per_million.columns)): - median_sum = np.float64(np.median(np.sum(counts[:, i]))) + median_sum = float(np.median(np.sum(counts[:, i]))) if cut_off == "default": # noqa: SIM108 - cutoff = np.float64(10e6) / median_sum + cutoff = float(10e6 / median_sum) else: - cutoff = np.float64(1e6 * cut_off) / median_sum + cutoff = float(1e6 * cut_off) / median_sum test_bools = test_bools.merge(counts_per_million[counts_per_million.iloc[:, i] > cutoff]) return metrics diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index 3271025d..142be019 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -262,7 +262,7 @@ async def _process_first_multirun_sample(strand_file: Path, all_counts_files: li # Set na values to 0 sample_count = sample_count.fillna(value="0") - sample_count["counts"] = sample_count["counts"].astype(np.float64) + sample_count["counts"] = sample_count["counts"].astype(float) count_sums = sample_count.groupby("ensembl_gene_id", as_index=False)["counts"].mean() count_sums["counts"] = np.ceil(count_sums["counts"].astype(np.uint32)) diff --git a/main/como/sanity_checks/cobra_sanity_check.py b/main/como/sanity_checks/cobra_sanity_check.py index fdd9fc28..af648dcd 100644 --- a/main/como/sanity_checks/cobra_sanity_check.py +++ b/main/como/sanity_checks/cobra_sanity_check.py @@ -134,7 +134,7 @@ def preprocess(model_path: Path, solver: Literal["glpk", "cplex", "gurobi"] = "g modelexchanges_ids: list[str] = [rxn.id for rxn in model_closed.reactions if re.search(pattern=r"^(EX|DM|sink)_", string=rxn.id)] # Step 2: Matrix-based method to find selected_exchanges reactions (single metabolite, 1 entry) - s_matrix: npt.NDArray[np.floating] = cobra.util.array.create_stoichiometric_matrix(model_closed) + s_matrix: npt.NDArray[float] = cobra.util.array.create_stoichiometric_matrix(model_closed) # Step 2: Convert it to a sparse CSC matrix to optimize memory usage and performance s_sparse = csc_matrix(s_matrix) diff --git a/main/como/sanity_checks/fast_leak_test.py b/main/como/sanity_checks/fast_leak_test.py index 0ce92c1a..55ec6b1b 100644 --- a/main/como/sanity_checks/fast_leak_test.py +++ b/main/como/sanity_checks/fast_leak_test.py @@ -27,7 +27,7 @@ def fast_leak_test( model: cobra.Model, test_reactions: Iterable[str], demand_test, - tolerance: np.float16 = 1e-6, + tolerance: float = 1e-6, ): """Run a Fast Leak Test. diff --git a/main/como/stats/_two_sample.py b/main/como/stats/_two_sample.py index d856ebeb..e97dffbf 100644 --- a/main/como/stats/_two_sample.py +++ b/main/como/stats/_two_sample.py @@ -9,8 +9,8 @@ T_BASE_SAMPLE = TypeVar("T_BASE_SAMPLE", bound="BaseTwoSample") T_ALTERNATIVE = Literal["greater", "less", "two-sided"] -KS_RESULT = tuple[np.floating, np.floating, np.floating, np.int8] -MW_RESULT = tuple[np.floating, np.floating] +KS_RESULT = tuple[float, float, float, int] +MW_RESULT = tuple[float, float] TEST_RESULT = TypeVar("TEST_RESULT", KS_RESULT, MW_RESULT) __all__ = ["BaseTwoSample"] @@ -21,7 +21,7 @@ class BaseTwoSample(ABC, Generic[TEST_RESULT]): @staticmethod @abstractmethod - def _worker(a: npt.NDArray[np.floating], b: npt.NDArray[np.floating], **kwargs) -> TEST_RESULT: ... + def _worker(a: npt.NDArray[float], b: npt.NDArray[float], **kwargs) -> TEST_RESULT: ... @property @abstractmethod @@ -40,10 +40,10 @@ def _run( df2: pd.DataFrame, cores: int = 1, worker_kwargs: dict | None = None, - ) -> tuple[list[str], Mapping[str, npt.NDArray[np.float64 | np.uint8]]]: + ) -> tuple[list[str], Mapping[str, npt.NDArray[float | np.uint8]]]: all_reactions = list(set(df1.columns) & set(df2.columns)) - array_a = df1[all_reactions].to_numpy(dtype=np.float64, copy=False) - array_b = df2[all_reactions].to_numpy(dtype=np.float64, copy=False) + array_a = df1[all_reactions].to_numpy(dtype=float, copy=False) + array_b = df2[all_reactions].to_numpy(dtype=float, copy=False) n = len(all_reactions) results = {field: np.empty(n, dtype=np.dtype(dtype)) for field, dtype in cls._fields.items()} diff --git a/main/como/stats/ks_test.py b/main/como/stats/ks_test.py index 9f3c1778..68fb1c2f 100644 --- a/main/como/stats/ks_test.py +++ b/main/como/stats/ks_test.py @@ -14,20 +14,20 @@ @dataclass(frozen=True, kw_only=True, slots=True) class KSTest(BaseTwoSample[KS_RESULT]): _fields: ClassVar[dict[str, type]] = { - "statistic": np.float64, - "pvalue": np.float64, - "statistic_location": np.float64, - "statistic_sign": np.uint8, + "statistic": float, + "pvalue": float, + "statistic_location": float, + "statistic_sign": int, } reaction_ids: list[str] - statistic: npt.NDArray[np.float64] - pvalue: npt.NDArray[np.float64] - statistic_location: npt.NDArray[np.float64] - statistic_sign: npt.NDArray[np.int8] + statistic: npt.NDArray[float] + pvalue: npt.NDArray[float] + statistic_location: npt.NDArray[float] + statistic_sign: npt.NDArray[int] @staticmethod - def _worker(a: npt.NDArray[np.floating], b: npt.NDArray[np.floating], **kwargs) -> KS_RESULT: + def _worker(a: npt.NDArray[float], b: npt.NDArray[float], **kwargs) -> KS_RESULT: """Calculate the KS statistic. Args: @@ -78,10 +78,10 @@ def run( ) return cls( reaction_ids=all_reactions, - statistic=results["statistic"].astype(np.float64), - pvalue=results["pvalue"].astype(np.float64), - statistic_location=results["statistic_location"].astype(np.float64), - statistic_sign=results["statistic_sign"].astype(np.int8), + statistic=results["statistic"].astype(float), + pvalue=results["pvalue"].astype(float), + statistic_location=results["statistic_location"].astype(float), + statistic_sign=results["statistic_sign"].astype(int), ) @property diff --git a/main/como/stats/mann_whitney_test.py b/main/como/stats/mann_whitney_test.py index 98c8131d..8e63f93c 100644 --- a/main/como/stats/mann_whitney_test.py +++ b/main/como/stats/mann_whitney_test.py @@ -13,14 +13,14 @@ @dataclass(frozen=True, kw_only=True, slots=True) class MannWhitneyUTest(BaseTwoSample[MW_RESULT]): - _fields: ClassVar[dict[str, type]] = {"statistic": np.float64, "pvalue": np.float64} + _fields: ClassVar[dict[str, type]] = {"statistic": float, "pvalue": float} reaction_ids: list[str] - statistic: npt.NDArray[np.float64] - pvalue: npt.NDArray[np.float64] + statistic: npt.NDArray[float] + pvalue: npt.NDArray[float] @staticmethod - def _worker(a: npt.NDArray[np.floating], b: npt.NDArray[np.floating], **kwargs) -> MW_RESULT: + def _worker(a: npt.NDArray[float], b: npt.NDArray[float], **kwargs) -> MW_RESULT: """Calculate the MWU statistic. Args: @@ -32,7 +32,7 @@ def _worker(a: npt.NDArray[np.floating], b: npt.NDArray[np.floating], **kwargs) A tuple of (statistic, pvalue) """ res = mannwhitneyu(x=a, y=b, **kwargs) - return np.float64(res.statistic), np.float64(res.pvalue) + return float(res.statistic), float(res.pvalue) @classmethod def run( @@ -67,7 +67,7 @@ def run( cores=cores, worker_kwargs={"alternative": alternative, "use_continuity": use_continuity, "axis": axis, "method": method}, ) - return cls(reaction_ids=all_reactions, statistic=results["statistic"].astype(np.float64), pvalue=results["pvalue"].astype(np.float64)) + return cls(reaction_ids=all_reactions, statistic=results["statistic"].astype(float), pvalue=results["pvalue"].astype(float)) @property def df(self) -> pd.DataFrame: diff --git a/tests/unit/test_fisher_stats.py b/tests/unit/test_fisher_stats.py index d7dad75c..dc212590 100644 --- a/tests/unit/test_fisher_stats.py +++ b/tests/unit/test_fisher_stats.py @@ -10,8 +10,8 @@ def test_fisher_stats(): assert real == FisherExactTest( pathway="Glycolysis/gluconeogenesis", - statistic=np.float64(4.321708185053381), - pvalue=np.float64(1.2883495211648955e-05), + statistic=4.321708185053381, + pvalue=1.2883495211648955e-05, a=32, b=10, c=4496, diff --git a/tests/unit/test_ks_stats.py b/tests/unit/test_ks_stats.py index 08403d46..32856809 100644 --- a/tests/unit/test_ks_stats.py +++ b/tests/unit/test_ks_stats.py @@ -7,7 +7,7 @@ @pytest.fixture def expected_result() -> pd.DataFrame: df = pd.read_csv("tests/inputs/expected_ks_results.csv", index_col=0) - df["statistic_sign"] = df["statistic_sign"].astype(np.int8) + df["statistic_sign"] = df["statistic_sign"].astype(int) return df @@ -21,12 +21,12 @@ def test_ks_stats(expected_result: pd.DataFrame, cores: int): df1 = pd.DataFrame( index=list(range(size)), columns=list(range(cols)), - data=gen.normal(loc=0, size=size * cols).reshape(size, cols).astype(np.float32), + data=gen.normal(loc=0, size=size * cols).reshape(size, cols).astype(float), ) df2 = pd.DataFrame( index=list(range(size)), columns=list(range(cols)), - data=gen.normal(loc=1, size=size * cols).reshape(size, cols).astype(np.float32), + data=gen.normal(loc=1, size=size * cols).reshape(size, cols).astype(float), ) real_result = KSTest.run(df1, df2, cores=cores) diff --git a/tests/unit/test_mwu_stats.py b/tests/unit/test_mwu_stats.py index 9b1823e4..1889de2a 100644 --- a/tests/unit/test_mwu_stats.py +++ b/tests/unit/test_mwu_stats.py @@ -20,12 +20,12 @@ def test_mannwhitney_stats(expected_result: pd.DataFrame, cores: int): df1 = pd.DataFrame( index=list(range(size)), columns=list(range(cols)), - data=gen.normal(loc=0, size=size * cols).reshape(size, cols).astype(np.float32), + data=gen.normal(loc=0, size=size * cols).reshape(size, cols).astype(float), ) df2 = pd.DataFrame( index=list(range(size)), columns=list(range(cols)), - data=gen.normal(loc=1, size=size * cols).reshape(size, cols).astype(np.float32), + data=gen.normal(loc=1, size=size * cols).reshape(size, cols).astype(float), ) real_result = MannWhitneyUTest.run(df1, df2, cores=cores) From 5b39a1cddb5c74d4f9e371962d7048f98930e3be Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:22:46 -0600 Subject: [PATCH 05/52] chore: ruff formatting Signed-off-by: Josh Loecker --- tests/unit/test_fisher_stats.py | 1 + tests/unit/test_ks_stats.py | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/unit/test_fisher_stats.py b/tests/unit/test_fisher_stats.py index dc212590..0f11d185 100644 --- a/tests/unit/test_fisher_stats.py +++ b/tests/unit/test_fisher_stats.py @@ -1,5 +1,6 @@ import cobra import numpy as np + from como.stats.fisher_exact_test import FisherExactTest diff --git a/tests/unit/test_ks_stats.py b/tests/unit/test_ks_stats.py index 32856809..fa6e742b 100644 --- a/tests/unit/test_ks_stats.py +++ b/tests/unit/test_ks_stats.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import pytest + from como.stats.ks_test import KSTest From 34451a9ece56836fe4d139f16282ef62ce276a28 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:24:11 -0600 Subject: [PATCH 06/52] chore(dev): ignore, but provide warning for, unused imports Signed-off-by: Josh Loecker --- ruff.toml | 73 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 38 insertions(+), 35 deletions(-) diff --git a/ruff.toml b/ruff.toml index 4929a8e3..a556c25f 100644 --- a/ruff.toml +++ b/ruff.toml @@ -7,38 +7,41 @@ docstring-code-format = true [lint] # Linting rules: https://docs.astral.sh/ruff/rules/ +unfixable = [ + "F401", # warn about, but do not remove, unused imports +] + select = [ - "A", # do not use python builtins for variables or parameters; https://pypi.org/project/flake8-builtins/ - "ASYNC", # identify asynchronous-related problems; https://pypi.org/project/flake8-async/ - "B", # find likely bugs and design problems; https://pypi.org/project/flake8-bugbear/ - "C4", # create better list, set, & dict comprehensions; https://pypi.org/project/flake8-comprehensions/ - "C90", # check for complexity; https://pypi.org/project/mccabe/ - "D", # docstring style checker; https://pypi.org/project/pydocstyle/ - "DOC", # docstring linter; https://pypi.org/project/pydoclint/ - "E", # style guide checking; https://pypi.org/project/pycodestyle/ - "F", # check for errors; https://pypi.org/project/pyflakes/ - "RUF", # ruff-specific linting rules; https://docs.astral.sh/ruff/rules/#ruff-specific-rules-ruf - "FA", # use from __future__ import annotations if needed; https://pypi.org/project/flake8-future-annotations/ - "FURB", # refurbish and modernize Python codebases; https://pypi.org/project/refurb/ - "I", # sorting imports rules; https://pypi.org/project/isort/ - "N", # check naming conventions; https://pypi.org/project/pep8-naming/ - "PERF", # check performance anti-patterns; https://pypi.org/project/perflint/ - "PT", # check common style issues or inconsistencies in pytest; https://pypi.org/project/flake8-pytest-style/ - "PTH", # use pathlib where possible; https://pypi.org/project/flake8-use-pathlib/ - "S", # security testing; https://pypi.org/project/flake8-bandit/ - "SIM", # check for code that can be simplified; https://pypi.org/project/flake8_simplify/ - "T20", # do not use prints in production; https://pypi.org/project/flake8-print/ - "TRY", # prevent Exception handling anti-patterns; https://pypi.org/project/tryceratops/ + "A", # do not use python builtins for variables or parameters; https://pypi.org/project/flake8-builtins/ + "ASYNC", # identify asynchronous-related problems; https://pypi.org/project/flake8-async/ + "B", # find likely bugs and design problems; https://pypi.org/project/flake8-bugbear/ + "C4", # create better list, set, & dict comprehensions; https://pypi.org/project/flake8-comprehensions/ + "C90", # check for complexity; https://pypi.org/project/mccabe/ + "D", # docstring style checker; https://pypi.org/project/pydocstyle/ + "DOC", # docstring linter; https://pypi.org/project/pydoclint/ + "E", # style guide checking; https://pypi.org/project/pycodestyle/ + "F", # check for errors; https://pypi.org/project/pyflakes/ + "RUF", # ruff-specific linting rules; https://docs.astral.sh/ruff/rules/#ruff-specific-rules-ruf + "FA", # use from __future__ import annotations if needed; https://pypi.org/project/flake8-future-annotations/ + "FURB", # refurbish and modernize Python codebases; https://pypi.org/project/refurb/ + "I", # sorting imports rules; https://pypi.org/project/isort/ + "N", # check naming conventions; https://pypi.org/project/pep8-naming/ + "PERF", # check performance anti-patterns; https://pypi.org/project/perflint/ + "PT", # check common style issues or inconsistencies in pytest; https://pypi.org/project/flake8-pytest-style/ + "PTH", # use pathlib where possible; https://pypi.org/project/flake8-use-pathlib/ + "S", # security testing; https://pypi.org/project/flake8-bandit/ + "SIM", # check for code that can be simplified; https://pypi.org/project/flake8_simplify/ + "T20", # do not use prints in production; https://pypi.org/project/flake8-print/ + "TRY", # prevent Exception handling anti-patterns; https://pypi.org/project/tryceratops/ "UP" # upgrade syntax for newer versions; https://pypi.org/project/pyupgrade ] ignore = [ - "D100", # allow undocumented public module definitions - "D101", # allow undocumented public class - "D104", # allow undocumented public package - "D203", # do not require one blank line before class docstring - "D213", # first docstring line should be on the second line - "TRY003", # allow exception messages outside the `Exception` class - "F401", # allow unused imports + "D100", # allow undocumented public module definitions + "D101", # allow undocumented public class + "D104", # allow undocumented public package + "D203", # do not require one blank line before class docstring + "D213", # first docstring line should be on the second line + "TRY003", # allow exception messages outside the `Exception` class ] [lint.pydocstyle] @@ -46,13 +49,13 @@ convention = "google" [lint.per-file-ignores] "tests/*" = [ - "D101", # allow undocumented public class - "D102", # allow undocumented class method - "D103", # allow undocumented public method definitions - "D104", # allow undocumented public package - "F811", # allow redefinition of variables, required for pytest fixtures - "S101", # allow use of `assert` in test files + "D101", # allow undocumented public class + "D102", # allow undocumented class method + "D103", # allow undocumented public method definitions + "D104", # allow undocumented public package + "F811", # allow redefinition of variables, required for pytest fixtures + "S101", # allow use of `assert` in test files ] "main/COMO.ipynb" = [ - "E501", # allow long lines + "E501", # allow long lines ] From c0de5285b86b78ee62aedc6a5df64ecc0af27e93 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:24:44 -0600 Subject: [PATCH 07/52] fix: clipped values should use floats, not integers Signed-off-by: Josh Loecker --- main/como/combine_distributions.py | 38 ++++++++++++++---------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py index 7549d1f6..73c1d3e1 100644 --- a/main/como/combine_distributions.py +++ b/main/como/combine_distributions.py @@ -61,28 +61,26 @@ def _combine_z_distribution_for_batch( logger.trace(f"A single sample exists for batch '{batch.batch_num}'. Returning as-is because no additional combining can be done") return matrix - values = matrix.iloc[:, 1:].values - weighted_matrix = np.sum(values, axis=1) / np.sqrt(values.shape[1]) - weighted_matrix = np.clip(weighted_matrix, weighted_z_floor, weighted_z_ceiling).astype(np.int8) + values = matrix.values + weighted_matrix = np.clip( + a=np.sum(values, axis=1) / np.sqrt(values.shape[1]), # calculate a weighted matrix + a_min=weighted_z_floor, + a_max=weighted_z_ceiling, + ).astype(float) - merge_df = pd.concat([matrix, pd.Series(weighted_matrix, name="combined")], axis=1) - weighted_matrix = pd.DataFrame( - { - "ensembl_gene_id": matrix["ensembl_gene_id"], - "combine_z": weighted_matrix, - }, - ) - stack_df = pd.melt( - merge_df, - id_vars=["ensembl_gene_id"], - # Get all columns except ensembl_gene_id - value_vars=[col for col in merge_df.columns if col not in GeneIdentifier._member_map_], - var_name="source", - value_name="zscore", - ) - if len(stack_df["source"].unique()) > 10: - stack_df = stack_df[stack_df["source"] == "combined"] + weighted_matrix = pd.DataFrame({"combine_z": weighted_matrix}, index=matrix.index) + # merge_df = pd.concat([matrix, pd.Series(weighted_matrix, name="combined")], axis=1) + # stack_df = pd.melt( + # merge_df, + # id_vars=["ensembl_gene_id"], + # # Get all 'data' columns + # value_vars=[col for col in merge_df.columns if col not in GeneIdentifier._member_map_], + # var_name="source", + # value_name="zscore", + # ) + # if len(stack_df["source"].unique()) > 10: + # stack_df = stack_df[stack_df["source"] == "combined"] # graph_zscore_distribution( # stack_df, # title=f"Combined Z-score Distribution for {context_name} - batch #{batch.batch_num}", From 526da0d163c901a20086f6d483043ffdcbd49fc4 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:24:58 -0600 Subject: [PATCH 08/52] chore: import required modules Signed-off-by: Josh Loecker --- main/como/combine_distributions.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py index 73c1d3e1..fa04a386 100644 --- a/main/como/combine_distributions.py +++ b/main/como/combine_distributions.py @@ -1,7 +1,8 @@ from __future__ import annotations -import asyncio +from collections.abc import Sequence from pathlib import Path +from typing import cast import numpy as np import pandas as pd @@ -17,10 +18,7 @@ _OutputCombinedSourceFilepath, _SourceWeights, ) -from como.graph import z_score_distribution as graph_zscore_distribution -from como.utils import ( - _num_columns, -) +from como.utils import LogLevel, _log_and_raise_error, _num_columns def _combine_z_distribution_for_batch( From 512fa4b03eaaeceda8e5d060c8d9f563092a4dc8 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:25:18 -0600 Subject: [PATCH 09/52] fix: set index name + single column name Signed-off-by: Josh Loecker --- main/como/combine_distributions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py index fa04a386..042fcbaf 100644 --- a/main/como/combine_distributions.py +++ b/main/como/combine_distributions.py @@ -86,8 +86,8 @@ def _combine_z_distribution_for_batch( # / f"{context_name}_{source.value}_batch{batch.batch_num}_combined_zscore_distribution.pdf", # ) - weighted_matrix.columns = ["ensembl_gene_id", batch.batch_num] - weighted_matrix.to_csv(output_combined_matrix_filepath, index=False) + weighted_matrix.columns = [batch.batch_num] + weighted_matrix.to_csv(output_combined_matrix_filepath, index=True) return weighted_matrix From c319e8e3b8793afaab3f50e8cb7e5e3085994a23 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:32:15 -0600 Subject: [PATCH 10/52] chore: more explicit variable name Signed-off-by: Josh Loecker --- main/como/combine_distributions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py index 042fcbaf..840521a7 100644 --- a/main/como/combine_distributions.py +++ b/main/como/combine_distributions.py @@ -94,7 +94,7 @@ def _combine_z_distribution_for_batch( def _combine_z_distribution_for_source( merged_source_data: pd.DataFrame, context_name: str, - num_replicates: int, + replicates_per_batch: Sequence[int], output_combined_matrix_filepath: Path, output_figure_filepath: Path, weighted_z_floor: int = -6, @@ -267,7 +267,7 @@ async def _begin_combining_distributions( merged_source_results: pd.DataFrame = _combine_z_distribution_for_source( merged_source_data=merged_batch_results, context_name=context_name, - num_replicates=sum(batch.num_samples for batch in batch_names[source.value]), + replicates_per_batch=replicates_per_batch, output_combined_matrix_filepath=( output_filepaths[source.value].parent / f"{context_name}_{source.value}_combined_zscore_distribution.csv" ), From 86ea37650dd28e9c6d2419272c0aa8fcda5d1b0b Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:32:37 -0600 Subject: [PATCH 11/52] chore: update docstring Signed-off-by: Josh Loecker --- main/como/combine_distributions.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py index 840521a7..9e5ac3e7 100644 --- a/main/como/combine_distributions.py +++ b/main/como/combine_distributions.py @@ -103,13 +103,14 @@ def _combine_z_distribution_for_source( """Combine z-score distributions across batches for a single source. Args: - merged_source_data: DataFrame with 'ensembl_gene_id' and batch columns. - context_name: Name of the context (e.g., tissue or condition). - num_replicates: Number of replicates (samples) for weighting. - output_combined_matrix_filepath: Path to save the combined z-score matrix. - output_figure_filepath: Path to save the z-score distribution figure. - weighted_z_floor: Minimum z-score value after combining. - weighted_z_ceiling: Maximum z-score value after combining. + merged_source_data : DataFrame with columns: ["ensembl_gene_id", , , ...]. Each batch column contains the within-batch combined z for that gene. + context_name: tissue or condition name + replicates_per_batch: vector of replicate counts per batch, aligned exactly to the order of + batch columns in `merged_source_data.iloc[:, 1:]`. + output_combined_matrix_filepath: directory to write combined z-score figures + output_figure_filepath: filepath to write z-score figure + weighted_z_floor: lower bound to clip z-scores + weighted_z_ceiling : upper bound to clip z-scores Returns: A pandas dataframe of the weighted z-distributions From d8179bb520779a642a199ceed9182ec70773c215 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:32:47 -0600 Subject: [PATCH 12/52] chore: do not modify input dataframe inplace Signed-off-by: Josh Loecker --- main/como/combine_distributions.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py index 9e5ac3e7..91221ca8 100644 --- a/main/como/combine_distributions.py +++ b/main/como/combine_distributions.py @@ -115,10 +115,12 @@ def _combine_z_distribution_for_source( Returns: A pandas dataframe of the weighted z-distributions """ - if _num_columns(merged_source_data) <= 2: - logger.warning("A single source exists, returning matrix as-is because no additional combining can be done") - merged_source_data.columns = ["ensembl_gene_id", "combine_z"] - return merged_source_data + # If only one batch column exists, return as-is (rename like R path-through). + if _num_columns(merged_source_data) <= 1: + logger.warning("A single batch exists for this source; returning matrix as-is.") + out_df = merged_source_data.copy() + out_df.columns = ["combine_z"] + return out_df output_combined_matrix_filepath.parent.mkdir(parents=True, exist_ok=True) output_figure_filepath.parent.mkdir(parents=True, exist_ok=True) From 838947f1ec6508b760d7255de9c1ad30b1d7081e Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:33:30 -0600 Subject: [PATCH 13/52] fix: use a per-replicate count for weighting instead of a single weight across the study Signed-off-by: Josh Loecker --- main/como/combine_distributions.py | 133 +++++++++++++++++------------ 1 file changed, 80 insertions(+), 53 deletions(-) diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py index 91221ca8..c152d69f 100644 --- a/main/como/combine_distributions.py +++ b/main/como/combine_distributions.py @@ -125,39 +125,50 @@ def _combine_z_distribution_for_source( output_combined_matrix_filepath.parent.mkdir(parents=True, exist_ok=True) output_figure_filepath.parent.mkdir(parents=True, exist_ok=True) - logger.trace(f"Found {_num_columns(merged_source_data) - 1} samples for context '{context_name}' to combine") - values = merged_source_data.iloc[:, 1:].values - mask = ~np.isnan(values) - masked_values = np.where(mask, values, 0) # Replace NaN with 0 - masked_num_replicates = np.where(mask, num_replicates, 0) - - weights = masked_num_replicates / np.sum(masked_num_replicates, axis=1, keepdims=True) - numerator = np.sum(weights * masked_values, axis=1) - denominator = np.sqrt(np.sum(weights**2, axis=1)) - weighted_matrix = numerator / denominator - weighted_matrix = np.clip(weighted_matrix, weighted_z_floor, weighted_z_ceiling) - logger.trace("Finished combining z-distribution") - # merge_df = pd.concat([merged_source_data, pd.Series(weighted_matrix, name="combined")], axis=1) - weighted_matrix = pd.DataFrame( - { - "ensembl_gene_id": merged_source_data["ensembl_gene_id"], - "combine_z": weighted_matrix, - } + # alidate alignment between columns and replicate vector + batch_column_names: list[str] = list(merged_source_data.columns) + expected_num_batches = len(batch_column_names) + if expected_num_batches != len(replicates_per_batch): + raise ValueError( + f"[{context_name}] Mismatch between number of batch columns " + f"({expected_num_batches}: {batch_column_names}) and length of " + f"replicates_per_batch ({len(replicates_per_batch)}: {list(replicates_per_batch)})." + ) + + logger.trace( + f"[{context_name}] Combining {expected_num_batches} batches with replicate counts: {dict(zip(batch_column_names, replicates_per_batch))}" ) - # stack_df = pd.melt( - # merge_df, - # id_vars=["ensembl_gene_id"], - # value_vars=merge_df.columns[1:], # all other columns are values - # var_name="source", - # value_name="zscore", - # ) - # graph_zscore_distribution( - # df=stack_df, - # title=f"Combined Z-score Distribution for {context_name}", - # output_filepath=output_figure_filepath, - # ) - return weighted_matrix + # extract values and build masks (shape is in gene x batch) + gene_by_batch_values: np.ndarray = merged_source_data.to_numpy(dtype=float) + present_mask: np.ndarray = ~np.isnan(gene_by_batch_values) + + # per-batch weights from replicate counts, normalized over all batches + replicates_per_batch_array: np.ndarray = np.asarray(replicates_per_batch, dtype=float) + total_replicates: float = replicates_per_batch_array.sum() + if total_replicates <= 0.0: + raise ValueError(f"[{context_name}] Sum of replicates_per_batch must be > 0; got {total_replicates}.") + base_weights_per_batch: np.ndarray = replicates_per_batch_array / total_replicates # shape: (B,) + + # drop NA columns per gene by zeroing their weights + masked_weights_per_batch: np.ndarray = np.where(present_mask, base_weights_per_batch, 0.0) # (G,B) + masked_values: np.ndarray = np.where(present_mask, gene_by_batch_values, 0.0) # (G,B) + + # weighted z combine per gene: sum(w*x)/sqrt(sum(w^2)), with NA rows → NaN + numerator_per_gene: np.ndarray = np.sum(masked_weights_per_batch * masked_values, axis=1) # (G,) + denominator_per_gene: np.ndarray = np.sqrt(np.sum(masked_weights_per_batch**2, axis=1)) # (G,) + combined_z_per_gene: np.ndarray = np.divide( + numerator_per_gene, + denominator_per_gene, + out=np.full_like(numerator_per_gene, np.nan, dtype=float), + where=denominator_per_gene != 0.0, + ) + + combined_z_per_gene = np.clip(combined_z_per_gene, weighted_z_floor, weighted_z_ceiling) + + logger.trace(f"[{context_name}] Finished combining batch z-distributions for source.") + + return pd.DataFrame({"combine_z": combined_z_per_gene}, index=merged_source_data.index) def _combine_z_distribution_for_context( @@ -171,14 +182,30 @@ def _combine_z_distribution_for_context( logger.warning("No zscore results exist, returning empty dataframe") return pd.DataFrame({"ensembl_gene_id": [], "combine_z": []}) - z_matrices = [ - res.z_score_matrix.set_index("ensembl_gene_id").rename(columns=dict.fromkeys(res.z_score_matrix.columns[1:], res.type.value)) - for res in zscore_results - ] - z_matrix = pd.concat(z_matrices, axis=1, join="outer").reset_index() + z_matrices: list[pd.DataFrame] = [] + for res in zscore_results: + matrix = res.z_score_matrix.copy() + if len(matrix.columns) > 1: + _log_and_raise_error( + f"Expected a single column for combined z-score dataframe for data '{res.type.value.lower()}'. Got '{len(matrix.columns)}' columns", + error=ValueError, + level=LogLevel.ERROR, + ) + + matrix.columns = [res.type.value.lower()] + matrix = cast(pd.DataFrame, matrix.loc[matrix.index != "-"]) + + # if the matrix has duplicate ids (i.e., multiple entrez ids map to a single ensembl id, etc.) + # take the mean value of them to remove the duplicates + if not matrix.index.is_unique: + # matrix = cast(pd.DataFrame, matrix.groupby(level=0).mean()) + matrix = cast(pd.DataFrame, matrix.groupby(level=0).max()) + z_matrices.append(matrix) + + z_matrix = pd.concat(z_matrices, axis="columns", join="outer", ignore_index=False) if _num_columns(z_matrix) <= 1: logger.trace(f"Only 1 source exists for '{context}', returning dataframe as-is becuase no data exists to combine") - z_matrix.columns = ["ensembl_gene_id", "combine_z"] + z_matrix.columns = ["combine_z"] return z_matrix values = z_matrix.iloc[:, 1:].values @@ -191,7 +218,7 @@ def _combine_z_distribution_for_context( numerator = np.sum(normalized_weights * masked_values, axis=1) denominator = np.sqrt(np.sum(normalized_weights**2, axis=1)) combined_z_matrix = numerator / denominator - combined_z_matrix = np.clip(combined_z_matrix, weighted_z_floor, weighted_z_ceiling) + combined_z_matrix = np.clip(combined_z_matrix, weighted_z_floor, weighted_z_ceiling, dtype=float) combined_z_matrix_df = pd.DataFrame( { "ensembl_gene_id": z_matrix["ensembl_gene_id"], @@ -199,21 +226,21 @@ def _combine_z_distribution_for_context( } ) - stack_df = pd.melt( - z_matrix, - id_vars=["ensembl_gene_id"], - value_vars=z_matrix.columns[1:], - var_name="source", - value_name="zscore", - ) - combined_df = pd.DataFrame( - { - "ensembl_gene_id": z_matrix["ensembl_gene_id"], - "zscore": combined_z_matrix, - "source": "combined", - } - ) - stack_df = pd.concat([stack_df, combined_df]) + # combined_df = pd.DataFrame( + # { + # "ensembl_gene_id": z_matrix.index, + # "zscore": combined_z_matrix, + # "source": "combined", + # } + # ) + # stack_df = pd.melt( + # z_matrix, + # id_vars=["ensembl_gene_id"], + # value_vars=z_matrix.columns[1:], + # var_name="source", + # value_name="zscore", + # ) + # stack_df = pd.concat([stack_df, combined_df]) # graph_zscore_distribution( # df=stack_df, # title=f"Combined Z-score Distribution for {context}", From ae0797f8ff8e8273ac11243bc0b6f62ffce1a854 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:33:49 -0600 Subject: [PATCH 14/52] fix: use index value for ensembl ids Signed-off-by: Josh Loecker --- main/como/combine_distributions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py index c152d69f..42c34354 100644 --- a/main/como/combine_distributions.py +++ b/main/como/combine_distributions.py @@ -221,7 +221,7 @@ def _combine_z_distribution_for_context( combined_z_matrix = np.clip(combined_z_matrix, weighted_z_floor, weighted_z_ceiling, dtype=float) combined_z_matrix_df = pd.DataFrame( { - "ensembl_gene_id": z_matrix["ensembl_gene_id"], + "ensembl_gene_id": z_matrix.index, "combine_z": combined_z_matrix, } ) From f423d1bc9baafa6ca5eba4a99c998777ed39c180 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:34:28 -0600 Subject: [PATCH 15/52] refactor: do not use async Signed-off-by: Josh Loecker --- main/como/combine_distributions.py | 57 ++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 15 deletions(-) diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py index 42c34354..9b87ea47 100644 --- a/main/como/combine_distributions.py +++ b/main/como/combine_distributions.py @@ -272,27 +272,54 @@ async def _begin_combining_distributions( logger.critical(f"Invalid source; got '{source.value}', expected 'trna', 'mrna', 'scrna', or 'proteomics'.") raise ValueError("Invalid source") - batch_results = await asyncio.gather( - *[ + batch_results: list[pd.DataFrame] = [] + for batch in batch_names[source.value]: + batch: _BatchEntry + matrix_subset = cast(pd.DataFrame, matrix[[GeneIdentifier.ensembl_gene_id.value, *batch.sample_names]]) + matrix_subset = matrix_subset.set_index(keys=[GeneIdentifier.ensembl_gene_id.value], drop=True) + + output_fp = output_filepaths[source.value].parent / f"{source.value}_batch{batch.batch_num}_combined_z_distrobution_{context_name}.csv" + batch_results.append( _combine_z_distribution_for_batch( context_name=context_name, batch=batch, - matrix=matrix[[GeneIdentifier.ENSEMBL_GENE_ID.value, *batch.sample_names]], + matrix=matrix_subset, source=source, - output_combined_matrix_filepath=( - output_filepaths[source.value].parent / f"{context_name}_{source.value}_batch{batch.batch_num}_combined_z_distribution.csv" - ), + output_combined_matrix_filepath=output_fp, output_figure_dirpath=output_figure_dirpath, weighted_z_floor=weighted_z_floor, weighted_z_ceiling=weighted_z_ceiling, ) - for batch in batch_names[source.value] - ] - ) + ) - merged_batch_results = pd.DataFrame() - for df in batch_results: - merged_batch_results = df if merged_batch_results.empty else merged_batch_results.merge(df, on="ensembl_gene_id", how="outer") + index_name: str = ( + "ensembl_gene_id" + if all(df.index.name == "ensembl_gene_id" for df in batch_results) + else "entrez_gene_id" + if all(df.index.name == "entrez_gene_id" for df in batch_results) + else "gene_symbol" + if all(df.index.name == "gene_symbol" for df in batch_results) + else "" + ) + if not index_name: + _log_and_raise_error( + f"Unable to find common gene identifier across batches for source '{source.value}' in context '{context_name}'", + error=ValueError, + level=LogLevel.ERROR, + ) + merged_batch_results = pd.concat(batch_results, axis="columns") + merged_batch_results.index.name = index_name + + replicates_by_batch_num_map: dict[str, int] = {str(batch.batch_num): batch.num_samples for batch in batch_names[source.value]} + batch_column_names: list[str] = list(merged_batch_results.columns) + replicates_per_batch: list[int] = [] + for col in batch_column_names: + key = str(col) + if key not in replicates_by_batch_num_map: + raise KeyError( + f"Missing replicate count for batch column '{col}' in source '{source.value}'. Known: {list(replicates_by_batch_num_map.keys())}" + ) + replicates_per_batch.append(replicates_by_batch_num_map[key]) merged_source_results: pd.DataFrame = _combine_z_distribution_for_source( merged_source_data=merged_batch_results, @@ -312,7 +339,7 @@ async def _begin_combining_distributions( weight=source_weights[source.value], ) ) - merged_source_results.to_csv(output_filepaths[source.value], index=False) + merged_source_results.to_csv(output_filepaths[source.value], index=True) logger.success(f"Wrote z-scores for source '{source.value}' in context '{context_name}' to '{output_filepaths[source.value]}'") logger.trace(f"Combining z-score distributions for all sources in context '{context_name}'") @@ -321,5 +348,5 @@ async def _begin_combining_distributions( zscore_results=z_score_results, output_graph_filepath=output_figure_dirpath / f"{context_name}_combined_omics_distribution.pdf", ) - merged_context_results.to_csv(output_final_model_scores, index=False) - logger.success(f"Finished combining z-scores for context '{context_name}'") + merged_context_results.to_csv(output_final_model_scores, index=True) + logger.success(f"Wrote combined z-scores for context '{context_name}' to {output_final_model_scores}") From 7f99af70be47703884869eced5a5dd05b1ec89d8 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:34:37 -0600 Subject: [PATCH 16/52] refactor: do not use async Signed-off-by: Josh Loecker --- main/como/create_context_specific_model.py | 225 ++++++++++++--------- 1 file changed, 124 insertions(+), 101 deletions(-) diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py index 8985603a..cd771c4b 100644 --- a/main/como/create_context_specific_model.py +++ b/main/como/create_context_specific_model.py @@ -1,12 +1,14 @@ from __future__ import annotations +import asyncio import collections +import os import re import sys -from collections.abc import Sequence +from collections.abc import Coroutine, Sequence from io import TextIOWrapper from pathlib import Path -from typing import Literal, TextIO, cast +from typing import Any, Literal, TextIO, cast import cobra import cobra.util.array @@ -15,14 +17,16 @@ import pandas as pd from cobra import Model from cobra.flux_analysis import pfba +from fast_bioservices.common import Taxon +from fast_bioservices.pipeline import ensembl_to_gene_id_and_symbol from loguru import logger from troppo.methods.reconstruction.fastcore import FASTcore, FastcoreProperties from troppo.methods.reconstruction.gimme import GIMME, GIMMEProperties from troppo.methods.reconstruction.imat import IMAT, IMATProperties from troppo.methods.reconstruction.tINIT import tINIT, tINITProperties -from como.data_types import Algorithm, BoundaryReactions, BuildResults, CobraCompartments, LogLevel, Solver -from como.utils import _log_and_raise_error, read_file, set_up_logging, split_gene_expression_data +from como.data_types import Algorithm, CobraCompartments, LogLevel, Solver, _BoundaryReactions, _BuildResults +from como.utils import _log_and_raise_error, _read_file, _set_up_logging, split_gene_expression_data def _correct_bracket(rule: str, name: str) -> str: @@ -287,6 +291,7 @@ async def _map_expression_to_reaction( recon_algorithm: Algorithm, low_thresh: float, high_thresh: float, + taxon: int | str | Taxon, ) -> collections.OrderedDict[str, int]: """Map gene ids to a reaction based on GPR (gene to protein to reaction) association rules. @@ -298,6 +303,7 @@ async def _map_expression_to_reaction( recon_algorithm: Algorithm to use for reconstruction (GIMME, FASTCORE, iMAT, or tINIT) low_thresh: Low expression threshold for algorithms that require it (iMAT, tINIT) high_thresh: High expression threshold for algorithms that require it (iMAT, tINIT) + taxon: Taxon ID or Taxon object for gene ID conversion. Returns: An ordered dictionary mapping reaction IDs to their corresponding expression values. @@ -305,7 +311,7 @@ async def _map_expression_to_reaction( Raises: ValueError: If neither 'entrez_gene_id' nor 'ensembl_gene_id' columns are found in the gene expression file. """ - expression_data = await read_file(gene_expression_file) + expression_data = await _read_file(gene_expression_file) identifier_column = next((col for col in ("entrez_gene_id", "ensembl_gene_id") if col in expression_data.columns), "") if not identifier_column: @@ -317,6 +323,14 @@ async def _map_expression_to_reaction( identifier_column=cast(Literal["ensembl_gene_id", "entrez_gene_id"], identifier_column), recon_algorithm=recon_algorithm, ) + + # convert ensembl IDs to entrez ids to map expression data to reactions in the reference model + if identifier_column == "ensembl_gene_id": + conversion = await ensembl_to_gene_id_and_symbol(ids=gene_activity.index.tolist(), taxon=taxon) + gene_activity = gene_activity.merge(conversion, how="left", left_index=True, right_on="ensembl_gene_id") + gene_activity = gene_activity.set_index(keys=["entrez_gene_id"]) + gene_activity = gene_activity.drop(columns=["ensembl_gene_id", "gene_symbol"]) + reaction_expression = collections.OrderedDict() # fmt: off @@ -343,7 +357,7 @@ async def _map_expression_to_reaction( activity = gene_activity.at[gene_id, "active"] if gene_id in gene_activity.index else f"{default_expression!s}" # replace gene_id with activity, using optional whitespace before and after the gene id # Do not replace the whitespace (if it exists) before and after the gene ID - gene_reaction_rule = re.sub(pattern=rf"\b{gene_id}\b", repl=activity, string=gene_reaction_rule) + gene_reaction_rule = re.sub(pattern=rf"\b{gene_id}\b", repl=str(activity), string=str(gene_reaction_rule)) try: # We are using eval here because ast.literal_eval is unable to process an evaluable such as `max(-4, 0, 1)` @@ -359,7 +373,24 @@ async def _map_expression_to_reaction( return reaction_expression -async def _build_model( # noqa: C901 +def _read_reference_model(filepath: Path) -> cobra.Model: + match filepath.suffix: + case ".mat": + reference_model = cobra.io.load_matlab_model(filepath) + case ".xml" | ".sbml": + reference_model = cobra.io.read_sbml_model(filepath) + case ".json": + reference_model = cobra.io.load_json_model(filepath) + case _: + _log_and_raise_error( + f"Reference model format must be .xml, .mat, or .json; found '{filepath.suffix}'", + error=ValueError, + level=LogLevel.ERROR, + ) + return reference_model + + +async def _build_model( general_model_file: Path, gene_expression_file: Path, recon_algorithm: Algorithm, @@ -373,9 +404,10 @@ async def _build_model( # noqa: C901 low_thresh: float, high_thresh: float, output_flux_result_filepath: Path, + taxon: int | str | Taxon, *, force_boundary_rxn_inclusion: bool, -) -> BuildResults: +) -> _BuildResults: """Seed a context specific reference_model. Core reactions are determined from GPR associations with gene expression logicals. @@ -397,25 +429,13 @@ async def _build_model( # noqa: C901 low_thresh: Low expression threshold for algorithms that require it (iMAT, tINIT) high_thresh: High expression threshold for algorithms that require it (iMAT, tINIT) output_flux_result_filepath: Path to save flux results (for iMAT only) + taxon: Taxon ID or Taxon object for gene ID conversion. force_boundary_rxn_inclusion: If True, ensure that all boundary reactions are included in the final model. Returns: A _BuildResults object containing the context-specific model, list of expression indices used, and a DataFrame of infeasible reactions. """ - reference_model: cobra.Model - match general_model_file.suffix: - case ".mat": - reference_model = cobra.io.load_matlab_model(general_model_file) - case (".xml", ".sbml"): - reference_model = cobra.io.read_sbml_model(general_model_file) - case ".json": - reference_model = cobra.io.load_json_model(general_model_file) - case _: - _log_and_raise_error( - f"Reference model format must be .xml, .mat, or .json; found '{general_model_file.suffix}'", - error=ValueError, - level=LogLevel.ERROR, - ) + reference_model: cobra.Model = _read_reference_model(general_model_file) if objective not in force_reactions: force_reactions.append(objective) @@ -426,21 +446,19 @@ async def _build_model( # noqa: C901 # inconsistent_reactions, cobra_model = _feasibility_test(cobra_model, "before_seeding") inconsistent_reactions = [] s_matrix = cobra.util.array.create_stoichiometric_matrix(reference_model, array_type="dense") - lower_bounds = [] - upper_bounds = [] - reaction_ids = [] - for reaction in reference_model.reactions: + lower_bounds: list[int] = [] + upper_bounds: list[int] = [] + reaction_ids: list[str] = [] + for i, reaction in enumerate(reference_model.reactions): + # if reaction.id in boundary_reactions: + # lower_bounds.append() lower_bounds.append(reaction.lower_bound) upper_bounds.append(reaction.upper_bound) reaction_ids.append(reaction.id) # get expressed reactions reaction_expression: collections.OrderedDict[str, int] = await _map_expression_to_reaction( - reference_model, - gene_expression_file, - recon_algorithm, - high_thresh=high_thresh, - low_thresh=low_thresh, + reference_model, gene_expression_file, recon_algorithm, high_thresh=high_thresh, low_thresh=low_thresh, taxon=taxon ) expression_vector: npt.NDArray[float] = np.array(list(reaction_expression.values()), dtype=float) @@ -483,6 +501,16 @@ async def _build_model( # noqa: C901 expression_threshold = (low_thresh, high_thresh) match recon_algorithm: + case Algorithm.IMAT: + context_model_cobra: cobra.Model = _build_with_imat( + reference_model=reference_model, + lower_bounds=lower_bounds, + upper_bounds=upper_bounds, + expr_vector=expression_vector, + expr_thresh=expression_threshold, + force_reaction_indices=force_reaction_indices, + solver=solver, + ) case Algorithm.GIMME: context_model_cobra: cobra.Model = _build_with_gimme( reference_model=reference_model, @@ -500,22 +528,12 @@ async def _build_model( # noqa: C901 exp_idx_list=expression_vector_indices, solver=solver, ) - case Algorithm.IMAT: - context_model_cobra: cobra.Model = _build_with_imat( - reference_model=reference_model, - lower_bounds=lower_bounds, - upper_bounds=upper_bounds, - expr_vector=expression_vector, - expr_thresh=expression_threshold, - force_gene_indices=force_reaction_indices, - solver=solver, - ) context_model_cobra.objective = objective flux_sol: cobra.Solution = context_model_cobra.optimize() fluxes: pd.Series = flux_sol.fluxes model_reactions: list[str] = [reaction.id for reaction in context_model_cobra.reactions] reaction_intersections: set[str] = set(fluxes.index).intersection(model_reactions) - flux_df: pd.DataFrame = fluxes[~fluxes.index.isin(reaction_intersections)] + flux_df: pd.DataFrame = cast(pd.DataFrame, fluxes[~fluxes.index.isin(reaction_intersections)]) flux_df.dropna(inplace=True) flux_df.to_csv(output_flux_result_filepath) case Algorithm.TINIT: @@ -549,7 +567,7 @@ async def _build_model( # noqa: C901 axis=0, ) - return BuildResults( + return _BuildResults( model=context_model_cobra, expression_index_list=expression_vector_indices, infeasible_reactions=inconsistent_and_infeasible_reactions, @@ -559,7 +577,7 @@ async def _build_model( # noqa: C901 async def _create_df(path: Path, *, lowercase_col_names: bool = False) -> pd.DataFrame: if path.suffix not in {".csv", ".tsv"}: raise ValueError(f"File must be a .csv or .tsv file, got '{path.suffix}'") - df: pd.DataFrame = await read_file(path=path, header=0, sep="," if path.suffix == ".csv" else "\t", h5ad_as_df=True) + df: pd.DataFrame = await _read_file(path=path, header=0, sep="," if path.suffix == ".csv" else "\t", h5ad_as_df=True) if not isinstance(df, pd.DataFrame): _log_and_raise_error( @@ -573,7 +591,7 @@ async def _create_df(path: Path, *, lowercase_col_names: bool = False) -> pd.Dat return df -async def _collect_boundary_reactions(path: Path) -> BoundaryReactions: +async def _collect_boundary_reactions(path: Path) -> _BoundaryReactions: df: pd.DataFrame = await _create_df(path, lowercase_col_names=True) for column in df.columns: if column not in [ @@ -609,39 +627,48 @@ async def _collect_boundary_reactions(path: Path) -> BoundaryReactions: shorthand_compartment = CobraCompartments.get_shorthand(reaction_compartment[i]) reactions[i] = f"{boundary_map.get(boundary)}_{reaction_abbreviation[i]}[{shorthand_compartment}]" - return BoundaryReactions( + return _BoundaryReactions( reactions=reactions, lower_bounds=df["minimum reaction rate"].tolist(), upper_bounds=df["maximum reaction rate"].tolist(), ) -def _write_model_to_disk(context_name: str, model: cobra.Model, output_filepaths: list[Path]) -> None: +async def _write_model_to_disk( + context_name: str, + model: cobra.Model, + output_filepaths: list[Path], + mat_suffix: set[str], + json_suffix: set[str], + xml_suffix: set[str], +) -> None: + tasks: set[Coroutine[Any, Any, None]] = set() for path in output_filepaths: path.parent.mkdir(parents=True, exist_ok=True) - if path.suffix == ".mat": - cobra.io.save_matlab_model(model=model, file_name=path) - elif path.suffix == ".json": - cobra.io.save_json_model(model=model, filename=path, pretty=True) - elif path.suffix in {".sbml", ".xml"}: - cobra.io.write_sbml_model(cobra_model=model, filename=path) + if path.suffix in mat_suffix: + tasks.add(asyncio.to_thread(cobra.io.save_matlab_model, model=model, file_name=path)) + elif path.suffix in json_suffix: + tasks.add(asyncio.to_thread(cobra.io.save_json_model, model=model, filename=path, pretty=True)) + elif path.suffix in xml_suffix: + tasks.add(asyncio.to_thread(cobra.io.write_sbml_model, model=model, filename=path)) else: _log_and_raise_error( f"Invalid output model filetype. Should be one of .xml, .sbml, .mat, or .json. Got '{path.suffix}'", error=ValueError, level=LogLevel.ERROR, ) - logger.success(f"Saved metabolic model for context '{context_name}' to '{path}'") + logger.success(f"Will save metabolic model for context '{context_name}' to: '{path}'") + await asyncio.gather(*tasks) async def create_context_specific_model( # noqa: C901 context_name: str, + taxon: int | str | Taxon, reference_model: Path, active_genes_filepath: Path, output_infeasible_reactions_filepath: Path, output_flux_result_filepath: Path, output_model_filepaths: Path | list[Path], - output_filetypes: list[str] | None = None, output_fastcore_expression_index_filepath: Path | None = None, objective: str = "biomass_reaction", boundary_rxns_filepath: str | Path | None = None, @@ -660,12 +687,12 @@ async def create_context_specific_model( # noqa: C901 Args: context_name: Name of the context-specific model. + taxon: NCBI taxonomy ID or name for the organism of interest. reference_model: Path to the general genome-scale metabolic model file (.xml, .mat, or .json). active_genes_filepath: Path to the gene expression data file (csv, tsv, or Excel). output_infeasible_reactions_filepath: Path to save infeasible reactions (csv). output_flux_result_filepath: Path to save flux results (csv). output_model_filepaths: Path or list of paths to save the context-specific model (.xml, .mat, or .json). - output_filetypes: List of file types to save the model as ('xml', 'mat', 'json'). output_fastcore_expression_index_filepath: Path to save Fastcore expression indices (txt). Required if using Fastcore. objective: Objective function reaction ID. boundary_rxns_filepath: Optional path to boundary reactions file (csv, tsv, or Excel). @@ -682,22 +709,9 @@ async def create_context_specific_model( # noqa: C901 Raises: ImportError: If Gurobi solver is selected but gurobipy is not installed. """ + _set_up_logging(level=log_level, location=log_location) boundary_rxns_filepath: Path | None = Path(boundary_rxns_filepath) if boundary_rxns_filepath else None - set_up_logging(level=log_level, location=log_location) output_model_filepaths = [output_model_filepaths] if isinstance(output_model_filepaths, Path) else output_model_filepaths - for path in output_model_filepaths: - if path.suffix not in {".mat", ".xml", ".sbml", ".json"}: - _log_and_raise_error( - f"Invalid output model filetype. Should be one of .xml, .sbml, .mat, or .json. Got '{path.suffix}'", - error=ValueError, - level=LogLevel.ERROR, - ) - if len(output_model_filepaths) != len(output_model_filepaths): - _log_and_raise_error( - "The number of output model filepaths must be the same as the number of output flux result filepaths", - error=ValueError, - level=LogLevel.ERROR, - ) if not reference_model.exists(): _log_and_raise_error( @@ -724,15 +738,6 @@ async def create_context_specific_model( # noqa: C901 level=LogLevel.ERROR, ) - output_filetypes = ["mat"] if output_filetypes is None else output_filetypes - for output_type in output_filetypes: - if output_type not in {"xml", "mat", "json"}: - _log_and_raise_error( - f"Output file type {output_type} not recognized. Must be one of: 'xml', 'mat', 'json'", - error=ValueError, - level=LogLevel.ERROR, - ) - if algorithm not in Algorithm: _log_and_raise_error( f"Algorithm {algorithm} not supported. Use one of {', '.join(a.value for a in Algorithm)}", @@ -747,6 +752,16 @@ async def create_context_specific_model( # noqa: C901 level=LogLevel.ERROR, ) + mat_suffix, json_suffix, xml_suffix = {".mat"}, {".json"}, {".sbml", ".xml"} + if any(path.suffix not in {*mat_suffix, *json_suffix, *xml_suffix} for path in output_model_filepaths): + invalid_suffix = "\n".join(path for path in output_model_filepaths if path.suffix not in {*mat_suffix, *json_suffix, *xml_suffix}) + _log_and_raise_error( + f"Invalid output filetype. Should be 'xml', 'sbml', 'mat', or 'json'. Got:\n{invalid_suffix}'", + error=ValueError, + level=LogLevel.ERROR, + ) + + boundary_reactions = None if boundary_rxns_filepath: boundary_reactions = await _collect_boundary_reactions(boundary_rxns_filepath) @@ -776,42 +791,43 @@ async def create_context_specific_model( # noqa: C901 # Test that gurobi is using a valid license file if solver == Solver.GUROBI: - # test if gurobi is available - try: - import gurobipy as gp - except ImportError as e: - logger.error( - "The gurobi solver requires the gurobipy package to be installed. " - "Please install gurobipy and try again. " - "This can be done by installing the 'gurobi' optional dependency." + from importlib.util import find_spec + + gurobi_present = find_spec("gurobipy") + if not gurobi_present: + _log_and_raise_error( + message=( + "The gurobi solver requires the gurobipy package to be installed. " + "Please install gurobipy and try again. " + "This can be done by installing the 'gurobi' optional dependency." + ), + error=ImportError, + level=LogLevel.ERROR, ) - raise ImportError from e - env = gp.Env() - if env.getParam("WLSACCESSID") == "" or env.getParam("WLSSECRET") == "": + if not Path(f"{os.environ['HOME']}/gurobi.lic").exists(): logger.critical( "Gurobi solver requested, but license information cannot be found. " "COMO will continue, but it is HIGHLY unlikely the resulting model will be valid." ) - # remove gurobi-related information, it is no longer required - del env, gp - logger.info(f"Creating '{context_name}' model using '{algorithm.value}' reconstruction and '{solver.value}' solver") - build_results: BuildResults = await _build_model( + logger.info(f"context='{context_name}', reconstruction method='{algorithm.value}', solver='{solver.value}'") + build_results: _BuildResults = await _build_model( general_model_file=reference_model, gene_expression_file=active_genes_filepath, recon_algorithm=algorithm, objective=objective, - boundary_reactions=boundary_reactions.reactions, + boundary_reactions=boundary_reactions.reactions if boundary_reactions else [], exclude_reactions=exclude_rxns, force_reactions=force_rxns, - lower_bounds=boundary_reactions.lower_bounds, - upper_bounds=boundary_reactions.upper_bounds, + lower_bounds=boundary_reactions.lower_bounds if boundary_reactions else [], + upper_bounds=boundary_reactions.upper_bounds if boundary_reactions else [], solver=solver.value.lower(), low_thresh=low_threshold, high_thresh=high_threshold, output_flux_result_filepath=output_flux_result_filepath, force_boundary_rxn_inclusion=force_boundary_rxn_inclusion, + taxon=taxon, ) build_results.infeasible_reactions.dropna(inplace=True) @@ -822,7 +838,14 @@ async def create_context_specific_model( # noqa: C901 fastcore_df.dropna(inplace=True) fastcore_df.to_csv(output_fastcore_expression_index_filepath, index=False) - _write_model_to_disk(context_name=context_name, model=build_results.model, output_filepaths=output_model_filepaths) - logger.debug(f"Number of Genes: {len(build_results.model.genes):,}") - logger.debug(f"Number of Metabolites: {len(build_results.model.metabolites):,}") - logger.debug(f"Number of Reactions: {len(build_results.model.reactions):,}") + await _write_model_to_disk( + context_name=context_name, + model=build_results.model, + output_filepaths=output_model_filepaths, + mat_suffix=mat_suffix, + json_suffix=json_suffix, + xml_suffix=xml_suffix, + ) + logger.info( + f"context={context_name}, reactions={len(build_results.model.reactions)}, genes={len(build_results.model.genes)}, metabolites={len(build_results.model.metabolites)}" + ) From 926c1393a01da47e748710bc59f463a323169368 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:34:57 -0600 Subject: [PATCH 17/52] refactor: use lowercase identifier names; provide default values Signed-off-by: Josh Loecker --- main/como/data_types.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/main/como/data_types.py b/main/como/data_types.py index 8b439d47..a004f862 100644 --- a/main/como/data_types.py +++ b/main/como/data_types.py @@ -40,9 +40,9 @@ class FilteringTechnique(str, Enum): class GeneIdentifier(str, Enum): - ENSEMBL_GENE_ID = "ENSEMBL_GENE_ID" - ENTREZ_GENE_ID = "ENTREZ_GENE_ID" - GENE_SYMBOL = "GENE_SYMBOL" + ensembl_gene_id = "ensembl_gene_id" + entrez_gene_id = "entrez_gene_id" + gene_symbol = "gene_symbol" class LogLevel(int, Enum): @@ -72,15 +72,15 @@ class Solver(str, Enum): class SourceTypes(str, Enum): - TRNA = "TRNA" - MRNA = "MRNA" - SCRNA = "SCRNA" - PROTEOMICS = "PROTEOMICS" + trna = "trna" + mrna = "mrna" + scrna = "scrna" + proteomics = "proteomics" class PeakIdentificationParameters(NamedTuple): - height: float - distance: float + height: float = 0.02 + distance: float = 1.0 class CobraCompartments: From 4149eb95dc862913e756f83d3030c4d7f77f8306 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:35:17 -0600 Subject: [PATCH 18/52] refactor: import required modules Signed-off-by: Josh Loecker --- main/como/merge_xomics.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/main/como/merge_xomics.py b/main/como/merge_xomics.py index a89219ef..3c3d4f43 100644 --- a/main/como/merge_xomics.py +++ b/main/como/merge_xomics.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +import re import sys from pathlib import Path from typing import TextIO, cast @@ -10,9 +11,7 @@ from fast_bioservices.biothings.mygene import MyGene from loguru import logger -from como.combine_distributions import ( - _begin_combining_distributions, -) +from como.combine_distributions import _begin_combining_distributions from como.data_types import ( AdjustmentMethod, LogLevel, From f56b85312fbbe52233f737218b6e478de92dfe85 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:35:42 -0600 Subject: [PATCH 19/52] fix: use `.loc[]` to prevent copy-on-write warning Signed-off-by: Josh Loecker --- main/como/merge_xomics.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/main/como/merge_xomics.py b/main/como/merge_xomics.py index 3c3d4f43..f1bac265 100644 --- a/main/como/merge_xomics.py +++ b/main/como/merge_xomics.py @@ -106,7 +106,7 @@ def _merge_logical_table(df: pd.DataFrame): """ # step 1: get all plural ENTREZ_GENE_IDs in the input table, extract unique IDs df.dropna(subset=["entrez_gene_id"], inplace=True) - df["entrez_gene_id"] = df["entrez_gene_id"].astype(str).str.replace(" /// ", "//").astype(str) + df.loc[:, "entrez_gene_id"] = df.loc[:, "entrez_gene_id"].astype(str).str.replace(" /// ", "//").astype(str) id_list: list[str] = df.loc[~df["entrez_gene_id"].str.contains("//"), "entrez_gene_id"].tolist() # Collect "single" ids, like "123" multiple_entrez_ids: list[str] = df.loc[ @@ -287,6 +287,8 @@ async def _merge_xomics( expression_list.append(expressed_sourcetype) high_confidence_list.append(high_expressed_sourcetype) matrix.rename(columns={"expressed": expressed_sourcetype, "high": high_expressed_sourcetype}, inplace=True) + matrix = cast(pd.DataFrame, matrix[matrix["entrez_gene_id"] != "-"]) + matrix.loc[:, "entrez_gene_id"] = matrix.loc[:, "entrez_gene_id"].astype(int) merge_data = matrix if merge_data.empty else merge_data.merge(matrix, on="entrez_gene_id", how="outer") logger.trace(f"Shape of merged data before merging logical tables: {merge_data.shape}") From 6105b98c16cdda6c4289f168eaabe70973c28584 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:36:01 -0600 Subject: [PATCH 20/52] fix: explicitly check column names Signed-off-by: Josh Loecker --- main/como/merge_xomics.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/main/como/merge_xomics.py b/main/como/merge_xomics.py index f1bac265..91bd83c4 100644 --- a/main/como/merge_xomics.py +++ b/main/como/merge_xomics.py @@ -367,16 +367,25 @@ async def _update_missing_data(input_matrices: _InputMatrices, taxon_id: int) -> if matrix is not None: # fmt: off existing_data = ( - "gene_symbol" if "gene_symbol" in matrix - else "entrez_gene_id" if "entrez_gene_id" in matrix + "gene_symbol" if "gene_symbol" in matrix.columns + else "entrez_gene_id" if "entrez_gene_id" in matrix.columns else "ensembl_gene_id" ) # fmt: on logger.trace(f"Merging conversion data for {matrix_name}, existing id column is: {existing_data}") + # input_matrices[matrix_name][existing_data] = input_matrices[matrix_name][existing_data].astype(str) + if existing_data == "entrez_gene_id": + input_matrices[matrix_name]["entrez_gene_id"] = input_matrices[matrix_name]["entrez_gene_id"].astype(int) + conversion.index = conversion.index.astype(int) + input_matrices[matrix_name] = ( - input_matrices[matrix_name].merge(conversion, how="left", on=[existing_data]).dropna().reset_index(drop=True) + input_matrices[matrix_name].merge(conversion, how="left", left_on=existing_data, right_index=True).reset_index(drop=True) ) + # input_matrices[matrix_name] = ( + # input_matrices[matrix_name].merge(conversion, how="left", left_index=True, right_index=True).dropna().reset_index(drop=True) + # ) + logger.debug("Updated missing genomic data") return input_matrices From 1219cc40b23f128731c62f3baced4e93dccd362f Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:36:18 -0600 Subject: [PATCH 21/52] refactor: raise error if adjustment method not found Signed-off-by: Josh Loecker --- main/como/merge_xomics.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/main/como/merge_xomics.py b/main/como/merge_xomics.py index 91bd83c4..4580b0eb 100644 --- a/main/como/merge_xomics.py +++ b/main/como/merge_xomics.py @@ -450,6 +450,12 @@ async def _process( adjusted_expression_requirement = expression_requirement - (4 - num_sources) elif adjust_method == AdjustmentMethod.FLAT: adjusted_expression_requirement = expression_requirement + else: + _log_and_raise_error( + message=f"Unknown `adjust_method`: {adjust_method}.", + error=ValueError, + level=LogLevel.ERROR, + ) logger.debug(f"Adjusted expression requirement: {adjusted_expression_requirement}") if adjusted_expression_requirement != expression_requirement: From 9c7175473e870b38f9f896f2d16f9a8027ebfe3d Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:37:02 -0600 Subject: [PATCH 22/52] refactor: validate batch in loop Signed-off-by: Josh Loecker --- main/como/merge_xomics.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/main/como/merge_xomics.py b/main/como/merge_xomics.py index 4580b0eb..381f66d6 100644 --- a/main/como/merge_xomics.py +++ b/main/como/merge_xomics.py @@ -513,7 +513,16 @@ def _build_batches( continue metadata: pd.DataFrame # Re-assign type to assist in type hinting - for batch_num, study in enumerate(sorted(metadata["study"].unique()), start=1): + for study in sorted(metadata["study"].unique()): + batch_search = re.search(r"\d+", study) + if not batch_search: + _log_and_raise_error( + message=f"Unable to find batch number in study name. Expected a digit in the study value: {study}", + error=ValueError, + level=LogLevel.ERROR, + ) + + batch_num = int(batch_search.group(0)) # ty: ignore[possibly-missing-attribute] study_sample_names = metadata[metadata["study"] == study]["sample_name"].tolist() batch_names[source.value].append(_BatchEntry(batch_num=batch_num, sample_names=study_sample_names)) logger.debug(f"Found {len(study_sample_names)} sample names for study '{study}', batch number {batch_num}") From 4142156e8a11ea0b457ce611bf54e3f8bc561002 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:37:11 -0600 Subject: [PATCH 23/52] refactor: use lowercase names Signed-off-by: Josh Loecker --- main/como/merge_xomics.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/main/como/merge_xomics.py b/main/como/merge_xomics.py index 381f66d6..87b5ba62 100644 --- a/main/como/merge_xomics.py +++ b/main/como/merge_xomics.py @@ -596,10 +596,10 @@ async def merge_xomics( # noqa: C901 # fmt: off source_data = { - SourceTypes.TRNA: (trna_matrix_or_filepath, trna_boolean_matrix_or_filepath, trna_metadata_filepath_or_df, output_trna_activity_filepath), - SourceTypes.MRNA: (mrna_matrix_or_filepath, mrna_boolean_matrix_or_filepath, mrna_metadata_filepath_or_df, output_mrna_activity_filepath), - SourceTypes.SCRNA: (scrna_matrix_or_filepath, scrna_boolean_matrix_or_filepath, scrna_metadata_filepath_or_df, output_scrna_activity_filepath), # noqa: E501 - SourceTypes.PROTEOMICS: (proteomic_matrix_or_filepath, proteomic_boolean_matrix_or_filepath, proteomic_metadata_filepath_or_df, output_proteomic_activity_filepath), # noqa: E501 + SourceTypes.trna: (trna_matrix_or_filepath, trna_boolean_matrix_or_filepath, trna_metadata_filepath_or_df, output_trna_activity_filepath), + SourceTypes.mrna: (mrna_matrix_or_filepath, mrna_boolean_matrix_or_filepath, mrna_metadata_filepath_or_df, output_mrna_activity_filepath), + SourceTypes.scrna: (scrna_matrix_or_filepath, scrna_boolean_matrix_or_filepath, scrna_metadata_filepath_or_df, output_scrna_activity_filepath), # noqa: E501 + SourceTypes.proteomics: (proteomic_matrix_or_filepath, proteomic_boolean_matrix_or_filepath, proteomic_metadata_filepath_or_df, output_proteomic_activity_filepath), # noqa: E501 } # fmt: on for source in source_data: From 89f05aa35d5e3253efa4050e82c4f66be8a6674e Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:37:42 -0600 Subject: [PATCH 24/52] refactor: import required modules Signed-off-by: Josh Loecker --- main/como/proteomics_gen.py | 8 +++++--- main/como/rnaseq_gen.py | 8 ++++---- main/como/rnaseq_preprocess.py | 3 ++- main/como/utils.py | 1 + 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/main/como/proteomics_gen.py b/main/como/proteomics_gen.py index 012486a4..e0d9b078 100644 --- a/main/como/proteomics_gen.py +++ b/main/como/proteomics_gen.py @@ -1,15 +1,17 @@ from __future__ import annotations +import itertools import sys -from io import TextIOWrapper from pathlib import Path +from typing import TextIO, cast import numpy as np import pandas as pd -from fast_bioservices.biodbnet import BioDBNet, Input, Output +from bioservices import BioDBNet +from fast_bioservices.biodbnet import Input, Output # BioDBNet from loguru import logger -from como.data_types import LOG_FORMAT, LogLevel +from como.data_types import LogLevel from como.project import Config from como.proteomics_preprocessing import protein_transform_main from como.utils import _log_and_raise_error, _set_up_logging, return_placeholder_data diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 99121f8f..bad4f70e 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -1,5 +1,6 @@ from __future__ import annotations +import itertools import multiprocessing import sys import time @@ -8,9 +9,8 @@ from concurrent.futures import Future, ProcessPoolExecutor, as_completed from dataclasses import dataclass, field from enum import Enum -from io import TextIOWrapper from pathlib import Path -from typing import NamedTuple +from typing import NamedTuple, TextIO, cast import matplotlib.pyplot as plt import numpy as np @@ -22,11 +22,11 @@ from fast_bioservices.pipeline import ensembl_to_gene_id_and_symbol, gene_symbol_to_ensembl_and_gene_id from loguru import logger from pandas import DataFrame -from scipy.signal import find_peaks -from sklearn.neighbors import KernelDensity from como.data_types import FilteringTechnique, LogLevel, PeakIdentificationParameters, RNAType +from como.density import density from como.migrations import gene_info_migrations +from como.peak_finder import find_peaks from como.project import Config from como.utils import _log_and_raise_error, _num_columns, _read_file, _set_up_logging diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index 142be019..5ef58fb9 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +import functools import json import re import sys @@ -14,7 +15,7 @@ import numpy as np import pandas as pd from fast_bioservices.biothings.mygene import MyGene -from fast_bioservices.pipeline import ensembl_to_gene_id_and_symbol, gene_symbol_to_ensembl_and_gene_id +from fast_bioservices.pipeline import gene_symbol_to_ensembl_and_gene_id from loguru import logger from como.data_types import PATH_TYPE, LogLevel, RNAType diff --git a/main/como/utils.py b/main/como/utils.py index ececc902..05629032 100644 --- a/main/como/utils.py +++ b/main/como/utils.py @@ -2,6 +2,7 @@ import contextlib import io +import itertools import sys from collections.abc import Iterator from pathlib import Path From 60a883a81a1ede81b768799c47064d0d91c91b94 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:39:17 -0600 Subject: [PATCH 25/52] feat: replicate R's zFPKM module Signed-off-by: Josh Loecker --- main/como/approx.py | 310 +++++++++++++++++++++++++++++++++++++ main/como/density.py | 325 +++++++++++++++++++++++++++++++++++++++ main/como/peak_finder.py | 191 +++++++++++++++++++++++ main/como/rnaseq_gen.py | 116 ++++---------- 4 files changed, 859 insertions(+), 83 deletions(-) create mode 100644 main/como/approx.py create mode 100644 main/como/density.py create mode 100644 main/como/peak_finder.py diff --git a/main/como/approx.py b/main/como/approx.py new file mode 100644 index 00000000..3ef69a74 --- /dev/null +++ b/main/como/approx.py @@ -0,0 +1,310 @@ +from __future__ import annotations + +from collections.abc import Callable, Sequence +from typing import Any + +import numpy as np + + +def _coerce_to_float_array(a): + """Helper to ensure input is a 1D float array.""" + arr = np.asarray(a, dtype=float) + if arr.ndim != 1: + arr = arr.ravel() + return arr + + +def _regularize_values(x: np.ndarray, y: np.ndarray, ties, na_rm: bool) -> dict[str, Any]: + """Minimal reimplementation of R's regularize.values() used by approx(). + + - Removes NA pairs if na_rm is True (else requires x to have no NA). + - Sorts by x (stable). + - Collapses duplicate x via `ties` aggregator. + Returns dict with: + x: sorted unique x + y: aggregated y aligned with x + not_na: boolean mask of y that are not NaN + kept_na: True iff any NaN remained in y after regularization + """ + if na_rm: + # Remove pairs where x or y is NA + keep = ~np.isnan(x) & ~np.isnan(y) + x = x[keep] + y = y[keep] + kept_na = False + else: + # R's C code errors if na_rm=False and any x is NA + if np.isnan(x).any(): + raise ValueError("approx(x,y, .., na.rm=False): NA values in x are not allowed") + kept_na = np.isnan(y).any() + + if x.size == 0: + # THIS IS THE CORRECTED LINE: + return {"x": x, "y": y, "not_na": np.array([], dtype=bool), "kept_na": kept_na} + + # Use a stable sort (mergesort) to match R's order() + order = np.argsort(x, kind="mergesort") + x_sorted = x[order] + y_sorted = y[order] + + # Handle the 'ties' argument + if callable(ties): + agg_fn = ties + else: + ties_str = "mean" if ties is None else str(ties).lower() + if ties_str in ("mean", "avg", "average"): + agg_fn = np.mean + elif ties_str in ("first", "left"): + + def agg_fn(a): + return a[0] + elif ties_str in ("last", "right"): + + def agg_fn(a): + return a[-1] + elif ties_str == "min": + agg_fn = np.min + elif ties_str == "max": + agg_fn = np.max + elif ties_str == "median": + agg_fn = np.median + elif ties_str == "sum": + agg_fn = np.sum + else: + raise ValueError("Unsupported `ties`; use a callable or one of 'mean', 'first', 'last', 'min', 'max', 'median', 'sum'.") + + # Find unique x values and their indices/counts + unique_x, start_idx, counts = np.unique(x_sorted, return_index=True, return_counts=True) + + # Aggregate y values for tied x values + y_agg = np.empty(unique_x.shape, dtype=float) + for k, (start, count) in enumerate(zip(start_idx, counts, strict=True)): + seg = y_sorted[start : start + count] + # Note: aggregators like np.mean will return NaN if `seg` contains NaN, + # matching R's default (mean(..., na.rm = FALSE)). + y_agg[k] = agg_fn(seg) + + not_na = ~np.isnan(y_agg) + # Check if any NaNs remain in the *aggregated* y values + kept_na_final = np.any(~not_na) if np.any(np.isnan(y_agg)) else False + return {"x": unique_x, "y": y_agg, "not_na": not_na, "kept_na": kept_na_final} + + +def approx( + x: Sequence[float], + y: Sequence[float] | None = None, + xout: Sequence[float] | None = None, + method: str | int = "linear", + n: int = 50, + yleft: float | None = None, + yright: float | None = None, + rule: int | Sequence[int] = 1, + f: float = 0.0, + ties: str | Callable[[np.ndarray], float] = "mean", + na_rm: bool = True, +) -> dict[str, np.ndarray]: + """Faithful Python port of R's `approx` function. + + This implementation aims to replicate the behavior of R's `approx` + function, including its handling of `ties`, `rule`, `na_rm`, and + `method`. + + Args: + x: Coordinates, or y-values if `y` is None. + y: y-coordinates. If None, `x` is treated as `y` and `x` becomes `1..n`. + xout: Points at which to interpolate. + method: Interpolation method. "linear" (1) or "constant" (2). + n: If `xout` is not provided, interpolation happens at `n` + equally spaced points spanning the range of `x`. + yleft: Value to use for extrapolation to the left. + Defaults to `NA` (np.nan) if `rule` is 1. + yright: Value to use for extrapolation to the right. + Defaults to `NA` (np.nan) if `rule` is 1. + rule: Extrapolation rule. + - 1: Return `np.nan` for points outside the `x` range. + - 2: Use `yleft` and `yright` for points outside the range. + f: For `method="constant"`, determines the split point. + `f=0` is left-step, `f=1` is right-step, `f=0.5` is midpoint. + ties: How to handle duplicate `x` values. + Can be 'mean', 'first', 'last', 'min', 'max', 'median', 'sum', + or a callable function. + na_rm: If True, `NA` pairs are removed before interpolation. + If False, `NA`s in `x` cause an error, `NA`s in `y` + are propagated. + + Returns: + dict with: + - 'x': numpy array of xout used + - 'y': numpy array of interpolated values + """ + # One-argument form: approx(y) -> x := 1..n, y := y + if y is None: + y_arr = _coerce_to_float_array(x) + x_arr = np.arange(1.0, y_arr.size + 1.0, dtype=float) + else: + x_arr = _coerce_to_float_array(x) + y_arr = _coerce_to_float_array(y) + if x_arr.size != y_arr.size: + raise ValueError("x and y must have same length") + + # --- Method normalization --- + # (matches R's pmatch() result: 1=linear, 2=constant) + if isinstance(method, str): + m = method.strip().lower() + if m.startswith("lin"): + method_code = 1 + elif m.startswith("const"): + method_code = 2 + else: + raise ValueError("invalid interpolation method") + else: + if method in (1, 2): + method_code = int(method) + else: + raise ValueError("invalid interpolation method") + + # --- Rule normalization --- + if isinstance(rule, list | tuple | np.ndarray): + rlist = list(rule) + if not (1 <= len(rlist) <= 2): + raise ValueError("`rule` must have length 1 or 2") + if len(rlist) == 1: + rlist = [rlist[0], rlist[0]] + else: + rlist = [rule, rule] + + # --- Regularize values --- + # Sort by x, collapse ties, and handle NAs like R's regularize.values() + r = _regularize_values(x_arr, y_arr, ties, na_rm) + x_reg = r["x"] + y_reg = r["y"] + not_na_mask = r["not_na"] + # no_na is True if we don't have to worry about NAs in y_reg + no_na = na_rm or (not r["kept_na"]) + # nx is the number of *valid* (non-NA) points for interpolation + nx = x_reg.size if no_na else int(np.count_nonzero(not_na_mask)) + + if np.isnan(nx): + raise ValueError("invalid length(x)") + + # R's C_ApproxTest logic + if nx <= 1: + if method_code == 1: + raise ValueError("need at least two non-NA values to interpolate") + if nx == 0: + raise ValueError("zero non-NA points") + + # --- Set extrapolation values (yleft/yright) --- + # This logic matches R's C code (R_approxtest) + if no_na: + y_first = y_reg[0] + y_last = y_reg[-1] + else: + # Find first and last non-NA y values + y_valid = y_reg[not_na_mask] + y_first = y_valid[0] + y_last = y_valid[-1] + + yleft_val = (np.nan if int(rlist[0]) == 1 else y_first) if yleft is None else float(yleft) + yright_val = (np.nan if int(rlist[1]) == 1 else y_last) if yright is None else float(yright) + + # --- Fabricate xout if missing --- + if xout is None: + if n <= 0: + raise ValueError("'approx' requires n >= 1") + if no_na: + x_first = x_reg[0] + x_last = x_reg[nx - 1] + else: + x_valid = x_reg[not_na_mask] + x_first = x_valid[0] + x_last = x_valid[-1] + xout_arr = np.linspace(x_first, x_last, num=int(n), dtype=float) + else: + xout_arr = _coerce_to_float_array(xout) + + # --- C_ApproxTest checks --- + if method_code == 2 and (not np.isfinite(f) or f < 0.0 or f > 1.0): + raise ValueError("approx(): invalid f value") + if not no_na: + # R re-filters x and y here if NAs remained + x_reg = x_reg[not_na_mask] + y_reg = y_reg[not_na_mask] + + # --- Interpolation --- + # This section vectorized the logic from R's approx1 and R_approxfun + yout = np.empty_like(xout_arr, dtype=float) + mask_nan_xout = np.isnan(xout_arr) + yout[mask_nan_xout] = np.nan + mask_valid = ~mask_nan_xout + + if x_reg.size == 0: + # No valid points to interpolate against + yout[mask_valid] = np.nan + return {"x": xout_arr, "y": yout} + + xv = xout_arr[mask_valid] + left_mask = xv < x_reg[0] + right_mask = xv > x_reg[-1] + mid_mask = ~(left_mask | right_mask) + + res = np.empty_like(xv) + res[left_mask] = yleft_val + res[right_mask] = yright_val + + if np.any(mid_mask): + xm = xv[mid_mask] + + # Find indices + # j_right[k] = index of first x_reg > xm[k] + j_right = np.searchsorted(x_reg, xm, side="right") + # j_left[k] = index of first x_reg >= xm[k] + j_left = np.searchsorted(x_reg, xm, side="left") + + # Points that exactly match an x_reg value + eq_mask = j_left != j_right + # Points that fall between x_reg values + in_interval_mask = ~eq_mask + + res_mid = np.empty_like(xm) + + if np.any(eq_mask): + # For exact matches, use the corresponding y_reg value + # R's C code uses the 'j_left' index here + res_mid[eq_mask] = y_reg[j_left[eq_mask]] + + if np.any(in_interval_mask): + # R's C code sets i = j-1, where j is the 'right' index + j = j_right[in_interval_mask] + i = j - 1 + + # Get the surrounding x and y values + xi = x_reg[i] + xj = x_reg[j] + yi = y_reg[i] + yj = y_reg[j] + + if method_code == 1: # linear + t = (xm[in_interval_mask] - xi) / (xj - xi) + res_mid[in_interval_mask] = yi + (yj - yi) * t + else: # constant + # This matches R_approxfun's constant logic + if f == 0.0: + res_mid[in_interval_mask] = yi + elif f == 1.0: + res_mid[in_interval_mask] = yj + else: + # R computes (1-f)*yi + f*yj, but carefully + f1 = 1.0 - f + f2 = f + part = np.zeros_like(yi) + if f1 != 0.0: + part = part + yi * f1 + if f2 != 0.0: + part = part + yj * f2 + res_mid[in_interval_mask] = part + + res[mid_mask] = res_mid + + yout[mask_valid] = res + return {"x": xout_arr, "y": yout} diff --git a/main/como/density.py b/main/como/density.py new file mode 100644 index 00000000..0f75623a --- /dev/null +++ b/main/como/density.py @@ -0,0 +1,325 @@ +from __future__ import annotations + +import sys +from collections.abc import Sequence +from typing import Literal, NamedTuple, cast + +import numpy as np +import numpy.typing as npt +from scipy.interpolate import interp1d + +from como.approx import approx + + +class DensityResult(NamedTuple): + x: npt.NDArray[float] + y: npt.NDArray[float] + x_grid: npt.NDArray[float] + y_grid: npt.NDArray[float] + bw: float + n: int + + +NUMBER = int | float | np.number + + +def bin_distance(x: npt.NDArray[float], weights: npt.NDArray[float], lo: NUMBER, up: NUMBER, n: int) -> npt.NDArray[float]: + """Bin weighted distances.""" + ixmin: int = 0 + ixmax: int = n - 2 + delta: float = (up - lo) / (n - 1) + + y: npt.NDArray[float] = np.zeros((2 * n,), dtype=float) + for i in range(x.size): + i: int + xpos: float = (x[i] - lo) / delta + if xpos > sys.maxsize or xpos < -sys.maxsize: # avoid integer overflows (taken from R's massdist.c) + continue + ix: int = np.floor(xpos).astype(int) + fx: float = xpos - ix + wi: float = weights[i] + if ixmin <= ix <= ixmax: + y[ix] += (1 - fx) * wi + y[ix + 1] += fx * wi + elif ix == -1: + y[0] += fx * wi + elif ix == ixmax + 1: + y[ix] += (1 - fx) * wi + return y + + +def dnorm(x: float, mean: NUMBER = 0.0, sd: NUMBER = 1.0, log: bool = False, fast_dnorm: bool = False) -> float: + """Density function for the normal distribution. + + This is a reproduction of R's `density` function. + Neither SciPy nor NumPy are capable of producing KDE values that align with R, and as a result, + a manual translation of R's KDE implementation was necessary. + + References: + 1) Documentation: https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/Normal + 2) Source code (2025-OCT-30): https://github.com/wch/r-source/blob/3f7e2528990351bc4b0d1f1b237714668ab4c0c5/src/nmath/dnorm.c + + Args: + x: Value at which to evaluate the density. + mean: Mean of the normal distribution. + sd: Standard deviation of the normal distribution. + log: If True, return the log density. + fast_dnorm: If True, use a faster but less accurate calculation for small `x`. + + + """ + # Constants + m_ln2: float = 0.693147180559945309417232121458 # ln(2) + m_1_sqrt_2pi: float = 0.398942280401432677939946059934 # 1/sqrt(2pi) + m_ln_sqrt_2pi: float = 0.918938533204672741780329736406 # log(sqrt(2*pi)) + r_d__0 = float(-np.inf) if log else 0.0 # R's R_D__0: (log_p ? ML_NEGINF : 0.) + + # 11 bits exponent, where one bit is used to indicate special numbers (e.g. NaN and Infinity), + # so the maximum exponent is 10 bits wide (2^10 == 1024). + # dbl_min_exp = -1022 + dbl_min_exp = -1074 + + # The mantissa is 52 bits wide, but because numbers are normalized the initial binary 1 is represented + # implicitly (the so-called "hidden bit"), which leaves us with the ability to represent 53 bits wide mantissa. + dbl_mant_dig = 53 + + mean: float = float(mean) + sd: float = float(sd) + + if np.isnan(x) or np.isnan(mean) or np.isnan(sd): + return x + mean + sd + if sd < 0.0: + return float(np.nan) + if not np.isfinite(sd): + return r_d__0 + if not np.isfinite(x) and x == mean: + return float(np.nan) + if sd == 0.0: + return float(np.inf) if x == mean else r_d__0 + + # From this point on, dividing by `sd` is safe because we know it is not 0 + z = (x - mean) / sd + if (not np.isfinite(z)) and (x == mean): + return float(np.nan) + + if not np.isfinite(z): + return r_d__0 + + a = np.fabs(z) + if a >= 2 * np.sqrt(np.finfo(float).max): + return r_d__0 + if log: + return -(m_ln_sqrt_2pi + 0.5 * a * a + np.log(sd)) + if a < 5 or fast_dnorm: # for `x < 5`, this is more accurate but less fast + return m_1_sqrt_2pi * np.exp(-0.5 * a * a) / sd + + # underflow boundary + boundary = np.sqrt(-2.0 * m_ln2 * (dbl_min_exp + 1 - dbl_mant_dig)) + if a > boundary: + return float(0.0) + + # Now, to get full accuracy, split x into two parts, + # x = x1+x2, such that |x2| <= 2^-16. + # Assuming that we are using IEEE doubles, that means that + # x1*x1 is error free for x<1024 (but we have x < 38.6 anyway). + # If we do not have IEEE this is still an improvement over the naive formula. + a1 = np.ldexp(np.rint(np.ldexp(a, 16)), -16) + a2 = a - a1 + return (m_1_sqrt_2pi / sd) * (np.exp(-0.5 * a1 * a1) * np.exp((-a1 * a2) - (0.5 * a2 * a2))) + + +def nrd0(x: npt.NDArray[float]) -> float: + """Calculate nrd0 from R source. + + This bandwidth calculation matches R's + + References: + 1) Documentation: https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/bandwidth + 2) Source code (as of 2025-OCT-30, copied below): https://github.com/wch/r-source/blob/trunk/src/library/stats/R/bandwidths.R + ```R + bw.nrd0 <- function (x) + { + if(length(x) < 2L) stop("need at least 2 data points") + hi <- sd(x) + if(!(lo <- min(hi, IQR(x)/1.34)))# qnorm(.75) - qnorm(.25) = 1.34898 + (lo <- hi) || (lo <- abs(x[1L])) || (lo <- 1.) + 0.9 * lo * length(x)^(-0.2) + } + ``` + """ + if x.size < 2: + raise ValueError("need at least 2 data points") + + hi = np.std(x, ddof=1) + q75, q25 = cast(tuple[float, float], cast(object, np.percentile(x, [75, 25]))) + iqr_over_134 = (q75 - q25) / 1.34 + + # We are using a cascading series of checks on `lo` to make sure it is a non-zero value + lo = min(hi, iqr_over_134) + if lo == 0: + if hi != 0: + lo = hi + elif abs(x[0]) != 0: + lo = abs(x[0]) + else: + lo = 1.0 + + return float(0.9 * lo * x.size ** (-1 / 5)) + + +def density( + x: Sequence[NUMBER] | npt.NDArray[NUMBER], + bw: NUMBER | Literal["nrd0"] = "nrd0", + adjust: NUMBER = 1, + kernel: Literal["gaussian", "epanechnikov", "rectangular", "triangular", "biweight", "cosine", "optcosine"] = "gaussian", + weights: Sequence[NUMBER] | None = None, + n: int = 512, + from_: NUMBER | None = None, + to_: NUMBER | None = None, + cut: int = 3, + ext: int = 4, + remove_na: bool = False, + kernel_only: bool = False, +) -> DensityResult: + """Compute kernel density estimates (KDE) using FFT method. + + This is a reproduction of R's `density` function. + Neither SciPy nor NumPy are capable of producing KDE values that align with R, and as a result, + a manual translation of R's KDE implementation was necessary. + + Args: + x: Input data points. + bw: Bandwidth for the kernel. If "nrd0", uses R's nrd0 method. + adjust: Adjustment factor for the bandwidth. + kernel: Kernel type to use. + weights: Optional weights for each data point. + n: Number of points in the output grid. + from_: Start of the grid (calculated automatically if not provided). + to_: End of the grid (calculated automatically if not provided). + cut: Number of bandwidths to extend the grid on each side. + ext: Number of bandwidths to extend the grid for FFT calculation. + remove_na: Whether to remove NA values from `x`. + kernel_only: If True, returns only the integral of the kernel function. + + Returns: + DensityResult: A named tuple containing: + x: The x-coordinates of the density estimate. + y: The estimated density values at the x-coordinates. + x_grid: The grid of x-coordinates used for FFT calculation. + y_grid: The density values on the FFT grid. + bw: The bandwidth used. + n: The number of points in the output grid. + """ + if not isinstance(x, Sequence | np.ndarray) or (len(x) < 2 and bw == "nrd0"): + raise ValueError("Need at at least two points to select a bandwidth automatically using 'nrd0'") + if kernel_only: + if kernel == "gaussian": + return 1 / (2 * np.sqrt(np.pi)) + elif kernel == "epanechnikov": + return 3 / (5 * np.sqrt(5)) + elif kernel == "rectangular": + return np.sqrt(3) / 6 + elif kernel == "triangular": + return np.sqrt(6) / 9 + elif kernel == "biweight": + return 5 * np.sqrt(7) / 49 + elif kernel == "cosine": + return 3 / 4 * np.sqrt(1 / 3 - 2 / np.pi**2) + elif kernel == "optcosine": + return np.sqrt(1 - 8 / np.pi**2) * np.pi**2 / 16 + + x: npt.NDArray[float] = np.asarray(x, dtype=float) + + has_weights = weights is not None + weights: npt.NDArray[float] | None = np.asarray(weights, float) if weights is not None else None + if has_weights and (weights is not None and weights.size != n): + raise ValueError(f"The length of provided weights does not match the length of x: {weights.size} != {n}") + + x_na: npt.NDArray[np.bool_] = np.isnan(x) + if np.any(x_na): + if remove_na: + x: npt.NDArray[float] = x[~x_na] + if has_weights and weights is not None: + true_d = weights.sum().astype(float) == 1 + weights = weights[~x_na] + if true_d: + weights = weights / weights.sum() + else: + raise ValueError("NA values found in 'x'. Set 'remove_na=True' to ignore them.") + + nx = n + x_finite = np.isfinite(x) + if np.any(~x_finite): + x = x[x_finite] + nx = x.size + if not has_weights: + weights: npt.NDArray[float] = np.full(shape=nx, fill_value=1 / nx, dtype=float) + total_mass = nx / n + else: + weights = np.asarray(weights) + if not np.all(np.isfinite(weights)): + raise ValueError("Non-finite values found in 'weights'.") + if np.any(weights < 0): + raise ValueError("Negative values found in 'weights'.") + wsum: float = weights.sum() + if np.any(~x_finite): + weights = weights[x_finite] + total_mass = float(weights.sum() / wsum) + else: + total_mass = float(1) + + n_user: int = n + n = max(n, 512) + if n > 512: # round n up to the next power of 2 (i.e., 2^8=512, 2^9=1024) + n: int = int(2 ** np.ceil(np.log2(n))) + + if isinstance(bw, str) and bw != "nrd0": + raise TypeError("Bandwidth 'bw' must be a number or 'nrd0'.") + elif isinstance(bw, str) and bw == "nrd0": + bw_calc = nrd0(x) + else: + bw_calc = float(bw) + if not np.isfinite(bw_calc): + raise ValueError("Calculated bandwidth 'bw' is not finite.") + bw_calc *= adjust + + if bw_calc <= 0: + raise ValueError("Bandwidth 'bw' must be positive.") + + from_ = float(from_ or min(x) - cut * bw_calc) + to_ = float(to_ or max(x) + cut * bw_calc) + + if not np.isfinite(from_): + raise ValueError("'from_' is not finite.") + if not np.isfinite(to_): + raise ValueError("'to_' is not finite.") + + lo = float(from_ - ext * bw_calc) + up = float(to_ + ext * bw_calc) + + y: npt.NDArray[float] = bin_distance(x, weights, lo, up, n) * total_mass + kords: npt.NDArray[float] = np.linspace(start=0, stop=((2 * n - 1) / (n - 1) * (up - lo)), num=2 * n, dtype=float) + kords[n + 1 : 2 * n] = -kords[n:1:-1] # mirror/negate tail: R's kords[n:2] will index from the reverse if `n`>2 + + # Initial diverge here (inside dnorm calculation) + kords: npt.NDArray[float] = np.asarray([dnorm(i, sd=bw_calc) for i in kords], dtype=float) + + fft_y: npt.NDArray[np.complex128] = np.fft.fft(y) + conj_fft_kords: npt.NDArray[np.complex128] = np.conjugate(np.fft.fft(kords)) + # Must multiply by `kords.size` because R does not produce a normalize inverse FFT, but NumPy normalizes by `1/size` + kords: npt.NDArray[np.complex128] = np.fft.ifft(fft_y * conj_fft_kords) * kords.size + kords: npt.NDArray[float] = (np.maximum(0, kords.real)[0:n]) / y.size # for values of kords, get 0 or kords[i], whichever is larger + xords: npt.NDArray[float] = np.linspace(lo, up, num=n, dtype=float) + + # xp=known x-coords, fp=known y-cords, x=unknown x-coords; returns interpolated (e.g., unknown) y-coords + interp_x: npt.NDArray[float] = np.linspace(from_, to_, num=n_user) + interp_y: npt.NDArray[float] = approx(xords, kords, interp_x) + + return DensityResult( + x=interp_x, + y=interp_y["y"], + x_grid=xords, + y_grid=kords, + bw=bw_calc, + n=n, + ) diff --git a/main/como/peak_finder.py b/main/como/peak_finder.py new file mode 100644 index 00000000..230db014 --- /dev/null +++ b/main/como/peak_finder.py @@ -0,0 +1,191 @@ +import re +from collections.abc import Sequence +from typing import Literal, cast, overload + +import numpy as np +import numpy.typing as npt +import pandas as pd + +from como.utils import _num_rows + + +def _validate_args( + x: npt.NDArray[np.number], + nups: int, + ndowns: int, + zero: Literal["0", "+", "-"], + min_peak_height: float, + min_peak_distance: int, + threshold: float, +): + """Validate input arguments to `find_peaks` function. + + Function created to reduce the complexity of `find_peaks` (ruff linting rule C901) + """ + if x.ndim != 1: + raise ValueError(f"Expected a 1D array, got {x.ndim}D array instead.") + if np.any(np.isnan(x)): + raise ValueError("Input x contains NaNs") + if nups < 0: + raise ValueError("Argument 'nups' must be non-negative") + if ndowns < 0: + raise ValueError("Argument 'ndowns' must be non-negative") + if zero not in {"0", "+", "-"}: + raise ValueError("Argument 'zero' must be '0', '+', or '-'") + if min_peak_height < 0: + raise ValueError("Argument 'min_peak_height' must be non-negative") + if min_peak_distance < 0: + raise ValueError("Argument 'minpeakdistance' must be non-negative") + if threshold < 0: + raise ValueError("Argument 'threshold' must be non-negative") + + +def _encode_signs(x: npt.NDArray[np.number], zero: str) -> str: + """Encode the signs of the differences in `x` into a string. + + Function created to reduce the complexity of `find_peaks` (ruff linting rule C901) + """ + diff_signs = np.sign(np.diff(x)) + chars = ["+" if d > 0 else "-" if d < 0 else "0" for d in diff_signs] + x_chars = "".join(chars) + if zero != "0": + x_chars = x_chars.replace("0", zero) + return x_chars + + +@overload +def _enforce_minimum_peak_distance(df: pd.DataFrame, min_peak_distance: int, inplace: Literal[True] = True) -> None: ... + + +@overload +def _enforce_minimum_peak_distance(df: pd.DataFrame, min_peak_distance: int, inplace: Literal[False] = False) -> pd.DataFrame: ... + + +def _enforce_minimum_peak_distance(df: pd.DataFrame, min_peak_distance: int, inplace: bool = True) -> None | pd.DataFrame: + """Enforces a minimum peak distance between identified peaks. + + Function created to reduce the complexity of `find_peaks` (ruff linting rule C901) + + Modifies `df` inplace + """ + good_peaks = np.ones(_num_rows(df), dtype=bool) + for i in range(_num_rows(df)): + if not good_peaks[i]: + continue + dpos = np.abs(df.at[i, "peak_idx"] - df["peak_idx"].values) + close = (dpos > 0) & (dpos < min_peak_distance) + good_peaks[close] = False + + if inplace: + df.drop(df.index[~good_peaks], inplace=True) + df.reset_index(drop=True, inplace=True) + return None + return cast(pd.DataFrame, df.iloc[good_peaks, :].reset_index(drop=True)) + + +def find_peaks( + x: Sequence[float] | Sequence[int] | npt.NDArray[np.number], + nups: int = 1, + ndowns: int | None = None, + zero: Literal["0", "+", "-"] = "0", + peak_pattern: str | None = None, + min_peak_height: float = 0.02, # default for R's zFPKM `peak_parameters` + min_peak_distance: int = 1, # default for R's zFPKM `peak_parameters` + threshold: float = 0.0, + npeaks: int = 0, + sortstr: bool = False, +) -> pd.DataFrame: + """Identify peaks in a given time series. + + This function is modelled after R's `pracma::findpeaks` function. + SciPy's `scipy.signal.find_peaks` provides different results than `pracma::findpeaks`, + resulting in the requirement for this translation. + + References: + 1) pracma::findpeaks: https://rdrr.io/cran/pracma/man/findpeaks.html + + Args: + x: numerical vector taken as a time series (no NA values allowed) + nups: minimum number of increasing steps before a peak is reached + ndowns: minimum number of decreasing steps after the peak (defaults to the same value as `nups`) + zero: can be '+', '-', or '0'; how to interprete succeeding steps of the same value: increasing, decreasing, or special + peak_pattern: define a peak as a regular pattern, such as the default pattern `[+]{1,}[-]{1,}` + If a pattern is provided, parameters `nups` and `ndowns` are not taken into account + min_peak_height: the minimum (absolute) height a peak has to have before being recognized + min_peak_distance: the minimum distance (in indices) between peaks before they are counted + threshold: the minimum difference in height between a peak and its surrounding values + npeaks: the number of peaks to return (<=0 returns all) + sortstr: should the peaks be returned in decreasing order of their peak height? + + Returns: + A dataframe with the columns: + height: the height of the peak + peak_idx: the index (from `x`) of the identified peak + start_idx: the starting index (from `x`) of the identified peak + end_idx: the ending index (from `x`) of the identified peak + If the dataframe is empty, no peaks could be identified + """ + x = np.asarray(x, dtype=float) if not isinstance(x, np.ndarray) else x + npeaks: int = max(npeaks, 0) + ndowns: int = ndowns or nups + peak_pattern: str = peak_pattern or rf"[+]{{{nups},}}[-]{{{ndowns},}}" + _validate_args( + x=x, + nups=nups, + ndowns=ndowns, + zero=zero, + min_peak_height=min_peak_height, + min_peak_distance=min_peak_distance, + threshold=threshold, + ) + + # find peaks by regex matching the sign pattern + derivative_chars = _encode_signs(x=x, zero=zero) + matches: list[re.Match[str]] = list(re.finditer(peak_pattern, derivative_chars)) + if not matches: + return pd.DataFrame(columns=["height", "peak_idx", "start_idx", "end_idx"]) + + pattern_start_index: npt.NDArray[int] = np.asarray([m.start() for m in matches], dtype=int) + pattern_end_index: npt.NDArray[int] = np.asarray([m.end() for m in matches], dtype=int) + + num_matches: int = len(pattern_start_index) + peak_index: npt.NDArray[int] = np.zeros(num_matches, dtype=int) + peak_height: npt.NDArray[float] = np.zeros(num_matches, dtype=float) + + # for each match region, find the local max + for i in range(num_matches): + segment: npt.NDArray[np.uint64] = x[pattern_start_index[i] : pattern_end_index[i] + 1] + segment_max_idx: np.uint64 = np.argmax(segment) + peak_index[i] = pattern_start_index[i] + segment_max_idx + peak_height[i] = segment[segment_max_idx] + + # filter for values that are too low or below the threshold difference + x_left: float = x[pattern_start_index] + x_right: float = x[np.minimum(pattern_end_index, x.size - 1)] + valid_peaks: list[int] = list(np.where((peak_height >= min_peak_height) & ((peak_height - np.maximum(x_left, x_right)) >= threshold))[0].astype(int)) # noqa: E501 # fmt: skip + + if len(valid_peaks) == 0: + return pd.DataFrame(columns=["height", "peak_idx", "start_idx", "end_idx"]) + + out_df: pd.DataFrame = pd.DataFrame( + data={ + "height": peak_height[valid_peaks], + "peak_idx": peak_index[valid_peaks], + "start_idx": pattern_start_index[valid_peaks], + "end_idx": pattern_end_index[valid_peaks], + } + ) + + # sort by peak height in descending order (largest to smallest) + if sortstr or min_peak_distance > 1: + out_df.sort_values(by=["height"], ascending=False, ignore_index=True, inplace=True) + + # if provided, enforce a minimum distance between the peaks (removes smaller peaks next to larger ones) + if min_peak_distance > 1 and len(out_df) > 1: + _enforce_minimum_peak_distance(df=out_df, min_peak_distance=min_peak_distance) + + # if provided, limit the number of peaks returned + if 0 < npeaks < _num_rows(out_df): + out_df = cast(pd.DataFrame, out_df.iloc[:npeaks, :].reset_index(drop=True)) + + return out_df diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index bad4f70e..308cd9d9 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -99,7 +99,7 @@ class _ZFPKMResult(NamedTuple): density: Density mu: float std_dev: float - max_fpkm: float + fpkm_at_mu: float class _ReadMatrixResults(NamedTuple): @@ -257,7 +257,7 @@ def calculate_tpm(metrics: NamedMetrics) -> NamedMetrics: return metrics -def _calculate_fpkm(metrics: NamedMetrics, scale: int = 1e6) -> NamedMetrics: +def _calculate_fpkm(metrics: NamedMetrics, scale: float = 1e6) -> NamedMetrics: """Calculate the Fragments Per Kilobase of transcript per Million mapped reads (FPKM) for each sample in the metrics dictionary. Args: @@ -318,92 +318,42 @@ def _calculate_fpkm(metrics: NamedMetrics, scale: int = 1e6) -> NamedMetrics: return metrics -def _zfpkm_calculation( - column: pd.Series, - peak_parameters: PeakIdentificationParameters, - bandwidth: float = 1.0, - epsilon: float = 1e-10, -) -> _ZFPKMResult: - """Log2 Transformations. - - Stabilize the variance in the data to make the distribution more symmetric; this is helpful for Gaussian fitting - - Kernel Density Estimation (kde) - - SciKit Learn: https://scikit-learn.org/stable/modules/density.html - - Non-parametric method to estimate the probability density function (PDF) of a random variable - - Estimates the distribution of log2-transformed FPKM values - - Bandwidth parameter controls the smoothness of the density estimate - - KDE Explanation - - A way to smooth a histogram to get a better idea of the underlying distribution of the data - - Given a set of data points, we want to understand how they are distributed. - Histograms can be useful, but are sensitive to bin size and number - - The KDE places a "kernel" - a small symmetric function (i.e., Gaussian curve) - at each data point - - The "kernel" acts as a weight, giving more weight to points closer to the center of the kernel, - and less weight to points farther away - - Kernel functions are summed along each point on the x-axis - - A smooth curve is created that represents the estimated density of the data - - Peak Finding - - Identifies that are above a certain height and separated by a minimum distance - - Represent potential local maxima in the distribution - - Peak Selection - - The peak with the highest x-value (from log2-FPKM) is chosen as the mean (mu) - of the "inactive" gene distribution - - The peak representing unexpressed or inactive genes should be at a lower FPKM - value compared to the peak representing expressed genes - - Standard Deviation Estimation - - The mean of log2-FPKM values are greater than the calculated mu - - Standard deviation is estimated based on the assumption that the right tail of the distribution - This represents expressed genes) can be approximated by a half-normal distribution - - zFPKM Transformation - - Centers disbribution around 0 and scales it by the standard deviation. - This makes it easier to compare gene expression across different samples - - Represents the number of standard deviations away from the mean of the "inactive" gene distribution - - Higher zFPKM values indicate higher expression levels relative to the "inactive" peak - - A zFPKM value of 0 represents the mean of the "inactive" distribution - - Research shows that a zFPKM value of -3 or greater can be used as - a threshold for calling a gene as "active" and/or "expressed" - : https://doi.org/10.1186/1471-2164-14-778 +def _zfpkm_calculation(col_fpkm: pd.Series, min_peak_height: float, min_peak_distance: int): + """ZFPKM Transformations. + + This function reproduces R's `zFPKM::zFPKM` function. + + References: + 1) zFPKM implementation in R: https://github.com/ronammar/zFPKM + 2) zFPKM publication: https://doi.org/10.1186/1471-2164-14-778 Args: - column: A pandas Series representing a single sample's FPKM values. - peak_parameters: Parameters for peak identification. - bandwidth: The bandwidth for the Kernel Density Estimation (KDE). - epsilon: A small value to add to FPKM values to prevent log2-divide-by-0 errors + col_fpkm: The raw FPKM values to perform zFPKM on + min_peak_distance: Minimum distance between peaks; passed on to `find_peaks` function + min_peak_height: Minimum height of peaks; passed on to `find_peaks` function Returns: - A named tuple containing the zFPKM values, density estimate, mean (mu), standard deviation, and maximum FPKM value. - + A named tuple containing the zFPKM values, density estimate, mean (mu), standard deviation, and maximum FPKM value. """ - log2values: npt.NDArray[float] = np.log2(column.values + epsilon) - - # 1D KDE *always* has one feature and many samples. scikit-learn expects data in the format of (n_samples, n_features) - # Thus, we use `.reshape(-1, 1)` because we know there is a single feature - # Even though the data is FPKM of many genes for a single sample, it is still one feature over many samples - # `-1` indicates the unknown dimension (the number of samples) - # `1` indicates the known dimension (the number of genes [also known as features]) - # https://scikit-learn.org/stable/auto_examples/neighbors/plot_kde_1d.html#sphx-glr-auto-examples-neighbors-plot-kde-1d-py - kde: KernelDensity = KernelDensity(kernel="gaussian", bandwidth=bandwidth).fit(log2values.reshape(-1, 1)) - - x_range: npt.NDArray[float] = np.linspace(log2values.min(), log2values.max(), 2000).reshape(-1, 1) - density: npt.NDArray[float] = np.exp(kde.score_samples(x_range)) - peaks, _ = find_peaks(density, height=peak_parameters.height, distance=peak_parameters.distance) - peak_positions = x_range[peaks] - - mu = 0 - max_fpkm = 0 - stddev = 1 - if len(peaks) != 0: - mu = peak_positions.max() - max_fpkm = density[peaks[np.argmax(peak_positions)]] - u = log2values[log2values > mu].mean() - stddev = (u - mu) * np.sqrt(np.pi / 2) - zfpkm = pd.Series((log2values - mu) / stddev, dtype=float, name=column.name) - - return _ZFPKMResult(zfpkm=zfpkm, density=Density(x_range, density), mu=mu, std_dev=stddev, max_fpkm=max_fpkm) + # Ignore np.log2(0) errors; we know this will happen, and are removing non-finite values in the density calculation + # This is required in order to match R's zFPKM calculations, as R's `density` function removes NA values. + with np.errstate(divide="ignore", invalid="ignore"): + log2fpkm: npt.NDArray[float] = np.log2(col_fpkm.values).astype(float) + d = density(log2fpkm) + + peaks: pd.DataFrame = find_peaks(d.y_grid, min_peak_height=min_peak_height, min_peak_distance=min_peak_distance) + peak_positions = d.x_grid[peaks["peak_idx"].astype(int).tolist()] + + sd = 1.0 + mu = 0.0 + fpkm_at_mu = 0.0 + if peak_positions.size > 0: + mu = float(peak_positions.max()) + u = float(log2fpkm[log2fpkm > mu].mean()) + fpkm_at_mu = float(d.y_grid[int(peaks.loc[np.argmax(peak_positions), "peak_idx"])]) + sd = float((u - mu) * np.sqrt(np.pi / 2)) + zfpkm = pd.Series((log2fpkm - mu) / sd, dtype=float, name=col_fpkm.name, index=col_fpkm.index) + return _ZFPKMResult(zfpkm=zfpkm, density=Density(d.x_grid, d.y_grid), mu=mu, std_dev=sd, fpkm_at_mu=fpkm_at_mu) def zfpkm_transform( From 95dc2db16a69feaeeee43bdc0580eee5a1e0c771 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:40:24 -0600 Subject: [PATCH 26/52] refactor: do not drop na values, keep as much data for as long as possible Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 308cd9d9..dec2e78d 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -166,18 +166,18 @@ async def _build_matrix_results( Returns: A dataclass `ReadMatrixResults` """ - matrix.dropna(subset="ensembl_gene_id", inplace=True) conversion = await ensembl_to_gene_id_and_symbol(ids=matrix["ensembl_gene_id"].tolist(), taxon=taxon) - # If any one column was - if any(conversion[col].eq("-").all() for col in conversion.columns): + # If all columns are empty, it is indicative that the incorrect taxon id was provided + if all(conversion[col].eq("-").all() for col in conversion.columns): logger.critical(f"Conversion of Ensembl Gene IDs to Entrez IDs and Gene Symbols was empty - is '{taxon}' the correct taxon ID for this data?") - conversion["ensembl_gene_id"] = conversion["ensembl_gene_id"].str.split(",") - conversion = conversion.explode("ensembl_gene_id") - conversion.reset_index(inplace=True, drop=True) - conversion = conversion[conversion["entrez_gene_id"] != "-"] # drop missing entrez IDs - conversion["entrez_gene_id"] = conversion["entrez_gene_id"].astype(int) # float32 is needed because np.nan is a float + # 2025-NOV-3: commented out `conversion` types to evaluate if it can be skipped + # conversion["ensembl_gene_id"] = conversion["ensembl_gene_id"].str.split(",") + # conversion = conversion.explode("ensembl_gene_id") + # conversion.reset_index(inplace=True, drop=True) + # conversion = conversion[conversion["entrez_gene_id"] != "-"] # drop missing entrez IDs + # conversion["entrez_gene_id"] = conversion["entrez_gene_id"].astype(int) # float32 is needed because np.nan is a float # merge_on should contain at least one of "ensembl_gene_id", "entrez_gene_id", or "gene_symbol" merge_on: list[str] = list(set(matrix.columns).intersection(conversion.columns)) @@ -195,11 +195,11 @@ async def _build_matrix_results( matrix = matrix.merge(conversion, on=merge_on, how="left") # drop rows that have `0` in `entrez_gene_id` column - matrix = matrix[matrix["entrez_gene_id"] != 0].reset_index(drop=True, inplace=False) - gene_info = gene_info[gene_info["entrez_gene_id"] != 0].reset_index(drop=True, inplace=False) + # matrix = matrix[matrix["entrez_gene_id"] != 0].reset_index(drop=True, inplace=False) + # gene_info = gene_info[gene_info["entrez_gene_id"] != 0].reset_index(drop=True, inplace=False) gene_info = gene_info_migrations(gene_info) - gene_info["entrez_gene_id"] = gene_info["entrez_gene_id"].astype(int) + # gene_info["entrez_gene_id"] = gene_info["entrez_gene_id"].astype(int) counts_matrix = matrix.merge( gene_info[["entrez_gene_id", "ensembl_gene_id"]], From bb1449a69e98addeef68003d06f78198c8b1a747 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:41:15 -0600 Subject: [PATCH 27/52] refactor: explicit type cast Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index dec2e78d..2c3e07d2 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -219,7 +219,7 @@ async def _build_matrix_results( study_sample_names = metadata_df[metadata_df["study"] == study]["sample_name"].tolist() layouts = metadata_df[metadata_df["study"] == study]["layout"].tolist() metrics[study] = _StudyMetrics( - count_matrix=counts_matrix[counts_matrix.columns.intersection(study_sample_names)], + count_matrix=cast(pd.DataFrame, counts_matrix[counts_matrix.columns.intersection(study_sample_names)]), fragment_lengths=metadata_df[metadata_df["study"] == study]["fragment_length"].values.astype(float), sample_names=study_sample_names, layout=[LayoutMethod(layout) for layout in layouts], @@ -393,6 +393,7 @@ def zfpkm_transform( zfpkm_series: list[pd.Series] = [] results: dict[str, _ZFPKMResult] = {} + slim_fpkm_df: pd.DataFrame = cast(pd.DataFrame, fpkm_df[fpkm_df.index != "-"] if remove_na else fpkm_df) with ProcessPoolExecutor(max_workers=cores) as pool: futures: list[Future[_ZFPKMResult]] = [ pool.submit( @@ -523,7 +524,7 @@ def cpm_filter( for sample, metric in metrics.items(): counts: pd.DataFrame = metric.count_matrix entrez_ids: list[str] = metric.entrez_gene_ids - library_size: pd.DataFrame = counts.sum(axis=1) + library_size: pd.DataFrame = cast(pd.DataFrame, counts.sum(axis=1)) # For library_sizes equal to 0, add 1 to prevent divide by 0 # This will not impact the final counts per million calculation because the original counts are still 0 @@ -595,8 +596,8 @@ def tpm_quantile_filter(*, metrics: NamedMetrics, filtering_options: _FilteringO # Only keep `entrez_gene_ids` that pass `min_genes` metric.entrez_gene_ids = [gene for gene, keep in zip(entrez_ids, min_genes, strict=True) if keep] metric.gene_sizes = np.array(gene for gene, keep in zip(gene_size, min_genes, strict=True) if keep) - metric.count_matrix = metric.count_matrix.iloc[min_genes, :] - metric.normalization_matrix = metrics[sample].normalization_matrix.iloc[min_genes, :] + metric.count_matrix = cast(pd.DataFrame, metric.count_matrix.iloc[min_genes, :]) + metric.normalization_matrix = cast(pd.DataFrame, metrics[sample].normalization_matrix.iloc[min_genes, :]) keep_top_genes = [gene for gene, keep in zip(entrez_ids, top_genes, strict=True) if keep] metric.high_confidence_entrez_gene_ids = [gene for gene, keep in zip(entrez_ids, keep_top_genes, strict=True) if keep] From 54802460f7b10798d73b89865747dc8ff6f3c418 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:42:02 -0600 Subject: [PATCH 28/52] refactor!: provide default min peak height and distance BREAKING-CHANGE: instead of providing a type `PeakIdentificationParameters`, simply provide the min peak height and distance directly Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 2c3e07d2..e3d43ab8 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -358,16 +358,16 @@ def _zfpkm_calculation(col_fpkm: pd.Series, min_peak_height: float, min_peak_dis def zfpkm_transform( fpkm_df: pd.DataFrame, - peak_parameters: PeakIdentificationParameters, - bandwidth: float, + min_peak_height: float = 0.02, + min_peak_distance: int = 1, update_every_percent: float = 0.1, ) -> tuple[dict[str, _ZFPKMResult], DataFrame]: """Perform zFPKM calculation/transformation. Args: fpkm_df: A DataFrame containing FPKM values with genes as rows and samples as columns. - peak_parameters: Parameters for peak identification in zFPKM calculation. - bandwidth: The bandwidth for kernel density estimation in zFPKM calculation. + min_peak_height: Minimum height of peaks; passed on to `find_peaks` function. + min_peak_distance: Minimum distance between peaks; passed on to `find_peaks` function. update_every_percent: Frequency of progress updates as a decimal between 0 and 1 (e.g., 0.1 for every 10%). Returns: @@ -398,12 +398,13 @@ def zfpkm_transform( futures: list[Future[_ZFPKMResult]] = [ pool.submit( _zfpkm_calculation, - column=fpkm_df[column], - peak_parameters=peak_parameters, - bandwidth=bandwidth, + col_fpkm=fpkm_df[column], + min_peak_height=min_peak_height, + min_peak_distance=min_peak_distance, ) - for column in fpkm_df + for column in slim_fpkm_df ] + for i, future in enumerate(as_completed(futures)): result = future.result() key = str(result.zfpkm.name) From 038a5feeca61aba05f5812725e6d4fc0128bfeca Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:42:32 -0600 Subject: [PATCH 29/52] feat: remove NA values by default (replicates R functionality) Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index e3d43ab8..87c82a6a 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -361,6 +361,7 @@ def zfpkm_transform( min_peak_height: float = 0.02, min_peak_distance: int = 1, update_every_percent: float = 0.1, + remove_na: bool = True, ) -> tuple[dict[str, _ZFPKMResult], DataFrame]: """Perform zFPKM calculation/transformation. @@ -369,6 +370,7 @@ def zfpkm_transform( min_peak_height: Minimum height of peaks; passed on to `find_peaks` function. min_peak_distance: Minimum distance between peaks; passed on to `find_peaks` function. update_every_percent: Frequency of progress updates as a decimal between 0 and 1 (e.g., 0.1 for every 10%). + remove_na: Whether to remove NaN & blank values from the input DataFrame before processing. Returns: A tuple containing: From 31cb4d4f590a035f8bcd0c6cb71e0e18df4952fd Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:42:47 -0600 Subject: [PATCH 30/52] fix: do not build a list of `None` Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 87c82a6a..26803c4e 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -440,7 +440,7 @@ def zfpkm_plot(results: dict[str, _ZFPKMResult], *, output_png_dirpath: Path, pl subplot_titles: Whether to display facet titles (sample names). """ - to_concat: list[pd.DataFrame] = [None] * len(results) # type: ignore # ignoring because None is not of type pd.DataFrame + to_concat: list[pd.DataFrame] = [] for name, result in results.items(): stddev: float = float(result.std_dev) x: npt.NDArray[float] = result.density.x.flatten() From 2f1727908158dec6a0538eae5d6efabbbdb25f5b Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:43:14 -0600 Subject: [PATCH 31/52] fix: plot gaussian distribution with a peak of the fpkm value at `mu`, not at the maximum fpkm distribution Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 26803c4e..250e429f 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -447,13 +447,13 @@ def zfpkm_plot(results: dict[str, _ZFPKMResult], *, output_png_dirpath: Path, pl y: npt.NDArray[float] = result.density.y.flatten() fitted: npt.NDArray[float] = np.exp(-0.5 * ((x - result.mu) / stddev) ** 2) / (stddev * np.sqrt(2 * np.pi)) - max_fpkm: float = float(y.max()) + fpkm_at_mu: float = result.fpkm_at_mu max_fitted: float = float(fitted.max()) - scale_fitted: float = fitted * max_fpkm / max_fitted - to_concat.append(pd.DataFrame({"sample_name": name, "log2fpkm": x, "fpkm_density": y, "fitted_density_scaled": scale_fitted})) + scale_fitted: float = fitted * fpkm_at_mu / max_fitted + to_concat.append(pd.DataFrame({"sample_name": name, "log2fpkm": x, "fpkm_density": y, "zfpkm_density": scale_fitted})) mega_df = pd.concat(to_concat, ignore_index=True) - mega_df.columns = pd.Series(data=["sample_name", "log2fpkm", "fpkm_density", "fitted_density_scaled"]) + mega_df.columns = pd.Series(data=["sample_name", "log2fpkm", "fpkm_density", "zfpkm_density"]) mega_df = mega_df.melt(id_vars=["log2fpkm", "sample_name"], var_name="source", value_name="density") fig: plt.Figure From 2d6b20f396ccb49bdf7600503d6dec078d069f59 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:44:10 -0600 Subject: [PATCH 32/52] refactor!: provide default min peak height and distance Signed-off-by: Josh Loecker --- main/como/proteomics_preprocessing.py | 18 +++---- main/como/rnaseq_gen.py | 46 ++++++++++-------- .../proteomics_data_inputs_paper.xlsx | Bin 6136 -> 10147 bytes 3 files changed, 35 insertions(+), 29 deletions(-) diff --git a/main/como/proteomics_preprocessing.py b/main/como/proteomics_preprocessing.py index 39d4c85c..44c21f6d 100644 --- a/main/como/proteomics_preprocessing.py +++ b/main/como/proteomics_preprocessing.py @@ -21,11 +21,11 @@ class ZResult: """Dataclass to store results of Z-score calculation.""" zfpkm: pd.DataFrame - x_range: pd.DataFrame # npt.NDArray[np.float64] - density: pd.DataFrame # npt.NDArray[np.float64] - mu: npt.NDArray[np.float64] - std_dev: npt.NDArray[np.float64] - max_fpkm_peak: npt.NDArray[np.float64] + x_range: pd.DataFrame # npt.NDArray[float] + density: pd.DataFrame # npt.NDArray[float] + mu: npt.NDArray[float] + std_dev: npt.NDArray[float] + max_fpkm_peak: npt.NDArray[float] def z_score_calc(abundance: pd.DataFrame, min_thresh: int) -> ZResult: @@ -42,11 +42,11 @@ def z_score_calc(abundance: pd.DataFrame, min_thresh: int) -> ZResult: log_abundance_filt = np.log2(values[values > min_thresh]).reshape((len(abundance), len(abundance.columns))) log_abundance = np.log2(values) - # np.zeros((1000, len(abundance.columns)), dtype=np.float64), + # np.zeros((1000, len(abundance.columns)), dtype=float), z_result = ZResult( - zfpkm=pd.DataFrame(data=np.nan * np.ones_like(values), index=abundance.index, columns=abundance.columns, dtype=np.float64), - x_range=pd.DataFrame(data=np.zeros((1000, len(abundance.columns))), columns=abundance.columns, dtype=np.float64), - density=pd.DataFrame(data=np.zeros((1000, len(abundance.columns))), columns=abundance.columns, dtype=np.float64), + zfpkm=pd.DataFrame(data=np.nan * np.ones_like(values), index=abundance.index, columns=abundance.columns, dtype=float), + x_range=pd.DataFrame(data=np.zeros((1000, len(abundance.columns))), columns=abundance.columns, dtype=float), + density=pd.DataFrame(data=np.zeros((1000, len(abundance.columns))), columns=abundance.columns, dtype=float), mu=np.zeros(len(abundance.columns)), std_dev=np.zeros(len(abundance.columns)), max_fpkm_peak=np.zeros(len(abundance.columns)), diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index 250e429f..b760041c 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -616,9 +616,9 @@ def zfpkm_filter( filtering_options: _FilteringOptions, calculate_fpkm: bool, force_zfpkm_plot: bool, - peak_parameters: PeakIdentificationParameters, - bandwidth: float, - output_png_filepath: Path | None, + min_peak_height: float, + min_peak_distance: int, + output_png_dirpath: Path | None, ) -> NamedMetrics: """Apply zFPKM filtering to the FPKM matrix for a given sample. @@ -627,9 +627,9 @@ def zfpkm_filter( filtering_options: Options for filtering the count matrix. calculate_fpkm: Whether to calculate FPKM from counts. force_zfpkm_plot: Whether to force plotting of zFPKM results even if there are many samples. - peak_parameters: Parameters for peak identification in zFPKM calculation. - bandwidth: The bandwidth for kernel density estimation in zFPKM calculation. - output_png_filepath: Optional filepath to save the zFPKM plot. + min_peak_height: Minimum peak height for zFPKM peak identification. + min_peak_distance: Minimum peak distance for zFPKM peak identification. + output_png_dirpath: Optional directory path to save zFPKM plots. Returns: A dictionary of filtered study metrics. @@ -643,9 +643,15 @@ def zfpkm_filter( metric: _StudyMetrics # if fpkm was not calculated, the normalization matrix will be empty; collect the count matrix instead matrix = metric.count_matrix if metric.normalization_matrix.empty else metric.normalization_matrix - matrix = matrix[matrix.sum(axis=1) > 0] # remove rows (genes) that have no counts across all samples - results, zfpkm_df = zfpkm_transform(matrix, peak_parameters=peak_parameters, bandwidth=bandwidth) + # TODO: 2025-OCT-31: Re-evaluate whether to remove rows with all 0 counts + # matrix = matrix[matrix.sum(axis=1) > 0] # remove rows (genes) that have no counts across all samples + + results, zfpkm_df = zfpkm_transform( + fpkm_df=matrix, + min_peak_height=min_peak_height, + min_peak_distance=min_peak_distance, + ) zfpkm_df[(matrix == 0) | (zfpkm_df.isna())] = -4 if len(results) > 10 and not force_zfpkm_plot: @@ -683,8 +689,8 @@ def filter_counts( filtering_options: _FilteringOptions, prep: RNAType, force_zfpkm_plot: bool, - peak_parameters: PeakIdentificationParameters, - bandwidth: float, + zfpkm_min_peak_height: float, + zfpkm_min_peak_distance: int, output_zfpkm_plot_dirpath: Path | None = None, ) -> NamedMetrics: """Filter the count matrix based on the specified technique. @@ -696,8 +702,8 @@ def filter_counts( filtering_options: Options for filtering the count matrix. prep: The RNA preparation type. force_zfpkm_plot: Whether to force plotting of zFPKM results even if there are many samples. - peak_parameters: Parameters for peak identification in zFPKM calculation. - bandwidth: The bandwidth for kernel density estimation in zFPKM calculation. + zfpkm_min_peak_height: Minimum peak height for zFPKM peak identification. + zfpkm_min_peak_distance: Minimum peak distance for zFPKM peak identification. output_zfpkm_plot_dirpath: Optional filepath to save the zFPKM plot. Returns: @@ -714,8 +720,8 @@ def filter_counts( filtering_options=filtering_options, calculate_fpkm=True, force_zfpkm_plot=force_zfpkm_plot, - peak_parameters=peak_parameters, - bandwidth=bandwidth, + min_peak_height=zfpkm_min_peak_height, + min_peak_distance=zfpkm_min_peak_distance, output_png_dirpath=output_zfpkm_plot_dirpath, ) case FilteringTechnique.UMI: @@ -725,8 +731,8 @@ def filter_counts( filtering_options=filtering_options, calculate_fpkm=False, force_zfpkm_plot=force_zfpkm_plot, - peak_parameters=peak_parameters, - bandwidth=bandwidth, + min_peak_height=zfpkm_min_peak_height, + min_peak_distance=zfpkm_min_peak_distance, output_png_dirpath=output_zfpkm_plot_dirpath, ) case _: @@ -751,8 +757,8 @@ async def _process( technique: FilteringTechnique, cut_off: int | float, force_zfpkm_plot: bool, - peak_parameters: PeakIdentificationParameters, - bandwidth: float, + zfpkm_min_peak_height: float, + zfpkm_min_peak_distance: int, output_boolean_activity_filepath: Path, output_zscore_normalization_filepath: Path, output_zfpkm_plot_dirpath: Path | None, @@ -792,8 +798,8 @@ async def _process( filtering_options=filtering_options, prep=prep, force_zfpkm_plot=force_zfpkm_plot, - peak_parameters=peak_parameters, - bandwidth=bandwidth, + zfpkm_min_peak_height=zfpkm_min_peak_height, + zfpkm_min_peak_distance=zfpkm_min_peak_distance, output_zfpkm_plot_dirpath=output_zfpkm_plot_dirpath, ) diff --git a/main/data/config_sheets/proteomics_data_inputs_paper.xlsx b/main/data/config_sheets/proteomics_data_inputs_paper.xlsx index ceedca03f451c0c4d4a10c92c9c5f7699d69f2c9..79c1f230d7c5772b9ba31e53cc6ca196ec00277e 100644 GIT binary patch literal 10147 zcmeHN^P`?ujhEq z_b<5j{bBFfv*%f#J@5NGYpuQ3t0)5thXX(WAOQdXG5{9-@{$P*0Dun<0AK@<9_fgJ zZJj{2PWq~Db|6PxW>*_)(hqQtXtMy1puhj$_%EJ;p15(_E>^U@!`So3YUL_KtE_U^ z0p0h+Jh_#K1B9X!QwpM_|U!C@zO{kk?ZjC}Fq;%e$O>^@VnQ zmN!vE4V4R#?;FGxwE5I0CruljjZm;lmWc_ZV0GU@6=I-dg0ax?xdewE&Wc`KmR>{t z!C}zlF}ztmeyLmsQ6I2sI(zAA!65NsC@(}plv^oUPqiE0!Dh1{Dh%Tguh%g8v9wTX zGD^+I1ayt84xordDNJFdxOSKXLH)MiYNoGJexA4C`*TmJOR>mKA$#Bo|NHN5UUV@d z(S8&*jH`Lx-dF-#uS<}wBgkF8^}={rlB}=Z@P`PJx;XexmsEgfxcM;{;iAzzd8}sh z9^P!w*}a*|PUj$W3aXcNv^j&y@8JO!p!gSY)~c~moIx2U3)LPfRGj(_AZte!<{!uZ ziSoZ#iGTX*r7>~}-K=PVhmyBJ0~a$(?}0C6Ttp86Z1f=PV7w!r`Z=b14f%n|81DQEbekJWj z`=&LDzNDcrTV`;DLSp(@tP(iRtU-v3oktw_L?GEmyH8GQ+32zyX6B{Jepz5eJx|t7 z?0Bm8Y(oBbbfI8A*}bV`+yO@;v$=B50c$eI6``7nIiE#^QHH~FayNYw>yA^g)OPGU zPbT@a0cCPdtV{OsSN&935O3`o&g0>9w;mqk?vjC{LEkWv!ewZj`nyPOD+h=1U;zLa zL;wH-Iy0`;EG}RNDPPg-Ra;S5UYeH<^|xeD_)`g3R4g=0*pm;I$aOEHJ9l+~cy5z*TzHOB@s(7H=i=!NDn2R^z`xP*vyggO?|GXErkhb`u_DD{^o znPpCxCB=uOXUQ@6Ql?85%*5j=o@n*%-TDHo&Rk^ya1425PmS1@hp(ox(mrZ3EukYd z`YG7sNVKw^@r&SNe{WD4IZjr8?9_0&o0DQyVHSW8ki&3a@FY;w(5J0K179nlBCpk? zT(~xxj#`M3SHa6lN7BLK44lCyV~V$p7f~oVMof{c&QRHAQ#=-fmjVicKOrdwC1b$* zXx4ZxK-%qL=BdBOxqV0Th0NU&nQW&liIxjvUX{mx@dQSvhz1rTeO++lw!>+xE{OR7$DKh!&@+Z-;N*gMzBp*~$gCWs~Y6rTCQRE5lQw@r)ZS!YWLT9M&f#ZzTQ zba1NisZU3i6$~>;ztRh$J|&=>IZM`=6HwOUPNafYmof0nggY~xF(Zqb@Dtp7GbZF+ zG|#&7%bjZxzD(N(1p13dE@V~M7ZvzIJTh5TJZ(na}(*U&q!pgJN1Ai+R2^hd1u zvy%QDcVM8&AXJI}-J>OD)Ut;a&Hpf<&2O^RKIUF==(z`7Kx^FsB5Ng9kesz~k&xR> zO56c=MQ-9m4khO)+lc86-`V<83g=}y+A3nWruo#dS`@tt1&s^R;98ERi^sU(k`-ml zEZpN?tmG9;nuW=Wc$E#3yZz!+b%;4Zf>|<&ef=VigvJbhmDHfsW zN){&|#r+tfohcdb@L7$-p@(bnc-3u!H(f z%xPR_o3nwk<`U|C@u8IXm-lrv2Z5X%S$>|_ez@VZ*oDXiQefaA1LP_2f=!T8eKxdu z=fHBNwW!z@V-_yT7>AYly-G=&2cj}fv;%e-$QW(pV!Y$<>Q0ZL3Vt{DyHyGm9n}Qt z^njPz^x;Yd4GFD%#s#`Yeo)G@j>((e8<*4cN~>8IS6~0-nVM6tLVU*$u%o(02;c1U zxzrt_GT-zr8u;+ANirl9SPXI|3J74)d!TmLl;Iqi^hk&vMgdVnJw{fX9qcIN!gTCX z79gfcNI8gNjCnEl!0b44rPfk+<11yur5o!Z_&J>xN6%0t>PUS$3%XMKdEo$U)RN&${(m@xj|3r*ojvO;@g?>tlt=InrR+OXuA0Z>{rc5aC63Ne=?uqHBuZz019(E8^=KAcbTNgV9^)xIeJG(K-C^G`O%l*k{GSf zj#1nSahf?oE9eOCb<&5(Fhh|bRF?W9B$PxUgS!I_)jsB-s@MaN_sAOG{B0PO%)#np zVP@5_E-)|GCjlJSjz1Dpm6i_@Jc2 zw!N&b?*qF|ldEQijb_u2kNFiR*4^w}PvYmE<1Hm3`Rs#}M8vwOmslyt)mK?aKzcrcRpCi0`dCb}Ur6 z)t=G%D;cHmwA^z1@yLx>Ra)*G{(j8M#L2m&T>11HBf<-B_N{i9(+{v^t?^m)y1>zM1E1Ho58>KAKb)jNb~@m{Up zX@=&WVanJzy+BixE~U@ZNI!M3s5oeChz{Ko(BoU~l`;+1r`0NJK`jp<_#x5Z6Eo|+ zTtcZth<*Sj&q%j=0j#I^CRj#F_LcfQ3d1r|^HW2MX3xe+XYq%V2OHlU$)}li(M{Fo zQOsMlec1A9m>6>6LZK&K(sM?Fm)iw_y1c2eLSu7aX~N4iG_#2z;<7Z^CMVnU1fd+> zSGE9H`ndo};<_UMLgLE!#CAI(RoVwMBImJfn$d#J&-yR4h5#62w^Pp6v9wfNoDRn8 zT8bvqY_~J}ffzSGL0{+~RDIKM9P%bun)l;37mO;OtSBQFol(-11YGvom9L^CFt{uo2&}r*78i zS;Jtl1^3fEfmWz5>`9os4sJu=J$MH)1fikwCT3|@g-<8 zU{lFx>n5k=XBz*<6GkmTMTr)mPjv#XVZM)0C|VN_M6&PZMAswDv9+4~6urwHQb2?5 zE&2iPvP;R+L0)isnfuZpFWZat>1S=>rh(6MSDQ*@ok(2!Y59Aon#@ySYUjG0Rc2~$ zyy=+cQ9xS-DO%e-xg#{ouRcwtrkG6FG}ubiM?Gd`^l#OsIzT;Qq))(+CE5ZL`yYH( zfibf{*}@h7er1l;CDd_1E|To1w)AT1r02P}GdBN==?{ z@glY)DI1lXa#Ly+_2BXu`C$0CMb)0W0`*tv1ySWSlSl#u$ypuj?{6IrzeXLb!{SCK zk-k^hfH`oFR{kzhOK-*+zWFpG2F^>z4$~OE-bgDQ)f3s>SRMUsyH*67+>5wX{7-_~ zjNihuLe#%6Ezpv%HOR$%g5{tdtFbha!r!}nF@e;W082dqZ>o#2m3%sk+^GD_NR$3D zd-rN+UtTGPIp^J*Bx9MZy{YWxX%tKDw%qo*ZoY4vH-<}7VBzHZ>jRER{v5;ZE+WN1 z@ZPG2*O$Xjj4d#r%dN+4f;+Wc(E&VJ!IK-FcuUr`3!}uX0gSx-PF?(?`74B5M~~nq zDo=A_-B)RJW?RUI6`>w3ivVxsgafwRw~?YpSZZVZ9K-)8RY zZorI@7)Z=gR(c&`2WS?8l4EdkYO@{y?N+RfO`eq&*Lz6V_JiATFCI(A1M1UBP%gV# z!XlY`0;rMX{=Wa=4&w5y{o#HAL;rp-A`x_3;(dQVQ@?T-=dNiPK&N+qRidHq4LM=) zzPu;Ve_nF|myxiBO^6*LKS1R^V(cnxjFKvp9GWU6OD@ZVOzdMDv=bf5xH?G4fmg0z zjV*}#9`w57ro`;YY_()5j`PSZYbety+ji-Az%&2@@|p@&L49yapiwj4K8mf)Zd9Nn zWOH?rKHyARmmU$b(maDbjkbjnSxNGtZyeeF#gbW^_8WnGj!r6l+A{+2x3cNf$rDAm zk^vKB5uGencp`SSpwzDgh2zP)ML`=~5?xgtBO+celR$kpu6cHQ%6y${3`Dm`B?ICo z)=Z|LMM?Ujvl!M$2}klgnXTw|+f00SWPv?fPB&~i{oKm5?gvwD{(LMHLARd3HC|_j zd@0pFFZyvZF}CNML-0ad=O~jGtB7fP&KD}uj4thhm@jokz>Kf;n@^N!J&xu@a5)oTqLSj>F2A+<-di z1BFHtJm&CSK=86g#Frtu`U*UoH3Zu5Qg@pC!QIeIY(}{DTQ|#040T8YZo5otHOoh0 zy%LA>HLrl$QQ*fIl}8JMw(CtyHnuWEhYxKvbM*;e)RYC^t-37xR^PYpee(+qvs&o1 z1@e!v5BF~vX^qV+M+c%g%Sezva-FiFrDVie$lBh#vw%yyb~AW{T95N(fV6W4To^h= zp7zq4mQ{BoZ$yoSO1ja%E}DS*PSruK-a@c!-B@sZK0UvE!;Csk6ZfkZETO_2Z%wgk zpl#iW$S7EJsX{kJeK+u3LMw8}lnHl8il3Mj$0Z?FljFIUrV}yay!Kt68h<{(Qs-lb z2aj((qk?Z>P4V@#TuX&#DaAx`@d~gU8P}Zxf;KDiE8dZ?Vj9SZcR3qNS zONGji>VF(hcVq$w?sjuZxH%BQwc~*}7uXevE2J7#6QU57od+G&%9GDJz{(6I`%-!QmVIf#XWstWgkyNC)`8ND)`JuI~}_KGGy6sgC2 z5ps44?)IUg4l&tyUg6|jwDGca)yra|W4A7G&`89zc<*8=xX$SfDxILrlQI2>%LTt$ zN`0T^@efy2DOK^V;bc98Wx^gV@>IkmaLFaxIAh&)$)&;VR{Sa%cV(jGicQ0r1ZYiGUEh@tD-K)s+q^!0PGC6im#?(Ywh#xW9#~(my}80F zF~Y0W1U25ckL_E>JcTQEOm)3zZ5Jy`8u^eOYMI zZie_RLJDX2d-5wdeZY^$hRqf{uhC-qf%2FdjQsni$cxES>;kxb^(un;+ljKKA=Iah z%+l4W_eqSvbbaU62{pd0pnf;54lLo+6g>4W>JwV~bH47?!{XKO&A4MjU#uX-OfB(W zh^uV_4b$3HUaz_O9d5U(+B7ohI@ zrgh~yLhmh*8v(a{314lewn^WD$Guki<76Rxj>Ia)&DyrGg1G&}K$|Ciw*^pVdqpVe zI2}y7KI$`@P7qbH==oKVYnf+77;fzzOxh7G%s>nesla4gtRZR-!gaCHayKh3fs{b!`TTSEm8a(}D_6~q?$gpf=OdcsYJoW=K#F{6 zUHjU>quB!TBtw&@YPh)Cm1rbMy*yVpE=A6WozLpS`QU&+fAq#C;fx?Uzcz>*iaEpv$!fkdC`J9aK*cXvr9gat)R z1(w4r+Tq>UgmAyNu^Jyni-&)Qbl z%Joq!1=xMQI2?3Z^La>1btm0{lo|-psBl%kZl=FwA=z`;EV}bKR~4r0p;W+)FTDVx z7jy+#(l_#+b$&q{V&Zyob`W)ibc`Go7J&Ua*zx*{nXhn$uY4qd+B?6<^XY7u0Gt|C zjOqQG{)33@AX<3|M9D-}h1q%4i7wH#0>umzb$(LIOVeGA0;0?ET1QgH4D~eInh#xx zQ+{dav;`(FUtDuPsOuZlb1Cy{NnR*|@Dcj-y9WJVMw&*=@;~zq|7_e&@T3+(B}f`+ zuK!-J;FaDEn)kUQU6yzx(7g3)mpufr>z!6@H?Gs&`fa##qT|>j9;M2Z1AmX_4R2A3Ra>9-7tm+uJWu8Fo76yam=F{$G=** zrivLVz7A4iO`g(XP1U0ji55Gye6F=jBhJ2_(Q+byB!vTE38TqmBuLH>SdFr}8_L@Hv*HdT!=kvTP6Ec5lbL_KL;bN^MFiPz<;1HmOtV%921Zq|I9 zB57?~^(O{bkGCyN0lu->W8oQ!op|hfXPW^g^5XY0r3R98i^!e(@5YY$5Ob}U-#Srm zvYq>iicKHCOIoDescOL-)AaoS!KQ+Dj!Yl$lD)Kpn|V@WQUaQaus7^6`>Gh4n|GQ0 z>J;Y=C*gg#!0KIFkzR40rTWI9{adf`^Ohz(`ZonRW^K~RSM)|(yLauNz3*Z~cMsfR zg75F4!u|UjwX&a|WDs;dzCopp2`!hJfQ=O$z;=!-#$X4~4{1ZYbN{Q7LdoK(JPd6a z16wq9gv8nqX?&DT0!V9dgHUp%-)Td(=206c>p-cqE3@{bKdzSuqD;+rNS+q6&-+xq z93+$GM-CNc?+tj3K4~NviAY-&DJ?HUWf&Z3Fj?5pKAcP~OW?{O?PveUdLurUN)}^s zo==Xo&Ke!J_UZWhXRqm1l(uE6yHcvqbz#P0a-$gR;#6MQGvV*5W2RH`Sy4eg$BDe- zK%-J6P4Bg!>BTFDRhbhKg4+nyLU#G&l^?rgWiv)t0U{|i54pK(t3E^^TJ{bV%8%8; z=Xz2#779b}Fcyqm?oZyISl~UkKg+{9TeHvwI|6VX;uO|#edh;~FyH3+^o|F;Jt#(- z2g0C(vZ~O(t~Te)sno%3;BZUyQNzrDW+xu6wZmKn%%A(I0p{6Nqg3n^f!m>A`$&J` zKn!D8Y!L6$k!gPuffzKwT`lnaEFWjYj>(a?I7?!W7C7Uv6@Hs`+Mv?~_lkxC}P)8Hi?8l{;Ue%?S5TAiP8a)ONxK%#O@cMd zyKFJ&ZNS8m*CyZO3QJe;*0XQjkCjY%RvGL%jmaV<*R|W<=G3t{;m?tSq`Fkd2)Ar0 zsFD$MveXD~IEzHBeN>1p*gw^Bxe|@Ygk@|#AOA@C4F7Gwj5rT+sFMBksg-z%0fP6L z=CfuK%@VAIF_;|nlh+A8o$+-^4z6HXY3eSdj3GW3y@d<+x8%7-Qp9i)6c!aVn%Zw9 zS9Y1i?lZr}22CEC_xtT1pDwO!#LdIE*&WUDv}--cV(1ds28G>^IUyJ8!eW*<1=5`& zE4t8)eM5*&MYw%dB9!Kh*xi1_+xGaaTkX)w=Jy5;*}+r(iFWAf{K09MM@-O&^v@fa z{=J6(KL5kECPkUQ8u;sGi+=-uoYSE~`P0^m-+{k3pnpNTpwZ*EcJ%Myzjoe!K>+{+ zl%L@Lrw{kLo!|Q~zbpx({pTV6)|L6)%I~G(Usk?CTkHSO+xb~I{@u#&<*i>6Av02Jf!l z{r}~9@Au7lo;`b>b7tP>otgJHGd~r13``&Z7Z(>GoKda;xF!V1|AvlcwoW|UsNdq4 zemRW$q=CEekSLg2HH8$oqA_S7t{9dpR= z@Hdbt_e-g3W>PEF+w!=b!jy11#-6{Q4{}JW)u;$d`sgBB7wDKKLB8f+a^)mciz^d( z)exijt=n?+(;&O01VBX|2iLsNw`Lx>Z4q<;^7-!^LxTLm9l3^`qm7B3oej4;#5P>L z6FSXH+IY-~ISaXGU`>;N0g8wQ%#apzCdbI!v~84I-Bl386)`3f`8_^d zH(00NbIzPquqBTSw4J8u$A(ref%zF$Q2CqyS;wx5fjkk)rvUsl-o^}3R>CJhV8 zRDqLoqABJo%A!?i`-itRN+R#)P^6Xif!sXFgc2%0?HloJ*&Z`iv$osbP-0$XQf5eb z+-3_@(PIje)^5$R-&h7{l-8*}z0w3hEoR2mp_w7-Cy=M{wj@HDuELKU>HH}WEbZoO zUySL$zkS@$S3T-teHc07!#sRHQ`W3f*vE)Nd=MtR)sjz#(FRH*Z;c6habX8*{Ho0u zt?qrvh)bg*xPTM710%~CalU#Qq?U`V_f3d`&(2P&r|BaDXbH?0@p6!l_ON*mI#N$Z z2x|N>^HPytS*z3e(hvAe#p00V9z685S1E`?z(TDJp}97s`u$EJT*TaqJ-8=1{JRV-C3~Np7%*Or*cxv-KfXvXPF&Hy#1Lxa~9~O0-d$QZz|v1yz3T2s+G` z#;lrfPV%J&o?%QFqoTFRy-`PEw2iaS{MUN&I-WektCy_xBe}tB>=;j`Kh++~t(8r_ zCfH_*<5#;Ydjb(_!CH8|NPm`{zD%t^u0%l4{3br$rnnG;-K6-&fUeWm0fV~TPkU4= z7wy|$>rMF2Nu{*gj+9~{0Gwuz-46x`f@k^%<|U~?qw-T|8PjoO&YikqaShdin#zEx?g9n9mO9wO#&@ zf;iVm>W_>p1s6LEhj_68Pe{3mkP1%n=R$W?guQx!<+!kmxZzD}ToTTzS?t}8o@Jv| zB7TvKV!!17yYzcnJI)W~sX(w5wpq?cv2aJHtBhXEmd*N)JtRB(gC*l9n(wzg*Vh<| zKl+Kl2jU&~BsaQ++c1E21*?SKZl(6(Z@*r~ICSSm`a+uHZ}=7+>&` z6&B_2@3Lbvj0y||nD~ECtP5&yS&)q*=YAS#pJm{OX4OURiY1%OZzURJ;!BxZ%5Shv zy&c?yA9=UDL&HY=UWm3=-e(#`n7fcN(DGR?qfbNjSpRoX;h%TQztPFx7WUy@p-lRs?EY!FUHrbEQ5%f4jK@H*0>!1 z>Mb9WZ=wyf`krreN!IqlD;l9QIKqCH55IF_4L(fezbY1NT!_gUeOt~q+v0_{$dB)1 zrv+Oud>t~1-Bv@z*IwNqz$(w6;6S z(A+E8LPaX29d&g)2uvhWL!>E5D|eDCDrkGBus=Q;ka!`zWuktcV5X@w)MJ(j6r(8$%*OCT+^k2F0ljSqB7$>eN|o|z#U;X5rKE;7B(diHqRgC zb-OG`RcrgI_>(ViDxnNrQ4Ydbnsj>GELDxokZ{vvbs0~0F<#znbk!&nymG|5v|z;N zfPebss)F@3mkHLnWSzcv8T(|b@ai1XLZdfxQ0SaINk#R-R()O?4s^_N=H6?E$_h-* zNM(4W>Jk~#=IgVQa6}}mwViGQ9@*JF!fvMkA7U2RR@sGJ_?{hxBa~sS=BRguDjALU zf9OU|$OgWg%aIqehL;B{4i({#fAJ#}!4a4NOmXk8r5@7W=kvh+Nxm+20}3Iu4xpyIw%BR{<-&^2yI$~&p&h)I`3 z{TiS)RtNuQHns72OWDl%>U`#R0ot2WRzQ{Xd>wWx+N)?ah=JEP&?B6)h0WB&5Bo<_bIV8=Y15z)r3=zF7(Sz+`*A@bs+q~e?*|N_)|2S2Sl^OPmp;2` zgryp(c}hl?nDjM-0P0F?n$+5NnugEEO{i|@W1`P@`@vIv*)iMBo|>U=C>Y691WADk8FXcTVZo3YBUBffm+7UUsprMC~l=h$H!7kB_UdlG8Ape z$9kE@Uz8wkMLkoQ0}>`g-n>eMji|cB!JuUb_{=Z)ULjaAL8isn0=`+zAa^h05f~?a zDQ|45)EH~eQhIE6K&vy)`1D@zrK2^_SW@MKF@5ifuG@AE_F2I@+>I$;-Hw^#e1Y+u zFGj^SY*7IWsc&c0cOvGub}1<_&0m{_{2It&fTH=8%@j*t|3>=orTgON-!?&a0|6 zvpJIyJ*l7%0pea3>r3=yu^8*{(9X>P_FY2w5QghZK%?D0!8&M{@bWFr))wI9+~Tc) z0_3|XiO#^lMmxtKu?^3Pyu~3dIV6bCqCf;*pu1b&lZ9EY4=Z@J*w08r=9EgDwDDC^ zaFH_ohyn#7!Fo>r1`#FYd>6kdo_%aL<{@X#p{u9~3QF+zhl)GY^Rth!jf6MfLdG?N z6|=$Cof*j=w}pxBEXmPKyF%ZYtW*a!RXutHZ?r8v;w^*T>T1g`izg9^B>)2miUe=W zx992V`0|C=+sbl29_k>F^GN`8G9m6$Ck{+d{?&t|`-*T$HB>lR)@~y)v19$IXXx`0eB@X5TLQ zXp3xRe7+!lMLcpNwdaRK_sAL;$}LQ3hgZ*@)nL|r$~8U5_fCdk+u-~c&f2e(eVan= zzx*HPR^Y#<4ohQ4GqAd|qZQQR*YFWP^jWEukraErUU{%*Ca<_XiKr}?ZL`;Jw1R27 zmh}!s_khft!3f(+zC&Eko}aOdB)ibUn_;$McPw2WYCLOX;7S8Fh%)flWi5g&d1Ki% z_IwC&;7rNt@ufF7Db?{|OtlPyRI{hA;*@7So!M5*MDd8pdOtCnIL2CTp2-NB;uX&J z#{nyaT-adg`&Sk@R+947dR~EBLP_ElYj?sN9uK5B7&n8KR6=s|PO^XQ1+K5Duyjsf z?HbgtBJONK`^;; zO~Wt6l!kJ#Ud~rn@TbFtQs}@GTc31P-_f$FhM}tztv-ntda2vnatFUCCuF!#D=LRs z=K&{_FCXt6k!XsZp98w>e0UtI!kXn$DjVhVYBX8MGDTPH=G3zd(Fd$!aqz(#_S79U zI=(rn!XY#*DkKr>;Lb@SL`}Xmkzk9859SJ&QZ{Br6>(UlHbZ4kfEvErh`r`itlybY z8{^mA<3Xv#nN7;Q)^O?y z(W}`584}DNgLWNQ$K6%1c#u*Hh-QR#<`&{W0~ZAl#*hd4ZUex*3>dcKd>|D>mf3>w?{=NElOD#dAQqlJwxtJRVgCm zlJg&s<2t;-4D_wlWOAs1A85qfX1jwym!WefI`9pQ}B! z4f`%2nrNfO0#w3VqB~s147NQbf`yAqN)aKxU?#qFlq95)ixquEf#|C4?pw~GNHY7X zE}^K`9Gh5y6gvuoA3C?kPwtom2vGWbqH2uu0~upO zeCAe;bjt!s!xGS)ZhIC;FgIM3Wa^J<9cxxhSv}%e>i9rYUtLR0c|c&u7+w8i!(mJ& zo+xLWSB3|%Z-?^JkOJY|4Wt)dB7ODuO%*xvi|0@6pw4DcXG0ASdow3}l&jh_M3Al` zojvEgUCQA)jm8vLjS)$9SJfQ6C~rb70j|%xJWf@ATX+MVf=nR+e%x3zY*rCo82eC= zCp;xEb#9VXyQAuewl^Hna>KF32mDN77HZ9s1mdwBDK7XA{D#NCCp zLe%j4jR|)_QH#1kH0Nheml=n*sPUT$pE_22UQ6R{mUXP!3{@j(SQ9&!wqK35YetjAg)!ja)of7&R4c&P|i?+?--kM7w`{GNXk5Z1rn^NJ*KG zSQ|^&QEGvrm3oEtQbK<_m(_7i&D-ScC?VX~Qs9%fHN>@h6p|_rAxed{BXjHLWyPP5 zi)W**7(FodiiBjuQYH>E)=e*z#a&I^1WCLf_%4DcbecS%AYvjBE8Nxe_da;j;QQU@vZ;Wf}8x1Nv7CWU64UnOk!$O4z4IK#h zO|M?(7g6-;f66to^-sm?v=WMb{4G?-rT!oz|5Uy{%cGd5-|`&EQX!Rp^H6`PUT@%0 zZTfFv#{U0x|7-OB)5`S%{-0K?@c*ZkU#)@@wA! From 8076c76b26514b6b93d8ec51ffee0599ad1614ed Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:44:48 -0600 Subject: [PATCH 33/52] refactor: pythonic method to collect merged gene z-scores Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index b760041c..fd247d0d 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -803,27 +803,14 @@ async def _process( output_zfpkm_plot_dirpath=output_zfpkm_plot_dirpath, ) - merged_zscore_df = pd.DataFrame() - expressed_genes: list[str] = [] - top_genes: list[str] = [] - for metric in metrics.values(): - expressed_genes.extend(metric.entrez_gene_ids) - top_genes.extend(metric.high_confidence_entrez_gene_ids) - - merged_zscore_df = ( - metric.z_score_matrix - if merged_zscore_df.empty - else merged_zscore_df.merge( - metric.z_score_matrix, - how="outer", - left_index=True, - right_index=True, - ) - ) - merged_zscore_df[merged_zscore_df.isna()] = -4 + merged_zscore_df = pd.concat([m.z_score_matrix[m.z_score_matrix.index != "-"] for m in metrics.values()], axis="columns") + merged_zscore_df.fillna(-4, inplace=True) + expressed_genes: list[str] = list(itertools.chain.from_iterable(m.entrez_gene_ids for m in metrics.values())) + top_genes: list[str] = list(itertools.chain.from_iterable(m.high_confidence_entrez_gene_ids for m in metrics.values())) # If any of the normalization metrics are not empty, write the normalized metrics to disk if not all(metric.normalization_matrix.empty for metric in metrics.values()): + merged_zscore_df: pd.DataFrame = merged_zscore_df.reindex(columns=sorted(merged_zscore_df)) merged_zscore_df.to_csv(output_zscore_normalization_filepath, index=True) logger.success(f"Wrote z-score normalization matrix to {output_zscore_normalization_filepath}") else: From 9bcda5f901d61c78c790f4d14a80f7b8ac6a9869 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:44:58 -0600 Subject: [PATCH 34/52] refactor: do not drop na values, keep as much data for as long as possible Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index fd247d0d..f0c35326 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -840,7 +840,8 @@ async def _process( expressed_count = len(boolean_matrix[boolean_matrix["expressed"] == 1]) high_confidence_count = len(boolean_matrix[boolean_matrix["high"] == 1]) - boolean_matrix.dropna(subset="entrez_gene_id", inplace=True) + # TODO: 2025-OCT-31: commented out dropping entrez ids, should this be kept? + # boolean_matrix.dropna(subset="entrez_gene_id", inplace=True) boolean_matrix.to_csv(output_boolean_activity_filepath, index=False) logger.info(f"{context_name} - Found {expressed_count} expressed genes, {high_confidence_count} of which are confidently expressed") logger.success(f"Wrote boolean matrix to {output_boolean_activity_filepath}") From 6fde5bdd49091a98d23925365abe579a5d42f8ea Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:45:12 -0600 Subject: [PATCH 35/52] refactor: use new min zfpkm peak height/distance Signed-off-by: Josh Loecker --- main/como/rnaseq_gen.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py index f0c35326..a3e496fe 100644 --- a/main/como/rnaseq_gen.py +++ b/main/como/rnaseq_gen.py @@ -861,9 +861,8 @@ async def rnaseq_gen( # noqa: C901 batch_ratio: float = 0.5, high_batch_ratio: float = 0.75, technique: FilteringTechnique | str = FilteringTechnique.ZFPKM, - zfpkm_peak_height: float = 0.02, - zfpkm_peak_distance: float = 1.0, - zfpkm_bandwidth: float = 1.0, + zfpkm_min_peak_height: float = 0.02, + zfpkm_min_peak_distance: int = 1, cutoff: int | float | None = None, force_zfpkm_plot: bool = False, log_level: LogLevel = LogLevel.INFO, @@ -892,9 +891,8 @@ async def rnaseq_gen( # noqa: C901 :param high_batch_ratio: The percentage of batches that a gene must appear in for a gene to be marked "highly confident" in its expression :param technique: The filtering technique to use - :param zfpkm_peak_height: The height of the zFPKM peak - :param zfpkm_peak_distance: The distance of the zFPKM peak - :param zfpkm_bandwidth: The bandwidth of the zFPKM + :param zfpkm_min_peak_height: The height of the zFPKM peak + :param zfpkm_min_peak_distance: The distance of the zFPKM peak :param cutoff: The cutoff value to use for the provided filtering technique :param force_zfpkm_plot: If too many samples exist, should plotting be done anyway? :param log_level: The level of logging to output @@ -988,8 +986,8 @@ async def rnaseq_gen( # noqa: C901 technique=technique, cut_off=cutoff, force_zfpkm_plot=force_zfpkm_plot, - peak_parameters=PeakIdentificationParameters(height=zfpkm_peak_height, distance=zfpkm_peak_distance), - bandwidth=zfpkm_bandwidth, + zfpkm_min_peak_height=zfpkm_min_peak_height, + zfpkm_min_peak_distance=zfpkm_min_peak_distance, output_boolean_activity_filepath=output_boolean_activity_filepath, output_zscore_normalization_filepath=output_zscore_normalization_filepath, output_zfpkm_plot_dirpath=output_zfpkm_plot_dirpath, From 74ca5de85d56a18d0deb7f09ee1fa9dfa47b8a88 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:45:53 -0600 Subject: [PATCH 36/52] refactor: do not drop na values, keep as much data for as long as possible Signed-off-by: Josh Loecker --- main/como/rnaseq_preprocess.py | 1 - 1 file changed, 1 deletion(-) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index 5ef58fb9..52385101 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -74,7 +74,6 @@ async def build_from_tab(cls, filepath: Path) -> _STARinformation: "second_read_transcription_strand", ], ) - df = df[~df["ensembl_gene_id"].isna()] return _STARinformation( num_unmapped=num_unmapped, num_multimapping=num_multimapping, From 666fb84b3197ec8eb070b7c8808d671786f36520 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:46:07 -0600 Subject: [PATCH 37/52] chore: remove `__main__` function Signed-off-by: Josh Loecker --- main/como/rnaseq_preprocess.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index 52385101..125e10ff 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -849,28 +849,3 @@ async def rnaseq_preprocess( cache=cache, create_gene_info_only=create_gene_info_only, ) - - -async def _main(): - context_name = "notreatment" - taxon = 9606 - como_context_dir = Path("/Users/joshl/Projects/COMO/main/data/COMO_input/naiveB") - output_gene_info_filepath = Path("/Users/joshl/Projects/COMO/main/data/results/naiveB/gene_info_fixed.csv") - output_trna_metadata_filepath = Path("/Users/joshl/Projects/COMO/main/data/config_sheets/trna_config.xlsx") - output_trna_count_matrix_filepath = Path("/Users/joshl/Projects/COMO/main/data/results/naiveB/total-rna/totalrna_naiveB.csv") - - await rnaseq_preprocess( - context_name=context_name, - taxon=taxon, - como_context_dir=como_context_dir, - input_matrix_filepath=None, - output_gene_info_filepath=output_gene_info_filepath, - output_trna_metadata_filepath=output_trna_metadata_filepath, - output_trna_count_matrix_filepath=output_trna_count_matrix_filepath, - cache=False, - log_level="INFO", - ) - - -if __name__ == "__main__": - asyncio.run(_main()) From 4de54212d15002109a1131b0a9c85857eace79cb Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:46:24 -0600 Subject: [PATCH 38/52] fix: convert identifiers to lowercase Signed-off-by: Josh Loecker --- main/como/rnaseq_preprocess.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index 125e10ff..a5fea646 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -359,12 +359,12 @@ async def _write_counts_matrix( """ study_metrics = _organize_gene_counts_files(data_dir=como_context_dir) counts: list[pd.DataFrame] = await asyncio.gather(*[_create_sample_counts_matrix(metric) for metric in study_metrics]) - rna_specific_sample_names = set(config_df.loc[config_df["library_prep"] == rna.value, "sample_name"].tolist()) + rna_specific_sample_names = set(config_df.loc[config_df["library_prep"].str.lower() == rna.value.lower(), "sample_name"].tolist()) final_matrix: pd.DataFrame = functools.reduce(lambda left, right: pd.merge(left, right, on="ensembl_gene_id", how="outer"), counts) final_matrix.fillna(value=0, inplace=True) final_matrix.iloc[:, 1:] = final_matrix.iloc[:, 1:].astype(np.uint64) - final_matrix = final_matrix[["ensembl_gene_id", *rna_specific_sample_names]] + final_matrix = cast(pd.DataFrame, final_matrix[["ensembl_gene_id", *rna_specific_sample_names]]) output_counts_matrix_filepath.parent.mkdir(parents=True, exist_ok=True) final_matrix.to_csv(output_counts_matrix_filepath, index=False) @@ -459,7 +459,6 @@ async def _create_config_df( # noqa: C901 fragment_label = f"{context_name}_{label}_fragment_size.txt" frag_paths = [p for p in aux_lookup["fragment"].values() if p.name == fragment_label] - print(f"HERE: {prep}") if not frag_paths and prep.lower() != RNAType.TRNA.value.lower(): logger.warning(f"No fragment file for '{label}'; defaulting to 100 bp (needed for zFPKM).") mean_frag = 100.0 From b917ac1ac433fe6a7e40578d9ab25ae5dca1300b Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:47:37 -0600 Subject: [PATCH 39/52] refactor: more robust data handling when creating the gene info file Signed-off-by: Josh Loecker --- main/como/rnaseq_preprocess.py | 70 ++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 28 deletions(-) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py index a5fea646..60457fb9 100644 --- a/main/como/rnaseq_preprocess.py +++ b/main/como/rnaseq_preprocess.py @@ -664,15 +664,13 @@ async def _create_gene_info_file( The gene information file will be created by reading each matrix filepath in the provided list """ - async def read_counts(file: Path) -> list[str]: + async def read_ensembl_gene_ids(file: Path) -> list[str]: data = await _read_file(file, h5ad_as_df=False) - + if isinstance(data, pd.DataFrame): + data: pd.DataFrame + return data["ensembl_gene_id"].tolist() try: - conversion = await ( - ensembl_to_gene_id_and_symbol(ids=data["ensembl_gene_id"].tolist(), taxon=taxon) - if isinstance(data, pd.DataFrame) - else gene_symbol_to_ensembl_and_gene_id(symbols=data.var_names.tolist(), taxon=taxon) - ) + conversion = await gene_symbol_to_ensembl_and_gene_id(symbols=data.var_names.tolist(), taxon=taxon) except json.JSONDecodeError: _log_and_raise_error( f"Got a JSON decode error for file '{counts_matrix_filepaths}'", @@ -681,32 +679,48 @@ async def read_counts(file: Path) -> list[str]: ) # Remove NA values from entrez_gene_id dataframe column - return conversion["entrez_gene_id"].tolist() + return conversion["ensembl_gene_id"].tolist() logger.info("Fetching gene info - this can take up to 5 minutes depending on the number of genes and your internet connection") - genes = set(chain.from_iterable(await asyncio.gather(*[read_counts(f) for f in counts_matrix_filepaths]))) - gene_data = await MyGene(cache=cache).query(items=list(genes), taxon=taxon, scopes="entrezgene") + + ensembl_ids: set[str] = set(chain.from_iterable(await asyncio.gather(*[read_ensembl_gene_ids(f) for f in counts_matrix_filepaths]))) + gene_data: list[dict[str, str | int | list[str] | list[int] | None]] = await MyGene(cache=cache).query( + items=list(ensembl_ids), + taxon=taxon, + scopes="ensemblgene", + ) gene_info: pd.DataFrame = pd.DataFrame( data=None, - columns=["ensembl_gene_id", "gene_symbol", "entrez_gene_id", "size"], - index=list(range(len(gene_data))), + columns=pd.Index(data=["ensembl_gene_id", "gene_symbol", "entrez_gene_id", "size"]), + index=pd.Index(data=list(range(len(ensembl_ids)))), ) - for i, data in enumerate(gene_data): - ensembl_ids = data.get("genomic_pos.ensemblgene", pd.NA) - if isinstance(ensembl_ids, list): - ensembl_ids = ensembl_ids[0] - - start_pos = data.get("genomic_pos.start", 0) - start_pos: int = int(sum(start_pos) / len(start_pos)) if isinstance(start_pos, list) else int(start_pos) - end_pos = data.get("genomic_pos.end", 0) - end_pos: int = int(sum(end_pos) / len(end_pos)) if isinstance(end_pos, list) else int(end_pos) - - gene_info.at[i, "gene_symbol"] = data.get("symbol", pd.NA) - gene_info.at[i, "entrez_gene_id"] = data.get("entrezgene", pd.NA) - gene_info.at[i, "ensembl_gene_id"] = ensembl_ids - gene_info.at[i, "size"] = end_pos - start_pos - gene_info.sort_values(by="ensembl_gene_id", inplace=True) + for i, data in enumerate(gene_data): + data: dict[str, str | int | list[str] | list[int] | None] + ensembl_genes: str | list[str] = cast(str | list[str], data.get("ensembl.gene", "-")) + start_pos: int | list[int] = cast(int | list[int], data.get("genomic_pos.start", 0)) + end_pos: int | list[int] = cast(int | list[int], data.get("genomic_pos.end", 0)) + + avg_start: int | float = sum(start_pos) / len(start_pos) if isinstance(start_pos, list) else start_pos + avg_end: int | float = sum(end_pos) / len(end_pos) if isinstance(end_pos, list) else end_pos + size: int = int(avg_end - avg_start) + + gene_info.at[i, "gene_symbol"] = data.get("symbol", "-") + gene_info.at[i, "entrez_gene_id"] = data.get("entrezgene", "-") + gene_info.at[i, "ensembl_gene_id"] = ",".join(ensembl_genes) if isinstance(ensembl_genes, list) else ensembl_genes + gene_info.at[i, "size"] = size if size > 0 else -1 + + gene_info["size"] = gene_info["size"].astype(str) # replace no-length values with "-" to match rows where every value is "-" + gene_info["size"] = gene_info["size"].replace("-1", "-") + gene_info = cast(pd.DataFrame, gene_info[~(gene_info == "-").all(axis=1)]) # remove rows where every value is "-" + + gene_info["ensembl_gene_id"] = gene_info["ensembl_gene_id"].str.split(",") # extend lists into multiple rows + gene_info = gene_info.explode(column=["ensembl_gene_id"]) + gene_info["size"] = gene_info["size"].astype(int) + # we would set `entrez_gene_id` to int here as well, but not all ensembl ids are mapped to entrez ids, + # and as a result, there are still "-" values in the entrez id column that cannot be converted to an integer + + gene_info: pd.DataFrame = cast(pd.DataFrame, gene_info.sort_values(by="ensembl_gene_id")) output_filepath.parent.mkdir(parents=True, exist_ok=True) gene_info.to_csv(output_filepath, index=False) logger.success(f"Gene Info file written at '{output_filepath}'") @@ -728,7 +742,7 @@ async def _process_como_input( rna=rna, ) with pd.ExcelWriter(output_config_filepath) as writer: - subset_config = config_df[config_df["library_prep"] == rna.value] + subset_config = config_df[config_df["library_prep"].str.lower() == rna.value.lower()] subset_config.to_excel(writer, sheet_name=context_name, header=True, index=False) From e5127409905b2bc39e45dadc076f48196964832b Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:48:01 -0600 Subject: [PATCH 40/52] refactor: include more robust error handling for dataframes when collecting gene data Signed-off-by: Josh Loecker --- main/como/utils.py | 58 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 45 insertions(+), 13 deletions(-) diff --git a/main/como/utils.py b/main/como/utils.py index 05629032..dfb9ac1e 100644 --- a/main/como/utils.py +++ b/main/como/utils.py @@ -135,7 +135,7 @@ async def _format_determination( async def get_missing_gene_data(values: list[str] | pd.DataFrame, taxon_id: int | str | Taxon) -> pd.DataFrame: - if isinstance(values, list): + if isinstance(values, list) and not isinstance(values, pd.DataFrame): # second isinstance required for static type check to be happy gene_type = await determine_gene_type(values) if all(v == "gene_symbol" for v in gene_type.values()): return await gene_symbol_to_ensembl_and_gene_id(values, taxon=taxon_id) @@ -144,19 +144,51 @@ async def get_missing_gene_data(values: list[str] | pd.DataFrame, taxon_id: int elif all(v == "entrez_gene_id" for v in gene_type.values()): return await gene_id_to_ensembl_and_gene_symbol(ids=values, taxon=taxon_id) else: - logger.critical("Gene data must be of the same type (i.e., all Ensembl, Entrez, or Gene Symbols)") - raise ValueError("Gene data must be of the same type (i.e., all Ensembl, Entrez, or Gene Symbols)") - else: - values: pd.DataFrame # Re-define type to assist in type hinting - if "gene_symbol" in values: - return await get_missing_gene_data(values["gene_symbol"].tolist(), taxon_id=taxon_id) - elif "entrez_gene_id" in values: - return await get_missing_gene_data(values["entrez_gene_id"].tolist(), taxon_id=taxon_id) - elif "ensembl_gene_id" in values: - return await get_missing_gene_data(values["ensembl_gene_id"].tolist(), taxon_id=taxon_id) + _log_and_raise_error( + message="Gene data must be of the same type (i.e., all Ensembl, Entrez, or Gene Symbols)", + error=ValueError, + level=LogLevel.CRITICAL, + ) + elif isinstance(values, pd.DataFrame): + # raise error if duplicate column names exist + if any(values.columns.duplicated(keep=False)): + duplicate_cols = values.columns[values.columns.duplicated(keep=False)].unique().tolist() + _log_and_raise_error( + message=f"Duplicate column names exist! This will result in an error processing data. Duplicates: {','.join(duplicate_cols)}", + error=ValueError, + level=LogLevel.CRITICAL, + ) + if "gene_symbol" in itertools.chain(values.columns, [values.index.name]): + return await get_missing_gene_data( + values["gene_symbol"].tolist() if "gene_symbol" in values.columns else values.index.tolist(), + taxon_id=taxon_id, + ) + elif "entrez_gene_id" in itertools.chain(values.columns, [values.index.name]): + return await get_missing_gene_data( + values["entrez_gene_id"].tolist() if "entrez_gene_id" in values.columns else values.index.tolist(), + taxon_id=taxon_id, + ) + elif "ensembl_gene_id" in itertools.chain(values.columns, [values.index.name]): + return await get_missing_gene_data( + values["ensembl_gene_id"].tolist() if "ensembl_gene_id" in values.columns else values.index.tolist(), + taxon_id=taxon_id, + ) else: - logger.critical("Unable to find 'gene_symbol', 'entrez_gene_id', or 'ensembl_gene_id' in the input matrix.") - raise ValueError("Unable to find 'gene_symbol', 'entrez_gene_id', or 'ensembl_gene_id' in the input matrix.") + _log_and_raise_error( + message="Unable to find 'gene_symbol', 'entrez_gene_id', or 'ensembl_gene_id' in the input matrix.", + error=ValueError, + level=LogLevel.CRITICAL, + ) + else: + _log_and_raise_error( + message=f"Values must be a list of strings or a pandas DataFrame, got: {type(values)}", + error=TypeError, + level=LogLevel.CRITICAL, + ) + + +@overload +async def _read_file(path: None, h5ad_as_df: Literal[True] | Literal[False], **kwargs) -> None: ... @overload From 8d505f7196bc40fd174022bb7732f9794b6aece5 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:48:18 -0600 Subject: [PATCH 41/52] chore(dev): expand overloaded functions to add more type paths Signed-off-by: Josh Loecker --- main/como/utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/main/como/utils.py b/main/como/utils.py index dfb9ac1e..84df2aac 100644 --- a/main/como/utils.py +++ b/main/como/utils.py @@ -192,23 +192,23 @@ async def _read_file(path: None, h5ad_as_df: Literal[True] | Literal[False], **k @overload -async def _read_file(path: None, h5ad_as_df: bool, **kwargs) -> None: ... +async def _read_file(path: pd.DataFrame, h5ad_as_df: Literal[True] | Literal[False], **kwargs) -> pd.DataFrame: ... @overload -async def _read_file(path: pd.DataFrame, h5ad_as_df: bool, **kwargs) -> pd.DataFrame: ... +async def _read_file(path: sc.AnnData, h5ad_as_df: Literal[False] = False, **kwargs) -> sc.AnnData: ... @overload -async def _read_file(path: sc.AnnData, h5ad_as_df: bool = False, **kwargs) -> sc.AnnData: ... +async def _read_file(path: sc.AnnData, h5ad_as_df: Literal[True] = True, **kwargs) -> pd.DataFrame: ... -def _num_rows(item: pd.DataFrame | npt.NDArray) -> int: - return item.shape[0] +@overload +async def _read_file(path: Path, h5ad_as_df: Literal[False] = False, **kwargs) -> pd.DataFrame | sc.AnnData: ... @overload -async def _read_file(path: sc.AnnData, h5ad_as_df: bool = True, **kwargs) -> pd.DataFrame: ... +async def _read_file(path: Path, h5ad_as_df: Literal[True] = True, **kwargs) -> pd.DataFrame: ... async def _read_file( From 448627b0910e9674116ccd03cf5b388725f25ea7 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:49:35 -0600 Subject: [PATCH 42/52] feat(dev): added ty.toml file for type hints Signed-off-by: Josh Loecker --- main/como/proteomics_gen.py | 43 ++++++++++++++++++++++++++----------- ty.toml | 16 ++++++++++++++ 2 files changed, 47 insertions(+), 12 deletions(-) create mode 100644 ty.toml diff --git a/main/como/proteomics_gen.py b/main/como/proteomics_gen.py index e0d9b078..caeeb8a3 100644 --- a/main/como/proteomics_gen.py +++ b/main/como/proteomics_gen.py @@ -29,16 +29,25 @@ def process_proteomics_data(path: Path) -> pd.DataFrame: """ # Preprocess data, drop na, duplicate ';' in symbol, matrix: pd.DataFrame = pd.read_csv(path) - if "gene_symbol" not in matrix.columns: + gene_symbol_colname = [col for col in matrix.columns if "symbol" in col] + if len(gene_symbol_colname) == 0: _log_and_raise_error( "No gene_symbol column found in proteomics data.", error=ValueError, level=LogLevel.ERROR, ) - + if len(gene_symbol_colname) > 1: + _log_and_raise_error( + "Multiple gene_symbol columns found in proteomics data.", + error=ValueError, + level=LogLevel.ERROR, + ) + symbol_col = gene_symbol_colname[0] + matrix = matrix.rename(columns={symbol_col: "gene_symbol"}) matrix["gene_symbol"] = matrix["gene_symbol"].astype(str) matrix.dropna(subset=["gene_symbol"], inplace=True) - matrix = matrix.assign(gene_symbol=matrix["gene_symbol"].str.split(";")).explode("gene_symbol") + matrix["gene_symbol"] = matrix["gene_symbol"].str.split(":") + matrix = matrix.explode(column=["gene_symbol"]) return matrix @@ -56,12 +65,23 @@ async def load_gene_symbol_map(gene_symbols: list[str], entrez_map: Path | None if entrez_map and entrez_map.exists(): df = pd.read_csv(entrez_map, index_col="gene_symbol") else: - biodbnet = BioDBNet() - df = await biodbnet.async_db2db( - values=gene_symbols, - input_db=Input.GENE_SYMBOL, - output_db=[Output.GENE_ID, Output.ENSEMBL_GENE_ID], - ) + dataframes: list[pd.DataFrame] = [] + step_size: int = 300 + biodbnet = BioDBNet(cache=True) + biodbnet.services.settings.TIMEOUT = 60 + for i in range(0, len(gene_symbols), step_size): + # Operations: Goes from gene_symbols=["A", "B;C", "D"] -> gene.split=[["A"], ["B", "C"], ["D"]] -> chain.from_iterable["A", "B", "C", "D"] + chunk: list[str] = list(itertools.chain.from_iterable(gene.split(";") for gene in gene_symbols[i : i + step_size])) + dataframes.append( + biodbnet.db2db( + input_values=chunk, + input_db=Input.GENE_SYMBOL.value, + output_db=[Output.GENE_ID.value, Output.ENSEMBL_GENE_ID.value], + taxon=9606, + ) + ) + df = pd.concat(dataframes, axis="columns") + print(df) df.loc[df["gene_id"].isna(), ["gene_id"]] = np.nan df.to_csv(entrez_map, index_label="gene_symbol") @@ -188,7 +208,7 @@ async def proteomics_gen( high_confidence_batch_ratio: float = 0.7, quantile: int = 25, log_level: LogLevel = LogLevel.INFO, - log_location: str | TextIOWrapper = sys.stderr, + log_location: str | TextIO = sys.stderr, ): """Generate proteomics data.""" _set_up_logging(level=log_level, location=log_location) @@ -229,13 +249,12 @@ async def proteomics_gen( config_df = pd.read_excel(config_filepath, sheet_name=context_name) matrix: pd.DataFrame = process_proteomics_data(matrix_filepath) - groups = config_df["group"].unique().tolist() for group in groups: indices = np.where([g == group for g in config_df["group"]]) sample_columns = [*np.take(config_df["sample_name"].to_numpy(), indices).ravel().tolist(), "gene_symbol"] - matrix = matrix.loc[:, sample_columns] + matrix = cast(pd.DataFrame, matrix.loc[:, sample_columns]) symbols_to_gene_ids = await load_gene_symbol_map( gene_symbols=matrix["gene_symbol"].tolist(), diff --git a/ty.toml b/ty.toml new file mode 100644 index 00000000..a78d595f --- /dev/null +++ b/ty.toml @@ -0,0 +1,16 @@ +[environment] +root = ["main"] + +[rules] +call-non-callable = "error" +invalid-argument-type = "error" +unresolved-reference = "error" +unsupported-operator = "error" + +ambiguous-protocol-member = "warn" +deprecated = "warn" +division-by-zero = "warn" +possibly-missing-attribute = "warn" +possibly-unresolved-reference = "warn" +unsupported-base = "warn" +unused-ignore-comment = "warn" From cebab4448f6ba15b9bf5e9f6b9c4bd2f28a4bcac Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 10:58:54 -0600 Subject: [PATCH 43/52] fix(test): lowercase names Signed-off-by: Josh Loecker --- tests/unit/test_data_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_data_types.py b/tests/unit/test_data_types.py index 3b24f9a2..45939ba8 100644 --- a/tests/unit/test_data_types.py +++ b/tests/unit/test_data_types.py @@ -3,7 +3,7 @@ def test_source_types(): """Validate that source types always go in the order of 'trna', 'mrna', 'scrna', 'protemics'.""" - expected_order = ["TRNA", "MRNA", "SCRNA", "PROTEOMICS"] + expected_order = ["trna", "mrna", "scrna", "proteomics"] for expected, source in zip(expected_order, SourceTypes, strict=True): expected: str source: SourceTypes From 37be94ba55c5432fcb582a9c22b4a301e6d23d01 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 11:00:28 -0600 Subject: [PATCH 44/52] format: ruff formatting Signed-off-by: Josh Loecker --- tests/test_proteomics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_proteomics.py b/tests/test_proteomics.py index 0f38be27..defb0fef 100644 --- a/tests/test_proteomics.py +++ b/tests/test_proteomics.py @@ -2,11 +2,11 @@ import aioftp import pytest + from como.proteomics.Crux import MZMLtoSQT, RAWtoMZML, SQTtoCSV from como.proteomics.FileInformation import FileInformation from como.proteomics.FTPManager import Download, Reader, aioftp_client from como.proteomics.proteomics_preprocess import ParseCSVInput, PopulateInformation - from tests.fixtures.fixture_ftp_server import fixture_ftp_server, ftp_file_names From a19536bd7cf1cf32c08731137d7289e22cbfc066 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 14:14:08 -0600 Subject: [PATCH 45/52] feat(test): added `approx` tests Signed-off-by: Josh Loecker --- tests/unit/test_approx.py | 237 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 237 insertions(+) create mode 100644 tests/unit/test_approx.py diff --git a/tests/unit/test_approx.py b/tests/unit/test_approx.py new file mode 100644 index 00000000..772dbc77 --- /dev/null +++ b/tests/unit/test_approx.py @@ -0,0 +1,237 @@ +import numpy as np +import pytest + +from como.approx import _coerce_to_float_array, _regularize_values, approx + + +class TestCoerceToFloatArray: + """Tests for the _coerce_to_float_array helper function.""" + + def test_converts_int_list_to_array(self): + result = _coerce_to_float_array([1, 2, 3]) + assert isinstance(result, np.ndarray) + assert result.dtype == float + np.testing.assert_array_equal(result, [1.0, 2.0, 3.0]) + + def test_converts_float_list_to_array(self): + result = _coerce_to_float_array([1.0, 2.0, 3.0]) + assert isinstance(result, np.ndarray) + assert result.dtype == float + np.testing.assert_array_equal(result, [1.0, 2.0, 3.0]) + + +class TestRegularizeValues: + """Tests for the _regularize_values function.""" + + def test_removes_na_pairs_when_na_rm_true(self): + x: list[float] = np.array([1.0, 2.0, np.nan, 4.0]) + y: list[float] = np.array([10.0, np.nan, 30.0, 40.0]) + result = _regularize_values(x, y, na_rm=True, ties="mean") + np.testing.assert_array_equal(result.x, [1.0, 4.0]) + np.testing.assert_array_equal(result.y, [10.0, 40.0]) + + def test_raises_error_with_na_in_x_when_na_rm_false(self): + x: list[float] = np.array([1.0, np.nan, 3.0]) + y: list[float] = np.array([10.0, 20.0, 30.0]) + with pytest.raises(ValueError, match="NA values in x are not allowed"): + _regularize_values(x, y, na_rm=False, ties="mean") + + def test_sorts_by_x(self): + x: list[float] = np.array([3.0, 1.0, 2.0]) + y: list[float] = np.array([30.0, 10.0, 20.0]) + result = _regularize_values(x, y, na_rm=True, ties="mean") + np.testing.assert_array_equal(result.x, [1.0, 2.0, 3.0]) + np.testing.assert_array_equal(result.y, [10.0, 20.0, 30.0]) + + def test_aggregates_duplicates_with_mean(self): + x: list[float] = np.array([1.0, 1.0, 2.0]) + y: list[float] = np.array([10.0, 20.0, 30.0]) + result = _regularize_values(x, y, na_rm=True, ties="mean") + np.testing.assert_array_equal(result.x, [1.0, 2.0]) + np.testing.assert_array_equal(result.y, [15.0, 30.0]) + + def test_aggregates_duplicates_with_first(self): + x: list[float] = np.array([1.0, 1.0, 2.0]) + y: list[float] = np.array([10.0, 20.0, 30.0]) + result = _regularize_values(x, y, na_rm=True, ties="first") + np.testing.assert_array_equal(result.x, [1.0, 2.0]) + np.testing.assert_array_equal(result.y, [10.0, 30.0]) + + def test_aggregates_duplicates_with_last(self): + x: list[float] = np.array([1.0, 1.0, 2.0]) + y: list[float] = np.array([10.0, 20.0, 30.0]) + result = _regularize_values(x, y, na_rm=True, ties="last") + np.testing.assert_array_equal(result.x, [1.0, 2.0]) + np.testing.assert_array_equal(result.y, [20.0, 30.0]) + + def test_handles_empty_arrays(self): + x: list[float] = np.array([]) + y: list[float] = np.array([]) + result = _regularize_values(x, y, na_rm=True, ties="mean") + assert result.x.size == 0 + assert result.y.size == 0 + assert result.not_na.size == 0 + + def test_callable_ties_function(self): + x: list[float] = np.array([1.0, 1.0, 2.0]) + y: list[float] = np.array([10.0, 20.0, 30.0]) + result = _regularize_values(x, y, na_rm=True, ties=np.sum) + np.testing.assert_array_equal(result.x, [1.0, 2.0]) + np.testing.assert_array_equal(result.y, [30.0, 30.0]) + + +class TestApprox: + """Tests for the main approx function.""" + + def test_basic_linear_interpolation(self): + x: list[float] = [1.0, 2.0, 3.0] + y: list[float] = [10.0, 20.0, 30.0] + xout: list[float] = [1.5, 2.5] + result = approx(x, y, xout=xout) + np.testing.assert_array_almost_equal(result.y, [15.0, 25.0]) + + def test_one_argument_form(self): + y: list[float] = [10.0, 20.0, 30.0] + result = approx(y, xout=[1.5, 2.5]) + np.testing.assert_array_almost_equal(result.y, [15.0, 25.0]) + + def test_default_n_points(self): + x: list[float] = [1.0, 5.0] + y: list[float] = [10.0, 50.0] + result = approx(x, y) + assert len(result.x) == 50 + assert len(result.y) == 50 + + def test_custom_n_points(self): + x: list[float] = [1.0, 5.0] + y: list[float] = [10.0, 50.0] + result = approx(x, y, n=10) + assert len(result.x) == 10 + + def test_extrapolation_rule_1(self): + x: list[float] = [1.0, 2.0, 3.0] + y: list[float] = [10.0, 20.0, 30.0] + xout: list[float] = [0.5, 3.5] + result = approx(x, y, xout=xout, rule=1) + assert np.isnan(result.y[0]) + assert np.isnan(result.y[1]) + + def test_extrapolation_rule_2(self): + x: list[float] = [1.0, 2.0, 3.0] + y: list[float] = [10.0, 20.0, 30.0] + xout: list[float] = [0.5, 3.5] + result = approx(x, y, xout=xout, rule=2) + assert result.y[0] == 10.0 + assert result.y[1] == 30.0 + + def test_yleft_yright_parameters(self): + x: list[float] = [1.0, 2.0, 3.0] + y: list[float] = [10.0, 20.0, 30.0] + xout: list[float] = [0.5, 3.5] + result = approx(x, y, xout=xout, yleft=5.0, yright=35.0) + assert result.y[0] == 5.0 + assert result.y[1] == 35.0 + + def test_constant_method_f_0(self): + x: list[float] = [1.0, 2.0, 3.0] + y: list[float] = [10.0, 20.0, 30.0] + xout: list[float] = [1.5] + result = approx(x, y, xout=xout, method="constant", f=0.0) + assert result.y[0] == 10.0 + + def test_constant_method_f_1(self): + x: list[float] = [1.0, 2.0, 3.0] + y: list[float] = [10.0, 20.0, 30.0] + xout: list[float] = [1.5] + result = approx(x, y, xout=xout, method="constant", f=1.0) + assert result.y[0] == 20.0 + + def test_constant_method_f_05(self): + x: list[float] = [1.0, 2.0, 3.0] + y: list[float] = [10.0, 20.0, 30.0] + xout: list[float] = [1.5] + result = approx(x, y, xout=xout, method="constant", f=0.5) + assert result.y[0] == 15.0 + + def test_exact_match_returns_exact_value(self): + x: list[float] = [1.0, 2.0, 3.0] + y: list[float] = [10.0, 20.0, 30.0] + xout: list[float] = [2.0] + result = approx(x, y, xout=xout) + assert result.y[0] == 20.0 + + def test_na_in_xout_returns_nan(self): + x: list[float] = [1.0, 2.0, 3.0] + y: list[float] = [10.0, 20.0, 30.0] + xout: list[float] = [np.nan, 2.0] + result = approx(x, y, xout=xout) + assert np.isnan(result.y[0]) + assert result.y[1] == 20.0 + + def test_method_numeric_codes(self): + x: list[float] = [1.0, 2.0] + y: list[float] = [10.0, 20.0] + xout: list[float] = [1.5] + result1 = approx(x, y, xout=xout, method=1) + result2 = approx(x, y, xout=xout, method=2, f=0.0) + assert result1.y[0] == 15.0 + assert result2.y[0] == 10.0 + + def test_raises_error_different_length_xy(self): + x: list[float] = [1.0, 2.0] + y: list[float] = [10.0] + with pytest.raises(ValueError, match="x and y must have same length"): + approx(x, y) + + def test_raises_error_invalid_method(self): + x: list[float] = [1.0, 2.0] + y: list[float] = [10.0, 20.0] + with pytest.raises(ValueError, match="invalid interpolation method"): + approx(x, y, method="invalid") + + def test_raises_error_invalid_method_code(self): + x: list[float] = [1.0, 2.0] + y: list[float] = [10.0, 20.0] + with pytest.raises(ValueError, match="invalid interpolation method"): + approx(x, y, method=3) + + def test_raises_error_invalid_f(self): + x: list[float] = [1.0, 2.0] + y: list[float] = [10.0, 20.0] + with pytest.raises(ValueError, match="invalid f value"): + approx(x, y, method="constant", f=2.0) + + def test_raises_error_need_two_points_for_linear(self): + x: list[float] = [1.0] + y: list[float] = [10.0] + with pytest.raises(ValueError, match="need at least two non-NA values"): + approx(x, y, method="linear") + + def test_raises_error_zero_points(self): + x: list[float] = [] + y: list[float] = [] + with pytest.raises(ValueError, match="zero non-NA points"): + approx(x, y, method="constant") + + def test_handles_ties_mean(self): + x: list[float] = [1.0, 1.0, 2.0] + y: list[float] = [10.0, 20.0, 30.0] + xout: list[float] = [1.5] + result = approx(x, y, xout=xout, ties="mean") + assert result.y[0] == 22.5 + + def test_rule_as_list(self): + x: list[float] = [1.0, 2.0, 3.0] + y: list[float] = [10.0, 20.0, 30.0] + xout: list[float] = [0.5, 3.5] + result = approx(x, y, xout=xout, rule=[1, 2]) + assert np.isnan(result.y[0]) + assert result.y[1] == 30.0 + + def test_na_rm_false_with_na_in_y(self): + x: list[float] = [1.0, 2.0, 3.0] + y: list[float] = [10.0, np.nan, 30.0] + xout: list[float] = [2.5] + result = approx(x, y, xout=xout, na_rm=False) + # After filtering out NA, should interpolate between 1.0->10.0 and 3.0->30.0 + assert result.y[0] == 25.0 From 0891f010066f7bfd42057636c8795ff924847371 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 14:14:15 -0600 Subject: [PATCH 46/52] feat(test): added `density` tests Signed-off-by: Josh Loecker --- tests/unit/test_density.py | 226 +++++++++++++++++++++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 tests/unit/test_density.py diff --git a/tests/unit/test_density.py b/tests/unit/test_density.py new file mode 100644 index 00000000..c4d88721 --- /dev/null +++ b/tests/unit/test_density.py @@ -0,0 +1,226 @@ +from typing import Literal, cast + +import numpy as np +import numpy.typing as npt +import pytest +from numpy.testing import assert_allclose, assert_array_equal + +from como.density import DensityResult, bin_distance, density, dnorm, nrd0 + +KERNEL_TYPE = Literal["gaussian", "epanechnikov", "rectangular", "triangular", "biweight", "cosine", "optcosine"] + + +class TestBinDistance: + def test_basic_binning(self): + x: npt.NDArray[float] = np.array([0.5, 1.5, 2.5]) + weights: npt.NDArray[float] = np.array([1.0, 1.0, 1.0]) + result: npt.NDArray[float] = bin_distance(x, weights, lo=0, up=3, n=4) + + assert result.shape == (8,) + assert np.all(result >= 0) + + def test_weighted_binning(self): + x: npt.NDArray[float] = np.array([1.0, 2.0]) + weights: npt.NDArray[float] = np.array([2.0, 1.0]) + result: npt.NDArray[float] = bin_distance(x, weights, lo=0, up=3, n=4) + + assert result.shape == (8,) + assert result.sum() > 0 + + def test_empty_array(self): + x: npt.NDArray[float] = np.array([]) + weights: npt.NDArray[float] = np.array([]) + result: npt.NDArray[float] = bin_distance(x, weights, lo=0, up=1, n=2) + + assert result.shape == (4,) + assert_array_equal(result, np.zeros(4)) + + def test_out_of_bounds_handling(self): + x: npt.NDArray[float] = np.array([10.0]) + weights: npt.NDArray[float] = np.array([1.0]) + result: npt.NDArray[float] = bin_distance(x, weights, lo=0, up=1, n=2) + + assert result.shape == (4,) + + +class TestDnorm: + def test_standard_normal(self): + # dnorm(0) for standard normal should be ~0.3989 + result: float = dnorm(0.0, mean=0.0, sd=1.0) + expected: float = 1.0 / np.sqrt(2 * np.pi) + assert_allclose(result, expected, rtol=1e-10) + + def test_with_mean_and_sd(self): + result: float = dnorm(5.0, mean=5.0, sd=2.0) + expected: float = 1.0 / (2.0 * np.sqrt(2 * np.pi)) + assert_allclose(result, expected, rtol=1e-10) + + def test_log_density(self): + result: float = dnorm(0.0, mean=0.0, sd=1.0, log=True) + expected: float = np.log(1.0 / np.sqrt(2 * np.pi)) + assert_allclose(result, expected, rtol=1e-10) + + def test_nan_inputs(self): + assert np.isnan(dnorm(np.nan)) + assert np.isnan(dnorm(0.0, mean=np.nan)) + assert np.isnan(dnorm(0.0, sd=np.nan)) + + def test_negative_sd(self): + result: float = dnorm(0.0, sd=-1.0) + assert np.isnan(result) + + def test_infinite_sd(self): + result: float = dnorm(0.0, sd=np.inf) + assert result == 0.0 or result == -np.inf + + def test_zero_sd_at_mean(self): + result: float = dnorm(5.0, mean=5.0, sd=0.0) + assert np.isinf(result) + + def test_zero_sd_away_from_mean(self): + result: float = dnorm(5.0, mean=3.0, sd=0.0, log=False) + assert result == 0.0 + + def test_fast_dnorm_flag(self): + result_fast: float = dnorm(1.0, fast_dnorm=True) + result_slow: float = dnorm(1.0, fast_dnorm=False) + assert_allclose(result_fast, result_slow, rtol=1e-5) + + def test_large_values(self): + result: float = dnorm(100.0, mean=0.0, sd=1.0) + assert result >= 0.0 + assert result < 1e-10 + + +class TestNrd0: + def test_basic_calculation(self): + x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0, 4.0, 5.0]) + result: float = nrd0(x) + assert result > 0 + assert np.isfinite(result) + + def test_with_constant_values(self): + x: npt.NDArray[float] = np.array([5.0, 5.0, 5.0, 5.0]) + result: float = nrd0(x) + assert result > 0 + assert np.isfinite(result) + + def test_single_nonzero_value(self): + x: npt.NDArray[float] = np.array([0.0, 7.0]) + result: float = nrd0(x) + assert result > 0 + + def test_all_zeros(self): + # if the input array is changed from a shape of `(3,)`, the result will change + # this is because `nrd0` takes the length of the input into account when calculating bandwidth + x: npt.NDArray[float] = np.array([0.0, 0.0, 0.0], dtype=float) + result: float = nrd0(x) + assert result == 0.7224674055842076 + + def test_insufficient_data(self): + with pytest.raises(ValueError, match="need at least 2 data points"): + nrd0(np.array([1.0])) + + def test_empty_array(self): + with pytest.raises(ValueError, match="need at least 2 data points"): + nrd0(np.array([])) + + +class TestDensity: + def test_basic_density(self): + x: npt.NDArray[float] = np.random.normal(0, 1, 100) + result: DensityResult = density(x) + + assert isinstance(result, DensityResult) + assert len(result.x) == 512 + assert len(result.y) == 512 + assert result.bw > 0 + assert np.all(result.y >= 0) + + def test_custom_bandwidth(self): + x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0, 4.0, 5.0]) + result: DensityResult = density(x, bw=0.5) + + assert_allclose(result.bw, 0.5, rtol=1e-10) + + def test_with_weights(self): + x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0, 4.0, 5.0]) + weights: npt.NDArray[float] = np.array([0.1, 0.2, 0.4, 0.2, 0.1]) + result: DensityResult = density(x, weights=weights, n=5) + + assert len(result.x) == 5 + assert np.all(result.y >= 0) + + def test_custom_grid_range(self): + x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0, 4.0, 5.0]) + result: DensityResult = density(x, from_=0, to_=6, n=100) + assert result.x[0] == 0 + assert result.x[-1] == 6 + assert len(result.x) == 100 + + def test_different_kernels(self): + x: npt.NDArray[float] = np.random.normal(0, 1, 50) + + for kernel in ["epanechnikov", "rectangular", "triangular", "biweight", "cosine", "optcosine"]: + with pytest.raises(NotImplementedError, match=f"Only 'gaussian' kernel is implemented; got '{kernel}'"): + density(x, kernel=cast(KERNEL_TYPE, kernel)) + + def test_kernel_only_mode(self): + result: npt.NDArray[float] = density([1, 2, 3], kernel="gaussian", kernel_only=True) + expected: float = 1 / (2 * np.sqrt(np.pi)) + assert isinstance(result, float) + assert_allclose(result, expected, rtol=1e-10) + + def test_na_removal(self): + x: npt.NDArray[float] = np.array([1.0, 2.0, np.nan, 4.0, 5.0]) + result: DensityResult = density(x, remove_na=True) + + assert len(result.x) == 512 + assert np.all(np.isfinite(result.y)) + + def test_na_without_removal(self): + x: npt.NDArray[float] = np.array([1.0, 2.0, np.nan, 4.0, 5.0]) + + with pytest.raises(ValueError, match="NA values found"): + density(x, remove_na=False) + + def test_insufficient_data_for_nrd0(self): + with pytest.raises(ValueError, match="at least two points"): + density([1.0], bw="nrd0") + + def test_adjust_parameter(self): + x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0, 4.0, 5.0]) + result1: DensityResult = density(x, bw=1.0, adjust=1.0) + result2: DensityResult = density(x, bw=1.0, adjust=2.0) + + assert_allclose(result2.bw, 2.0 * result1.bw, rtol=1e-10) + + def test_invalid_bandwidth_string(self): + x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0]) + + with pytest.raises(TypeError, match="must be a number or 'nrd0'"): + density(x, bw="invalid") + + def test_negative_weights(self): + x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0]) + weights: npt.NDArray[float] = np.array([1.0, -1.0, 1.0]) + + with pytest.raises(ValueError, match="Negative values found"): + density(x, weights=weights) + + def test_infinite_values_in_x(self): + x: npt.NDArray[float] = np.array([1.0, 2.0, np.inf, 4.0, 5.0]) + result: DensityResult = density(x) + + assert np.all(np.isfinite(result.y)) + + def test_result_named_tuple_fields(self): + x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0, 4.0, 5.0]) + result: DensityResult = density(x, n=100) + + assert hasattr(result, "x") + assert hasattr(result, "y") + assert hasattr(result, "x_grid") + assert hasattr(result, "y_grid") + assert hasattr(result, "bw") + assert hasattr(result, "n") From 330fc445639ecca62e697df4cd40daf4954e1683 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 14:14:29 -0600 Subject: [PATCH 47/52] feat(test): added `find_peaks` tests Signed-off-by: Josh Loecker --- tests/unit/test_peak_finder.py | 204 +++++++++++++++++++++++++++++++++ 1 file changed, 204 insertions(+) create mode 100644 tests/unit/test_peak_finder.py diff --git a/tests/unit/test_peak_finder.py b/tests/unit/test_peak_finder.py new file mode 100644 index 00000000..7d2d730d --- /dev/null +++ b/tests/unit/test_peak_finder.py @@ -0,0 +1,204 @@ +import numpy as np +import numpy.typing as npt +import pandas as pd +import pytest + +from como.peak_finder import ( + _encode_signs, + _enforce_minimum_peak_distance, + _validate_args, + find_peaks, +) + + +class TestValidateArgs: + def test_multidimensional_array_raises_error(self): + x: npt.NDArray[float] = np.array([[1, 2], [3, 4]]) + with pytest.raises(ValueError, match="Expected a 1D array, got 2D array instead"): + _validate_args(x, 1, 1, "0", 0.0, 1, 0.0) + + def test_nan_values_raise_error(self): + x: npt.NDArray[float] = np.array([1.0, 2.0, np.nan, 4.0]) + with pytest.raises(ValueError, match="Input x contains NaNs"): + _validate_args(x, 1, 1, "0", 0.0, 1, 0.0) + + def test_negative_nups_raises_error(self): + x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0]) + with pytest.raises(ValueError, match="Argument 'nups' must be non-negative"): + _validate_args(x, -1, 1, "0", 0.0, 1, 0.0) + + def test_negative_ndowns_raises_error(self): + x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0]) + with pytest.raises(ValueError, match="Argument 'ndowns' must be non-negative"): + _validate_args(x, 1, -1, "0", 0.0, 1, 0.0) + + def test_invalid_zero_raises_error(self): + x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0]) + with pytest.raises(ValueError, match="Argument 'zero' must be '0', '\\+', or '-'"): + _validate_args(x, 1, 1, "x", 0.0, 1, 0.0) # type: ignore[invalid-type-argument] + + def test_negative_min_peak_height_raises_error(self): + x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0]) + with pytest.raises(ValueError, match="Argument 'min_peak_height' must be non-negative"): + _validate_args(x, 1, 1, "0", -1.0, 1, 0.0) + + def test_negative_min_peak_distance_raises_error(self): + x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0]) + with pytest.raises(ValueError, match="Argument 'minpeakdistance' must be non-negative"): + _validate_args(x, 1, 1, "0", 0.0, -1, 0.0) + + def test_negative_threshold_raises_error(self): + x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0]) + with pytest.raises(ValueError, match="Argument 'threshold' must be non-negative"): + _validate_args(x, 1, 1, "0", 0.0, 1, -1.0) + + def test_valid_args_does_not_raise(self): + x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0]) + _validate_args(x, 1, 1, "0", 0.0, 1, 0.0) + + +class TestEncodeSigns: + def test_increasing_sequence(self): + x: npt.NDArray[float] = np.array([1.0, 2.0, 3.0, 4.0]) + result: str = _encode_signs(x, "0") + assert result == "+++" + + def test_decreasing_sequence(self): + x: npt.NDArray[float] = np.array([4.0, 3.0, 2.0, 1.0]) + result: str = _encode_signs(x, "0") + assert result == "---" + + def test_flat_sequence_with_zero(self): + x: npt.NDArray[float] = np.array([1.0, 1.0, 1.0]) + result: str = _encode_signs(x, "0") + assert result == "00" + + def test_flat_sequence_with_plus(self): + x: npt.NDArray[float] = np.array([1.0, 1.0, 1.0]) + result: str = _encode_signs(x, "+") + assert result == "++" + + def test_flat_sequence_with_minus(self): + x: npt.NDArray[float] = np.array([1.0, 1.0, 1.0]) + result: str = _encode_signs(x, "-") + assert result == "--" + + def test_flat_sequence_with_dollarsign(self): + x: npt.NDArray[float] = np.array([1.0, 1.0, 1.0]) + result: str = _encode_signs(x, "$") + assert result == "$$" + + def test_mixed_sequence(self): + x: npt.NDArray[float] = np.array([1.0, 2.0, 2.0, 3.0, 2.0]) + result: str = _encode_signs(x, "0") + assert result == "+0+-" + + +class TestEnforceMinimumPeakDistance: + def test_inplace_removes_close_peaks(self): + df: pd.DataFrame = pd.DataFrame( + { + "height": [10.0, 8.0, 5.0], + "peak_idx": [0, 2, 10], + "start_idx": [0, 1, 9], + "end_idx": [1, 3, 11], + } + ) + _enforce_minimum_peak_distance(df, min_peak_distance=5, inplace=True) + assert len(df) == 2 + assert df["peak_idx"].tolist() == [0, 10] + + def test_not_inplace_returns_new_dataframe(self): + df: pd.DataFrame = pd.DataFrame( + { + "height": [10.0, 8.0, 5.0], + "peak_idx": [0, 2, 10], + "start_idx": [0, 1, 9], + "end_idx": [1, 3, 11], + } + ) + result = _enforce_minimum_peak_distance(df, min_peak_distance=5, inplace=False) + assert len(result) == 2 + assert len(df) == 3 # original unchanged + assert result["peak_idx"].tolist() == [0, 10] + + def test_all_peaks_sufficiently_spaced(self): + df: pd.DataFrame = pd.DataFrame( + { + "height": [10.0, 8.0], + "peak_idx": [0, 10], + "start_idx": [0, 9], + "end_idx": [1, 11], + } + ) + _enforce_minimum_peak_distance(df, min_peak_distance=5, inplace=True) + assert len(df) == 2 + + +class TestFindPeaks: + def test_simple_peak(self): + x: npt.NDArray[float] = np.asarray([0.0, 1.0, 2.0, 3.0, 2.0, 1.0, 0.0], dtype=float) + result: pd.DataFrame = find_peaks(x, min_peak_height=0.0) + assert len(result) == 1 + assert result.iloc[0]["peak_idx"] == 3 + assert result.iloc[0]["height"] == 3.0 + + def test_multiple_peaks(self): + x: npt.NDArray[float] = np.asarray([0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0], dtype=float) + result: pd.DataFrame = find_peaks(x, min_peak_height=0.0) + assert len(result) == 3 + + def test_no_peaks(self): + x: npt.NDArray[float] = np.asarray([1.0, 2.0, 3.0, 4.0, 5.0], dtype=float) + result: pd.DataFrame = find_peaks(x) + assert len(result) == 0 + assert list(result.columns) == ["height", "peak_idx", "start_idx", "end_idx"] + + def test_min_peak_height_filter(self): + x: npt.NDArray[float] = np.asarray([0.0, 1.0, 0.0, 5.0, 0.0], dtype=float) + result: pd.DataFrame = find_peaks(x, min_peak_height=2.0) + assert len(result) == 1 + assert result.iloc[0]["height"] == 5.0 + + def test_nups_and_ndowns(self): + x: npt.NDArray[float] = np.asarray([0.0, 1.0, 2.0, 3.0, 2.0, 1.0, 0.0], dtype=float) + result: pd.DataFrame = find_peaks(x, nups=2, ndowns=2, min_peak_height=0.0) + assert len(result) == 1 + + def test_npeaks_limits_output(self): + x: npt.NDArray[float] = np.asarray([0.0, 5.0, 0.0, 3.0, 0.0, 1.0, 0.0], dtype=float) + result: pd.DataFrame = find_peaks(x, npeaks=2, min_peak_height=0.0) + assert len(result) == 2 + + def test_sortstr_orders_by_height(self): + x: npt.NDArray[float] = np.asarray([0.0, 1.0, 0.0, 5.0, 0.0, 3.0, 0.0], dtype=float) + result: pd.DataFrame = find_peaks(x, sortstr=True, min_peak_height=0.0) + heights = result["height"].tolist() + assert heights == sorted(heights, reverse=True) + + def test_min_peak_distance(self): + x: npt.NDArray[float] = np.asarray([0.0, 10.0, 0.0, 8.0, 0.0], dtype=float) + result: pd.DataFrame = find_peaks(x, min_peak_distance=3, min_peak_height=0.0) + assert len(result) == 1 + assert result.iloc[0]["height"] == 10.0 + + def test_threshold_filter(self): + x: npt.NDArray[float] = np.asarray([1.0, 2.0, 1.5, 5.0, 1.0], dtype=float) + result: pd.DataFrame = find_peaks(x, threshold=2.0, min_peak_height=0.0) + assert len(result) == 1 + assert result.iloc[0]["height"] == 5.0 + + def test_accepts_list_input(self): + x: npt.NDArray[float] = np.asarray([0.0, 1.0, 2.0, 1.0, 0.0], dtype=float) + result: pd.DataFrame = find_peaks(x, min_peak_height=0.0) + assert len(result) == 1 + + def test_accepts_numpy_array(self): + x: npt.NDArray[float] = np.array([0.0, 1.0, 2.0, 1.0, 0.0]) + result: pd.DataFrame = find_peaks(x, min_peak_height=0.0) + assert len(result) == 1 + + def test_custom_peak_pattern(self): + x: npt.NDArray[float] = np.asarray([0.0, 1.0, 2.0, 3.0, 2.0, 1.0, 0.0], dtype=float) + result: pd.DataFrame = find_peaks(x, peak_pattern=r"[+]{3,}[-]{3,}", min_peak_height=0.0) + assert len(result) == 1 From 46178635ef66cec27df5d34afa9f93998e13d20f Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 14:15:41 -0600 Subject: [PATCH 48/52] feat(dev): use named tuples for return types Signed-off-by: Josh Loecker --- main/como/approx.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/main/como/approx.py b/main/como/approx.py index 3ef69a74..7bc8f53a 100644 --- a/main/como/approx.py +++ b/main/como/approx.py @@ -1,9 +1,22 @@ from __future__ import annotations from collections.abc import Callable, Sequence -from typing import Any +from typing import Literal, NamedTuple import numpy as np +import numpy.typing as npt + + +class RegularizedArray(NamedTuple): + x: npt.NDArray[np.number] + y: npt.NDArray[np.number] + not_na: npt.NDArray[bool] + kept_na: bool + + +class Approx(NamedTuple): + x: npt.NDArray[float] + y: npt.NDArray[float] def _coerce_to_float_array(a): @@ -39,8 +52,7 @@ def _regularize_values(x: np.ndarray, y: np.ndarray, ties, na_rm: bool) -> dict[ kept_na = np.isnan(y).any() if x.size == 0: - # THIS IS THE CORRECTED LINE: - return {"x": x, "y": y, "not_na": np.array([], dtype=bool), "kept_na": kept_na} + return RegularizedArray(x=x, y=y, not_na=np.array([], dtype=bool), kept_na=kept_na) # Use a stable sort (mergesort) to match R's order() order = np.argsort(x, kind="mergesort") @@ -88,6 +100,7 @@ def agg_fn(a): # Check if any NaNs remain in the *aggregated* y values kept_na_final = np.any(~not_na) if np.any(np.isnan(y_agg)) else False return {"x": unique_x, "y": y_agg, "not_na": not_na, "kept_na": kept_na_final} + return RegularizedArray(x=unique_x, y=y_agg, not_na=not_na, kept_na=kept_na_final) def approx( @@ -102,7 +115,7 @@ def approx( f: float = 0.0, ties: str | Callable[[np.ndarray], float] = "mean", na_rm: bool = True, -) -> dict[str, np.ndarray]: +) -> Approx: """Faithful Python port of R's `approx` function. This implementation aims to replicate the behavior of R's `approx` @@ -133,7 +146,7 @@ def approx( are propagated. Returns: - dict with: + `Approx` class with: - 'x': numpy array of xout used - 'y': numpy array of interpolated values """ @@ -179,6 +192,7 @@ def approx( x_reg = r["x"] y_reg = r["y"] not_na_mask = r["not_na"] + r: RegularizedArray = _regularize_values(x_arr, y_arr, na_rm=na_rm, ties=ties) # no_na is True if we don't have to worry about NAs in y_reg no_na = na_rm or (not r["kept_na"]) # nx is the number of *valid* (non-NA) points for interpolation @@ -307,4 +321,4 @@ def approx( res[mid_mask] = res_mid yout[mask_valid] = res - return {"x": xout_arr, "y": yout} + return Approx(x=xout_arr, y=yout) From 688ad8ed282873cb1101e88aece23e78a2c121a5 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 14:17:34 -0600 Subject: [PATCH 49/52] style: update docstrings and types Signed-off-by: Josh Loecker --- main/como/approx.py | 192 +++++++++++++++++++++++--------------------- 1 file changed, 101 insertions(+), 91 deletions(-) diff --git a/main/como/approx.py b/main/como/approx.py index 7bc8f53a..14c69e3e 100644 --- a/main/como/approx.py +++ b/main/como/approx.py @@ -19,25 +19,46 @@ class Approx(NamedTuple): y: npt.NDArray[float] -def _coerce_to_float_array(a): - """Helper to ensure input is a 1D float array.""" +def _coerce_to_float_array(a: Sequence[int | float | np.number]): + """Coerce input to a 1D float array. + + Args: + a: the array to coerce + + Returns: + A floating point 1D array. + """ arr = np.asarray(a, dtype=float) if arr.ndim != 1: arr = arr.ravel() return arr -def _regularize_values(x: np.ndarray, y: np.ndarray, ties, na_rm: bool) -> dict[str, Any]: - """Minimal reimplementation of R's regularize.values() used by approx(). +def _regularize_values( + x: npt.NDArray[np.number], + y: npt.NDArray[np.number], + *, + na_rm: bool, + ties: Callable[[np.ndarray], float] | Literal["mean", "first", "last", "min", "max", "median", "sum"] = "mean", +) -> RegularizedArray: + """Reimplementation of R's regularize.values() used by approx(). - Removes NA pairs if na_rm is True (else requires x to have no NA). - Sorts by x (stable). - Collapses duplicate x via `ties` aggregator. - Returns dict with: - x: sorted unique x - y: aggregated y aligned with x - not_na: boolean mask of y that are not NaN - kept_na: True iff any NaN remained in y after regularization + + Args: + x: the x values to regularize + y: ties: the corresponding y values + na_rm: should NA values be removed? + ties: how to handle duplicate x values; can be a string or a callable + + Returns: + A NamedTuple with: + - x: sorted unique x + - y: aggregated y aligned with x + - not_na: boolean mask of y that are not NaN + - kept_na: True iff any NaN remained in y after regularization """ if na_rm: # Remove pairs where x or y is NA @@ -98,8 +119,7 @@ def agg_fn(a): not_na = ~np.isnan(y_agg) # Check if any NaNs remain in the *aggregated* y values - kept_na_final = np.any(~not_na) if np.any(np.isnan(y_agg)) else False - return {"x": unique_x, "y": y_agg, "not_na": not_na, "kept_na": kept_na_final} + kept_na_final = bool(np.any(~not_na) if np.any(np.isnan(y_agg)) else False) return RegularizedArray(x=unique_x, y=y_agg, not_na=not_na, kept_na=kept_na_final) @@ -111,9 +131,9 @@ def approx( n: int = 50, yleft: float | None = None, yright: float | None = None, - rule: int | Sequence[int] = 1, + rule: int | Sequence[int] | npt.NDArray[int] = 1, f: float = 0.0, - ties: str | Callable[[np.ndarray], float] = "mean", + ties: Callable[[np.ndarray], float] | Literal["mean", "first", "last", "min", "max", "median", "sum"] = "mean", na_rm: bool = True, ) -> Approx: """Faithful Python port of R's `approx` function. @@ -127,23 +147,15 @@ def approx( y: y-coordinates. If None, `x` is treated as `y` and `x` becomes `1..n`. xout: Points at which to interpolate. method: Interpolation method. "linear" (1) or "constant" (2). - n: If `xout` is not provided, interpolation happens at `n` - equally spaced points spanning the range of `x`. - yleft: Value to use for extrapolation to the left. - Defaults to `NA` (np.nan) if `rule` is 1. - yright: Value to use for extrapolation to the right. - Defaults to `NA` (np.nan) if `rule` is 1. + n: If `xout` is not provided, interpolation happens at `n` equally spaced points spanning the range of `x`. + yleft: Value to use for extrapolation to the left. Defaults to `NA` (np.nan) if `rule` is 1. + yright: Value to use for extrapolation to the right. Defaults to `NA` (np.nan) if `rule` is 1. rule: Extrapolation rule. - 1: Return `np.nan` for points outside the `x` range. - 2: Use `yleft` and `yright` for points outside the range. - f: For `method="constant"`, determines the split point. - `f=0` is left-step, `f=1` is right-step, `f=0.5` is midpoint. - ties: How to handle duplicate `x` values. - Can be 'mean', 'first', 'last', 'min', 'max', 'median', 'sum', - or a callable function. - na_rm: If True, `NA` pairs are removed before interpolation. - If False, `NA`s in `x` cause an error, `NA`s in `y` - are propagated. + f: For `method="constant"`, determines the split point. `f=0` is left-step, `f=1` is right-step, `f=0.5` is midpoint. + ties: How to handle duplicate `x` values. Can be 'mean', 'first', 'last', 'min', 'max', 'median', 'sum', or a callable function. + na_rm: If True, `NA` pairs are removed before interpolation. If False, `NA`s in `x` cause an error, `NA`s in `y` are propagated. Returns: `Approx` class with: @@ -177,26 +189,24 @@ def approx( raise ValueError("invalid interpolation method") # --- Rule normalization --- - if isinstance(rule, list | tuple | np.ndarray): - rlist = list(rule) - if not (1 <= len(rlist) <= 2): + if isinstance(rule, Sequence | np.ndarray): + rule_list: list[int] = list(rule) # type: ignore[invalid-argument-type] # This is a valid argument type, not sure why ty is upset + if not (1 <= len(rule_list) <= 2): raise ValueError("`rule` must have length 1 or 2") - if len(rlist) == 1: - rlist = [rlist[0], rlist[0]] + if len(rule_list) == 1: + rule_list = [rule_list[0], rule_list[0]] else: - rlist = [rule, rule] + rule_list = [rule, rule] - # --- Regularize values --- # Sort by x, collapse ties, and handle NAs like R's regularize.values() - r = _regularize_values(x_arr, y_arr, ties, na_rm) - x_reg = r["x"] - y_reg = r["y"] - not_na_mask = r["not_na"] r: RegularizedArray = _regularize_values(x_arr, y_arr, na_rm=na_rm, ties=ties) + x_reg: npt.NDArray[float] = r.x + y_reg: npt.NDArray[float] = r.y + not_na_mask: npt.NDArray[bool] = r.not_na # no_na is True if we don't have to worry about NAs in y_reg - no_na = na_rm or (not r["kept_na"]) + no_na: bool = na_rm or (not r.kept_na) # nx is the number of *valid* (non-NA) points for interpolation - nx = x_reg.size if no_na else int(np.count_nonzero(not_na_mask)) + nx: int = x_reg.size if no_na else int(np.count_nonzero(not_na_mask)) if np.isnan(nx): raise ValueError("invalid length(x)") @@ -208,98 +218,98 @@ def approx( if nx == 0: raise ValueError("zero non-NA points") - # --- Set extrapolation values (yleft/yright) --- + # set extrapolation values (yleft/yright) # This logic matches R's C code (R_approxtest) if no_na: - y_first = y_reg[0] - y_last = y_reg[-1] + y_first: float = y_reg[0] + y_last: float = y_reg[-1] else: # Find first and last non-NA y values - y_valid = y_reg[not_na_mask] - y_first = y_valid[0] - y_last = y_valid[-1] + y_valid: npt.NDArray[float] = y_reg[not_na_mask] + y_first: float = y_valid[0] + y_last: float = y_valid[-1] - yleft_val = (np.nan if int(rlist[0]) == 1 else y_first) if yleft is None else float(yleft) - yright_val = (np.nan if int(rlist[1]) == 1 else y_last) if yright is None else float(yright) + yleft_val: float = (np.nan if int(rule_list[0]) == 1 else y_first) if yleft is None else float(yleft) + yright_val: float = (np.nan if int(rule_list[1]) == 1 else y_last) if yright is None else float(yright) - # --- Fabricate xout if missing --- + # fabricate xout if it is missing if xout is None: if n <= 0: raise ValueError("'approx' requires n >= 1") if no_na: - x_first = x_reg[0] - x_last = x_reg[nx - 1] + x_first: float = x_reg[0] + x_last: float = x_reg[nx - 1] else: - x_valid = x_reg[not_na_mask] - x_first = x_valid[0] - x_last = x_valid[-1] - xout_arr = np.linspace(x_first, x_last, num=int(n), dtype=float) + x_valid: npt.NDArray[float] = x_reg[not_na_mask] + x_first: float = x_valid[0] + x_last: float = x_valid[-1] + xout_arr: npt.NDArray[float] = np.linspace(x_first, x_last, num=int(n), dtype=float) else: - xout_arr = _coerce_to_float_array(xout) + xout_arr: npt.NDArray[float] = _coerce_to_float_array(xout) - # --- C_ApproxTest checks --- + # replicate R's C_ApproxTest checks if method_code == 2 and (not np.isfinite(f) or f < 0.0 or f > 1.0): - raise ValueError("approx(): invalid f value") + raise ValueError("approx(): invalid f value; with `method=2`, `f` must be in the range [0,1] (inclusive) or NA") if not no_na: # R re-filters x and y here if NAs remained - x_reg = x_reg[not_na_mask] - y_reg = y_reg[not_na_mask] + x_reg: npt.NDArray[float] = x_reg[not_na_mask] + y_reg: npt.NDArray[float] = y_reg[not_na_mask] - # --- Interpolation --- - # This section vectorized the logic from R's approx1 and R_approxfun - yout = np.empty_like(xout_arr, dtype=float) - mask_nan_xout = np.isnan(xout_arr) + # perform interpolation + # this section is a vectorized form of the logic from R's approx1 and R_approxfun + yout: npt.NDArray[float] = np.empty_like(xout_arr, dtype=float) + mask_nan_xout: npt.NDArray[bool] = np.isnan(xout_arr) yout[mask_nan_xout] = np.nan - mask_valid = ~mask_nan_xout + mask_valid: npt.NDArray[bool] = ~mask_nan_xout if x_reg.size == 0: # No valid points to interpolate against yout[mask_valid] = np.nan - return {"x": xout_arr, "y": yout} + return Approx(x=xout_arr, y=yout) - xv = xout_arr[mask_valid] - left_mask = xv < x_reg[0] - right_mask = xv > x_reg[-1] - mid_mask = ~(left_mask | right_mask) + xv: npt.NDArray[float] = xout_arr[mask_valid] + left_mask: npt.NDArray[bool] = xv < x_reg[0] + right_mask: npt.NDArray[bool] = xv > x_reg[-1] + mid_mask: npt.NDArray[bool] = ~(left_mask | right_mask) - res = np.empty_like(xv) + res: npt.NDArray[float] = np.empty_like(xv, dtype=float) res[left_mask] = yleft_val res[right_mask] = yright_val if np.any(mid_mask): - xm = xv[mid_mask] + xm: npt.NDArray[float] = xv[mid_mask] # Find indices # j_right[k] = index of first x_reg > xm[k] - j_right = np.searchsorted(x_reg, xm, side="right") + j_right: npt.NDArray[int] = np.searchsorted(x_reg, xm, side="right") # j_left[k] = index of first x_reg >= xm[k] - j_left = np.searchsorted(x_reg, xm, side="left") + j_left: npt.NDArray[int] = np.searchsorted(x_reg, xm, side="left") # Points that exactly match an x_reg value - eq_mask = j_left != j_right + eq_mask: npt.NDArray[bool] = j_left != j_right # Points that fall between x_reg values - in_interval_mask = ~eq_mask + in_interval_mask: npt.NDArray[bool] = ~eq_mask - res_mid = np.empty_like(xm) + res_mid: npt.NDArray[float] = np.empty_like(xm, dtype=float) if np.any(eq_mask): # For exact matches, use the corresponding y_reg value # R's C code uses the 'j_left' index here - res_mid[eq_mask] = y_reg[j_left[eq_mask]] + res_mid[eq_mask] = y_reg[j_left[eq_mask]] # type: ignore[non-subscriptable] # we know this is of type npt.NDArray[float], not sure why ty is upset if np.any(in_interval_mask): # R's C code sets i = j-1, where j is the 'right' index - j = j_right[in_interval_mask] - i = j - 1 + j: npt.NDArray[float] = j_right[in_interval_mask] # type: ignore[non-subscriptable] # we know this is of type npt.NDArray[float], not sure why ty is upset + i: npt.NDArray[float] = j - 1 # Get the surrounding x and y values - xi = x_reg[i] - xj = x_reg[j] - yi = y_reg[i] - yj = y_reg[j] + xi: npt.NDArray[float] = x_reg[i] + xj: npt.NDArray[float] = x_reg[j] + yi: npt.NDArray[float] = y_reg[i] + yj: npt.NDArray[float] = y_reg[j] if method_code == 1: # linear - t = (xm[in_interval_mask] - xi) / (xj - xi) + t: npt.NDArray[float] = (xm[in_interval_mask] - xi) / (xj - xi) res_mid[in_interval_mask] = yi + (yj - yi) * t else: # constant # This matches R_approxfun's constant logic @@ -308,14 +318,14 @@ def approx( elif f == 1.0: res_mid[in_interval_mask] = yj else: - # R computes (1-f)*yi + f*yj, but carefully - f1 = 1.0 - f - f2 = f - part = np.zeros_like(yi) + # computes R's (1-f)*yi + f*yj + f1: float = float(1.0 - f) + f2: float = float(f) + part: npt.NDArray[float] = np.zeros_like(yi, dtype=float) if f1 != 0.0: - part = part + yi * f1 + part: npt.NDArray[float] = part + yi * f1 if f2 != 0.0: - part = part + yj * f2 + part: npt.NDArray[float] = part + yj * f2 res_mid[in_interval_mask] = part res[mid_mask] = res_mid From 874fdf2dfb24542b96b67cce48d81a60196c5f3a Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 14:18:08 -0600 Subject: [PATCH 50/52] refactor: do not create temporary variable for aggregation function Signed-off-by: Josh Loecker --- main/como/approx.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/main/como/approx.py b/main/como/approx.py index 14c69e3e..17805253 100644 --- a/main/como/approx.py +++ b/main/como/approx.py @@ -84,24 +84,23 @@ def _regularize_values( if callable(ties): agg_fn = ties else: - ties_str = "mean" if ties is None else str(ties).lower() - if ties_str in ("mean", "avg", "average"): + if ties in ("mean", "avg", "average"): agg_fn = np.mean - elif ties_str in ("first", "left"): + elif ties in ("first", "left"): def agg_fn(a): return a[0] - elif ties_str in ("last", "right"): + elif ties in ("last", "right"): def agg_fn(a): return a[-1] - elif ties_str == "min": + elif ties == "min": agg_fn = np.min - elif ties_str == "max": + elif ties == "max": agg_fn = np.max - elif ties_str == "median": + elif ties == "median": agg_fn = np.median - elif ties_str == "sum": + elif ties == "sum": agg_fn = np.sum else: raise ValueError("Unsupported `ties`; use a callable or one of 'mean', 'first', 'last', 'min', 'max', 'median', 'sum'.") From c6a0015ed459a3df3242bd38867c516102d6b48d Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 14:20:27 -0600 Subject: [PATCH 51/52] style: ruff formatting Signed-off-by: Josh Loecker --- main/como/approx.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main/como/approx.py b/main/como/approx.py index 17805253..441a489f 100644 --- a/main/como/approx.py +++ b/main/como/approx.py @@ -84,14 +84,13 @@ def _regularize_values( if callable(ties): agg_fn = ties else: + # fmt: off if ties in ("mean", "avg", "average"): agg_fn = np.mean elif ties in ("first", "left"): - def agg_fn(a): return a[0] elif ties in ("last", "right"): - def agg_fn(a): return a[-1] elif ties == "min": @@ -104,6 +103,7 @@ def agg_fn(a): agg_fn = np.sum else: raise ValueError("Unsupported `ties`; use a callable or one of 'mean', 'first', 'last', 'min', 'max', 'median', 'sum'.") + # fmt: on # Find unique x values and their indices/counts unique_x, start_idx, counts = np.unique(x_sorted, return_index=True, return_counts=True) From ae60218a0661eed4bc778b787ae98ef5a2c6fb19 Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Mon, 3 Nov 2025 14:20:52 -0600 Subject: [PATCH 52/52] feat(dev): use named tuples for return types Signed-off-by: Josh Loecker --- main/como/density.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/main/como/density.py b/main/como/density.py index 0f75623a..810ced26 100644 --- a/main/como/density.py +++ b/main/como/density.py @@ -6,9 +6,8 @@ import numpy as np import numpy.typing as npt -from scipy.interpolate import interp1d -from como.approx import approx +from como.approx import Approx, approx class DensityResult(NamedTuple): @@ -116,7 +115,7 @@ def dnorm(x: float, mean: NUMBER = 0.0, sd: NUMBER = 1.0, log: bool = False, fas # underflow boundary boundary = np.sqrt(-2.0 * m_ln2 * (dbl_min_exp + 1 - dbl_mant_dig)) if a > boundary: - return float(0.0) + return 0.0 # Now, to get full accuracy, split x into two parts, # x = x1+x2, such that |x2| <= 2^-16. @@ -228,12 +227,15 @@ def density( elif kernel == "optcosine": return np.sqrt(1 - 8 / np.pi**2) * np.pi**2 / 16 + if kernel != "gaussian": + raise NotImplementedError(f"Only 'gaussian' kernel is implemented; got '{kernel}'") + x: npt.NDArray[float] = np.asarray(x, dtype=float) has_weights = weights is not None weights: npt.NDArray[float] | None = np.asarray(weights, float) if weights is not None else None - if has_weights and (weights is not None and weights.size != n): - raise ValueError(f"The length of provided weights does not match the length of x: {weights.size} != {n}") + if has_weights and (weights is not None and weights.size != x.size): + raise ValueError(f"The length of provided weights does not match the length of x: {weights.size} != {x.size}") x_na: npt.NDArray[np.bool_] = np.isnan(x) if np.any(x_na): @@ -286,8 +288,9 @@ def density( if bw_calc <= 0: raise ValueError("Bandwidth 'bw' must be positive.") - from_ = float(from_ or min(x) - cut * bw_calc) - to_ = float(to_ or max(x) + cut * bw_calc) + # have to use `... if ... else` because `0` is falsey, resulting in the right-half being used instead of the user-provided value + from_ = float(from_ if from_ is not None else min(x) - cut * bw_calc) + to_ = float(to_ if to_ is not None else max(x) + cut * bw_calc) if not np.isfinite(from_): raise ValueError("'from_' is not finite.") @@ -313,11 +316,11 @@ def density( # xp=known x-coords, fp=known y-cords, x=unknown x-coords; returns interpolated (e.g., unknown) y-coords interp_x: npt.NDArray[float] = np.linspace(from_, to_, num=n_user) - interp_y: npt.NDArray[float] = approx(xords, kords, interp_x) + interp_y: Approx = approx(xords, kords, interp_x) return DensityResult( x=interp_x, - y=interp_y["y"], + y=interp_y.y, x_grid=xords, y_grid=kords, bw=bw_calc,