Add test, typing info, nits, imports.

justushelo · justushelo · commit 625c39edc662 · 2026-05-11T11:38:02.000+03:00
- Remove decomposition guardrail. - Use one print for print_indices. - Add typing info to visualization.py - Specify decomposition in visualization.py - Add Ipython into pyproject.toml and add guardrail for import. - Add stacklevel=2 into warnings. - Make a function for plotting heterogeneity indices. - Work with ax instead of plt. - Delete unnecessary +. - Add logging instead of print in heterogeneity_indices.py. - Set import order correct. Closes #46, Closes #47.
diff --git a/pyproject.toml b/pyproject.toml
@@ -41,6 +41,10 @@ dashboard = [
     "cryptography",
 ]
 
+ipython = [
+    "ipython"
+]
+
 test = [
     "pytest",
     "pytest-cov",
@@ -55,7 +59,7 @@ doc = [
 ]
 
 dev = [
-    "simdec[doc,test,dashboard]",
+    "simdec[doc,test,dashboard, ipython]",
     "watchfiles",
     "pre-commit",
 ]
diff --git a/src/simdec/decomposition.py b/src/simdec/decomposition.py
@@ -65,7 +65,7 @@ def __reduce__(self):
 
 def decomposition(
     inputs: pd.DataFrame,
-    output: pd.DataFrame | np.ndarray,
+    output: pd.DataFrame,
     *,
     sensitivity_indices: np.ndarray,
     dec_limit: float | None = None,
@@ -116,11 +116,7 @@ def decomposition(
         inputs[cat_col] = codes
 
     inputs = inputs.to_numpy()
-
-    if hasattr(output, "to_numpy"):
-        output = output.to_numpy().flatten()
-    else:
-        output = np.asarray(output).flatten()
+    output = output.to_numpy().flatten()
 
     # 1. variables for decomposition
     var_order = np.argsort(sensitivity_indices)[::-1]
diff --git a/src/simdec/heterogeneity_indices.py b/src/simdec/heterogeneity_indices.py
@@ -1,9 +1,22 @@
-from .sensitivity_indices import sensitivity_indices
+from dataclasses import dataclass
+import logging
+
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-import matplotlib.pyplot as plt
 
-__all__ = ["heterogeneity_indices"]
+import simdec as sd
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["heterogeneity_indices", "plot_heterogeneity"]
+
+
+@dataclass
+class HeterogeneityResult:
+    summary: pd.DataFrame
+    regional_profiles: pd.DataFrame
+    split_name: str
 
 
 def heterogeneity_indices(
@@ -12,9 +25,11 @@ def heterogeneity_indices(
     split_variable: str | pd.Series,
     n_subdivisions: int | None = None,
     plot: bool = False,
-) -> pd.DataFrame:
-    """
-    Compute sensitivity-based heterogeneity across subdivisions of a variable.
+) -> HeterogeneityResult:
+    """Heterogeneity indices.
+
+    Compute sensitivity-based heterogeneity across subdivisions
+    of a variable.
 
     Parameters
     ----------
@@ -27,12 +42,25 @@ def heterogeneity_indices(
     n_subdivisions : int, optional
         Number of regions for continuous variables. Defaults to 4.
     plot : bool, default False
-        If True, displays a stacked bar chart of regional sensitivities.
+        If True, displays a stacked bar chart of regional sensitivity profiles
+        by calling :func:`plot_heterogeneity`. The chart shows variance
+        contributions of each input across subdivisions of ``split_variable``,
+        ranked by global sensitivity indices. To capture the returned
+        ``matplotlib.axes.Axes`` object, call :func:`plot_heterogeneity`
+        directly on the result instead.
 
     Returns
-    ----------
-    summary : pd.Dataframe
-        A summary of calculated heterogeneity indices.
+    -------
+    res : HeterogeneityResult
+        An object with attributes:
+
+        summary : DataFrame
+            A summary of calculated heterogeneity indices.
+        regional_profiles : DataFrame
+            Regional sensitivity indices for each input across subdivisions.
+        split_name : str
+            The name of the variable used to split the data.
+
     """
     y = pd.Series(output).reset_index(drop=True)
     X = pd.DataFrame(inputs).reset_index(drop=True)
@@ -51,8 +79,9 @@ def heterogeneity_indices(
 
     # Determine if variable is categorical/binary
     is_categorical = (
-        pd.api.types.is_categorical_dtype(z)
+        isinstance(z.dtype, pd.CategoricalDtype)
         or pd.api.types.is_object_dtype(z)
+        or pd.api.types.is_string_dtype(z)
         or pd.api.types.is_bool_dtype(z)
         or n_unique <= 2
     )
@@ -89,7 +118,7 @@ def heterogeneity_indices(
             continue
 
         try:
-            res = sensitivity_indices(inputs=X_sub, output=y_sub)
+            res = sd.sensitivity_indices(inputs=X_sub, output=y_sub)
             si_vals = np.asarray(res.si).ravel()
 
             # Guard against NaN/Inf from degenerate sensitivity computation
@@ -105,11 +134,9 @@ def heterogeneity_indices(
             continue
 
     if skipped:
-        print(
-            f"[heterogeneity_indices] Skipped {len(skipped)} region(s) of '{split_name}':"
-        )
+        logger.info("Skipped %d region(s) of '%s':", len(skipped), split_name)
         for reg, n, reason in skipped:
-            print(f"  - region={reg!r}, n={n}, reason={reason}")
+            logger.info("  - region=%r, n=%d, reason=%s", reg, n, reason)
 
     if len(regional_profiles) < 2:
         total_regions = len(regions.cat.categories)
@@ -118,15 +145,15 @@ def heterogeneity_indices(
             f"Not enough valid subdivisions to compute heterogeneity: "
             f"{valid}/{total_regions} regions passed all checks for '{split_name}'.\n"
             f"Skipped regions:\n"
-            + "\n".join(f"  {r!r}: n={n}, {reason}" for r, n, reason in skipped)
-            + "\n\nTry: (1) reducing n_subdivisions, "
+            "\n".join(f"  {r!r}: n={n}, {reason} " for r, n, reason in skipped),
+            "\n\nTry: (1) reducing n_subdivisions, "
             "(2) using a different split_variable, or "
-            "(3) ensuring more samples per region."
+            "(3) ensuring more samples per region.",
         )
 
     regional_si = pd.concat(regional_profiles, axis=1)
 
-    res_global = sensitivity_indices(inputs=X, output=y)
+    res_global = sd.sensitivity_indices(inputs=X, output=y)
     overall_si = pd.Series(
         np.asarray(res_global.si).ravel(),
         index=X.columns,
@@ -143,29 +170,70 @@ def heterogeneity_indices(
     ).sort_values(by=hetero_col_name, ascending=False)
     summary.loc["SUM / TOTAL"] = [overall_si.sum(), total_hetero]
 
+    result = HeterogeneityResult(summary, regional_si, split_name)
+
     if plot:
-        plot_order = summary.index[:-1]
-        data_to_plot = regional_si.loc[plot_order].T
-
-        cmap = plt.get_cmap("terrain")
-        colors = [cmap(i) for i in np.linspace(0.05, 0.95, len(plot_order))]
-
-        _ = data_to_plot.plot(
-            kind="bar",
-            stacked=True,
-            figsize=(10, 6),
-            color=colors,
-            edgecolor="white",
-            width=0.8,
-        )
+        plot_heterogeneity(result)
+
+    return result
+
+
+def plot_heterogeneity(result: HeterogeneityResult, ax: plt.Axes = None) -> plt.Axes:
+    """Plot regional sensitivity profiles.
+
+    Parameters
+    ----------
+    result : HeterogeneityResult
+        The result object from heterogeneity_indices.
+    ax : matplotlib.axes.Axes, optional
+        Existing axes to plot on.
+
+    Returns
+    -------
+    ax : matplotlib.axes.Axes
+        The axes with the plot.
+
+    """
+    summary = result.summary
+    regional_si = result.regional_profiles
+    split_name = result.split_name
+
+    plot_order = summary.index[summary.index != "SUM / TOTAL"]
+    plot_order = (
+        summary.loc[plot_order].sort_values(by="Overall_SI", ascending=False).index
+    )
+
+    cmap = plt.colormaps["terrain"]
+    colors = [cmap(i) for i in np.linspace(0.05, 0.95, len(regional_si.index))]
+
+    data_to_plot = regional_si.loc[plot_order].T
+
+    if ax is None:
+        _, ax = plt.subplots(figsize=(10, 6))
+
+    data_to_plot.plot(
+        kind="bar",
+        stacked=True,
+        ax=ax,
+        color=colors,
+        edgecolor="white",
+        width=0.8,
+    )
+
+    ax.set_title(f"Sensitivity Profiles across {split_name}", fontsize=14)
+    ax.set_ylabel("Variance Contribution", fontsize=12)
+    ax.set_xlabel(f"Regions of {split_name}", fontsize=12)
+
+    ax.legend(
+        title="Inputs (Ranked by Global SI)",
+        bbox_to_anchor=(1.05, 1),
+        loc="upper left",
+    )
+
+    ax.tick_params(axis="x", labelrotation=45)
+    ax.grid(axis="y", linestyle="--", alpha=0.7)
 
-        plt.title(f"Sensitivity Profiles across {split_name}", fontsize=14)
-        plt.ylabel("Variance Contribution", fontsize=12)
-        plt.xlabel(f"Regions of {split_name}", fontsize=12)
-        plt.legend(title="Input Variables", bbox_to_anchor=(1.05, 1), loc="upper left")
-        plt.xticks(rotation=45)
-        plt.grid(axis="y", linestyle="--", alpha=0.7)
+    if plt.get_backend().lower() != "agg":
         plt.tight_layout()
-        plt.show()
 
-    return summary
+    return ax
diff --git a/src/simdec/sensitivity_indices.py b/src/simdec/sensitivity_indices.py
@@ -102,7 +102,7 @@ def sensitivity_indices(
     # Handle inputs conversion
     if isinstance(inputs, pd.DataFrame):
         var_names = inputs.columns.tolist()
-        cat_cols = inputs.select_dtypes(["category", "O"]).columns
+        cat_cols = inputs.select_dtypes(include=["category", "O", "string"]).columns
         if not cat_cols.empty:
             inputs = inputs.copy()  # Avoid SettingWithCopyWarning
             inputs[cat_cols] = inputs[cat_cols].apply(
@@ -198,8 +198,6 @@ def sensitivity_indices(
         df_si = pd.DataFrame(si, index=var_names, columns=["Combined effect"])
 
         df_indices = pd.concat([df_foe, df_soe, df_si], axis=1)
-        print(f"{'-'*69}")
-        print(df_indices)
-        print(f"{'-'*69}")
+        print(f"\n{df_indices}\n")
 
     return SensitivityAnalysisResult(si, foe, soe)
diff --git a/src/simdec/visualization.py b/src/simdec/visualization.py
@@ -10,9 +10,18 @@
 import seaborn as sns
 import pandas as pd
 from pandas.io.formats.style import Styler
+import warnings
+
+from simdec.decomposition import DecompositionResult
 
 __all__ = ["visualization", "two_output_visualization", "tableau", "palette"]
 
+try:
+    from IPython.display import display
+
+    HAS_IPYTHON = True
+except ImportError:
+    HAS_IPYTHON = False
 
 SEQUENTIAL_PALETTES = [
     "#DC267F",
@@ -140,7 +149,7 @@ def visualization(
     kind: Literal["histogram", "boxplot"] = "histogram",
     ax=None,
     print_legend: bool = False,
-    decomposition=None,
+    decomposition: DecompositionResult | None = None,
 ) -> plt.Axes:
     """Histogram plot of scenarios.
 
@@ -158,7 +167,7 @@ def visualization(
         Matplotlib axis.
     print_legend: Boolean, optional
         Prints plot legend.
-    decomposition: Object, optional
+    decomposition: DecompositionResult, optional
         Required for print_legend.
 
     Returns
@@ -194,13 +203,16 @@ def visualization(
         raise ValueError("'kind' can only be 'histogram' or 'boxplot'")
 
     if print_legend:
-        from IPython.display import display
-
-        if decomposition is None:
-            import warnings
-
+        if not HAS_IPYTHON:
+            warnings.warn(
+                "print_legend=True requires ipython to be installed. "
+                "Install it with: pip install simdec[ipython]",
+                stacklevel=2,
+            )
+        elif decomposition is None:
             warnings.warn(
-                "print_legend=True requires the decomposition object. Table skipped."
+                "print_legend=True requires the decomposition parameter. Table skipped.",
+                stacklevel=2,
             )
         else:
             try:
@@ -229,7 +241,7 @@ def two_output_visualization(
     ylim: tuple[float, float] | None = None,
     r_scatter: float = 1.0,
     print_legend: bool = False,
-    decomposition=None,
+    decomposition: DecompositionResult | None = None,
 ) -> tuple[plt.Figure, np.ndarray]:
     """Two-output visualization.
 
@@ -261,7 +273,7 @@ def two_output_visualization(
         Fraction of data points shown in the scatter plot.
     print_legend: Boolean, optional
         Prints plot legend.
-    decomposition: Object, optional
+    decomposition: DecompositionResult, optional
         Required for print_legend.
 
     Returns
@@ -322,13 +334,16 @@ def two_output_visualization(
     fig.subplots_adjust(wspace=-0.015, hspace=0)
 
     if print_legend:
-        from IPython.display import display
-
-        if decomposition is None:
-            import warnings
-
+        if not HAS_IPYTHON:
+            warnings.warn(
+                "print_legend=True requires ipython to be installed. "
+                "Install it with: pip install simdec[ipython]",
+                stacklevel=2,
+            )
+        elif decomposition is None:
             warnings.warn(
-                "print_legend=True requires the decomposition object. Table skipped."
+                "print_legend=True requires the decomposition parameter. Table skipped.",
+                stacklevel=2,
             )
         else:
             try:
diff --git a/tests/test_heterogeneity_indices.py b/tests/test_heterogeneity_indices.py
diff --git a/tests/test_visualization.py b/tests/test_visualization.py

Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,10 @@ dashboard = [`
`41`	`41`	`"cryptography",`
`42`	`42`	`]`
`43`	`43`
	`44`	`+ipython = [`
	`45`	`+ "ipython"`
	`46`	`+]`
	`47`	`+`
`44`	`48`	`test = [`
`45`	`49`	`"pytest",`
`46`	`50`	`"pytest-cov",`
`@@ -55,7 +59,7 @@ doc = [`
`55`	`59`	`]`
`56`	`60`
`57`	`61`	`dev = [`
`58`		`- "simdec[doc,test,dashboard]",`
	`62`	`+ "simdec[doc,test,dashboard, ipython]",`
`59`	`63`	`"watchfiles",`
`60`	`64`	`"pre-commit",`
`61`	`65`	`]`