KernelTuner · benvanwerkhoven · Feb 3, 2026 · Feb 3, 2026 · Feb 3, 2026
diff --git a/kernel_tuner/__init__.py b/kernel_tuner/__init__.py
@@ -1,6 +1,21 @@
-from kernel_tuner.integration import store_results, create_device_targets
-from kernel_tuner.interface import tune_kernel, tune_kernel_T1, run_kernel
-
 from importlib.metadata import version
 
+from kernel_tuner.interface import run_kernel, tune_kernel, tune_kernel_T1
+
 __version__ = version(__package__)
+
+__all__ = [
+    "create_device_targets",
+    "run_kernel",
+    "store_results",
+    "tune_kernel",
+    "tune_kernel_T1",
+    "__version__",
+]
+
+
+def __getattr__(name):
+    if name in ("store_results", "create_device_targets"):
+        from kernel_tuner import integration
+        return getattr(integration, name)
+    raise AttributeError(f"module 'kernel_tuner' has no attribute {name!r}")
diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
@@ -7,21 +7,15 @@
 
 import numpy as np
 
-try:
-    import cupy as cp
-except ImportError:
-    cp = np
+def _get_cupy():
+    try:
+        import cupy as _cp
+    except ImportError:
+        return None
+    return _cp
 
 import kernel_tuner.util as util
 from kernel_tuner.accuracy import Tunable
-from kernel_tuner.backends.compiler import CompilerFunctions
-from kernel_tuner.backends.cupy import CupyFunctions
-from kernel_tuner.backends.hip import HipFunctions
-from kernel_tuner.backends.hypertuner import HypertunerFunctions
-from kernel_tuner.backends.nvcuda import CudaFunctions
-from kernel_tuner.backends.opencl import OpenCLFunctions
-from kernel_tuner.backends.pycuda import PyCudaFunctions
-from kernel_tuner.observers.nvml import NVMLObserver
 from kernel_tuner.observers.observer import ContinuousObserver, OutputObserver, PrologueObserver
 from kernel_tuner.observers.tegra import TegraObserver
 
@@ -35,6 +29,7 @@
 except ImportError:
     DeviceArray = Exception # using Exception here as a type that will never be among kernel arguments
 
+
 _KernelInstance = namedtuple(
     "_KernelInstance",
     [
@@ -272,27 +267,31 @@ def __init__(
         logging.debug("DeviceInterface instantiated, lang=%s", lang)
 
         if lang.upper() == "CUDA":
+            from kernel_tuner.backends.pycuda import PyCudaFunctions
             dev = PyCudaFunctions(
                 device,
                 compiler_options=compiler_options,
                 iterations=iterations,
                 observers=observers,
             )
         elif lang.upper() == "CUPY":
+            from kernel_tuner.backends.cupy import CupyFunctions
             dev = CupyFunctions(
                 device,
                 compiler_options=compiler_options,
                 iterations=iterations,
                 observers=observers,
             )
         elif lang.upper() == "NVCUDA":
+            from kernel_tuner.backends.nvcuda import CudaFunctions
             dev = CudaFunctions(
                 device,
                 compiler_options=compiler_options,
                 iterations=iterations,
                 observers=observers,
             )
         elif lang.upper() == "OPENCL":
+            from kernel_tuner.backends.opencl import OpenCLFunctions
             dev = OpenCLFunctions(
                 device,
                 platform,
@@ -301,20 +300,23 @@ def __init__(
                 observers=observers,
             )
         elif lang.upper() in ["C", "FORTRAN"]:
+            from kernel_tuner.backends.compiler import CompilerFunctions
             dev = CompilerFunctions(
                 compiler=compiler,
                 compiler_options=compiler_options,
                 iterations=iterations,
                 observers=observers,
             )
         elif lang.upper() == "HIP":
+            from kernel_tuner.backends.hip import HipFunctions
             dev = HipFunctions(
                 device,
                 compiler_options=compiler_options,
                 iterations=iterations,
                 observers=observers,
             )
         elif lang.upper() == "HYPERTUNER":
+            from kernel_tuner.backends.hypertuner import HypertunerFunctions
             dev = HypertunerFunctions(
                 iterations=iterations,
                 compiler_options=compiler_options
@@ -333,8 +335,12 @@ def __init__(
         self.output_observers = []
         self.prologue_observers = []
         if observers:
+            try:
+                from kernel_tuner.observers.nvml import NVMLObserver as _NVMLObserver
+            except ImportError:
+                _NVMLObserver = None
             for obs in observers:
-                if isinstance(obs, NVMLObserver):
+                if _NVMLObserver is not None and isinstance(obs, _NVMLObserver):
                     self.nvml = obs.nvml
                     self.use_nvml = True
                 if isinstance(obs, TegraObserver):
@@ -500,7 +506,12 @@ def check_kernel_output(
 
             should_sync = [answer[i] is not None for i, arg in enumerate(instance.arguments)]
         else:
-            should_sync = [isinstance(arg, (np.ndarray, cp.ndarray, torch.Tensor, DeviceArray)) for arg in instance.arguments]
+            cp = _get_cupy()
+            cupy_ndarray = (cp.ndarray,) if cp is not None else ()
+            should_sync = [
+                isinstance(arg, (np.ndarray, torch.Tensor, DeviceArray) + cupy_ndarray)
+                for arg in instance.arguments
+            ]
 
         # re-copy original contents of output arguments to GPU memory, to overwrite any changes
         # by earlier kernel runs
@@ -516,7 +527,9 @@ def check_kernel_output(
         result_host = []
         for i, arg in enumerate(instance.arguments):
             if should_sync[i]:
-                if isinstance(arg, (np.ndarray, cp.ndarray)):
+                cp = _get_cupy()
+                cupy_ndarray = (cp.ndarray,) if cp is not None else ()
+                if isinstance(arg, (np.ndarray,) + cupy_ndarray):
                     result_host.append(np.zeros_like(arg))
                     self.dev.memcpy_dtoh(result_host[-1], gpu_args[i])
                 elif isinstance(arg, torch.Tensor) and isinstance(answer[i], torch.Tensor):
@@ -790,8 +803,10 @@ def _default_verify_function(instance, answer, result_host, atol, verbose):
     # for each element in the argument list, check if the types match
     for i, arg in enumerate(instance.arguments):
         if answer[i] is not None:  # skip None elements in the answer list
-            if isinstance(answer[i], (np.ndarray, cp.ndarray)) and isinstance(
-                arg, (np.ndarray, cp.ndarray)
+            cp = _get_cupy()
+            cupy_ndarray = (cp.ndarray,) if cp is not None else ()
+            if isinstance(answer[i], (np.ndarray,) + cupy_ndarray) and isinstance(
+                arg, (np.ndarray,) + cupy_ndarray
             ):
                 if not np.can_cast(arg.dtype, answer[i].dtype):
                     raise TypeError(
@@ -840,7 +855,9 @@ def _default_verify_function(instance, answer, result_host, atol, verbose):
                     )
             else:
                 # either answer[i] and argument have different types or answer[i] is not a numpy type
-                if not isinstance(answer[i], (np.ndarray, cp.ndarray, torch.Tensor)) or not isinstance(
+                cp = _get_cupy()
+                cupy_ndarray = (cp.ndarray,) if cp is not None else ()
+                if not isinstance(answer[i], (np.ndarray, torch.Tensor) + cupy_ndarray) or not isinstance(
                     answer[i], np.number
                 ):
                     raise TypeError(
@@ -865,7 +882,8 @@ def _flatten(a):
         if expected is not None:
             result = _ravel(result_host[i])
             expected = _flatten(expected)
-            if any([isinstance(array, cp.ndarray) for array in [expected, result]]):
+            cp = _get_cupy()
+            if cp is not None and any([isinstance(array, cp.ndarray) for array in [expected, result]]):
                 output_test = cp.allclose(expected, result, atol=atol)
             elif isinstance(expected, torch.Tensor) and isinstance(result, torch.Tensor):
                 output_test = torch.allclose(expected, result, atol=atol)

diff --git a/kernel_tuner/integration.py b/kernel_tuner/integration.py
@@ -4,35 +4,7 @@
 
 from jsonschema import validate
 
-from kernel_tuner.util import get_instance_string, looks_like_a_filename, read_file
-
-#specifies for a number of pre-defined objectives whether
-#the objective should be minimized or maximized (boolean value denotes higher is better)
-objective_default_map = {
-    "time": False,
-    "energy": False,
-    "fitness": True,
-    "cost": False,
-    "loss": False,
-    "GFLOP/s": True,
-    "TFLOP/s": True,
-    "GB/s": True,
-    "TB/s": True,
-    "GFLOPS/W": True,
-    "TFLOPS/W": True,
-    "GFLOP/J": True,
-    "TFLOP/J": True
-}
-
-def get_objective_defaults(objective, objective_higher_is_better):
-    """ Uses time as default objective and attempts to lookup objective_higher_is_better for known objectives """
-    objective = objective or "time"
-    if objective_higher_is_better is None:
-        if objective in objective_default_map:
-            objective_higher_is_better = objective_default_map[objective]
-        else:
-            raise ValueError(f"Please specify objective_higher_is_better for objective {objective}")
-    return objective, objective_higher_is_better
+from kernel_tuner.util import get_instance_string, looks_like_a_filename, read_file, get_objective_defaults
 
 schema_v1_0 = {
     "$schema": "https://json-schema.org/draft-07/schema#",

diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
@@ -25,6 +25,7 @@
 """
 
 import logging
+import importlib
 from argparse import ArgumentParser
 from ast import literal_eval
 from datetime import datetime
@@ -38,7 +39,7 @@
 import kernel_tuner.core as core
 import kernel_tuner.util as util
 from kernel_tuner.file_utils import get_input_file, get_t4_metadata, get_t4_results, import_class_from_file
-from kernel_tuner.integration import get_objective_defaults
+from kernel_tuner.util import get_objective_defaults
 from kernel_tuner.runners.sequential import SequentialRunner
 from kernel_tuner.runners.simulation import SimulationRunner
 from kernel_tuner.searchspace import Searchspace
@@ -48,48 +49,61 @@
 except ImportError:
     torch = util.TorchPlaceHolder()
 
-from kernel_tuner.strategies import (
-    basinhopping,
-    bayes_opt,
-    brute_force,
-    diff_evo,
-    dual_annealing,
-    firefly_algorithm,
-    genetic_algorithm,
-    greedy_ils,
-    greedy_mls,
-    minimize,
-    mls,
-    ordered_greedy_mls,
-    pso,
-    pyatf_strategies,
-    random_sample,
-    simulated_annealing,
-    skopt
-)
 from kernel_tuner.strategies.wrapper import OptAlgWrapper
 
-strategy_map = {
-    "brute_force": brute_force,
-    "random_sample": random_sample,
-    "minimize": minimize,
-    "basinhopping": basinhopping,
-    "diff_evo": diff_evo,
-    "genetic_algorithm": genetic_algorithm,
-    "greedy_mls": greedy_mls,
-    "ordered_greedy_mls": ordered_greedy_mls,
-    "greedy_ils": greedy_ils,
-    "dual_annealing": dual_annealing,
-    "mls": mls,
-    "pso": pso,
-    "simulated_annealing": simulated_annealing,
-    "skopt": skopt,
-    "firefly_algorithm": firefly_algorithm,
-    "bayes_opt": bayes_opt,
-    "pyatf_strategies": pyatf_strategies,
+_STRATEGY_IMPORTS = {
+    "brute_force": "kernel_tuner.strategies.brute_force",
+    "random_sample": "kernel_tuner.strategies.random_sample",
+    "minimize": "kernel_tuner.strategies.minimize",
+    "basinhopping": "kernel_tuner.strategies.basinhopping",
+    "diff_evo": "kernel_tuner.strategies.diff_evo",
+    "genetic_algorithm": "kernel_tuner.strategies.genetic_algorithm",
+    "greedy_mls": "kernel_tuner.strategies.greedy_mls",
+    "ordered_greedy_mls": "kernel_tuner.strategies.ordered_greedy_mls",
+    "greedy_ils": "kernel_tuner.strategies.greedy_ils",
+    "dual_annealing": "kernel_tuner.strategies.dual_annealing",
+    "mls": "kernel_tuner.strategies.mls",
+    "pso": "kernel_tuner.strategies.pso",
+    "simulated_annealing": "kernel_tuner.strategies.simulated_annealing",
+    "skopt": "kernel_tuner.strategies.skopt",
+    "firefly_algorithm": "kernel_tuner.strategies.firefly_algorithm",
+    "bayes_opt": "kernel_tuner.strategies.bayes_opt",
+    "pyatf_strategies": "kernel_tuner.strategies.pyatf_strategies",
 }
 
 
+def _strategy_import_error(strategy_name, module_path, err):
+    base_msg = (
+        f"Failed to import strategy '{strategy_name}' from '{module_path}'. "
+        "This strategy may require optional dependencies that are not installed."
+    )
+    return ImportError(f"{base_msg} Original error: {err}")
+
+
+class _LazyStrategyModule:
+    def __init__(self, name, module_path):
+        self._name = name
+        self._module_path = module_path
+        self._module = None
+
+    def _load(self):
+        if self._module is None:
+            try:
+                self._module = importlib.import_module(self._module_path)
+            except ImportError as err:
+                raise _strategy_import_error(self._name, self._module_path, err)
+        return self._module
+
+    def __getattr__(self, attr):
+        return getattr(self._load(), attr)
+
+    def __repr__(self):
+        return f"<lazy strategy module '{self._name}'>"
+
+
+strategy_map = {name: _LazyStrategyModule(name, path) for name, path in _STRATEGY_IMPORTS.items()}
+
+
 class Options(dict):
     """read-only class for passing options around."""
 
@@ -651,7 +665,7 @@ def tune_kernel(
         tuning_options.strategy_options = Options(strategy_options or {})
     # if no strategy selected
     else:
-        strategy = brute_force
+        strategy = strategy_map["brute_force"]
 
     # select the runner for this job based on input
     selected_runner = SimulationRunner if simulation_mode else SequentialRunner

diff --git a/kernel_tuner/searchspace.py b/kernel_tuner/searchspace.py
@@ -10,7 +10,6 @@
 from inspect import signature
 
 import numpy as np
-from scipy.stats.qmc import LatinHypercube
 from constraint import (
     BacktrackingSolver,
     Constraint,
@@ -1333,6 +1332,7 @@ def get_distributed_random_sample(self, num_samples: int, sampling_factor=10) ->
 
     def get_LHS_sample_indices(self, num_samples: int) -> List[int]:
         """Get a Latin Hypercube sample of parameter configuration indices."""
+        from scipy.stats.qmc import LatinHypercube
         if num_samples > self.size:
             warn(
                 f"Too many samples requested ({num_samples}), reducing the number of samples to half of the searchspace size ({self.size})"

diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py
@@ -5,7 +5,6 @@
 from time import perf_counter
 
 import numpy as np
-from scipy.spatial import distance
 
 from kernel_tuner import util
 from kernel_tuner.searchspace import Searchspace
@@ -325,6 +324,7 @@ def unscale_and_snap_to_nearest_valid(x, params, searchspace, eps):
 
     if neighbors:
         # sort on distance to x
+        from scipy.spatial import distance
         neighbors.sort(key=lambda y: distance.euclidean(x,scale_from_params(y, searchspace.tune_params, eps)))
 
         # return closest valid neighbor