From 2e57c74562c38aa0d61e2619b3d34dc33b35b146 Mon Sep 17 00:00:00 2001 From: Ben van Werkhoven Date: Tue, 3 Feb 2026 11:06:32 +0100 Subject: [PATCH 1/3] lazy-load strategies/backends and avoid optional imports --- kernel_tuner/core.py | 49 +++++++++++++-------- kernel_tuner/interface.py | 90 ++++++++++++++++++++++----------------- kernel_tuner/util.py | 14 +++--- test/test_core.py | 2 +- test/test_lazy_imports.py | 47 ++++++++++++++++++++ 5 files changed, 141 insertions(+), 61 deletions(-) create mode 100644 test/test_lazy_imports.py diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 5352ced74..45ccfe5e2 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -7,20 +7,15 @@ import numpy as np -try: - import cupy as cp -except ImportError: - cp = np +def _get_cupy(): + try: + import cupy as _cp + except ImportError: + return None + return _cp import kernel_tuner.util as util from kernel_tuner.accuracy import Tunable -from kernel_tuner.backends.compiler import CompilerFunctions -from kernel_tuner.backends.cupy import CupyFunctions -from kernel_tuner.backends.hip import HipFunctions -from kernel_tuner.backends.hypertuner import HypertunerFunctions -from kernel_tuner.backends.nvcuda import CudaFunctions -from kernel_tuner.backends.opencl import OpenCLFunctions -from kernel_tuner.backends.pycuda import PyCudaFunctions from kernel_tuner.observers.nvml import NVMLObserver from kernel_tuner.observers.observer import ContinuousObserver, OutputObserver, PrologueObserver from kernel_tuner.observers.tegra import TegraObserver @@ -35,6 +30,7 @@ except ImportError: DeviceArray = Exception # using Exception here as a type that will never be among kernel arguments + _KernelInstance = namedtuple( "_KernelInstance", [ @@ -272,6 +268,7 @@ def __init__( logging.debug("DeviceInterface instantiated, lang=%s", lang) if lang.upper() == "CUDA": + from kernel_tuner.backends.pycuda import PyCudaFunctions dev = PyCudaFunctions( device, compiler_options=compiler_options, @@ -279,6 +276,7 @@ def __init__( observers=observers, ) elif lang.upper() == "CUPY": + from kernel_tuner.backends.cupy import CupyFunctions dev = CupyFunctions( device, compiler_options=compiler_options, @@ -286,6 +284,7 @@ def __init__( observers=observers, ) elif lang.upper() == "NVCUDA": + from kernel_tuner.backends.nvcuda import CudaFunctions dev = CudaFunctions( device, compiler_options=compiler_options, @@ -293,6 +292,7 @@ def __init__( observers=observers, ) elif lang.upper() == "OPENCL": + from kernel_tuner.backends.opencl import OpenCLFunctions dev = OpenCLFunctions( device, platform, @@ -301,6 +301,7 @@ def __init__( observers=observers, ) elif lang.upper() in ["C", "FORTRAN"]: + from kernel_tuner.backends.compiler import CompilerFunctions dev = CompilerFunctions( compiler=compiler, compiler_options=compiler_options, @@ -308,6 +309,7 @@ def __init__( observers=observers, ) elif lang.upper() == "HIP": + from kernel_tuner.backends.hip import HipFunctions dev = HipFunctions( device, compiler_options=compiler_options, @@ -315,6 +317,7 @@ def __init__( observers=observers, ) elif lang.upper() == "HYPERTUNER": + from kernel_tuner.backends.hypertuner import HypertunerFunctions dev = HypertunerFunctions( iterations=iterations, compiler_options=compiler_options @@ -500,7 +503,12 @@ def check_kernel_output( should_sync = [answer[i] is not None for i, arg in enumerate(instance.arguments)] else: - should_sync = [isinstance(arg, (np.ndarray, cp.ndarray, torch.Tensor, DeviceArray)) for arg in instance.arguments] + cp = _get_cupy() + cupy_ndarray = (cp.ndarray,) if cp is not None else () + should_sync = [ + isinstance(arg, (np.ndarray, torch.Tensor, DeviceArray) + cupy_ndarray) + for arg in instance.arguments + ] # re-copy original contents of output arguments to GPU memory, to overwrite any changes # by earlier kernel runs @@ -516,7 +524,9 @@ def check_kernel_output( result_host = [] for i, arg in enumerate(instance.arguments): if should_sync[i]: - if isinstance(arg, (np.ndarray, cp.ndarray)): + cp = _get_cupy() + cupy_ndarray = (cp.ndarray,) if cp is not None else () + if isinstance(arg, (np.ndarray,) + cupy_ndarray): result_host.append(np.zeros_like(arg)) self.dev.memcpy_dtoh(result_host[-1], gpu_args[i]) elif isinstance(arg, torch.Tensor) and isinstance(answer[i], torch.Tensor): @@ -790,8 +800,10 @@ def _default_verify_function(instance, answer, result_host, atol, verbose): # for each element in the argument list, check if the types match for i, arg in enumerate(instance.arguments): if answer[i] is not None: # skip None elements in the answer list - if isinstance(answer[i], (np.ndarray, cp.ndarray)) and isinstance( - arg, (np.ndarray, cp.ndarray) + cp = _get_cupy() + cupy_ndarray = (cp.ndarray,) if cp is not None else () + if isinstance(answer[i], (np.ndarray,) + cupy_ndarray) and isinstance( + arg, (np.ndarray,) + cupy_ndarray ): if not np.can_cast(arg.dtype, answer[i].dtype): raise TypeError( @@ -840,7 +852,9 @@ def _default_verify_function(instance, answer, result_host, atol, verbose): ) else: # either answer[i] and argument have different types or answer[i] is not a numpy type - if not isinstance(answer[i], (np.ndarray, cp.ndarray, torch.Tensor)) or not isinstance( + cp = _get_cupy() + cupy_ndarray = (cp.ndarray,) if cp is not None else () + if not isinstance(answer[i], (np.ndarray, torch.Tensor) + cupy_ndarray) or not isinstance( answer[i], np.number ): raise TypeError( @@ -865,7 +879,8 @@ def _flatten(a): if expected is not None: result = _ravel(result_host[i]) expected = _flatten(expected) - if any([isinstance(array, cp.ndarray) for array in [expected, result]]): + cp = _get_cupy() + if cp is not None and any([isinstance(array, cp.ndarray) for array in [expected, result]]): output_test = cp.allclose(expected, result, atol=atol) elif isinstance(expected, torch.Tensor) and isinstance(result, torch.Tensor): output_test = torch.allclose(expected, result, atol=atol) diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py index 0641eb7e1..1da17eae4 100644 --- a/kernel_tuner/interface.py +++ b/kernel_tuner/interface.py @@ -25,6 +25,7 @@ """ import logging +import importlib from argparse import ArgumentParser from ast import literal_eval from datetime import datetime @@ -48,48 +49,61 @@ except ImportError: torch = util.TorchPlaceHolder() -from kernel_tuner.strategies import ( - basinhopping, - bayes_opt, - brute_force, - diff_evo, - dual_annealing, - firefly_algorithm, - genetic_algorithm, - greedy_ils, - greedy_mls, - minimize, - mls, - ordered_greedy_mls, - pso, - pyatf_strategies, - random_sample, - simulated_annealing, - skopt -) from kernel_tuner.strategies.wrapper import OptAlgWrapper -strategy_map = { - "brute_force": brute_force, - "random_sample": random_sample, - "minimize": minimize, - "basinhopping": basinhopping, - "diff_evo": diff_evo, - "genetic_algorithm": genetic_algorithm, - "greedy_mls": greedy_mls, - "ordered_greedy_mls": ordered_greedy_mls, - "greedy_ils": greedy_ils, - "dual_annealing": dual_annealing, - "mls": mls, - "pso": pso, - "simulated_annealing": simulated_annealing, - "skopt": skopt, - "firefly_algorithm": firefly_algorithm, - "bayes_opt": bayes_opt, - "pyatf_strategies": pyatf_strategies, +_STRATEGY_IMPORTS = { + "brute_force": "kernel_tuner.strategies.brute_force", + "random_sample": "kernel_tuner.strategies.random_sample", + "minimize": "kernel_tuner.strategies.minimize", + "basinhopping": "kernel_tuner.strategies.basinhopping", + "diff_evo": "kernel_tuner.strategies.diff_evo", + "genetic_algorithm": "kernel_tuner.strategies.genetic_algorithm", + "greedy_mls": "kernel_tuner.strategies.greedy_mls", + "ordered_greedy_mls": "kernel_tuner.strategies.ordered_greedy_mls", + "greedy_ils": "kernel_tuner.strategies.greedy_ils", + "dual_annealing": "kernel_tuner.strategies.dual_annealing", + "mls": "kernel_tuner.strategies.mls", + "pso": "kernel_tuner.strategies.pso", + "simulated_annealing": "kernel_tuner.strategies.simulated_annealing", + "skopt": "kernel_tuner.strategies.skopt", + "firefly_algorithm": "kernel_tuner.strategies.firefly_algorithm", + "bayes_opt": "kernel_tuner.strategies.bayes_opt", + "pyatf_strategies": "kernel_tuner.strategies.pyatf_strategies", } +def _strategy_import_error(strategy_name, module_path, err): + base_msg = ( + f"Failed to import strategy '{strategy_name}' from '{module_path}'. " + "This strategy may require optional dependencies that are not installed." + ) + return ImportError(f"{base_msg} Original error: {err}") + + +class _LazyStrategyModule: + def __init__(self, name, module_path): + self._name = name + self._module_path = module_path + self._module = None + + def _load(self): + if self._module is None: + try: + self._module = importlib.import_module(self._module_path) + except ImportError as err: + raise _strategy_import_error(self._name, self._module_path, err) + return self._module + + def __getattr__(self, attr): + return getattr(self._load(), attr) + + def __repr__(self): + return f"" + + +strategy_map = {name: _LazyStrategyModule(name, path) for name, path in _STRATEGY_IMPORTS.items()} + + class Options(dict): """read-only class for passing options around.""" @@ -651,7 +665,7 @@ def tune_kernel( tuning_options.strategy_options = Options(strategy_options or {}) # if no strategy selected else: - strategy = brute_force + strategy = strategy_map["brute_force"] # select the runner for this job based on input selected_runner = SimulationRunner if simulation_mode else SequentialRunner diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 2c50bd6cc..a3e2e8c21 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -34,10 +34,12 @@ from kernel_tuner.accuracy import Tunable -try: - import cupy as cp -except ImportError: - cp = np +def _get_cupy(): + try: + import cupy as _cp + except ImportError: + return None + return _cp from kernel_tuner.observers.nvml import NVMLObserver @@ -132,6 +134,8 @@ def check_argument_type(dtype, kernel_argument): def check_argument_list(kernel_name, kernel_string, args): """Raise an exception if kernel arguments do not match host arguments.""" + cp = _get_cupy() + cupy_ndarray = (cp.ndarray,) if cp is not None else () kernel_arguments = list() collected_errors = list() @@ -155,7 +159,7 @@ def check_argument_list(kernel_name, kernel_string, args): continue # Handle numpy arrays and other array types - if not isinstance(arg, (np.ndarray, np.generic, cp.ndarray, torch.Tensor, DeviceArray)): + if not isinstance(arg, (np.ndarray, np.generic, torch.Tensor, DeviceArray) + cupy_ndarray): raise TypeError( f"Argument at position {i} of type: {type(arg)} should be of type " "np.ndarray, numpy scalar, or HIP Python DeviceArray type" diff --git a/test/test_core.py b/test/test_core.py index 39597b86c..aefe70cf8 100644 --- a/test/test_core.py +++ b/test/test_core.py @@ -93,7 +93,7 @@ def test_default_verify_function(env): assert True -@patch('kernel_tuner.core.PyCudaFunctions') +@patch('kernel_tuner.backends.pycuda.PyCudaFunctions') def test_check_kernel_output(dev_func_interface): dev_func_interface.configure_mock(**mock_config) diff --git a/test/test_lazy_imports.py b/test/test_lazy_imports.py new file mode 100644 index 000000000..7faf2c716 --- /dev/null +++ b/test/test_lazy_imports.py @@ -0,0 +1,47 @@ +import json +import subprocess +import sys + + +def _run_isolated(code): + result = subprocess.run([sys.executable, "-c", code], capture_output=True, text=True, check=True) + return result.stdout.strip() + + +def test_import_kernel_tuner_does_not_import_optional_deps(): + optional_modules = {"pycuda", "pyopencl", "cupy", "hip", "skopt", "sklearn", "pyatf"} + code = f""" +import json, sys +optional = {sorted(optional_modules)!r} +import kernel_tuner # noqa: F401 +loaded = sorted(set(optional) & set(sys.modules)) +print(json.dumps(loaded)) +""" + loaded = json.loads(_run_isolated(code)) + assert loaded == [] + + +def test_strategy_modules_loaded_on_demand(): + code = """ +import json, sys +import kernel_tuner.interface as interface +before = "kernel_tuner.strategies.brute_force" in sys.modules +_ = interface.strategy_map["brute_force"].tune +after = "kernel_tuner.strategies.brute_force" in sys.modules +print(json.dumps({"before": before, "after": after})) +""" + result = json.loads(_run_isolated(code)) + assert result["before"] is False + assert result["after"] is True + + +def test_backend_modules_not_loaded_on_import(): + code = """ +import json, sys +import kernel_tuner.core as core # noqa: F401 +mods = ["kernel_tuner.backends.pycuda", "kernel_tuner.backends.opencl"] +loaded = [m for m in mods if m in sys.modules] +print(json.dumps(loaded)) +""" + loaded = json.loads(_run_isolated(code)) + assert loaded == [] From ae751c7fce72c0bb4fdeeb9c9db31aa4deeda35c Mon Sep 17 00:00:00 2001 From: Ben van Werkhoven Date: Tue, 3 Feb 2026 11:35:36 +0100 Subject: [PATCH 2/3] lazy load integration and nvml --- kernel_tuner/__init__.py | 21 ++++++++++++++++++--- kernel_tuner/core.py | 7 +++++-- kernel_tuner/integration.py | 30 +----------------------------- kernel_tuner/interface.py | 2 +- kernel_tuner/util.py | 33 +++++++++++++++++++++++++++++++-- 5 files changed, 56 insertions(+), 37 deletions(-) diff --git a/kernel_tuner/__init__.py b/kernel_tuner/__init__.py index 40b88d463..3e5e8c6c2 100644 --- a/kernel_tuner/__init__.py +++ b/kernel_tuner/__init__.py @@ -1,6 +1,21 @@ -from kernel_tuner.integration import store_results, create_device_targets -from kernel_tuner.interface import tune_kernel, tune_kernel_T1, run_kernel - from importlib.metadata import version +from kernel_tuner.interface import run_kernel, tune_kernel, tune_kernel_T1 + __version__ = version(__package__) + +__all__ = [ + "create_device_targets", + "run_kernel", + "store_results", + "tune_kernel", + "tune_kernel_T1", + "__version__", +] + + +def __getattr__(name): + if name in ("store_results", "create_device_targets"): + from kernel_tuner import integration + return getattr(integration, name) + raise AttributeError(f"module 'kernel_tuner' has no attribute {name!r}") diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 45ccfe5e2..d47c1d1f3 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -16,7 +16,6 @@ def _get_cupy(): import kernel_tuner.util as util from kernel_tuner.accuracy import Tunable -from kernel_tuner.observers.nvml import NVMLObserver from kernel_tuner.observers.observer import ContinuousObserver, OutputObserver, PrologueObserver from kernel_tuner.observers.tegra import TegraObserver @@ -336,8 +335,12 @@ def __init__( self.output_observers = [] self.prologue_observers = [] if observers: + try: + from kernel_tuner.observers.nvml import NVMLObserver as _NVMLObserver + except ImportError: + _NVMLObserver = None for obs in observers: - if isinstance(obs, NVMLObserver): + if _NVMLObserver is not None and isinstance(obs, _NVMLObserver): self.nvml = obs.nvml self.use_nvml = True if isinstance(obs, TegraObserver): diff --git a/kernel_tuner/integration.py b/kernel_tuner/integration.py index b51a3eb36..d9fe8a0c7 100644 --- a/kernel_tuner/integration.py +++ b/kernel_tuner/integration.py @@ -4,35 +4,7 @@ from jsonschema import validate -from kernel_tuner.util import get_instance_string, looks_like_a_filename, read_file - -#specifies for a number of pre-defined objectives whether -#the objective should be minimized or maximized (boolean value denotes higher is better) -objective_default_map = { - "time": False, - "energy": False, - "fitness": True, - "cost": False, - "loss": False, - "GFLOP/s": True, - "TFLOP/s": True, - "GB/s": True, - "TB/s": True, - "GFLOPS/W": True, - "TFLOPS/W": True, - "GFLOP/J": True, - "TFLOP/J": True -} - -def get_objective_defaults(objective, objective_higher_is_better): - """ Uses time as default objective and attempts to lookup objective_higher_is_better for known objectives """ - objective = objective or "time" - if objective_higher_is_better is None: - if objective in objective_default_map: - objective_higher_is_better = objective_default_map[objective] - else: - raise ValueError(f"Please specify objective_higher_is_better for objective {objective}") - return objective, objective_higher_is_better +from kernel_tuner.util import get_instance_string, looks_like_a_filename, read_file, get_objective_defaults schema_v1_0 = { "$schema": "https://json-schema.org/draft-07/schema#", diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py index 1da17eae4..cfccf9ba7 100644 --- a/kernel_tuner/interface.py +++ b/kernel_tuner/interface.py @@ -39,7 +39,7 @@ import kernel_tuner.core as core import kernel_tuner.util as util from kernel_tuner.file_utils import get_input_file, get_t4_metadata, get_t4_results, import_class_from_file -from kernel_tuner.integration import get_objective_defaults +from kernel_tuner.util import get_objective_defaults from kernel_tuner.runners.sequential import SequentialRunner from kernel_tuner.runners.simulation import SimulationRunner from kernel_tuner.searchspace import Searchspace diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index a3e2e8c21..11141395f 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -41,8 +41,6 @@ def _get_cupy(): return None return _cp -from kernel_tuner.observers.nvml import NVMLObserver - # number of special values to insert when a configuration cannot be measured @@ -223,6 +221,7 @@ def check_tune_params_list(tune_params, observers, simulation_mode=False): if name in forbidden_names: raise ValueError("Tune parameter " + name + " with value " + str(param) + " has a forbidden name!") if any("nvml_" in param for param in tune_params): + from kernel_tuner.observers.nvml import NVMLObserver if not simulation_mode and (not observers or not any(isinstance(obs, NVMLObserver) for obs in observers)): raise ValueError("Tune parameters starting with nvml_ require an NVMLObserver!") @@ -431,6 +430,36 @@ def get_best_config(results, objective, objective_higher_is_better=False): return best_config +# specifies for a number of pre-defined objectives whether +# the objective should be minimized or maximized (boolean value denotes higher is better) +objective_default_map = { + "time": False, + "energy": False, + "fitness": True, + "cost": False, + "loss": False, + "GFLOP/s": True, + "TFLOP/s": True, + "GB/s": True, + "TB/s": True, + "GFLOPS/W": True, + "TFLOPS/W": True, + "GFLOP/J": True, + "TFLOP/J": True, +} + + +def get_objective_defaults(objective, objective_higher_is_better): + """Use time as default objective and infer objective_higher_is_better for known objectives.""" + objective = objective or "time" + if objective_higher_is_better is None: + if objective in objective_default_map: + objective_higher_is_better = objective_default_map[objective] + else: + raise ValueError(f"Please specify objective_higher_is_better for objective {objective}") + return objective, objective_higher_is_better + + def get_config_string(params, keys=None, units=None): """Return a compact string representation of a measurement.""" From f11d62c2c95b6231603ca5330314455f3f37e47f Mon Sep 17 00:00:00 2001 From: Ben van Werkhoven Date: Tue, 3 Feb 2026 11:41:54 +0100 Subject: [PATCH 3/3] reduce scipy import overhead --- kernel_tuner/searchspace.py | 2 +- kernel_tuner/strategies/common.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/searchspace.py b/kernel_tuner/searchspace.py index 5be3aabe8..8958902c2 100644 --- a/kernel_tuner/searchspace.py +++ b/kernel_tuner/searchspace.py @@ -10,7 +10,6 @@ from inspect import signature import numpy as np -from scipy.stats.qmc import LatinHypercube from constraint import ( BacktrackingSolver, Constraint, @@ -1333,6 +1332,7 @@ def get_distributed_random_sample(self, num_samples: int, sampling_factor=10) -> def get_LHS_sample_indices(self, num_samples: int) -> List[int]: """Get a Latin Hypercube sample of parameter configuration indices.""" + from scipy.stats.qmc import LatinHypercube if num_samples > self.size: warn( f"Too many samples requested ({num_samples}), reducing the number of samples to half of the searchspace size ({self.size})" diff --git a/kernel_tuner/strategies/common.py b/kernel_tuner/strategies/common.py index b51274ceb..e1f4b7ef0 100644 --- a/kernel_tuner/strategies/common.py +++ b/kernel_tuner/strategies/common.py @@ -5,7 +5,6 @@ from time import perf_counter import numpy as np -from scipy.spatial import distance from kernel_tuner import util from kernel_tuner.searchspace import Searchspace @@ -325,6 +324,7 @@ def unscale_and_snap_to_nearest_valid(x, params, searchspace, eps): if neighbors: # sort on distance to x + from scipy.spatial import distance neighbors.sort(key=lambda y: distance.euclidean(x,scale_from_params(y, searchspace.tune_params, eps))) # return closest valid neighbor