Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions kernel_tuner/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,21 @@
from kernel_tuner.integration import store_results, create_device_targets
from kernel_tuner.interface import tune_kernel, tune_kernel_T1, run_kernel

from importlib.metadata import version

from kernel_tuner.interface import run_kernel, tune_kernel, tune_kernel_T1

__version__ = version(__package__)

__all__ = [
"create_device_targets",
"run_kernel",
"store_results",
"tune_kernel",
"tune_kernel_T1",
"__version__",
]


def __getattr__(name):
if name in ("store_results", "create_device_targets"):
from kernel_tuner import integration
return getattr(integration, name)
raise AttributeError(f"module 'kernel_tuner' has no attribute {name!r}")
56 changes: 37 additions & 19 deletions kernel_tuner/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,15 @@

import numpy as np

try:
import cupy as cp
except ImportError:
cp = np
def _get_cupy():
try:
import cupy as _cp
except ImportError:
return None
return _cp

import kernel_tuner.util as util
from kernel_tuner.accuracy import Tunable
from kernel_tuner.backends.compiler import CompilerFunctions
from kernel_tuner.backends.cupy import CupyFunctions
from kernel_tuner.backends.hip import HipFunctions
from kernel_tuner.backends.hypertuner import HypertunerFunctions
from kernel_tuner.backends.nvcuda import CudaFunctions
from kernel_tuner.backends.opencl import OpenCLFunctions
from kernel_tuner.backends.pycuda import PyCudaFunctions
from kernel_tuner.observers.nvml import NVMLObserver
from kernel_tuner.observers.observer import ContinuousObserver, OutputObserver, PrologueObserver
from kernel_tuner.observers.tegra import TegraObserver

Expand All @@ -35,6 +29,7 @@
except ImportError:
DeviceArray = Exception # using Exception here as a type that will never be among kernel arguments


_KernelInstance = namedtuple(
"_KernelInstance",
[
Expand Down Expand Up @@ -272,27 +267,31 @@ def __init__(
logging.debug("DeviceInterface instantiated, lang=%s", lang)

if lang.upper() == "CUDA":
from kernel_tuner.backends.pycuda import PyCudaFunctions
dev = PyCudaFunctions(
device,
compiler_options=compiler_options,
iterations=iterations,
observers=observers,
)
elif lang.upper() == "CUPY":
from kernel_tuner.backends.cupy import CupyFunctions
dev = CupyFunctions(
device,
compiler_options=compiler_options,
iterations=iterations,
observers=observers,
)
elif lang.upper() == "NVCUDA":
from kernel_tuner.backends.nvcuda import CudaFunctions
dev = CudaFunctions(
device,
compiler_options=compiler_options,
iterations=iterations,
observers=observers,
)
elif lang.upper() == "OPENCL":
from kernel_tuner.backends.opencl import OpenCLFunctions
dev = OpenCLFunctions(
device,
platform,
Expand All @@ -301,20 +300,23 @@ def __init__(
observers=observers,
)
elif lang.upper() in ["C", "FORTRAN"]:
from kernel_tuner.backends.compiler import CompilerFunctions
dev = CompilerFunctions(
compiler=compiler,
compiler_options=compiler_options,
iterations=iterations,
observers=observers,
)
elif lang.upper() == "HIP":
from kernel_tuner.backends.hip import HipFunctions
dev = HipFunctions(
device,
compiler_options=compiler_options,
iterations=iterations,
observers=observers,
)
elif lang.upper() == "HYPERTUNER":
from kernel_tuner.backends.hypertuner import HypertunerFunctions
dev = HypertunerFunctions(
iterations=iterations,
compiler_options=compiler_options
Expand All @@ -333,8 +335,12 @@ def __init__(
self.output_observers = []
self.prologue_observers = []
if observers:
try:
from kernel_tuner.observers.nvml import NVMLObserver as _NVMLObserver
except ImportError:
_NVMLObserver = None
for obs in observers:
if isinstance(obs, NVMLObserver):
if _NVMLObserver is not None and isinstance(obs, _NVMLObserver):
self.nvml = obs.nvml
self.use_nvml = True
if isinstance(obs, TegraObserver):
Expand Down Expand Up @@ -500,7 +506,12 @@ def check_kernel_output(

should_sync = [answer[i] is not None for i, arg in enumerate(instance.arguments)]
else:
should_sync = [isinstance(arg, (np.ndarray, cp.ndarray, torch.Tensor, DeviceArray)) for arg in instance.arguments]
cp = _get_cupy()
cupy_ndarray = (cp.ndarray,) if cp is not None else ()
should_sync = [
isinstance(arg, (np.ndarray, torch.Tensor, DeviceArray) + cupy_ndarray)
for arg in instance.arguments
]

# re-copy original contents of output arguments to GPU memory, to overwrite any changes
# by earlier kernel runs
Expand All @@ -516,7 +527,9 @@ def check_kernel_output(
result_host = []
for i, arg in enumerate(instance.arguments):
if should_sync[i]:
if isinstance(arg, (np.ndarray, cp.ndarray)):
cp = _get_cupy()
cupy_ndarray = (cp.ndarray,) if cp is not None else ()
if isinstance(arg, (np.ndarray,) + cupy_ndarray):
result_host.append(np.zeros_like(arg))
self.dev.memcpy_dtoh(result_host[-1], gpu_args[i])
elif isinstance(arg, torch.Tensor) and isinstance(answer[i], torch.Tensor):
Expand Down Expand Up @@ -790,8 +803,10 @@ def _default_verify_function(instance, answer, result_host, atol, verbose):
# for each element in the argument list, check if the types match
for i, arg in enumerate(instance.arguments):
if answer[i] is not None: # skip None elements in the answer list
if isinstance(answer[i], (np.ndarray, cp.ndarray)) and isinstance(
arg, (np.ndarray, cp.ndarray)
cp = _get_cupy()
cupy_ndarray = (cp.ndarray,) if cp is not None else ()
if isinstance(answer[i], (np.ndarray,) + cupy_ndarray) and isinstance(
arg, (np.ndarray,) + cupy_ndarray
):
if not np.can_cast(arg.dtype, answer[i].dtype):
raise TypeError(
Expand Down Expand Up @@ -840,7 +855,9 @@ def _default_verify_function(instance, answer, result_host, atol, verbose):
)
else:
# either answer[i] and argument have different types or answer[i] is not a numpy type
if not isinstance(answer[i], (np.ndarray, cp.ndarray, torch.Tensor)) or not isinstance(
cp = _get_cupy()
cupy_ndarray = (cp.ndarray,) if cp is not None else ()
if not isinstance(answer[i], (np.ndarray, torch.Tensor) + cupy_ndarray) or not isinstance(
answer[i], np.number
):
raise TypeError(
Expand All @@ -865,7 +882,8 @@ def _flatten(a):
if expected is not None:
result = _ravel(result_host[i])
expected = _flatten(expected)
if any([isinstance(array, cp.ndarray) for array in [expected, result]]):
cp = _get_cupy()
if cp is not None and any([isinstance(array, cp.ndarray) for array in [expected, result]]):
output_test = cp.allclose(expected, result, atol=atol)
elif isinstance(expected, torch.Tensor) and isinstance(result, torch.Tensor):
output_test = torch.allclose(expected, result, atol=atol)
Expand Down
30 changes: 1 addition & 29 deletions kernel_tuner/integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,35 +4,7 @@

from jsonschema import validate

from kernel_tuner.util import get_instance_string, looks_like_a_filename, read_file

#specifies for a number of pre-defined objectives whether
#the objective should be minimized or maximized (boolean value denotes higher is better)
objective_default_map = {
"time": False,
"energy": False,
"fitness": True,
"cost": False,
"loss": False,
"GFLOP/s": True,
"TFLOP/s": True,
"GB/s": True,
"TB/s": True,
"GFLOPS/W": True,
"TFLOPS/W": True,
"GFLOP/J": True,
"TFLOP/J": True
}

def get_objective_defaults(objective, objective_higher_is_better):
""" Uses time as default objective and attempts to lookup objective_higher_is_better for known objectives """
objective = objective or "time"
if objective_higher_is_better is None:
if objective in objective_default_map:
objective_higher_is_better = objective_default_map[objective]
else:
raise ValueError(f"Please specify objective_higher_is_better for objective {objective}")
return objective, objective_higher_is_better
from kernel_tuner.util import get_instance_string, looks_like_a_filename, read_file, get_objective_defaults

schema_v1_0 = {
"$schema": "https://json-schema.org/draft-07/schema#",
Expand Down
92 changes: 53 additions & 39 deletions kernel_tuner/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"""

import logging
import importlib
from argparse import ArgumentParser
from ast import literal_eval
from datetime import datetime
Expand All @@ -38,7 +39,7 @@
import kernel_tuner.core as core
import kernel_tuner.util as util
from kernel_tuner.file_utils import get_input_file, get_t4_metadata, get_t4_results, import_class_from_file
from kernel_tuner.integration import get_objective_defaults
from kernel_tuner.util import get_objective_defaults
from kernel_tuner.runners.sequential import SequentialRunner
from kernel_tuner.runners.simulation import SimulationRunner
from kernel_tuner.searchspace import Searchspace
Expand All @@ -48,48 +49,61 @@
except ImportError:
torch = util.TorchPlaceHolder()

from kernel_tuner.strategies import (
basinhopping,
bayes_opt,
brute_force,
diff_evo,
dual_annealing,
firefly_algorithm,
genetic_algorithm,
greedy_ils,
greedy_mls,
minimize,
mls,
ordered_greedy_mls,
pso,
pyatf_strategies,
random_sample,
simulated_annealing,
skopt
)
from kernel_tuner.strategies.wrapper import OptAlgWrapper

strategy_map = {
"brute_force": brute_force,
"random_sample": random_sample,
"minimize": minimize,
"basinhopping": basinhopping,
"diff_evo": diff_evo,
"genetic_algorithm": genetic_algorithm,
"greedy_mls": greedy_mls,
"ordered_greedy_mls": ordered_greedy_mls,
"greedy_ils": greedy_ils,
"dual_annealing": dual_annealing,
"mls": mls,
"pso": pso,
"simulated_annealing": simulated_annealing,
"skopt": skopt,
"firefly_algorithm": firefly_algorithm,
"bayes_opt": bayes_opt,
"pyatf_strategies": pyatf_strategies,
_STRATEGY_IMPORTS = {
"brute_force": "kernel_tuner.strategies.brute_force",
"random_sample": "kernel_tuner.strategies.random_sample",
"minimize": "kernel_tuner.strategies.minimize",
"basinhopping": "kernel_tuner.strategies.basinhopping",
"diff_evo": "kernel_tuner.strategies.diff_evo",
"genetic_algorithm": "kernel_tuner.strategies.genetic_algorithm",
"greedy_mls": "kernel_tuner.strategies.greedy_mls",
"ordered_greedy_mls": "kernel_tuner.strategies.ordered_greedy_mls",
"greedy_ils": "kernel_tuner.strategies.greedy_ils",
"dual_annealing": "kernel_tuner.strategies.dual_annealing",
"mls": "kernel_tuner.strategies.mls",
"pso": "kernel_tuner.strategies.pso",
"simulated_annealing": "kernel_tuner.strategies.simulated_annealing",
"skopt": "kernel_tuner.strategies.skopt",
"firefly_algorithm": "kernel_tuner.strategies.firefly_algorithm",
"bayes_opt": "kernel_tuner.strategies.bayes_opt",
"pyatf_strategies": "kernel_tuner.strategies.pyatf_strategies",
}


def _strategy_import_error(strategy_name, module_path, err):
base_msg = (
f"Failed to import strategy '{strategy_name}' from '{module_path}'. "
"This strategy may require optional dependencies that are not installed."
)
return ImportError(f"{base_msg} Original error: {err}")


class _LazyStrategyModule:
def __init__(self, name, module_path):
self._name = name
self._module_path = module_path
self._module = None

def _load(self):
if self._module is None:
try:
self._module = importlib.import_module(self._module_path)
except ImportError as err:
raise _strategy_import_error(self._name, self._module_path, err)
return self._module

def __getattr__(self, attr):
return getattr(self._load(), attr)

def __repr__(self):
return f"<lazy strategy module '{self._name}'>"


strategy_map = {name: _LazyStrategyModule(name, path) for name, path in _STRATEGY_IMPORTS.items()}


class Options(dict):
"""read-only class for passing options around."""

Expand Down Expand Up @@ -651,7 +665,7 @@ def tune_kernel(
tuning_options.strategy_options = Options(strategy_options or {})
# if no strategy selected
else:
strategy = brute_force
strategy = strategy_map["brute_force"]

# select the runner for this job based on input
selected_runner = SimulationRunner if simulation_mode else SequentialRunner
Expand Down
2 changes: 1 addition & 1 deletion kernel_tuner/searchspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from inspect import signature

import numpy as np
from scipy.stats.qmc import LatinHypercube
from constraint import (
BacktrackingSolver,
Constraint,
Expand Down Expand Up @@ -1333,6 +1332,7 @@ def get_distributed_random_sample(self, num_samples: int, sampling_factor=10) ->

def get_LHS_sample_indices(self, num_samples: int) -> List[int]:
"""Get a Latin Hypercube sample of parameter configuration indices."""
from scipy.stats.qmc import LatinHypercube
if num_samples > self.size:
warn(
f"Too many samples requested ({num_samples}), reducing the number of samples to half of the searchspace size ({self.size})"
Expand Down
2 changes: 1 addition & 1 deletion kernel_tuner/strategies/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from time import perf_counter

import numpy as np
from scipy.spatial import distance

from kernel_tuner import util
from kernel_tuner.searchspace import Searchspace
Expand Down Expand Up @@ -325,6 +324,7 @@ def unscale_and_snap_to_nearest_valid(x, params, searchspace, eps):

if neighbors:
# sort on distance to x
from scipy.spatial import distance
neighbors.sort(key=lambda y: distance.euclidean(x,scale_from_params(y, searchspace.tune_params, eps)))

# return closest valid neighbor
Expand Down
Loading