diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index ff35d191..c1aee584 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -3,7 +3,6 @@ name: Code linting
 on:
   push:
     branches:
-      - main
   pull_request:
 
 jobs:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 637b7d0c..f6d7cf40 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,11 +20,15 @@ cmake_dependent_option(GPRAT_ENABLE_TESTS "Build unit and integration tests"
                        ${PROJECT_IS_TOP_LEVEL} "GPRAT_BUILD_CORE" OFF)
 cmake_dependent_option(GPRAT_ENABLE_MKL "Enable support for Intel oneMKL"
                        ${PROJECT_IS_TOP_LEVEL} "GPRAT_BUILD_CORE" OFF)
+option(GPRAT_ENABLE_BENCHMARK_CACHE_EVICTIONS
+       "Evict data from caches before running BLAS operations" ON)
 
 option(GPRAT_ENABLE_FORMAT_TARGETS "Enable clang-format / cmake-format targets"
        ${PROJECT_IS_TOP_LEVEL})
 
 if(GPRAT_ENABLE_FORMAT_TARGETS)
+  set(CMAKE_FORMAT_EXCLUDE "^external_ports/")
+
   find_package(format QUIET)
   if(NOT format_FOUND)
     include(FetchContent)
diff --git a/CMakePresets.json b/CMakePresets.json
index e18ab19b..95204452 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -21,6 +21,21 @@
         "deprecated": true
       }
     },
+    {
+      "name": "vcpkg",
+      "hidden": true,
+      "cacheVariables": {
+        "CMAKE_TOOLCHAIN_FILE": "$env{VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake",
+        "X_VCPKG_APPLOCAL_DEPS_INSTALL": "ON"
+      }
+    },
+    {
+      "name": "vcpkg-win64-static",
+      "hidden": true,
+      "cacheVariables": {
+        "VCPKG_TARGET_TRIPLET": "x64-windows-static-md-release"
+      }
+    },
     {
       "name": "cppcheck",
       "hidden": true,
@@ -67,7 +82,7 @@
       "description": "Note that all the flags after /W4 are required for MSVC to conform to the language standard",
       "hidden": true,
       "cacheVariables": {
-        "CMAKE_CXX_FLAGS": "/sdl /guard:cf /utf-8 /diagnostics:caret /w14165 /w44242 /w44254 /w44263 /w34265 /w34287 /w44296 /w44365 /w44388 /w44464 /w14545 /w14546 /w14547 /w14549 /w14555 /w34619 /w34640 /w24826 /w14905 /w14906 /w14928 /w45038 /W4 /permissive- /volatile:iso /Zc:inline /Zc:preprocessor /Zc:enumTypes /Zc:lambda /Zc:__cplusplus /Zc:externConstexpr /Zc:throwingNew /EHsc",
+        "CMAKE_CXX_FLAGS": "/sdl /guard:cf /utf-8 /diagnostics:caret /w14165 /w44242 /w44254 /w44263 /w34265 /w34287 /w44296 /w44365 /w44388 /w44464 /w14545 /w14546 /w14547 /w14549 /w14555 /w34619 /w34640 /w24826 /w14905 /w14906 /w14928 /w45038 /W4 /permissive- /volatile:iso /Zc:inline /Zc:preprocessor /Zc:enumTypes /Zc:lambda /Zc:__cplusplus /Zc:externConstexpr /Zc:throwingNew /EHsc /D_CRT_SECURE_NO_WARNINGS",
         "CMAKE_EXE_LINKER_FLAGS": "/machine:x64 /guard:cf",
         "CMAKE_SHARED_LINKER_FLAGS": "/machine:x64 /guard:cf"
       }
@@ -146,7 +161,7 @@
     },
     {
       "name": "ci-windows",
-      "inherits": ["ci-build", "ci-win64", "ci-multi-config"]
+      "inherits": ["ci-build", "ci-win64", "ci-multi-config", "vcpkg", "vcpkg-win64-static"]
     },
     {
       "name": "ci-ubuntu-24.04",
diff --git a/README.md b/README.md
index 389bc776..7c73f0e0 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ code.
 
 ## Dependencies
 
-GPRat depends on [HPX](https://hpx-docs.stellar-group.org/latest/html/index.html) for asynchronous task-based parallelization. 
+GPRat depends on [HPX](https://hpx-docs.stellar-group.org/latest/html/index.html) for asynchronous task-based parallelization.
 Furthermore, for CPU-only BLAS computation GPRat requires [OpenBLAS](http://www.openmathlib.org/OpenBLAS/) or [MKL](https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.html).
 A [CUDA](https://developer.nvidia.com/cuda-toolkit) installation is required for GPU-only BLAS computations.
 
@@ -20,6 +20,9 @@ A script to install and setup spack for `GPRat` is provided in [`spack-repo`](sp
 Spack environment configurations and setup scripts for CPU and GPU use are provided in
 [`spack-repo/environments`](spack-repo/environments).
 
+Since Spack is not available on Windows, we also support dependency installation using vcpkg.
+For now, vcpkg builds are only tested on Windows.
+
 ## How To Compile
 
 GPRat makes use of [CMake presets][1] to simplify the process of configuring the project.
@@ -35,6 +38,7 @@ ctest --preset=dev-linux
 As a developer, you may create a `CMakeUserPresets.json` file at the root of the project that contains additional
 presets local to your machine.
 In addition to the build configuration `dev-linux`, there are `release-linux`, `dev-linux-gpu`, and `release-linux-gpu`.
+For Windows, we have similar presets called `dev-windows` and `release-windows`.
 The configurations suffixed with `-gpu` build the library with CUDA.
 
 GPRat can be build with or without Python bindings.
diff --git a/bindings/CMakeLists.txt b/bindings/CMakeLists.txt
index bad4b5ea..5ae3222f 100644
--- a/bindings/CMakeLists.txt
+++ b/bindings/CMakeLists.txt
@@ -1,5 +1,5 @@
 # try finding pybind11
-set(GPRat_pybind11_VERSION 2.10.3)
+set(GPRat_pybind11_VERSION 2.13.6)
 find_package(pybind11 ${GPRat_pybind11_VERSION} QUIET)
 if(pybind11_FOUND)
   message(STATUS "Found package pybind11.")
diff --git a/bindings/gprat_py.cpp b/bindings/gprat_py.cpp
index b18d2279..9efb56ce 100644
--- a/bindings/gprat_py.cpp
+++ b/bindings/gprat_py.cpp
@@ -1,4 +1,5 @@
-#include "gprat_c.hpp"
+#include "gprat/gprat.hpp"
+
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
@@ -31,19 +32,19 @@ void init_gprat(py::module &m)
     // Set hyperparameters to default values in `AdamParams` class, unless
     // specified. Python object has full access to each hyperparameter and a
     // string representation `__repr__`.
-    py::class_<gprat_hyper::AdamParams>(m, "AdamParams")
+    py::class_<gprat::AdamParams>(m, "AdamParams")
         .def(py::init<double, double, double, double, int>(),
              py::arg("learning_rate") = 0.001,
              py::arg("beta1") = 0.9,
              py::arg("beta2") = 0.999,
              py::arg("epsilon") = 1e-8,
              py::arg("opt_iter") = 0)
-        .def_readwrite("learning_rate", &gprat_hyper::AdamParams::learning_rate)
-        .def_readwrite("beta1", &gprat_hyper::AdamParams::beta1)
-        .def_readwrite("beta2", &gprat_hyper::AdamParams::beta2)
-        .def_readwrite("epsilon", &gprat_hyper::AdamParams::epsilon)
-        .def_readwrite("opt_iter", &gprat_hyper::AdamParams::opt_iter)
-        .def("__repr__", &gprat_hyper::AdamParams::repr);
+        .def_readwrite("learning_rate", &gprat::AdamParams::learning_rate)
+        .def_readwrite("beta1", &gprat::AdamParams::beta1)
+        .def_readwrite("beta2", &gprat::AdamParams::beta2)
+        .def_readwrite("epsilon", &gprat::AdamParams::epsilon)
+        .def_readwrite("opt_iter", &gprat::AdamParams::opt_iter)
+        .def("__repr__", &gprat::AdamParams::repr);
 
     // Initializes Gaussian Process with `GP` class. Sets default parameters for
     // squared exponential kernel, number of regressors and trainable, unless
diff --git a/bindings/utils_py.cpp b/bindings/utils_py.cpp
index 277e40ef..ab44cc5a 100644
--- a/bindings/utils_py.cpp
+++ b/bindings/utils_py.cpp
@@ -1,5 +1,6 @@
-#include "target.hpp"
-#include "utils_c.hpp"
+#include "gprat/target.hpp"
+#include "gprat/utils.hpp"
+
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
@@ -32,7 +33,7 @@ void start_hpx_wrapper(std::vector<std::string> args, std::size_t n_cores)
     }
     argv.push_back(nullptr);
     int argc = static_cast<int>(args.size());
-    utils::start_hpx_runtime(argc, argv.data());
+    gprat::start_hpx_runtime(argc, argv.data());
 }
 
 /**
@@ -43,7 +44,7 @@ void start_hpx_wrapper(std::vector<std::string> args, std::size_t n_cores)
 void init_utils(py::module &m)
 {
     m.def("compute_train_tiles",
-          &utils::compute_train_tiles,
+          &gprat::compute_train_tiles,
           py::arg("n_samples"),
           py::arg("n_tile_size"),
           R"pbdoc(
@@ -58,7 +59,7 @@ void init_utils(py::module &m)
           )pbdoc");
 
     m.def("compute_train_tile_size",
-          &utils::compute_train_tile_size,
+          &gprat::compute_train_tile_size,
           py::arg("n_samples"),
           py::arg("n_tiles"),
           R"pbdoc(
@@ -73,7 +74,7 @@ void init_utils(py::module &m)
           )pbdoc");
 
     m.def("compute_test_tiles",
-          &utils::compute_test_tiles,
+          &gprat::compute_test_tiles,
           py::arg("m_samples"),
           py::arg("n_tiles"),
           py::arg("n_tile_size"),
@@ -90,7 +91,7 @@ void init_utils(py::module &m)
           )pbdoc");
 
     m.def("print_vector",
-          &utils::print_vector,
+          &gprat::print_vector,
           py::arg("vec"),
           py::arg("start") = 0,
           py::arg("end") = -1,
@@ -98,11 +99,11 @@ void init_utils(py::module &m)
           "Print elements of a vector with optional start, end, and separator parameters");
 
     m.def("start_hpx", &start_hpx_wrapper, py::arg("args"), py::arg("n_cores"));  // Using the wrapper function
-    m.def("resume_hpx", &utils::resume_hpx_runtime);
-    m.def("suspend_hpx", &utils::suspend_hpx_runtime);
-    m.def("stop_hpx", &utils::stop_hpx_runtime);
+    m.def("resume_hpx", &gprat::resume_hpx_runtime);
+    m.def("suspend_hpx", &gprat::suspend_hpx_runtime);
+    m.def("stop_hpx", &gprat::stop_hpx_runtime);
 
-    m.def("compiled_with_cuda", &utils::compiled_with_cuda, "Check if the code was compiled with CUDA support");
+    m.def("compiled_with_cuda", &gprat::compiled_with_cuda, "Check if the code was compiled with CUDA support");
 
     m.def("print_available_gpus", &gprat::print_available_gpus, "Print available GPUs with their properties");
     m.def("gpu_count", &gprat::gpu_count, "Return the number of available GPUs");
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index da4c96d0..1a7b4db3 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -1,18 +1,20 @@
+# Option for GPU support with CUDA, cuSolver, cuBLAS
+option(GPRAT_WITH_CUDA "Enable GPU support with CUDA, cuSolver, cuBLAS" OFF)
+
 if(GPRAT_WITH_CUDA)
+  set(CMAKE_CUDA_STANDARD 20)
+  set(CMAKE_CUDA_EXTENSIONS OFF)
   enable_language(CUDA)
 endif()
 
-# Option for GPU support with CUDA, cuSolver, cuBLAS
-option(GPRAT_WITH_CUDA "Enable GPU support with CUDA, cuSolver, cuBLAS" OFF)
-# Pass variable to C++ code
-add_compile_definitions(GPRAT_WITH_CUDA=$<BOOL:${GPRAT_WITH_CUDA}>)
-
 set(SOURCE_FILES
-    src/gprat_c.cpp
-    src/utils_c.cpp
+    src/gprat.cpp
+    src/utils.cpp
+    src/performance_counters.cpp
     src/target.cpp
-    src/gp_kernels.cpp
-    src/gp_hyperparameters.cpp
+    src/tile_data.cpp
+    src/kernels.cpp
+    src/hyperparameters.cpp
     src/cpu/gp_functions.cpp
     src/cpu/gp_algorithms.cpp
     src/cpu/gp_uncertainty.cpp
@@ -54,7 +56,10 @@ target_sources(gprat_core PRIVATE ${header_files})
 target_link_libraries(gprat_core PUBLIC HPX::hpx)
 
 if(GPRAT_WITH_CUDA)
+  find_package(CUDAToolkit MODULE REQUIRED)
   target_link_libraries(gprat_core PUBLIC CUDA::cusolver CUDA::cublas)
+  # Flag not working for CLANG CUDA
+  target_compile_features(gprat_core PUBLIC cuda_std_${CMAKE_CUDA_STANDARD})
 endif()
 
 # Include directories
@@ -66,16 +71,19 @@ if(GPRAT_ENABLE_MKL)
   # Link Intel oneMKL
   target_link_libraries(gprat_core PUBLIC MKL::mkl_intel_lp64 MKL::mkl_core
                                           MKL::MKL MKL::mkl_sequential)
+  target_compile_definitions(gprat_core PUBLIC GPRAT_ENABLE_MKL)
 else()
   # Link OpenBLAS
   target_link_libraries(gprat_core PUBLIC ${OpenBLAS_LIB})
 endif()
 
-if(GPRAT_ENABLE_MKL)
-  target_compile_definitions(gprat_core PUBLIC GPRAT_ENABLE_MKL)
+target_compile_definitions(gprat_core
+                           PUBLIC GPRAT_WITH_CUDA=$<BOOL:${GPRAT_WITH_CUDA}>)
+if(GPRAT_ENABLE_BENCHMARK_CACHE_EVICTIONS)
+  target_compile_definitions(gprat_core
+                             PUBLIC GPRAT_ENABLE_BENCHMARK_CACHE_EVICTIONS)
 endif()
-
-target_compile_features(gprat_core PUBLIC cxx_std_17)
+target_compile_features(gprat_core PUBLIC cxx_std_20)
 
 set_property(TARGET gprat_core PROPERTY POSITION_INDEPENDENT_CODE ON)
 
diff --git a/core/include/cpu/adapter_cblas_fp32.hpp b/core/include/cpu/adapter_cblas_fp32.hpp
deleted file mode 100644
index 9cf21915..00000000
--- a/core/include/cpu/adapter_cblas_fp32.hpp
+++ /dev/null
@@ -1,148 +0,0 @@
-#ifndef CPU_ADAPTER_CBLAS_FP32_H
-#define CPU_ADAPTER_CBLAS_FP32_H
-
-#include <hpx/future.hpp>
-#include <vector>
-using vector_future = hpx::shared_future<std::vector<float>>;
-
-// Constants that are compatible with CBLAS
-typedef enum BLAS_TRANSPOSE { Blas_no_trans = 111, Blas_trans = 112 } BLAS_TRANSPOSE;
-
-typedef enum BLAS_SIDE { Blas_left = 141, Blas_right = 142 } BLAS_SIDE;
-
-typedef enum BLAS_ALPHA { Blas_add = 1, Blas_substract = -1 } BLAS_ALPHA;
-
-// typedef enum BLAS_UPLO { Blas_upper = 121,
-//                          Blas_lower = 122 } BLAS_UPLO;
-
-// typedef enum BLAS_ORDERING { Blas_row_major = 101,
-//                              Blas_col_major = 102 } BLAS_ORDERING;
-
-// BLAS level 3 operations
-
-/**
- * @brief FP32 In-place Cholesky decomposition of A
- * @param f_A matrix to be factorized
- * @param N matrix dimension
- * @return factorized, lower triangular matrix f_L
- */
-vector_future potrf(vector_future f_A, const int N);
-
-/**
- * @brief FP32 In-place solve L(^T) * X = A or X * L(^T) = A where L lower triangular
- * @param f_L Cholesky factor matrix
- * @param f_A right hand side matrix
- * @param N first dimension
- * @param M second dimension
- * @return solution matrix f_X
- */
-vector_future trsm(vector_future f_L,
-                   vector_future f_A,
-                   const int N,
-                   const int M,
-                   const BLAS_TRANSPOSE transpose_L,
-                   const BLAS_SIDE side_L);
-
-/**
- * @brief FP32 Symmetric rank-k update: A = A - B * B^T
- * @param f_A Base matrix
- * @param f_B Symmetric update matrix
- * @param N matrix dimension
- * @return updated matrix f_A
- */
-vector_future syrk(vector_future f_A, vector_future f_B, const int N);
-
-/**
- * @brief FP32 General matrix-matrix multiplication: C = C - A(^T) * B(^T)
- * @param f_C Base matrix
- * @param f_B Right update matrix
- * @param f_A Left update matrix
- * @param N first matrix dimension
- * @param M second matrix dimension
- * @param K third matrix dimension
- * @param transpose_A transpose left matrix
- * @param transpose_B transpose right matrix
- * @return updated matrix f_X
- */
-vector_future
-gemm(vector_future f_A,
-     vector_future f_B,
-     vector_future f_C,
-     const int N,
-     const int M,
-     const int K,
-     const BLAS_TRANSPOSE transpose_A,
-     const BLAS_TRANSPOSE transpose_B);
-
-// BLAS level 2 operations
-
-/**
- * @brief FP32 In-place solve L(^T) * x = a where L lower triangular
- * @param f_L Cholesky factor matrix
- * @param f_a right hand side vector
- * @param N matrix dimension
- * @param transpose_L transpose Cholesky factor
- * @return solution vector f_x
- */
-vector_future trsv(vector_future f_L, vector_future f_a, const int N, const BLAS_TRANSPOSE transpose_L);
-
-/**
- * @brief FP32 General matrix-vector multiplication: b = b - A(^T) * a
- * @param f_A update matrix
- * @param f_a update vector
- * @param f_b base vector
- * @param N matrix dimension
- * @param alpha add or substract update to base vector
- * @param transpose_A transpose update matrix
- * @return updated vector f_b
- */
-vector_future gemv(vector_future f_A,
-                   vector_future f_a,
-                   vector_future f_b,
-                   const int N,
-                   const int M,
-                   const BLAS_ALPHA alpha,
-                   const BLAS_TRANSPOSE transpose_A);
-
-/**
- * @brief FP32 Vector update with diagonal SYRK: r = r + diag(A^T * A)
- * @param f_A update matrix
- * @param f_r base vector
- * @param N first matrix dimension
- * @param M second matrix dimension
- * @return updated vector f_r
- */
-vector_future dot_diag_syrk(vector_future f_A, vector_future f_r, const int N, const int M);
-
-/**
- * @brief FP32 Vector update with diagonal GEMM: r = r + diag(A * B)
- * @param f_A first update matrix
- * @param f_B second update matrix
- * @param f_r base vector
- * @param N first matrix dimension
- * @param M second matrix dimension
- * @return updated vector f_r
- */
-vector_future dot_diag_gemm(vector_future f_A, vector_future f_B, vector_future f_r, const int N, const int M);
-
-// BLAS level 1 operations
-
-/**
- * @brief FP32 AXPY: y - x
- * @param f_y left vector
- * @param f_x right vector
- * @param N vector length
- * @return y - x
- */
-vector_future axpy(vector_future f_y, vector_future f_x, const int N);
-
-/**
- * @brief FP32 Dot product: a * b
- * @param f_a left vector
- * @param f_b right vector
- * @param N vector length
- * @return f_a * f_b
- */
-float dot(std::vector<float> a, std::vector<float> b, const int N);
-
-#endif  // end of CPU_ADAPTER_CBLAS_FP32_H
diff --git a/core/include/cpu/gp_functions.hpp b/core/include/cpu/gp_functions.hpp
deleted file mode 100644
index 7079bab6..00000000
--- a/core/include/cpu/gp_functions.hpp
+++ /dev/null
@@ -1,183 +0,0 @@
-#ifndef CPU_GP_FUNCTIONS_H
-#define CPU_GP_FUNCTIONS_H
-
-#include "gp_hyperparameters.hpp"
-#include "gp_kernels.hpp"
-#include <vector>
-
-namespace cpu
-{
-
-/**
- * @brief Perform Cholesky decompositon (+Assebmly)
- *
- * @param training_input The training input data
- * @param hyperparameters The kernel hyperparameters
- *
- * @param n_tiles The number of training tiles
- * @param n_tile_size The size of each training tile
- * @param n_regressors The number of regressors
- *
- * @return The tiled Cholesky factor
- */
-std::vector<std::vector<double>>
-cholesky(const std::vector<double> &training_input,
-         const gprat_hyper::SEKParams &sek_params,
-         int n_tiles,
-         int n_tile_size,
-         int n_regressors);
-
-/**
- * @brief Compute the predictions without uncertainties.
- *
- * @param training_input The training input data
- * @param training_output The raining output data
- * @param test_input The test input data
- * @param hyperparameters The kernel hyperparameters
- * @param n_tiles The number of training tiles
- * @param n_tile_size The size of each training tile
- * @param m_tiles The number of test tiles
- * @param m_tile_size The size of each test tile
- * @param n_regressors The number of regressors
- *
- * @return A vector containing the predictions
- */
-std::vector<double>
-predict(const std::vector<double> &training_input,
-        const std::vector<double> &training_output,
-        const std::vector<double> &test_input,
-        const gprat_hyper::SEKParams &sek_params,
-        int n_tiles,
-        int n_tile_size,
-        int m_tiles,
-        int m_tile_size,
-        int n_regressors);
-
-/**
- * @brief Compute the predictions with uncertainties.
- *
- * @param training_input The training input data
- * @param training_output The raining output data
- * @param test_input The test input data
- * @param hyperparameters The kernel hyperparameters
- * @param n_tiles The number of training tiles
- * @param n_tile_size The size of each training tile
- * @param m_tiles The number of test tiles
- * @param m_tile_size The size of each test tile
- * @param n_regressors The number of regressors
- *
- * @return A vector containing the prediction vector and the uncertainty vector
- */
-std::vector<std::vector<double>> predict_with_uncertainty(
-    const std::vector<double> &training_input,
-    const std::vector<double> &training_output,
-    const std::vector<double> &test_input,
-    const gprat_hyper::SEKParams &sek_params,
-    int n_tiles,
-    int n_tile_size,
-    int m_tiles,
-    int m_tile_size,
-    int n_regressors);
-
-/**
- * @brief Compute the predictions with full covariance matrix.
- *
- * @param training_input The training input data
- * @param training_output The raining output data
- * @param test_input The test input data
- * @param hyperparameters The kernel hyperparameters
- * @param n_tiles The number of training tiles
- * @param n_tile_size The size of each training tile
- * @param m_tiles The number of test tiles
- * @param m_tile_size The size of each test tile
- * @param n_regressors The number of regressors
- *
- * @return A vector containing the prediction vector and the full posterior covariance matrix
- */
-std::vector<std::vector<double>> predict_with_full_cov(
-    const std::vector<double> &training_input,
-    const std::vector<double> &training_output,
-    const std::vector<double> &test_data,
-    const gprat_hyper::SEKParams &sek_params,
-    int n_tiles,
-    int n_tile_size,
-    int m_tiles,
-    int m_tile_size,
-    int n_regressors);
-
-/**
- * @brief Compute loss for given data and Gaussian process model
- *
- * @param training_input The training input data
- * @param training_output The raining output data
- * @param hyperparameters The kernel hyperparameters
- * @param n_tiles The number of training tiles
- * @param n_tile_size The size of each training tile
- * @param n_regressors The number of regressors
- *
- * @return The loss
- */
-double compute_loss(const std::vector<double> &training_input,
-                    const std::vector<double> &training_output,
-                    const gprat_hyper::SEKParams &sek_params,
-                    int n_tiles,
-                    int n_tile_size,
-                    int n_regressors);
-
-/**
- * @brief Perform optimization for a given number of iterations
- *
- * @param training_input The training input data
- * @param training_output The raining output data
- *
- * @param n_tiles The number of training tiles
- * @param n_tile_size The size of each training tile
- * @param n_regressors The number of regressors
- *
- * @param hyperparams The Adam optimizer hyperparameters
- * @param hyperparameters The kernel hyperparameters
- * @param trainable_params The vector containing a bool wheather to train a hyperparameter
- *
- * @return A vector containing the loss values of each iteration
- */
-std::vector<double>
-optimize(const std::vector<double> &training_input,
-         const std::vector<double> &training_output,
-         int n_tiles,
-         int n_tile_size,
-         int n_regressors,
-         const gprat_hyper::AdamParams &adam_params,
-         gprat_hyper::SEKParams &sek_params,
-         std::vector<bool> trainable_params);
-
-/**
- * @brief Perform a single optimization step
- *
- * @param training_input The training input data
- * @param training_output The raining output data
- *
- * @param n_tiles The number of training tiles
- * @param n_tile_size The size of each training tile
- * @param n_regressors The number of regressors
- *
- * @param hyperparams The Adam optimizer hyperparameters
- * @param hyperparameters The kernel hyperparameters
- * @param trainable_params The vector containing a bool wheather to train a hyperparameter
- *
- * @param iter The current optimization iteration
- *
- * @return The loss value
- */
-double optimize_step(const std::vector<double> &training_input,
-                     const std::vector<double> &training_output,
-                     int n_tiles,
-                     int n_tile_size,
-                     int n_regressors,
-                     gprat_hyper::AdamParams &adam_params,
-                     gprat_hyper::SEKParams &sek_params,
-                     std::vector<bool> trainable_params,
-                     int iter);
-
-}  // end of namespace cpu
-
-#endif  // end of CPU_GP_FUNCTIONS_H
diff --git a/core/include/cpu/gp_uncertainty.hpp b/core/include/cpu/gp_uncertainty.hpp
deleted file mode 100644
index 28089584..00000000
--- a/core/include/cpu/gp_uncertainty.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef CPU_GP_UNCERTAINTY_H
-#define CPU_GP_UNCERTAINTY_H
-
-#include <hpx/future.hpp>
-#include <vector>
-
-namespace cpu
-{
-
-/**
- * @brief Extract diagonal elements of the matrix A.
- *
- * @param A The matrix
- * @param M The rumber of rows in the matrix
- *
- * @return Diagonal element vector of the matrix A of size M
- */
-// std::vector<double> get_matrix_diagonal(const std::vector<double> &A, std::size_t M);
-hpx::shared_future<std::vector<double>> get_matrix_diagonal(hpx::shared_future<std::vector<double>> f_A, std::size_t M);
-
-}  // end of namespace cpu
-
-#endif  // end of CPU_GP_UNCERTAINTY_H
diff --git a/core/include/cpu/tiled_algorithms.hpp b/core/include/cpu/tiled_algorithms.hpp
deleted file mode 100644
index 28c25c05..00000000
--- a/core/include/cpu/tiled_algorithms.hpp
+++ /dev/null
@@ -1,183 +0,0 @@
-#ifndef CPU_TILED_ALGORITHMS_H
-#define CPU_TILED_ALGORITHMS_H
-
-#include "gp_hyperparameters.hpp"
-#include "gp_kernels.hpp"
-#include <hpx/future.hpp>
-
-using Tiled_matrix = std::vector<hpx::shared_future<std::vector<double>>>;
-using Tiled_vector = std::vector<hpx::shared_future<std::vector<double>>>;
-
-namespace cpu
-{
-
-// Tiled Cholesky Algorithm
-
-/**
- * @brief Perform right-looking tiled Cholesky decomposition.
- *
- * @param ft_tiles Tiled matrix represented as a vector of futurized tiles, containing the
- *        covariance matrix, afterwards the Cholesky decomposition.
- * @param N Tile size per dimension.
- * @param n_tiles Number of tiles per dimension.
- */
-void right_looking_cholesky_tiled(Tiled_matrix &ft_tiles, int N, std::size_t n_tiles);
-
-// Tiled Triangular Solve Algorithms
-
-/**
- * @brief Perform tiled forward triangular matrix-vector solve.
- *
- * @param ft_tiles Tiled triangular matrix represented as a vector of futurized tiles.
- * @param ft_rhs Tiled right-hand side vector, afterwards containing the tiled solution vector
- * @param N Tile size per dimension.
- * @param n_tiles Number of tiles per dimension.
- */
-void forward_solve_tiled(Tiled_matrix &ft_tiles, Tiled_vector &ft_rhs, int N, std::size_t n_tiles);
-
-/**
- * @brief Perform tiled backward triangular matrix-vector solve.
- *
- * @param ft_tiles Tiled triangular matrix represented as a vector of futurized tiles.
- * @param ft_rhs Tiled right-hand side vector, afterwards containing the tiled solution vector
- * @param N Tile size per dimension.
- * @param n_tiles Number of tiles per dimension.
- */
-void backward_solve_tiled(Tiled_matrix &ft_tiles, Tiled_vector &ft_rhs, int N, std::size_t n_tiles);
-
-/**
- * @brief Perform tiled forward triangular matrix-matrix solve.
- *
- * @param ft_tiles Tiled triangular matrix represented as a vector of futurized tiles.
- * @param ft_rhs Tiled right-hand side matrix, afterwards containing the tiled solution matrix.
- * @param N Tile size of first dimension.
- * @param M Tile size of second dimension.
- * @param n_tiles Number of tiles in first dimension.
- * @param m_tiles Number of tiles in second dimension.
- */
-void forward_solve_tiled_matrix(
-    Tiled_matrix &ft_tiles, Tiled_matrix &ft_rhs, int N, int M, std::size_t n_tiles, std::size_t m_tiles);
-
-/**
- * @brief Perform tiled backward triangular matrix-matrix solve.
- *
- * @param ft_tiles Tiled triangular matrix represented as a vector of futurized tiles.
- * @param ft_rhs Tiled right-hand side matrix, afterwards containing the tiled solution matrix.
- * @param N Tile size of first dimension.
- * @param M Tile size of second dimension.
- * @param n_tiles Number of tiles in first dimension.
- * @param m_tiles Number of tiles in second dimension.
- */
-void backward_solve_tiled_matrix(
-    Tiled_matrix &ft_tiles, Tiled_matrix &ft_rhs, int N, int M, std::size_t n_tiles, std::size_t m_tiles);
-
-/**
- * @brief Perform tiled matrix-vector multiplication
- *
- * @param ft_tiles Tiled matrix represented as a vector of futurized tiles.
- * @param ft_vector Tiled vector represented as a vector of futurized tiles.
- * @param ft_rhsTiled solution represented as a vector of futurized tiles.
- * @param N_row Tile size of first dimension.
- * @param N_col Tile size of second dimension.
- * @param n_tiles Number of tiles in first dimension.
- * @param m_tiles Number of tiles in second dimension.
- */
-void matrix_vector_tiled(Tiled_matrix &ft_tiles,
-                         Tiled_vector &ft_vector,
-                         Tiled_vector &ft_rhs,
-                         int N_row,
-                         int N_col,
-                         std::size_t n_tiles,
-                         std::size_t m_tiles);
-
-/**
- * @brief Perform tiled symmetric k-rank update on diagonal tiles
- *
- * @param ft_tiles Tiled matrix represented as a vector of futurized tiles.
- * @param ft_vector Tiled vector holding the diagonal tile results
- * @param N Tile size of first dimension.
- * @param M Tile size of second dimension.
- * @param n_tiles Number of tiles in first dimension.
- * @param m_tiles Number of tiles in second dimension.
- */
-void symmetric_matrix_matrix_diagonal_tiled(
-    Tiled_matrix &ft_tiles, Tiled_vector &ft_vector, int N, int M, std::size_t n_tiles, std::size_t m_tiles);
-
-/**
- * @brief Perform tiled symmetric k-rank update (ft_tiles^T * ft_tiles)
- *
- * @param ft_tiles Tiled matrix represented as a vector of futurized tiles.
- * @param ft_result Tiled matrix holding the result of the computationi.
- * @param N Tile size of first dimension.
- * @param M Tile size of second dimension.
- * @param n_tiles Number of tiles in first dimension.
- * @param m_tiles Number of tiles in second dimension.
- */
-void symmetric_matrix_matrix_tiled(
-    Tiled_matrix &ft_tiles, Tiled_matrix &ft_result, int N, int M, std::size_t n_tiles, std::size_t m_tiles);
-
-/**
- * @brief Compute the difference between two tiled vectors
- * @param ft_minuend Tiled vector that is being subtracted from.
- * @param ft_subtrahend Tiled vector that is being subtracted.
- * @param ft_difference Tiled vector that contains the result of the substraction.
- * @param M Tile size dimension.
- * @param m_tiles Number of tiles.
- */
-void vector_difference_tiled(Tiled_vector &ft_minuend, Tiled_vector &ft_substrahend, int M, std::size_t m_tiles);
-
-/**
- * @brief Extract the tiled diagonals of a tiled matrix
- * @param ft_tiles Tiled matrix represented as a vector of futurized tiles.
- * @param ft_vector Tiled vector containing the diagonals of the matrix tiles
- * @param M Tile size per dimension.
- * @param m_tiles Number of tiles per dimension.
- */
-void matrix_diagonal_tiled(Tiled_matrix &ft_tiles, Tiled_vector &ft_vector, int M, std::size_t m_tiles);
-
-/**
- * @brief Compute the negative log likelihood loss with a tiled covariance matrix K.
- *
- *  Computes l = 0.5 * ( log(det(K)) + y^T * K^-1 * y) + const.)
- *
- * @param ft_tiles Tiled Cholesky factor matrix represented as a vector of futurized tiles.
- * @param ft_alpha Tiled vector containing the solution of K^-1 * y
- * @param ft_y Tiled vector containing the the training output y
- * @param loss The loss value to be computed
- * @param N Tile size per dimension.
- * @param n_tiles Number of tiles per dimension.
- */
-void compute_loss_tiled(Tiled_matrix &ft_tiles,
-                        Tiled_vector &ft_alpha,
-                        Tiled_vector &ft_y,
-                        hpx::shared_future<double> &loss,
-                        int N,
-                        std::size_t n_tiles);
-
-/**
- * @brief Updates a hyperparameter of the SEK kernel using Adam
- *
- * @param ft_invK Tiled inverse of the covariance matrix K represented as a vector of futurized tiles.
- * @param ft_grad_param Tiled covariance matrix gradient w.r.t. a hyperparameter.
- * @param ft_alpha Tiled vector containing the precomputed inv(K) * y where y is the training output.
- * @param adam_params Hyperparameter of the Adam optimizer
- * @param sek_params Hyperparameters of the SEK kernel
- * @param N Tile size per dimension.
- * @param n_tiles Number of tiles per dimension.
- * @param iter Current iteration.
- * @param param_idx Index of the hyperparameter to optimize.
- */
-void update_hyperparameter_tiled(
-    const Tiled_matrix &ft_invK,
-    const Tiled_matrix &ft_gradK_param,
-    const Tiled_vector &ft_alpha,
-    const gprat_hyper::AdamParams &adam_params,
-    gprat_hyper::SEKParams &sek_params,
-    int N,
-    std::size_t n_tiles,
-    std::size_t iter,
-    std::size_t param_idx);
-
-}  // end of namespace cpu
-
-#endif  // end of CPU_TILED_ALGORITHMS_H
diff --git a/core/include/gprat/cpu/adapter_cblas_fp32.hpp b/core/include/gprat/cpu/adapter_cblas_fp32.hpp
new file mode 100644
index 00000000..015646b5
--- /dev/null
+++ b/core/include/gprat/cpu/adapter_cblas_fp32.hpp
@@ -0,0 +1,160 @@
+#ifndef GPRAT_CPU_ADAPTER_CBLAS_FP32_HPP
+#define GPRAT_CPU_ADAPTER_CBLAS_FP32_HPP
+
+#pragma once
+
+#include "gprat/detail/config.hpp"
+#include "gprat/tile_data.hpp"
+
+#include <span>
+
+GPRAT_NS_BEGIN
+
+// Constants that are compatible with CBLAS
+typedef enum BLAS_TRANSPOSE { Blas_no_trans = 111, Blas_trans = 112 } BLAS_TRANSPOSE;
+
+typedef enum BLAS_SIDE { Blas_left = 141, Blas_right = 142 } BLAS_SIDE;
+
+typedef enum BLAS_ALPHA { Blas_add = 1, Blas_substract = -1 } BLAS_ALPHA;
+
+// typedef enum BLAS_UPLO { Blas_upper = 121,
+//                          Blas_lower = 122 } BLAS_UPLO;
+
+// typedef enum BLAS_ORDERING { Blas_row_major = 101,
+//                              Blas_col_major = 102 } BLAS_ORDERING;
+
+// BLAS level 3 operations
+
+/**
+ * @brief FP32 In-place Cholesky decomposition of A
+ * @param A matrix to be factorized
+ * @param N matrix dimension
+ * @return factorized, lower triangular matrix f_L
+ */
+mutable_tile_data<float> potrf(const mutable_tile_data<float> &A, int N);
+
+/**
+ * @brief FP32 In-place solve L(^T) * X = A or X * L(^T) = A where L lower triangular
+ * @param L Cholesky factor matrix
+ * @param A right hand side matrix
+ * @param N first dimension
+ * @param M second dimension
+ * @return solution matrix f_X
+ */
+mutable_tile_data<float>
+trsm(const const_tile_data<float> &L,
+     const mutable_tile_data<float> &A,
+     int N,
+     int M,
+     BLAS_TRANSPOSE transpose_L,
+     BLAS_SIDE side_L);
+
+/**
+ * @brief FP32 Symmetric rank-k update: A = A - B * B^T
+ * @param A Base matrix
+ * @param B Symmetric update matrix
+ * @param N matrix dimension
+ * @return updated matrix f_A
+ */
+mutable_tile_data<float> syrk(const mutable_tile_data<float> &A, const const_tile_data<float> &B, int N);
+
+/**
+ * @brief FP32 General matrix-matrix multiplication: C = C - A(^T) * B(^T)
+ * @param C Base matrix
+ * @param B Right update matrix
+ * @param A Left update matrix
+ * @param N first matrix dimension
+ * @param M second matrix dimension
+ * @param K third matrix dimension
+ * @param transpose_A transpose left matrix
+ * @param transpose_B transpose right matrix
+ * @return updated matrix f_X
+ */
+mutable_tile_data<float>
+gemm(const const_tile_data<float> &A,
+     const const_tile_data<float> &B,
+     const mutable_tile_data<float> &C,
+     int N,
+     int M,
+     int K,
+     BLAS_TRANSPOSE transpose_A,
+     BLAS_TRANSPOSE transpose_B);
+
+// BLAS level 2 operations
+
+/**
+ * @brief FP32 In-place solve L(^T) * x = a where L lower triangular
+ * @param L Cholesky factor matrix
+ * @param a right hand side vector
+ * @param N matrix dimension
+ * @param transpose_L transpose Cholesky factor
+ * @return solution vector f_x
+ */
+mutable_tile_data<float>
+trsv(const const_tile_data<float> &L, const mutable_tile_data<float> &a, int N, BLAS_TRANSPOSE transpose_L);
+
+/**
+ * @brief FP32 General matrix-vector multiplication: b = b - A(^T) * a
+ * @param A update matrix
+ * @param a update vector
+ * @param b base vector
+ * @param N matrix dimension
+ * @param alpha add or subtract update to base vector
+ * @param transpose_A transpose update matrix
+ * @return updated vector f_b
+ */
+mutable_tile_data<float>
+gemv(const const_tile_data<float> &A,
+     const const_tile_data<float> &a,
+     const mutable_tile_data<float> &b,
+     int N,
+     int M,
+     BLAS_ALPHA alpha,
+     BLAS_TRANSPOSE transpose_A);
+
+/**
+ * @brief FP32 Vector update with diagonal SYRK: r = r + diag(A^T * A)
+ * @param A update matrix
+ * @param r base vector
+ * @param N first matrix dimension
+ * @param M second matrix dimension
+ * @return updated vector f_r
+ */
+mutable_tile_data<float>
+dot_diag_syrk(const const_tile_data<float> &A, const mutable_tile_data<float> &r, int N, int M);
+
+/**
+ * @brief FP32 Vector update with diagonal GEMM: r = r + diag(A * B)
+ * @param A first update matrix
+ * @param B second update matrix
+ * @param r base vector
+ * @param N first matrix dimension
+ * @param M second matrix dimension
+ * @return updated vector f_r
+ */
+mutable_tile_data<float> dot_diag_gemm(
+    const const_tile_data<float> &A, const const_tile_data<float> &B, const mutable_tile_data<float> &r, int N, int M);
+
+// BLAS level 1 operations
+
+/**
+ * @brief FP32 AXPY: y - x
+ * @param y left vector
+ * @param x right vector
+ * @param N vector length
+ * @return y - x
+ */
+mutable_tile_data<float> axpy(const mutable_tile_data<float> &y, const const_tile_data<float> &x, int N);
+
+/**
+ * @brief FP32 Dot product: a * b
+ * @param a left vector
+ * @param b right vector
+ * @param N vector length
+ * @return f_a * f_b
+ */
+float dot(std::span<const float> a, std::span<const float> b, int N);
+
+GPRAT_NS_END
+
+#endif
diff --git a/core/include/cpu/adapter_cblas_fp64.hpp b/core/include/gprat/cpu/adapter_cblas_fp64.hpp
similarity index 51%
rename from core/include/cpu/adapter_cblas_fp64.hpp
rename to core/include/gprat/cpu/adapter_cblas_fp64.hpp
index b3c95420..c2dab5d7 100644
--- a/core/include/cpu/adapter_cblas_fp64.hpp
+++ b/core/include/gprat/cpu/adapter_cblas_fp64.hpp
@@ -1,13 +1,16 @@
-#ifndef CPU_ADAPTER_CBLAS_FP64_H
-#define CPU_ADAPTER_CBLAS_FP64_H
+#ifndef GPRAT_CPU_ADAPTER_CBLAS_FP64_HPP
+#define GPRAT_CPU_ADAPTER_CBLAS_FP64_HPP
 
-#include <hpx/future.hpp>
-#include <vector>
+#pragma once
 
-using vector_future = hpx::shared_future<std::vector<double>>;
+#include "gprat/detail/config.hpp"
+#include "gprat/tile_data.hpp"
 
-// Constants that are compatible with CBLAS
+#include <span>
+
+GPRAT_NS_BEGIN
 
+// Constants that are compatible with CBLAS
 typedef enum BLAS_TRANSPOSE { Blas_no_trans = 111, Blas_trans = 112 } BLAS_TRANSPOSE;
 
 typedef enum BLAS_SIDE { Blas_left = 141, Blas_right = 142 } BLAS_SIDE;
@@ -24,41 +27,42 @@ typedef enum BLAS_ALPHA { Blas_add = 1, Blas_substract = -1 } BLAS_ALPHA;
 
 /**
  * @brief FP64 In-place Cholesky decomposition of A
- * @param f_A matrix to be factorized
+ * @param A matrix to be factorized
  * @param N matrix dimension
  * @return factorized, lower triangular matrix f_L
  */
-vector_future potrf(vector_future f_A, const int N);
+mutable_tile_data<double> potrf(const mutable_tile_data<double> &A, int N);
 
 /**
  * @brief FP64 In-place solve L(^T) * X = A or X * L(^T) = A where L lower triangular
- * @param f_L Cholesky factor matrix
- * @param f_A right hand side matrix
+ * @param L Cholesky factor matrix
+ * @param A right hand side matrix
  * @param N first dimension
  * @param M second dimension
  * @return solution matrix f_X
  */
-vector_future trsm(vector_future f_L,
-                   vector_future f_A,
-                   const int N,
-                   const int M,
-                   const BLAS_TRANSPOSE transpose_L,
-                   const BLAS_SIDE side_L);
+mutable_tile_data<double>
+trsm(const const_tile_data<double> &L,
+     const mutable_tile_data<double> &A,
+     int N,
+     int M,
+     BLAS_TRANSPOSE transpose_L,
+     BLAS_SIDE side_L);
 
 /**
  * @brief FP64 Symmetric rank-k update: A = A - B * B^T
- * @param f_A Base matrix
- * @param f_B Symmetric update matrix
+ * @param A Base matrix
+ * @param B Symmetric update matrix
  * @param N matrix dimension
  * @return updated matrix f_A
  */
-vector_future syrk(vector_future f_A, vector_future f_B, const int N);
+mutable_tile_data<double> syrk(const mutable_tile_data<double> &A, const const_tile_data<double> &B, int N);
 
 /**
  * @brief FP64 General matrix-matrix multiplication: C = C - A(^T) * B(^T)
- * @param f_C Base matrix
- * @param f_B Right update matrix
- * @param f_A Left update matrix
+ * @param C Base matrix
+ * @param B Right update matrix
+ * @param A Left update matrix
  * @param N first matrix dimension
  * @param M second matrix dimension
  * @param K third matrix dimension
@@ -66,66 +70,74 @@ vector_future syrk(vector_future f_A, vector_future f_B, const int N);
  * @param transpose_B transpose right matrix
  * @return updated matrix f_X
  */
-vector_future
-gemm(vector_future f_A,
-     vector_future f_B,
-     vector_future f_C,
-     const int N,
-     const int M,
-     const int K,
-     const BLAS_TRANSPOSE transpose_A,
-     const BLAS_TRANSPOSE transpose_B);
+mutable_tile_data<double>
+gemm(const const_tile_data<double> &A,
+     const const_tile_data<double> &B,
+     const mutable_tile_data<double> &C,
+     int N,
+     int M,
+     int K,
+     BLAS_TRANSPOSE transpose_A,
+     BLAS_TRANSPOSE transpose_B);
 
 // BLAS level 2 operations
 
 /**
  * @brief FP64 In-place solve L(^T) * x = a where L lower triangular
- * @param f_L Cholesky factor matrix
- * @param f_a right hand side vector
+ * @param L Cholesky factor matrix
+ * @param a right hand side vector
  * @param N matrix dimension
  * @param transpose_L transpose Cholesky factor
  * @return solution vector f_x
  */
-vector_future trsv(vector_future f_L, vector_future f_a, const int N, const BLAS_TRANSPOSE transpose_L);
+mutable_tile_data<double>
+trsv(const const_tile_data<double> &L, const mutable_tile_data<double> &a, int N, BLAS_TRANSPOSE transpose_L);
 
 /**
  * @brief FP64 General matrix-vector multiplication: b = b - A(^T) * a
- * @param f_A update matrix
- * @param f_a update vector
- * @param f_b base vector
+ * @param A update matrix
+ * @param a update vector
+ * @param b base vector
  * @param N matrix dimension
  * @param alpha add or substract update to base vector
  * @param transpose_A transpose update matrix
  * @return updated vector f_b
  */
-vector_future gemv(vector_future f_A,
-                   vector_future f_a,
-                   vector_future f_b,
-                   const int N,
-                   const int M,
-                   const BLAS_ALPHA alpha,
-                   const BLAS_TRANSPOSE transpose_A);
+mutable_tile_data<double>
+gemv(const const_tile_data<double> &A,
+     const const_tile_data<double> &a,
+     const mutable_tile_data<double> &b,
+     int N,
+     int M,
+     BLAS_ALPHA alpha,
+     BLAS_TRANSPOSE transpose_A);
 
 /**
  * @brief FP64 Vector update with diagonal SYRK: r = r + diag(A^T * A)
- * @param f_A update matrix
- * @param f_r base vector
+ * @param A update matrix
+ * @param r base vector
  * @param N first matrix dimension
  * @param M second matrix dimension
  * @return updated vector f_r
  */
-vector_future dot_diag_syrk(vector_future f_A, vector_future f_r, const int N, const int M);
+mutable_tile_data<double>
+dot_diag_syrk(const const_tile_data<double> &A, const mutable_tile_data<double> &r, int N, int M);
 
 /**
  * @brief FP64 Vector update with diagonal GEMM: r = r + diag(A * B)
- * @param f_A first update matrix
- * @param f_B second update matrix
- * @param f_r base vector
+ * @param A first update matrix
+ * @param B second update matrix
+ * @param r base vector
  * @param N first matrix dimension
  * @param M second matrix dimension
  * @return updated vector f_r
  */
-vector_future dot_diag_gemm(vector_future f_A, vector_future f_B, vector_future f_r, const int N, const int M);
+mutable_tile_data<double>
+dot_diag_gemm(const const_tile_data<double> &A,
+              const const_tile_data<double> &B,
+              const mutable_tile_data<double> &r,
+              int N,
+              int M);
 
 // BLAS level 1 operations
 
@@ -136,7 +148,7 @@ vector_future dot_diag_gemm(vector_future f_A, vector_future f_B, vector_future
  * @param N vector length
  * @return y - x
  */
-vector_future axpy(vector_future f_y, vector_future f_x, const int N);
+mutable_tile_data<double> axpy(const mutable_tile_data<double> &y, const const_tile_data<double> &x, int N);
 
 /**
  * @brief FP64 Dot product: a * b
@@ -145,6 +157,8 @@ vector_future axpy(vector_future f_y, vector_future f_x, const int N);
  * @param N vector length
  * @return a * b
  */
-double dot(std::vector<double> a, std::vector<double> b, const int N);
+double dot(std::span<const double> a, std::span<const double> b, int N);
+
+GPRAT_NS_END
 
-#endif  // end of CPU_ADAPTER_CBLAS_FP64_H
+#endif
diff --git a/core/include/cpu/gp_algorithms.hpp b/core/include/gprat/cpu/gp_algorithms.hpp
similarity index 71%
rename from core/include/cpu/gp_algorithms.hpp
rename to core/include/gprat/cpu/gp_algorithms.hpp
index b8a6f043..210810fd 100644
--- a/core/include/cpu/gp_algorithms.hpp
+++ b/core/include/gprat/cpu/gp_algorithms.hpp
@@ -1,30 +1,34 @@
-#ifndef CPU_GP_ALGORITHMS_H
-#define CPU_GP_ALGORITHMS_H
+#ifndef GPRAT_CPU_GP_ALGORITHMS_HPP
+#define GPRAT_CPU_GP_ALGORITHMS_HPP
 
-#include "gp_kernels.hpp"
+#pragma once
+
+#include "gprat/detail/config.hpp"
+#include "gprat/kernels.hpp"
+#include "gprat/tile_data.hpp"
+
+#include <span>
 #include <vector>
 
+GPRAT_NS_BEGIN
+
 namespace cpu
 {
 
 /**
  * @brief Compute the squared exponential kernel of two feature vectors
  *
- * @param i_global The global index of the first feature vector
- * @param j_global The global index of the second feature vector
  * @param n_regressors The number of regressors
- * @param hyperparameters The kernel hyperparameters
+ * @param sek_params The kernel hyperparameters
  * @param i_input The first feature vector
  * @param j_input The second feature vector
  *
- * @return The entry of a covariance function at position i_global,j_global
+ * @return The entry of a covariance function
  */
-double compute_covariance_function(std::size_t i_global,
-                                   std::size_t j_global,
-                                   std::size_t n_regressors,
-                                   const gprat_hyper::SEKParams &sek_params,
-                                   const std::vector<double> &i_input,
-                                   const std::vector<double> &j_input);
+double compute_covariance_function(std::size_t n_regressors,
+                                   const SEKParams &sek_params,
+                                   std::span<const double> i_input,
+                                   std::span<const double> j_input);
 
 /**
  * @brief Generate a tile of the covariance matrix
@@ -39,13 +43,13 @@ double compute_covariance_function(std::size_t i_global,
  * @return A quadratic tile of the covariance matrix of size N x N
  * @note Does apply noise variance on the diagonal
  */
-std::vector<double> gen_tile_covariance(
+mutable_tile_data<double> gen_tile_covariance(
     std::size_t row,
     std::size_t col,
     std::size_t N,
     std::size_t n_regressors,
-    const gprat_hyper::SEKParams &sek_params,
-    const std::vector<double> &input);
+    const SEKParams &sek_params,
+    std::span<const double> input);
 
 /**
  * @brief Generate a tile of the prior covariance matrix
@@ -61,13 +65,13 @@ std::vector<double> gen_tile_covariance(
  * @note Does NOT apply noise variance on the diagonal
  */
 // NAME: gen_tile_priot_covariance
-std::vector<double> gen_tile_full_prior_covariance(
+mutable_tile_data<double> gen_tile_full_prior_covariance(
     std::size_t row,
     std::size_t col,
     std::size_t N,
     std::size_t n_regressors,
-    const gprat_hyper::SEKParams &sek_params,
-    const std::vector<double> &input);
+    const SEKParams &sek_params,
+    std::span<const double> input);
 
 /**
  * @brief Generate the diagonal of a diagonal tile in the prior covariance matrix
@@ -83,13 +87,13 @@ std::vector<double> gen_tile_full_prior_covariance(
  * @note Does NOT apply noise variance
  */
 // NAME: gen_tile_diag_prior_covariance
-std::vector<double> gen_tile_prior_covariance(
+mutable_tile_data<double> gen_tile_prior_covariance(
     std::size_t row,
     std::size_t col,
     std::size_t N,
     std::size_t n_regressors,
-    const gprat_hyper::SEKParams &sek_params,
-    const std::vector<double> &input);
+    const SEKParams &sek_params,
+    std::span<const double> input);
 
 /**
  * @brief Generate a tile of the cross-covariance matrix
@@ -105,15 +109,15 @@ std::vector<double> gen_tile_prior_covariance(
  * @return A tile of the cross covariance matrix of size N_row x N_col
  * @note Does NOT apply noise variance
  */
-std::vector<double> gen_tile_cross_covariance(
+mutable_tile_data<double> gen_tile_cross_covariance(
     std::size_t row,
     std::size_t col,
     std::size_t N_row,
     std::size_t N_col,
     std::size_t n_regressors,
-    const gprat_hyper::SEKParams &sek_params,
-    const std::vector<double> &row_input,
-    const std::vector<double> &col_input);
+    const SEKParams &sek_params,
+    std::span<const double> row_input,
+    std::span<const double> col_input);
 
 /**
  * @brief Transpose a tile of size N_row x N_col
@@ -124,7 +128,7 @@ std::vector<double> gen_tile_cross_covariance(
  *
  * @return The transposed tile of size N_col x N_row
  */
-std::vector<double> gen_tile_transpose(std::size_t N_row, std::size_t N_col, const std::vector<double> &tile);
+mutable_tile_data<double> gen_tile_transpose(std::size_t N_row, std::size_t N_col, std::span<const double> tile);
 
 /**
  * @brief Generate a tile of the output data
@@ -135,7 +139,7 @@ std::vector<double> gen_tile_transpose(std::size_t N_row, std::size_t N_col, con
  *
  * @return A tile of the output data of size N
  */
-std::vector<double> gen_tile_output(std::size_t row, std::size_t N, const std::vector<double> &output);
+mutable_tile_data<double> gen_tile_output(std::size_t row, std::size_t N, std::span<const double> output);
 
 /**
  * @brief Compute the L2-error norm over all tiles and elements
@@ -158,7 +162,7 @@ double compute_error_norm(std::size_t n_tiles,
  *
  * @return A tile filled with zeros of size N
  */
-std::vector<double> gen_tile_zeros(std::size_t N);
+mutable_tile_data<double> gen_tile_zeros(std::size_t N);
 
 /**
  * @brief Generate an identity tile (i==j?1:0)
@@ -166,8 +170,10 @@ std::vector<double> gen_tile_zeros(std::size_t N);
  * @param N The dimension of the quadratic tile
  * @return A NxN identity tile
  */
-std::vector<double> gen_tile_identity(std::size_t N);
+mutable_tile_data<double> gen_tile_identity(std::size_t N);
 
 }  // end of namespace cpu
 
-#endif  // end of CPU_GP_ALGORITHMS_H
+GPRAT_NS_END
+
+#endif
diff --git a/core/include/gprat/cpu/gp_functions.hpp b/core/include/gprat/cpu/gp_functions.hpp
new file mode 100644
index 00000000..55a9e0e3
--- /dev/null
+++ b/core/include/gprat/cpu/gp_functions.hpp
@@ -0,0 +1,1171 @@
+#ifndef GPRAT_CPU_GP_FUNCTIONS_HPP
+#define GPRAT_CPU_GP_FUNCTIONS_HPP
+
+#pragma once
+
+#include "gprat/cpu/gp_algorithms.hpp"
+#include "gprat/cpu/tiled_algorithms.hpp"
+#include "gprat/detail/config.hpp"
+#include "gprat/hyperparameters.hpp"
+#include "gprat/kernels.hpp"
+#include "gprat/scheduler.hpp"
+#include "gprat/tile_data.hpp"
+
+#include <vector>
+
+GPRAT_NS_BEGIN
+
+namespace cpu
+{
+
+/**
+ * @brief Perform Cholesky decomposition (+Assembly)
+ *
+ * @param training_input The training input data
+ * @param sek_params The kernel hyperparameters
+ *
+ * @param n_tiles The number of training tiles
+ * @param n_tile_size The size of each training tile
+ * @param n_regressors The number of regressors
+ *
+ * @return The tiled Cholesky factor
+ */
+template <typename Scheduler = tiled_scheduler_local>
+std::vector<mutable_tile_data<double>>
+cholesky(Scheduler &sched,
+         const std::vector<double> &training_input,
+         const SEKParams &sek_params,
+         std::size_t n_tiles,
+         std::size_t n_tile_size,
+         std::size_t n_regressors)
+{
+    // Tiled covariance matrix K_NxN
+    auto K_tiles = make_tiled_dataset<double>(
+        sched,
+        n_tiles * n_tiles,
+        [&](std::size_t tile_index)
+        { return schedule::covariance_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); });
+
+    for (std::size_t row = 0; row < n_tiles; row++)
+    {
+        for (std::size_t col = 0; col <= row; col++)
+        {
+            K_tiles[row * n_tiles + col] = detail::named_make_tile<gen_tile_covariance>(
+                sched,
+                schedule::covariance_tile(sched, n_tiles, row, col),
+                "assemble_tiled_K",
+                K_tiles[row * n_tiles + col],
+                row,
+                col,
+                n_tile_size,
+                n_regressors,
+                sek_params,
+                training_input);
+        }
+    }
+
+    // Launch asynchronous Cholesky decomposition: K = L * L^T
+    right_looking_cholesky_tiled(sched, K_tiles, n_tile_size, n_tiles);
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Synchronize
+    std::vector<mutable_tile_data<double>> result(n_tiles * n_tiles);
+    for (std::size_t i = 0; i < n_tiles; i++)
+    {
+        for (std::size_t j = 0; j <= i; j++)
+        {
+            result[i * n_tiles + j] = K_tiles[i * n_tiles + j].get();
+        }
+    }
+    return result;
+}
+
+/**
+ * @brief Compute the predictions without uncertainties.
+ *
+ * @param training_input The training input data
+ * @param training_output The raining output data
+ * @param test_input The test input data
+ * @param hyperparameters The kernel hyperparameters
+ * @param n_tiles The number of training tiles
+ * @param n_tile_size The size of each training tile
+ * @param m_tiles The number of test tiles
+ * @param m_tile_size The size of each test tile
+ * @param n_regressors The number of regressors
+ *
+ * @return A vector containing the predictions
+ */
+template <typename Scheduler = tiled_scheduler_local>
+std::vector<double>
+predict(Scheduler &sched,
+        const std::vector<double> &training_input,
+        const std::vector<double> &training_output,
+        const std::vector<double> &test_input,
+        const SEKParams &sek_params,
+        std::size_t n_tiles,
+        std::size_t n_tile_size,
+        std::size_t m_tiles,
+        std::size_t m_tile_size,
+        std::size_t n_regressors)
+{
+    /*
+     * Prediction: hat(y)_M = cross(K)_MxN * K^-1_NxN * y_N
+     * - Covariance matrix K_NxN
+     * - Cross-covariance cross(K)_MxN
+     * - Training output y_N
+     * - Prediction output hat(y)_M
+     *
+     * Algorithm:
+     * 1: Compute lower triangular part of covariance matrix K
+     * 2: Compute Cholesky factor L of K
+     * 3: Compute prediction hat(y):
+     *    - triangular solve L * beta = y
+     *    - triangular solve L^T * alpha = beta
+     *    - compute hat(y) = cross(K) * alpha
+     */
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Cholesky
+
+    // Tiled covariance matrix K_NxN
+    auto K_tiles = make_tiled_dataset<double>(
+        sched,
+        n_tiles * n_tiles,
+        [&](std::size_t tile_index)
+        { return schedule::covariance_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); });
+
+    for (std::size_t row = 0; row < n_tiles; row++)
+    {
+        for (std::size_t col = 0; col <= row; col++)
+        {
+            K_tiles[row * n_tiles + col] = detail::named_make_tile<gen_tile_covariance>(
+                sched,
+                schedule::covariance_tile(sched, n_tiles, row, col),
+                "assemble_tiled_K",
+                K_tiles[row * n_tiles + col],
+                row,
+                col,
+                n_tile_size,
+                n_regressors,
+                sek_params,
+                training_input);
+        }
+    }
+
+    // Launch asynchronous Cholesky decomposition: K = L * L^T
+    right_looking_cholesky_tiled(sched, K_tiles, n_tile_size, n_tiles);
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Prediction
+
+    // Tiled cross_covariance matrix K_NxM
+    auto cross_covariance_tiles = make_tiled_dataset<double>(
+        sched,
+        m_tiles * n_tiles,
+        [&](std::size_t tile_index)
+        { return schedule::covariance_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); });
+    // Tiled solution
+    auto prediction_tiles = make_tiled_dataset<double>(
+        sched, m_tiles, [&](std::size_t tile_index) { return schedule::prediction_tile(sched, m_tiles, tile_index); });
+    // Tiled intermediate solution
+    auto alpha_tiles = make_tiled_dataset<double>(
+        sched, n_tiles, [&](std::size_t tile_index) { return schedule::alpha_tile(sched, n_tiles, tile_index); });
+
+    for (std::size_t i = 0; i < n_tiles; i++)
+    {
+        alpha_tiles[i] = detail::named_make_tile<gen_tile_output>(
+            sched,
+            schedule::alpha_tile(sched, n_tiles, i),
+            "assemble_tiled_alpha",
+            alpha_tiles[i],
+            i,
+            n_tile_size,
+            training_output);
+    }
+
+    for (std::size_t i = 0; i < m_tiles; i++)
+    {
+        for (std::size_t j = 0; j < n_tiles; j++)
+        {
+            cross_covariance_tiles[i * n_tiles + j] = detail::named_make_tile<gen_tile_cross_covariance>(
+                sched,
+                schedule::cross_covariance_tile(sched, n_tiles, i, j),
+                "assemble_pred",
+                cross_covariance_tiles[i * n_tiles + j],
+                i,
+                j,
+                m_tile_size,
+                n_tile_size,
+                n_regressors,
+                sek_params,
+                test_input,
+                training_input);
+        }
+    }
+
+    for (std::size_t i = 0; i < m_tiles; i++)
+    {
+        prediction_tiles[i] = detail::named_make_tile<gen_tile_zeros>(
+            sched, schedule::prediction_tile(sched, m_tiles, i), "assemble_tiled", prediction_tiles[i], m_tile_size);
+    }
+
+    // Launch asynchronous triangular solve  L * (L^T * alpha) = y
+    forward_solve_tiled(sched, K_tiles, alpha_tiles, n_tile_size, n_tiles);
+    backward_solve_tiled(sched, K_tiles, alpha_tiles, n_tile_size, n_tiles);
+
+    // Launch asynchronous prediction computation solve: \hat{y} = K_cross_cov * alpha
+    matrix_vector_tiled(
+        sched, cross_covariance_tiles, alpha_tiles, prediction_tiles, m_tile_size, n_tile_size, n_tiles, m_tiles);
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Synchronize prediction
+    // Preallocate memory
+    std::vector<double> prediction_result;
+    prediction_result.reserve(test_input.size());
+    for (std::size_t i = 0; i < m_tiles; i++)
+    {
+        mutable_tile_data<double> tile = prediction_tiles[i].get();
+        std::copy_n(tile.data(), tile.size(), std::back_inserter(prediction_result));
+    }
+    return prediction_result;
+}
+
+/**
+ * @brief Compute the predictions with uncertainties.
+ *
+ * @param training_input The training input data
+ * @param training_output The raining output data
+ * @param test_input The test input data
+ * @param hyperparameters The kernel hyperparameters
+ * @param n_tiles The number of training tiles
+ * @param n_tile_size The size of each training tile
+ * @param m_tiles The number of test tiles
+ * @param m_tile_size The size of each test tile
+ * @param n_regressors The number of regressors
+ *
+ * @return A vector containing the prediction vector and the uncertainty vector
+ */
+template <typename Scheduler = tiled_scheduler_local>
+std::vector<std::vector<double>> predict_with_uncertainty(
+    Scheduler &sched,
+    const std::vector<double> &training_input,
+    const std::vector<double> &training_output,
+    const std::vector<double> &test_input,
+    const SEKParams &sek_params,
+    std::size_t n_tiles,
+    std::size_t n_tile_size,
+    std::size_t m_tiles,
+    std::size_t m_tile_size,
+    std::size_t n_regressors)
+{
+    /*
+     * Prediction: hat(y) = cross(K) * K^-1 * y
+     * Uncertainty: diag(Sigma) = diag(prior(K)) * diag(cross(K)^T * K^-1 * cross(K))
+     * - Covariance matrix K_NxN
+     * - Cross-covariance cross(K)_MxN
+     * - Prior covariance prior(K)_MxM
+     * - Training output y_N
+     * - Prediction output hat(y)_M
+     * - Posterior covariance matrix Sigma_MxM
+     *
+     * Algorithm:
+     * 1: Compute lower triangular part of covariance matrix K
+     * 2: Compute Cholesky factor L of K
+     * 3: Compute prediction hat(y):
+     *    - triangular solve L * beta = y
+     *    - triangular solve L^T * alpha = beta
+     *    - compute hat(y) = cross(K) * alpha
+     * 4: Compute uncertainty diag(Sigma):
+     *    - triangular solve L * V = cross(K)^T
+     *    - compute diag(W) = diag(V^T * V)
+     *    - compute diag(Sigma) = diag(prior(K)) - diag(W)
+     */
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Cholesky
+
+    // Tiled covariance matrix K_NxN
+    auto K_tiles = make_tiled_dataset<double>(
+        sched,
+        n_tiles * n_tiles,
+        [&](std::size_t tile_index)
+        { return schedule::covariance_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); });
+
+    for (std::size_t row = 0; row < n_tiles; row++)
+    {
+        for (std::size_t col = 0; col <= row; col++)
+        {
+            K_tiles[row * n_tiles + col] = detail::named_make_tile<gen_tile_covariance>(
+                sched,
+                schedule::covariance_tile(sched, n_tiles, row, col),
+                "assemble_tiled_K",
+                K_tiles[row * n_tiles + col],
+                row,
+                col,
+                n_tile_size,
+                n_regressors,
+                sek_params,
+                training_input);
+        }
+    }
+
+    // Launch asynchronous Cholesky decomposition: K = L * L^T
+    right_looking_cholesky_tiled(sched, K_tiles, n_tile_size, n_tiles);
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Prediction
+
+    // Tiled intermediate solution
+    auto alpha_tiles = make_tiled_dataset<double>(
+        sched, n_tiles, [&](std::size_t tile_index) { return schedule::alpha_tile(sched, n_tiles, tile_index); });
+    for (std::size_t i = 0; i < n_tiles; i++)
+    {
+        alpha_tiles[i] = detail::named_make_tile<gen_tile_output>(
+            sched,
+            schedule::alpha_tile(sched, n_tiles, i),
+            "assemble_tiled_alpha",
+            alpha_tiles[i],
+            i,
+            n_tile_size,
+            training_output);
+    }
+
+    // Tiled cross_covariance matrix K_NxM
+    auto cross_covariance_tiles = make_tiled_dataset<double>(
+        sched,
+        m_tiles * n_tiles,
+        [&](std::size_t tile_index)
+        { return schedule::covariance_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); });
+    for (std::size_t i = 0; i < m_tiles; i++)
+    {
+        for (std::size_t j = 0; j < n_tiles; j++)
+        {
+            cross_covariance_tiles[i * n_tiles + j] = detail::named_make_tile<gen_tile_cross_covariance>(
+                sched,
+                schedule::cross_covariance_tile(sched, n_tiles, i, j),
+                "assemble_pred",
+                cross_covariance_tiles[i * n_tiles + j],
+                i,
+                j,
+                m_tile_size,
+                n_tile_size,
+                n_regressors,
+                sek_params,
+                test_input,
+                training_input);
+        }
+    }
+
+    // Tiled solution
+    auto prediction_tiles = make_tiled_dataset<double>(
+        sched, m_tiles, [&](std::size_t tile_index) { return schedule::prediction_tile(sched, m_tiles, tile_index); });
+    for (std::size_t i = 0; i < m_tiles; i++)
+    {
+        prediction_tiles[i] = detail::named_make_tile<gen_tile_zeros>(
+            sched, schedule::prediction_tile(sched, m_tiles, i), "assemble_tiled", prediction_tiles[i], m_tile_size);
+    }
+
+    // Launch asynchronous triangular solve  L * (L^T * alpha) = y
+    forward_solve_tiled(sched, K_tiles, alpha_tiles, n_tile_size, n_tiles);
+    backward_solve_tiled(sched, K_tiles, alpha_tiles, n_tile_size, n_tiles);
+
+    // Launch asynchronous prediction computation solve: \hat{y} = K_cross_cov * alpha
+    matrix_vector_tiled(
+        sched, cross_covariance_tiles, alpha_tiles, prediction_tiles, m_tile_size, n_tile_size, n_tiles, m_tiles);
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Uncertainty
+
+    // Tiled transposed cross_covariance matrix K_MxN
+    auto t_cross_covariance_tiles = make_tiled_dataset<double>(
+        sched,
+        n_tiles * m_tiles,
+        [&](std::size_t tile_index)
+        { return schedule::t_cross_covariance_tile(sched, m_tiles, tile_index / m_tiles, tile_index % m_tiles); });
+    for (std::size_t j = 0; j < n_tiles; j++)
+    {
+        for (std::size_t i = 0; i < m_tiles; i++)
+        {
+            t_cross_covariance_tiles[j * m_tiles + i] = detail::named_make_tile<gen_tile_transpose>(
+                sched,
+                schedule::t_cross_covariance_tile(sched, m_tiles, j, i),
+                "assemble_pred",
+                t_cross_covariance_tiles[j * m_tiles + i],
+                m_tile_size,
+                n_tile_size,
+                cross_covariance_tiles[i * n_tiles + j]);
+        }
+    }
+
+    // Tiled prior covariance matrix diagonal diag(K_MxM)
+    auto prior_K_tiles = make_tiled_dataset<double>(
+        sched, m_tiles, [&](std::size_t tile_index) { return schedule::prior_K_tile(sched, n_tiles, 0, tile_index); });
+    for (std::size_t i = 0; i < m_tiles; i++)
+    {
+        prior_K_tiles[i] = detail::named_make_tile<gen_tile_prior_covariance>(
+            sched,
+            schedule::prior_K_tile(sched, m_tiles, 0, i),
+            "assemble_tiled",
+            prior_K_tiles[i],
+            i,
+            i,
+            m_tile_size,
+            n_regressors,
+            sek_params,
+            test_input);
+    }
+
+    // Tiled uncertainty solution
+    auto uncertainty_tiles = make_tiled_dataset<double>(
+        sched, m_tiles, [&](std::size_t tile_index) { return schedule::uncertainty_tile(sched, m_tiles, tile_index); });
+    for (std::size_t i = 0; i < m_tiles; i++)
+    {
+        uncertainty_tiles[i] = detail::named_make_tile<gen_tile_zeros>(
+            sched,
+            schedule::uncertainty_tile(sched, m_tiles, i),
+            "assemble_prior_inter",
+            uncertainty_tiles[i],
+            m_tile_size);
+    }
+
+    // Launch asynchronous triangular solve L * V = cross(K)^T
+    forward_solve_tiled_matrix(sched, K_tiles, t_cross_covariance_tiles, n_tile_size, m_tile_size, n_tiles, m_tiles);
+
+    // Launch asynchronous computation diag(W) = diag(V^T * V)
+    symmetric_matrix_matrix_diagonal_tiled(
+        sched, t_cross_covariance_tiles, uncertainty_tiles, n_tile_size, m_tile_size, n_tiles, m_tiles);
+
+    // Launch asynchronous computation diag(Sigma) = diag(prior(K)) - diag(W)
+    vector_difference_tiled(sched, prior_K_tiles, uncertainty_tiles, m_tile_size, m_tiles);
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Preallocate memory
+    std::vector<double> prediction_result;
+    std::vector<double> uncertainty_result;
+    prediction_result.reserve(test_input.size());
+    uncertainty_result.reserve(test_input.size());
+
+    // Synchronize prediction
+    for (std::size_t i = 0; i < m_tiles; i++)
+    {
+        mutable_tile_data<double> tile = prediction_tiles[i].get();
+        std::copy_n(tile.begin(), tile.size(), std::back_inserter(prediction_result));
+    }
+
+    // Synchronize uncertainty
+    for (std::size_t i = 0; i < m_tiles; i++)
+    {
+        mutable_tile_data<double> tile = uncertainty_tiles[i].get();
+        std::copy_n(tile.begin(), tile.size(), std::back_inserter(uncertainty_result));
+    }
+
+    return std::vector<std::vector<double>>{ std::move(prediction_result), std::move(uncertainty_result) };
+}
+
+/**
+ * @brief Compute the predictions with full covariance matrix.
+ *
+ * @param training_input The training input data
+ * @param training_output The raining output data
+ * @param test_input The test input data
+ * @param sek_params The kernel hyperparameters
+ * @param n_tiles The number of training tiles
+ * @param n_tile_size The size of each training tile
+ * @param m_tiles The number of test tiles
+ * @param m_tile_size The size of each test tile
+ * @param n_regressors The number of regressors
+ *
+ * @return A vector containing the prediction vector and the full posterior covariance matrix
+ */
+template <typename Scheduler = tiled_scheduler_local>
+std::vector<std::vector<double>> predict_with_full_cov(
+    Scheduler &sched,
+    const std::vector<double> &training_input,
+    const std::vector<double> &training_output,
+    const std::vector<double> &test_input,
+    const SEKParams &sek_params,
+    std::size_t n_tiles,
+    std::size_t n_tile_size,
+    std::size_t m_tiles,
+    std::size_t m_tile_size,
+    std::size_t n_regressors)
+{
+    /*
+     * Prediction: hat(y)_M = cross(K) * K^-1 * y
+     * Full covariance: Sigma = prior(K) - cross(K)^T * K^-1 * cross(K)
+     * - Covariance matrix K_NxN
+     * - Cross-covariance cross(K)_MxN
+     * - Prior covariance prior(K)_MxM
+     * - Training output y_N
+     * - Prediction output hat(y)_M
+     * - Posterior covariance matrix Sigma_MxM
+     *
+     * Algorithm:
+     * 1: Compute lower triangular part of covariance matrix K
+     * 2: Compute Cholesky factor L of K
+     * 3: Compute prediction hat(y):
+     *    - triangular solve L * beta = y
+     *    - triangular solve L^T * alpha = beta
+     *    - compute hat(y) = cross(K) * alpha
+     * 4: Compute full covariance matrix Sigma:
+     *    - triangular solve L * V = cross(K)^T
+     *    - compute W = V^T * V
+     *    - compute Sigma = prior(K) - W
+     * 5: Compute diag(Sigma)
+     */
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Cholesky
+
+    // Tiled covariance matrix K_NxN
+    auto K_tiles = make_tiled_dataset<double>(
+        sched,
+        n_tiles * n_tiles,
+        [&](std::size_t tile_index)
+        { return schedule::covariance_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); });
+    for (std::size_t row = 0; row < n_tiles; row++)
+    {
+        for (std::size_t col = 0; col <= row; col++)
+        {
+            K_tiles[row * n_tiles + col] = detail::named_make_tile<gen_tile_covariance>(
+                sched,
+                schedule::covariance_tile(sched, n_tiles, row, col),
+                "assemble_tiled_K",
+                K_tiles[row * n_tiles + col],
+                row,
+                col,
+                n_tile_size,
+                n_regressors,
+                sek_params,
+                training_input);
+        }
+    }
+
+    // Launch asynchronous Cholesky decomposition: K = L * L^T
+    right_looking_cholesky_tiled(sched, K_tiles, n_tile_size, n_tiles);
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Prediction
+
+    // Tiled intermediate solution
+    auto alpha_tiles = make_tiled_dataset<double>(
+        sched, n_tiles, [&](std::size_t tile_index) { return schedule::alpha_tile(sched, n_tiles, tile_index); });
+    for (std::size_t i = 0; i < n_tiles; i++)
+    {
+        alpha_tiles[i] = detail::named_make_tile<gen_tile_output>(
+            sched,
+            schedule::alpha_tile(sched, n_tiles, i),
+            "assemble_tiled_alpha",
+            alpha_tiles[i],
+            i,
+            n_tile_size,
+            training_output);
+    }
+
+    // Tiled cross_covariance matrix K_NxM
+    auto cross_covariance_tiles = make_tiled_dataset<double>(
+        sched,
+        m_tiles * n_tiles,
+        [&](std::size_t tile_index)
+        { return schedule::covariance_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); });
+    for (std::size_t i = 0; i < m_tiles; i++)
+    {
+        for (std::size_t j = 0; j < n_tiles; j++)
+        {
+            cross_covariance_tiles[i * n_tiles + j] = detail::named_make_tile<gen_tile_cross_covariance>(
+                sched,
+                schedule::cross_covariance_tile(sched, n_tiles, i, j),
+                "assemble_pred",
+                cross_covariance_tiles[i * n_tiles + j],
+                i,
+                j,
+                m_tile_size,
+                n_tile_size,
+                n_regressors,
+                sek_params,
+                test_input,
+                training_input);
+        }
+    }
+
+    // Tiled solution
+    auto prediction_tiles = make_tiled_dataset<double>(
+        sched, m_tiles, [&](std::size_t tile_index) { return schedule::prediction_tile(sched, n_tiles, tile_index); });
+    for (std::size_t i = 0; i < m_tiles; i++)
+    {
+        prediction_tiles[i] = detail::named_make_tile<gen_tile_zeros>(
+            sched, schedule::prediction_tile(sched, m_tiles, i), "assemble_tiled", prediction_tiles[i], m_tile_size);
+    }
+
+    // Launch asynchronous triangular solve  L * (L^T * alpha) = y
+    forward_solve_tiled(sched, K_tiles, alpha_tiles, n_tile_size, n_tiles);
+    backward_solve_tiled(sched, K_tiles, alpha_tiles, n_tile_size, n_tiles);
+
+    // Launch asynchronous prediction computation solve: \hat{y} = K_cross_cov * alpha
+    matrix_vector_tiled(
+        sched, cross_covariance_tiles, alpha_tiles, prediction_tiles, m_tile_size, n_tile_size, n_tiles, m_tiles);
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Uncertainty
+
+    // Tiled transposed cross_covariance matrix K_MxN
+    auto t_cross_covariance_tiles = make_tiled_dataset<double>(
+        sched,
+        n_tiles * m_tiles,
+        [&](std::size_t tile_index)
+        { return schedule::t_cross_covariance_tile(sched, m_tiles, tile_index / m_tiles, tile_index % m_tiles); });
+    for (std::size_t j = 0; j < n_tiles; j++)
+    {
+        for (std::size_t i = 0; i < m_tiles; i++)
+        {
+            t_cross_covariance_tiles[j * m_tiles + i] = detail::named_make_tile<gen_tile_transpose>(
+                sched,
+                schedule::t_cross_covariance_tile(sched, m_tiles, j, i),
+                "assemble_pred",
+                t_cross_covariance_tiles[j * m_tiles + i],
+                m_tile_size,
+                n_tile_size,
+                cross_covariance_tiles[i * n_tiles + j]);
+        }
+    }
+
+    // Tiled prior covariance matrix K_MxM
+    auto prior_K_tiles = make_tiled_dataset<double>(
+        sched,
+        m_tiles * m_tiles,
+        [&](std::size_t tile_index)
+        { return schedule::prior_K_tile(sched, n_tiles, tile_index / m_tiles, tile_index % m_tiles); });
+    for (std::size_t i = 0; i < m_tiles; i++)
+    {
+        for (std::size_t j = 0; j <= i; j++)
+        {
+            prior_K_tiles[i * m_tiles + j] = detail::named_make_tile<gen_tile_full_prior_covariance>(
+                sched,
+                schedule::prior_K_tile(sched, m_tiles, i, j),
+                "assemble_prior_tiled",
+                prior_K_tiles[i * m_tiles + j],
+                i,
+                j,
+                m_tile_size,
+                n_regressors,
+                sek_params,
+                test_input);
+
+            if (i != j)
+            {
+                prior_K_tiles[j * m_tiles + i] = detail::named_make_tile<gen_tile_transpose>(
+                    sched,
+                    schedule::prior_K_tile(sched, m_tiles, j, i),
+                    "assemble_prior_tiled",
+                    prior_K_tiles[j * m_tiles + i],
+                    m_tile_size,
+                    m_tile_size,
+                    prior_K_tiles[i * m_tiles + j]);
+            }
+        }
+    }
+
+    // Tiled uncertainty solution
+    auto uncertainty_tiles = make_tiled_dataset<double>(
+        sched, m_tiles, [&](std::size_t tile_index) { return schedule::uncertainty_tile(sched, m_tiles, tile_index); });
+    for (std::size_t i = 0; i < m_tiles; i++)
+    {
+        uncertainty_tiles[i] = detail::named_make_tile<gen_tile_zeros>(
+            sched,
+            schedule::uncertainty_tile(sched, m_tiles, i),
+            "assemble_prior_inter",
+            uncertainty_tiles[i],
+            m_tile_size);
+    }
+
+    // Launch asynchronous triangular solve L * V = cross(K)^T
+    forward_solve_tiled_matrix(sched, K_tiles, t_cross_covariance_tiles, n_tile_size, m_tile_size, n_tiles, m_tiles);
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Launch asynchronous computation of full covariance Sigma = prior(K) - V^T * V
+    symmetric_matrix_matrix_tiled(
+        sched, t_cross_covariance_tiles, prior_K_tiles, n_tile_size, m_tile_size, n_tiles, m_tiles);
+    ///////////////////////////////////////////////////////////////////////////
+    // Launch asynchronous computation of uncertainty diag(Sigma)
+    matrix_diagonal_tiled(sched, prior_K_tiles, uncertainty_tiles, m_tile_size, m_tiles);
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Preallocate memory
+    std::vector<double> prediction_result;
+    std::vector<double> uncertainty_result;
+    prediction_result.reserve(test_input.size());
+    uncertainty_result.reserve(test_input.size());
+
+    // Synchronize prediction
+    for (std::size_t i = 0; i < m_tiles; i++)
+    {
+        mutable_tile_data<double> tile = prediction_tiles[i].get();
+        std::copy_n(tile.begin(), tile.size(), std::back_inserter(prediction_result));
+    }
+
+    // Synchronize uncertainty
+    for (std::size_t i = 0; i < m_tiles; i++)
+    {
+        mutable_tile_data<double> tile = uncertainty_tiles[i].get();
+        std::copy_n(tile.begin(), tile.size(), std::back_inserter(uncertainty_result));
+    }
+
+    return std::vector<std::vector<double>>{ std::move(prediction_result), std::move(uncertainty_result) };
+}
+
+///////////////////////////////////////////////////////////////////////////
+// OPTIMIZATION
+
+/**
+ * @brief Compute loss for given data and Gaussian process model
+ *
+ * @param training_input The training input data
+ * @param training_output The raining output data
+ * @param sek_params The kernel hyperparameters
+ * @param n_tiles The number of training tiles
+ * @param n_tile_size The size of each training tile
+ * @param n_regressors The number of regressors
+ *
+ * @return The loss
+ */
+template <typename Scheduler = tiled_scheduler_local>
+double calculate_loss(Scheduler &sched,
+                      const std::vector<double> &training_input,
+                      const std::vector<double> &training_output,
+                      const SEKParams &sek_params,
+                      std::size_t n_tiles,
+                      std::size_t n_tile_size,
+                      std::size_t n_regressors)
+{
+    /*
+     * Negative log likelihood loss:
+     * loss(theta) = 0.5 * ( log(det(K)) - y^T * K^-1 * y - N * log(2 * pi) )
+     * - Covariance matrix K(theta)_NxN
+     * - Training output y_N
+     * - Hyperparameters theta ={ v, l, v_n }
+     *
+     * Algorithm:
+     * 1: Compute lower triangular part of covariance matrix K
+     * 2: Compute Cholesky factor L of K
+     * 3: Compute prediction alpha = K^-1 * y:
+     *    - triangular solve L * beta = y
+     *    - triangular solve L^T * alpha = beta
+     * 5: Compute beta = K^-1 * y
+     * 6: Compute negative log likelihood loss
+     *    - Calculate sum_i^N log(L_ii^2)
+     *    - Calculate y^T * beta
+     *    - Add constant N * log (2 * pi)
+     */
+
+    // Tiled covariance matrix K_NxN
+    auto K_tiles = make_tiled_dataset<double>(
+        sched,
+        n_tiles * n_tiles,
+        [&](std::size_t tile_index)
+        { return schedule::covariance_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); });
+    for (std::size_t row = 0; row < n_tiles; row++)
+    {
+        for (std::size_t col = 0; col <= row; col++)
+        {
+            K_tiles[row * n_tiles + col] = detail::named_make_tile<gen_tile_covariance>(
+                sched,
+                schedule::covariance_tile(sched, n_tiles, row, col),
+                "assemble_tiled_K",
+                K_tiles[row * n_tiles + col],
+                row,
+                col,
+                n_tile_size,
+                n_regressors,
+                sek_params,
+                training_input);
+        }
+    }
+
+    // Tiled intermediate solution
+    auto alpha_tiles = make_tiled_dataset<double>(
+        sched, n_tiles, [&](std::size_t tile_index) { return schedule::alpha_tile(sched, n_tiles, tile_index); });
+    for (std::size_t i = 0; i < n_tiles; i++)
+    {
+        alpha_tiles[i] = detail::named_make_tile<gen_tile_output>(
+            sched,
+            schedule::alpha_tile(sched, n_tiles, i),
+            "assemble_tiled_alpha",
+            alpha_tiles[i],
+            i,
+            n_tile_size,
+            training_output);
+    }
+
+    // Tiled output
+    auto y_tiles = make_tiled_dataset<double>(
+        sched, n_tiles, [&](std::size_t tile_index) { return schedule::prediction_tile(sched, n_tiles, tile_index); });
+    for (std::size_t i = 0; i < n_tiles; i++)
+    {
+        y_tiles[i] = detail::named_make_tile<gen_tile_output>(
+            sched,
+            schedule::prediction_tile(sched, n_tiles, i),
+            "assemble_tiled_alpha",
+            y_tiles[i],
+            i,
+            n_tile_size,
+            training_output);
+    }
+
+    // Launch asynchronous Cholesky decomposition: K = L * L^T
+    right_looking_cholesky_tiled(sched, K_tiles, n_tile_size, n_tiles);
+
+    // Launch asynchronous triangular solve  L * (L^T * alpha) = y
+    forward_solve_tiled(sched, K_tiles, alpha_tiles, n_tile_size, n_tiles);
+    backward_solve_tiled(sched, K_tiles, alpha_tiles, n_tile_size, n_tiles);
+
+    // Launch asynchronous loss computation
+    return compute_loss_tiled(sched, K_tiles, alpha_tiles, y_tiles, n_tile_size, n_tiles).get();
+}
+
+/**
+ * @brief Perform optimization for a given number of iterations
+ *
+ * @param training_input The training input data
+ * @param training_output The raining output data
+ *
+ * @param n_tiles The number of training tiles
+ * @param n_tile_size The size of each training tile
+ * @param n_regressors The number of regressors
+ *
+ * @param adam_params The Adam optimizer hyperparameters
+ * @param sek_params The kernel hyperparameters
+ * @param trainable_params The vector containing a bool whether to train a hyperparameter
+ *
+ * @return A vector containing the loss values of each iteration
+ */
+template <typename Scheduler = tiled_scheduler_local>
+std::vector<double>
+optimize(Scheduler &sched,
+         const std::vector<double> &training_input,
+         const std::vector<double> &training_output,
+         std::size_t n_tiles,
+         std::size_t n_tile_size,
+         std::size_t n_regressors,
+         const AdamParams &adam_params,
+         SEKParams &sek_params,
+         std::vector<bool> trainable_params,
+         std::size_t start_iter = 0)
+{
+    /*
+     * - Hyperparameters theta={v, l, v_n}
+     * - Covariance matrix K(theta)
+     * - Training ouput y
+     *
+     * Algorithm:
+     * for opt_iter:
+     *   1: Compute distance for entries of covariance matrix K
+     *   2: Compute lower triangular part of K with distance
+     *   3: Compute lower triangular gradients for delta(K)/delta(v), and delta(K)/delta(l) with distance
+     *
+     *   4: Compute Cholesky factor L of K
+     *   5: Compute K^-1:
+     *       - triangular solve L * {} = I
+     *       - triangular solve L^T * K^-1 = {}
+     *   6: Compute beta = K^-1 * y
+     *
+     *   7: Compute negative log likelihood loss
+     *       - Calculate 0.5 sum_i^N log(L_ii^2)
+     *       - Calculate 0.5 y^T * beta
+     *       - Add constant N / 2 * log (2 * pi)
+     *
+     *   8: Compute delta(loss)/delta(param_i)
+     *       - Compute trace(K^-1 * delta(K)/delta(theta_i))
+     *       - Compute beta^T *  delta(K)/delta(theta_i) * beta
+     *   9: Update hyperparameters theta with Adam optimizer
+     *       - m_T = beta1 * m_T-1 + (1 - beta1) * g_T
+     *       - w_T = beta2 + w_T-1 + (1 - beta2) * g_T^2
+     *       - nu_T = nu * sqrt(1 - beta2_T) / (1 - beta1_T)
+     *       - theta_T = theta_T-1 - nu_T * m_T / (sqrt(w_T) + epsilon)
+     * endfor
+     */
+
+    // data holder for computed loss values
+    std::vector<double> losses;
+    losses.reserve(static_cast<std::size_t>(adam_params.opt_iter));
+
+    // Tiled output
+    auto y_tiles = make_tiled_dataset<double>(
+        sched, n_tiles, [&](std::size_t tile_index) { return schedule::prediction_tile(sched, n_tiles, tile_index); });
+    // Launch asynchronous assembly of output y
+    for (std::size_t i = 0; i < n_tiles; i++)
+    {
+        y_tiles[i] = detail::named_make_tile<gen_tile_output>(
+            sched,
+            schedule::prediction_tile(sched, n_tiles, i),
+            "assemble_y",
+            y_tiles[i],
+            i,
+            n_tile_size,
+            training_output);
+    }
+
+    //////////////////////////////////////////////////////////////////////////////
+    // per-loop tiles
+
+    // Tiled covariance matrix K_NxN
+    auto K_tiles = make_tiled_dataset<double>(
+        sched,
+        n_tiles * n_tiles,
+        [&](std::size_t tile_index)
+        { return schedule::covariance_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); });
+
+    // Tiled inverse covariance matrix K^-1_NxN
+    auto K_inv_tiles = make_tiled_dataset<double>(
+        sched,
+        n_tiles * n_tiles,
+        [&](std::size_t tile_index)
+        { return schedule::K_inv_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); });
+
+    // Tiled intermediate solution
+    auto alpha_tiles = make_tiled_dataset<double>(
+        sched, n_tiles, [&](std::size_t tile_index) { return schedule::alpha_tile(sched, n_tiles, tile_index); });
+
+    // Tiled future data structures for gradients
+
+    // Tiled covariance with gradient v
+    auto grad_v_tiles = make_tiled_dataset<double>(
+        sched,
+        n_tiles * n_tiles,
+        [&](std::size_t tile_index)
+        { return schedule::K_grad_v_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); });
+
+    // Tiled covariance with gradient l
+    auto grad_l_tiles = make_tiled_dataset<double>(
+        sched,
+        n_tiles * n_tiles,
+        [&](std::size_t tile_index)
+        { return schedule::K_grad_l_tile(sched, n_tiles, tile_index / n_tiles, tile_index % n_tiles); });
+
+    auto inter_alpha = make_tiled_dataset<double>(
+        sched, n_tiles, [&](std::size_t tile_index) { return schedule::inter_alpha_tile(sched, n_tiles, tile_index); });
+
+    auto diag_tiles = make_tiled_dataset<double>(
+        sched, n_tiles, [&](std::size_t tile_index) { return schedule::diag_tile(sched, n_tiles, tile_index); });
+
+    //////////////////////////////////////////////////////////////////////////////
+    // Perform optimization
+    for (std::size_t iter = start_iter; iter < static_cast<std::size_t>(adam_params.opt_iter); iter++)
+    {
+        ///////////////////////////////////////////////////////////////////////////
+        // Launch asynchronous assembly of tiled covariance matrix, derivative of covariance matrix
+        // vector w.r.t. to vertical lengthscale and derivative of covariance
+        // matrix vector w.r.t. to lengthscale
+        for (std::size_t i = 0; i < n_tiles; i++)
+        {
+            for (std::size_t j = 0; j <= i; j++)
+            {
+                // Compute the distance (z_i - z_j) of K entries to reuse
+                hpx::shared_future<mutable_tile_data<double>> cov_dists = detail::named_async<gen_tile_distance>(
+                    "assemble_cov_dist", i, j, n_tile_size, n_regressors, sek_params, training_input);
+
+                K_tiles[i * n_tiles + j] = detail::named_make_tile<gen_tile_covariance_with_distance>(
+                    sched,
+                    schedule::covariance_tile(sched, n_tiles, i, j),
+                    "assemble_K",
+                    K_tiles[i * n_tiles + j],
+                    i,
+                    j,
+                    n_tile_size,
+                    sek_params,
+                    cov_dists);
+                if (trainable_params[0])
+                {
+                    grad_l_tiles[i * n_tiles + j] = detail::named_make_tile<gen_tile_grad_l>(
+                        sched,
+                        schedule::K_grad_l_tile(sched, n_tiles, i, j),
+                        "assemble_gradl",
+                        grad_l_tiles[i * n_tiles + j],
+                        n_tile_size,
+                        sek_params,
+                        cov_dists);
+                    if (i != j)
+                    {
+                        grad_l_tiles[j * n_tiles + i] = detail::named_make_tile<gen_tile_transpose>(
+                            sched,
+                            schedule::K_grad_l_tile(sched, n_tiles, j, i),
+                            "assemble_gradl_t",
+                            grad_l_tiles[j * n_tiles + i],
+                            n_tile_size,
+                            n_tile_size,
+                            grad_l_tiles[i * n_tiles + j]);
+                    }
+                }
+
+                if (trainable_params[1])
+                {
+                    grad_v_tiles[i * n_tiles + j] = detail::named_make_tile<gen_tile_grad_v>(
+                        sched,
+                        schedule::K_grad_v_tile(sched, n_tiles, i, j),
+                        "assemble_gradv",
+                        grad_v_tiles[i * n_tiles + j],
+                        n_tile_size,
+                        sek_params,
+                        cov_dists);
+                    if (i != j)
+                    {
+                        grad_v_tiles[j * n_tiles + i] = detail::named_make_tile<gen_tile_transpose>(
+                            sched,
+                            schedule::K_grad_v_tile(sched, n_tiles, j, i),
+                            "assemble_gradv_t",
+                            grad_v_tiles[j * n_tiles + i],
+                            n_tile_size,
+                            n_tile_size,
+                            grad_v_tiles[i * n_tiles + j]);
+                    }
+                }
+            }
+        }
+
+        // Assembly with reallocation -> optimize to only set existing values
+        for (std::size_t i = 0; i < n_tiles; i++)
+        {
+            alpha_tiles[i] = detail::named_make_tile<gen_tile_zeros>(
+                sched, schedule::alpha_tile(sched, n_tiles, i), "assemble_tiled_alpha", alpha_tiles[i], n_tile_size);
+        }
+
+        for (std::size_t i = 0; i < n_tiles; i++)
+        {
+            for (std::size_t j = 0; j < n_tiles; j++)
+            {
+                if (i == j)
+                {
+                    K_inv_tiles[i * n_tiles + j] = detail::named_make_tile<gen_tile_identity>(
+                        sched,
+                        schedule::K_inv_tile(sched, n_tiles, i, j),
+                        "assemble_identity_matrix",
+                        K_inv_tiles[i * n_tiles + j],
+                        n_tile_size);
+                }
+                else
+                {
+                    K_inv_tiles[i * n_tiles + j] = detail::named_make_tile<gen_tile_zeros>(
+                        sched,
+                        schedule::K_inv_tile(sched, n_tiles, i, j),
+                        "assemble_identity_matrix",
+                        K_inv_tiles[i * n_tiles + j],
+                        n_tile_size * n_tile_size);
+                }
+            }
+        }
+
+        ///////////////////////////////////////////////////////////////////////////
+        // Launch asynchronous Cholesky decomposition: K = L * L^T
+        right_looking_cholesky_tiled(sched, K_tiles, n_tile_size, n_tiles);
+
+        ///////////////////////////////////////////////////////////////////////////
+        // Launch asynchronous compute K^-1 through L* (L^T * X) = I
+        forward_solve_tiled_matrix(sched, K_tiles, K_inv_tiles, n_tile_size, n_tile_size, n_tiles, n_tiles);
+        backward_solve_tiled_matrix(sched, K_tiles, K_inv_tiles, n_tile_size, n_tile_size, n_tiles, n_tiles);
+
+        ///////////////////////////////////////////////////////////////////////////
+        // Launch asynchronous compute beta = inv(K) * y
+        matrix_vector_tiled(sched, K_inv_tiles, y_tiles, alpha_tiles, n_tile_size, n_tile_size, n_tiles, n_tiles);
+
+        ///////////////////////////////////////////////////////////////////////////
+        // Launch asynchronous loss computation where
+        // loss(theta) = 0.5 * ( log(det(K)) - y^T * K^-1 * y - N * log(2 * pi) )
+        auto loss_value = compute_loss_tiled(sched, K_tiles, alpha_tiles, y_tiles, n_tile_size, n_tiles);
+
+        ///////////////////////////////////////////////////////////////////////////
+        // Launch asynchronous update of the hyperparameters
+        if (trainable_params[0])
+        {  // lengthscale
+            update_hyperparameter_tiled_lengthscale(
+                sched,
+                K_inv_tiles,
+                grad_l_tiles,
+                alpha_tiles,
+                adam_params,
+                diag_tiles,
+                inter_alpha,
+                sek_params,
+                n_tile_size,
+                n_tiles,
+                iter,
+                0);
+        }
+        if (trainable_params[1])
+        {  // vertical_lengthscale
+            update_hyperparameter_tiled_lengthscale(
+                sched,
+                K_inv_tiles,
+                grad_v_tiles,
+                alpha_tiles,
+                adam_params,
+                diag_tiles,
+                inter_alpha,
+                sek_params,
+                n_tile_size,
+                n_tiles,
+                iter,
+                1);
+        }
+        if (trainable_params[2])
+        {  // noise_variance
+            update_hyperparameter_tiled_noise_variance(
+                sched, K_inv_tiles, alpha_tiles, adam_params, sek_params, n_tile_size, n_tiles, iter, 2);
+        }
+        // Synchronize after iteration
+        losses.push_back(loss_value.get());
+    }
+    return losses;
+}
+
+/**
+ * @brief Perform a single optimization step
+ *
+ * @param training_input The training input data
+ * @param training_output The raining output data
+ *
+ * @param n_tiles The number of training tiles
+ * @param n_tile_size The size of each training tile
+ * @param n_regressors The number of regressors
+ *
+ * @param adam_params The Adam optimizer hyperparameters
+ * @param sek_params The kernel hyperparameters
+ * @param trainable_params The vector containing a bool whether to train a hyperparameter
+ *
+ * @param iter The current optimization iteration
+ *
+ * @return The loss value
+ */
+template <typename Scheduler = tiled_scheduler_local>
+double optimize_step(Scheduler &sched,
+                     const std::vector<double> &training_input,
+                     const std::vector<double> &training_output,
+                     std::size_t n_tiles,
+                     std::size_t n_tile_size,
+                     std::size_t n_regressors,
+                     AdamParams &adam_params,
+                     SEKParams &sek_params,
+                     std::vector<bool> trainable_params,
+                     std::size_t iter)
+{
+    // No point in copy&pasting everything for this function
+    const auto old_opt_iter = adam_params.opt_iter;
+    adam_params.opt_iter = iter + 1;
+    const auto r = optimize(
+        sched,
+        training_input,
+        training_output,
+        n_tiles,
+        n_tile_size,
+        n_regressors,
+        adam_params,
+        sek_params,
+        trainable_params,
+        iter);
+    adam_params.opt_iter = old_opt_iter;
+    return r[0];
+}
+
+}  // end of namespace cpu
+
+GPRAT_NS_END
+
+#endif
diff --git a/core/include/cpu/gp_optimizer.hpp b/core/include/gprat/cpu/gp_optimizer.hpp
similarity index 80%
rename from core/include/cpu/gp_optimizer.hpp
rename to core/include/gprat/cpu/gp_optimizer.hpp
index c632e87b..1712597d 100644
--- a/core/include/cpu/gp_optimizer.hpp
+++ b/core/include/gprat/cpu/gp_optimizer.hpp
@@ -1,10 +1,17 @@
-#ifndef CPU_GP_OPTIMIZER_H
-#define CPU_GP_OPTIMIZER_H
+#ifndef GPRAT_CPU_GP_OPTIMIZER_H
+#define GPRAT_CPU_GP_OPTIMIZER_H
+
+#pragma once
+
+#include "gprat/detail/config.hpp"
+#include "gprat/hyperparameters.hpp"
+#include "gprat/kernels.hpp"
+#include "gprat/tile_data.hpp"
 
-#include "gp_hyperparameters.hpp"
-#include "gp_kernels.hpp"
 #include <vector>
 
+GPRAT_NS_BEGIN
+
 namespace cpu
 {
 
@@ -54,7 +61,7 @@ double compute_sigmoid(double parameter);
 double compute_covariance_distance(std::size_t i_global,
                                    std::size_t j_global,
                                    std::size_t n_regressors,
-                                   const gprat_hyper::SEKParams &sek_params,
+                                   const SEKParams &sek_params,
                                    const std::vector<double> &i_input,
                                    const std::vector<double> &j_input);
 
@@ -70,12 +77,12 @@ double compute_covariance_distance(std::size_t i_global,
  *
  * @return A quadratic tile containing the distance between the features of size N x N
  */
-std::vector<double> gen_tile_distance(
+mutable_tile_data<double> gen_tile_distance(
     std::size_t row,
     std::size_t col,
     std::size_t N,
     std::size_t n_regressors,
-    const gprat_hyper::SEKParams &sek_params,
+    const SEKParams &sek_params,
     const std::vector<double> &input);
 
 /**
@@ -89,12 +96,12 @@ std::vector<double> gen_tile_distance(
  *
  * @return A quadratic tile of the covariance matrix of size N x N
  */
-std::vector<double> gen_tile_covariance_with_distance(
+mutable_tile_data<double> gen_tile_covariance_with_distance(
     std::size_t row,
     std::size_t col,
     std::size_t N,
-    const gprat_hyper::SEKParams &sek_params,
-    const std::vector<double> &distance);
+    const SEKParams &sek_params,
+    const const_tile_data<double> &distance);
 
 /**
  * @brief  Generate a derivative tile w.r.t. vertical_lengthscale v
@@ -105,8 +112,8 @@ std::vector<double> gen_tile_covariance_with_distance(
  *
  * @return A quadratic tile of the derivative of v of size N x N
  */
-std::vector<double>
-gen_tile_grad_v(std::size_t N, const gprat_hyper::SEKParams &sek_params, const std::vector<double> &distance);
+mutable_tile_data<double>
+gen_tile_grad_v(std::size_t N, const SEKParams &sek_params, const const_tile_data<double> &distance);
 
 /**
  * @brief  Generate a derivative tile w.r.t. lengthscale l
@@ -117,8 +124,8 @@ gen_tile_grad_v(std::size_t N, const gprat_hyper::SEKParams &sek_params, const s
  *
  * @return A quadratic tile of the derivative of l of size N x N
  */
-std::vector<double>
-gen_tile_grad_l(std::size_t N, const gprat_hyper::SEKParams &sek_params, const std::vector<double> &distance);
+mutable_tile_data<double>
+gen_tile_grad_l(std::size_t N, const SEKParams &sek_params, const const_tile_data<double> &distance);
 
 /**
  * @brief Update biased first raw moment estimate: m_T+1 = beta_1 * m_T + (1 - beta_1) * g_T.
@@ -153,11 +160,8 @@ double update_second_moment(double gradient, double v_T, double beta_2);
  *
  * @return The updated hyperparameter
  */
-double adam_step(const double unconstrained_hyperparam,
-                 const gprat_hyper::AdamParams &adam_params,
-                 double m_T,
-                 double v_T,
-                 std::size_t iter);
+double
+adam_step(double unconstrained_hyperparam, const AdamParams &adam_params, double m_T, double v_T, std::size_t iter);
 
 /**
  * @brief Compute negative-log likelihood on one tile.
@@ -168,9 +172,9 @@ double adam_step(const double unconstrained_hyperparam,
  *
  * @return Return l = y^T * alpha + \sum_i^N log(L_ii^2)
  */
-double compute_loss(const std::vector<double> &K_diag_tile,
-                    const std::vector<double> &alpha_tile,
-                    const std::vector<double> &y_tile,
+double compute_loss(std::span<const double> K_diag_tile,
+                    std::span<const double> alpha_tile,
+                    std::span<const double> y_tile,
                     std::size_t N);
 
 /**
@@ -182,7 +186,7 @@ double compute_loss(const std::vector<double> &K_diag_tile,
  *
  * @return The added up loss plus the constant factor
  */
-double add_losses(const std::vector<double> &losses, std::size_t N, std::size_t n);
+double add_losses(std::span<const double> losses, std::size_t N, std::size_t n);
 
 /**
  * @brief Compute the loss gradient.
@@ -204,7 +208,7 @@ double compute_gradient(double trace, double dot, std::size_t N, std::size_t n_t
  *
  * @return The updated global trace
  */
-double compute_trace(const std::vector<double> &diagonal, double trace);
+double compute_trace(std::span<const double> diagonal, double trace);
 
 /**
  * @brief Add the dot product of a vector to a global result.
@@ -215,7 +219,7 @@ double compute_trace(const std::vector<double> &diagonal, double trace);
  *
  * @return The updated global result
  */
-double compute_dot(const std::vector<double> &vector_T, const std::vector<double> &vector, double result);
+double compute_dot(std::span<const double> vector_T, std::span<const double> vector, double result);
 
 /**
  * @brief Add the local trace of a matrix tile to the global trace
@@ -226,8 +230,10 @@ double compute_dot(const std::vector<double> &vector_T, const std::vector<double
  *
  * @return The updated global trace
  */
-double compute_trace_diag(const std::vector<double> &tile, double trace, std::size_t N);
+double compute_trace_diag(std::span<const double> tile, double trace, std::size_t N);
 
 }  // end of namespace cpu
 
-#endif  // end of CPU_GP_OPTIMIZER_H
+GPRAT_NS_END
+
+#endif
diff --git a/core/include/gprat/cpu/gp_uncertainty.hpp b/core/include/gprat/cpu/gp_uncertainty.hpp
new file mode 100644
index 00000000..cb402119
--- /dev/null
+++ b/core/include/gprat/cpu/gp_uncertainty.hpp
@@ -0,0 +1,28 @@
+#ifndef GPRAT_CPU_GP_UNCERTAINTY_HPP
+#define GPRAT_CPU_GP_UNCERTAINTY_HPP
+
+#pragma once
+
+#include "gprat/detail/config.hpp"
+#include "gprat/tile_data.hpp"
+
+GPRAT_NS_BEGIN
+
+namespace cpu
+{
+
+/**
+ * @brief Extract diagonal elements of the matrix A.
+ *
+ * @param A The matrix
+ * @param M The rumber of rows in the matrix
+ *
+ * @return Diagonal element vector of the matrix A of size M
+ */
+mutable_tile_data<double> get_matrix_diagonal(const const_tile_data<double> &A, std::size_t M);
+
+}  // end of namespace cpu
+
+GPRAT_NS_END
+
+#endif  // end of CPU_GP_UNCERTAINTY_H
diff --git a/core/include/gprat/cpu/tiled_algorithms.hpp b/core/include/gprat/cpu/tiled_algorithms.hpp
new file mode 100644
index 00000000..718e4d5b
--- /dev/null
+++ b/core/include/gprat/cpu/tiled_algorithms.hpp
@@ -0,0 +1,657 @@
+#ifndef GPRAT_CPU_TILED_ALGORITHMS_H
+#define GPRAT_CPU_TILED_ALGORITHMS_H
+
+#pragma once
+
+#include "gprat/cpu/adapter_cblas_fp64.hpp"
+#include "gprat/cpu/gp_algorithms.hpp"
+#include "gprat/cpu/gp_optimizer.hpp"
+#include "gprat/cpu/gp_uncertainty.hpp"
+#include "gprat/detail/async_helpers.hpp"
+#include "gprat/detail/config.hpp"
+#include "gprat/hyperparameters.hpp"
+#include "gprat/kernels.hpp"
+#include "gprat/scheduler.hpp"
+
+#include <hpx/future.hpp>
+
+GPRAT_NS_BEGIN
+
+namespace cpu
+{
+
+namespace impl
+{
+void update_parameters(
+    const AdamParams &adam_params,
+    SEKParams &sek_params,
+    std::size_t N,
+    std::size_t n_tiles,
+    std::size_t iter,
+    std::size_t param_idx,
+    double trace,
+    double dot,
+    bool jitter,
+    double factor);
+}
+
+// Tiled Cholesky Algorithm
+
+/**
+ * @brief Perform right-looking tiled Cholesky decomposition.
+ *
+ * @param tiles Tiled matrix represented as a vector of futurized tiles, containing the
+ *        covariance matrix, afterwards the Cholesky decomposition.
+ * @param N Tile size per dimension.
+ * @param n_tiles Number of tiles per dimension.
+ */
+template <typename Tiles, typename Scheduler = tiled_scheduler_local>
+void right_looking_cholesky_tiled(Scheduler &sched, Tiles &tiles, std::size_t N, std::size_t n_tiles)
+{
+    for (std::size_t k = 0; k < n_tiles; k++)
+    {
+        // POTRF: Compute Cholesky factor L
+        tiles[k * n_tiles + k] = detail::named_dataflow<potrf>(
+            sched, schedule::cholesky_potrf(sched, n_tiles, k), "cholesky_tiled", tiles[k * n_tiles + k], N);
+        for (std::size_t m = k + 1; m < n_tiles; m++)
+        {
+            // TRSM:  Solve X * L^T = A
+            tiles[m * n_tiles + k] = detail::named_dataflow<trsm>(
+                sched,
+                schedule::cholesky_trsm(sched, n_tiles, k, m),
+                "cholesky_tiled",
+                tiles[k * n_tiles + k],
+                tiles[m * n_tiles + k],
+                N,
+                N,
+                Blas_trans,
+                Blas_right);
+        }
+        for (std::size_t m = k + 1; m < n_tiles; m++)
+        {
+            // SYRK:  A = A - B * B^T
+            tiles[m * n_tiles + m] = detail::named_dataflow<syrk>(
+                sched,
+                schedule::cholesky_syrk(sched, n_tiles, m),
+                "cholesky_tiled",
+                tiles[m * n_tiles + m],
+                tiles[m * n_tiles + k],
+                N);
+            for (std::size_t n = k + 1; n < m; n++)
+            {
+                // GEMM: C = C - A * B^T
+                tiles[m * n_tiles + n] = detail::named_dataflow<gemm>(
+                    sched,
+                    schedule::cholesky_gemm(sched, n_tiles, k, m, n),
+                    "cholesky_tiled",
+                    tiles[m * n_tiles + k],
+                    tiles[n * n_tiles + k],
+                    tiles[m * n_tiles + n],
+                    N,
+                    N,
+                    N,
+                    Blas_no_trans,
+                    Blas_trans);
+            }
+        }
+    }
+}
+
+// Tiled Triangular Solve Algorithms
+
+/**
+ * @brief Perform tiled forward triangular matrix-vector solve.
+ *
+ * @param ft_tiles Tiled triangular matrix represented as a vector of futurized tiles.
+ * @param ft_rhs Tiled right-hand side vector, afterwards containing the tiled solution vector
+ * @param N Tile size per dimension.
+ * @param n_tiles Number of tiles per dimension.
+ */
+template <typename Tiles, typename Scheduler = tiled_scheduler_local>
+void forward_solve_tiled(Scheduler &sched, Tiles &ft_tiles, Tiles &ft_rhs, std::size_t N, std::size_t n_tiles)
+{
+    for (std::size_t k = 0; k < n_tiles; k++)
+    {
+        // TRSM: Solve L * x = a
+        ft_rhs[k] = detail::named_dataflow<trsv>(
+            sched,
+            schedule::solve_trsv(sched, n_tiles, k),
+            "triangular_solve_tiled",
+            ft_tiles[k * n_tiles + k],
+            ft_rhs[k],
+            N,
+            Blas_no_trans);
+        for (std::size_t m = k + 1; m < n_tiles; m++)
+        {
+            // GEMV: b = b - A * a
+            ft_rhs[m] = detail::named_dataflow<gemv>(
+                sched,
+                schedule::solve_gemv(sched, n_tiles, k, m),
+                "triangular_solve_tiled",
+                ft_tiles[m * n_tiles + k],
+                ft_rhs[k],
+                ft_rhs[m],
+                N,
+                N,
+                Blas_substract,
+                Blas_no_trans);
+        }
+    }
+}
+
+/**
+ * @brief Perform tiled backward triangular matrix-vector solve.
+ *
+ * @param ft_tiles Tiled triangular matrix represented as a vector of futurized tiles.
+ * @param ft_rhs Tiled right-hand side vector, afterwards containing the tiled solution vector
+ * @param N Tile size per dimension.
+ * @param n_tiles Number of tiles per dimension.
+ */
+template <typename Tiles, typename Scheduler = tiled_scheduler_local>
+void backward_solve_tiled(Scheduler &sched, Tiles &ft_tiles, Tiles &ft_rhs, std::size_t N, std::size_t n_tiles)
+{
+    for (int k_ = static_cast<int>(n_tiles) - 1; k_ >= 0; k_--)  // int instead of std::size_t for last comparison
+    {
+        std::size_t k = static_cast<std::size_t>(k_);
+        // TRSM: Solve L^T * x = a
+        ft_rhs[k] = detail::named_dataflow<trsv>(
+            sched,
+            schedule::solve_trsm(sched, n_tiles, k),
+            "triangular_solve_tiled",
+            ft_tiles[k * n_tiles + k],
+            ft_rhs[k],
+            N,
+            Blas_trans);
+        for (int m_ = k_ - 1; m_ >= 0; m_--)  // int instead of std::size_t for last comparison
+        {
+            std::size_t m = static_cast<std::size_t>(m_);
+            // GEMV:b = b - A^T * a
+            ft_rhs[m] = detail::named_dataflow<gemv>(
+                sched,
+                schedule::solve_gemv(sched, n_tiles, k, m),
+                "triangular_solve_tiled",
+                ft_tiles[k * n_tiles + m],
+                ft_rhs[k],
+                ft_rhs[m],
+                N,
+                N,
+                Blas_substract,
+                Blas_trans);
+        }
+    }
+}
+
+/**
+ * @brief Perform tiled forward triangular matrix-matrix solve.
+ *
+ * @param ft_tiles Tiled triangular matrix represented as a vector of futurized tiles.
+ * @param ft_rhs Tiled right-hand side matrix, afterwards containing the tiled solution matrix.
+ * @param N Tile size of first dimension.
+ * @param M Tile size of second dimension.
+ * @param n_tiles Number of tiles in first dimension.
+ * @param m_tiles Number of tiles in second dimension.
+ */
+template <typename Tiles, typename Scheduler = tiled_scheduler_local>
+void forward_solve_tiled_matrix(
+    Scheduler &sched,
+    Tiles &ft_tiles,
+    Tiles &ft_rhs,
+    std::size_t N,
+    std::size_t M,
+    std::size_t n_tiles,
+    std::size_t m_tiles)
+{
+    for (std::size_t c = 0; c < m_tiles; c++)
+    {
+        for (std::size_t k = 0; k < n_tiles; k++)
+        {
+            // TRSM: solve L * X = A
+            ft_rhs[k * m_tiles + c] = detail::named_dataflow<trsm>(
+                sched,
+                schedule::solve_matrix_trsm(sched, m_tiles, c, k),
+                "triangular_solve_tiled_matrix",
+                ft_tiles[k * n_tiles + k],
+                ft_rhs[k * m_tiles + c],
+                N,
+                M,
+                Blas_no_trans,
+                Blas_left);
+            for (std::size_t m = k + 1; m < n_tiles; m++)
+            {
+                // GEMM: C = C - A * B
+                ft_rhs[m * m_tiles + c] = detail::named_dataflow<gemm>(
+                    sched,
+                    schedule::solve_matrix_gemm(sched, m_tiles, c, k, m),
+                    "triangular_solve_tiled_matrix",
+                    ft_tiles[m * n_tiles + k],
+                    ft_rhs[k * m_tiles + c],
+                    ft_rhs[m * m_tiles + c],
+                    N,
+                    M,
+                    N,
+                    Blas_no_trans,
+                    Blas_no_trans);
+            }
+        }
+    }
+}
+
+/**
+ * @brief Perform tiled backward triangular matrix-matrix solve.
+ *
+ * @param ft_tiles Tiled triangular matrix represented as a vector of futurized tiles.
+ * @param ft_rhs Tiled right-hand side matrix, afterwards containing the tiled solution matrix.
+ * @param N Tile size of first dimension.
+ * @param M Tile size of second dimension.
+ * @param n_tiles Number of tiles in first dimension.
+ * @param m_tiles Number of tiles in second dimension.
+ */
+template <typename Tiles, typename Scheduler = tiled_scheduler_local>
+void backward_solve_tiled_matrix(
+    Scheduler &sched,
+    Tiles &ft_tiles,
+    Tiles &ft_rhs,
+    std::size_t N,
+    std::size_t M,
+    std::size_t n_tiles,
+    std::size_t m_tiles)
+{
+    for (std::size_t c = 0; c < m_tiles; c++)
+    {
+        for (int k_ = static_cast<int>(n_tiles) - 1; k_ >= 0; k_--)  // int instead of std::size_t for last comparison
+        {
+            std::size_t k = static_cast<std::size_t>(k_);
+            // TRSM: solve L^T * X = A
+            ft_rhs[k * m_tiles + c] = detail::named_dataflow<trsm>(
+                sched,
+                schedule::solve_matrix_trsm(sched, m_tiles, c, k),
+                "triangular_solve_tiled_matrix",
+                ft_tiles[k * n_tiles + k],
+                ft_rhs[k * m_tiles + c],
+                N,
+                M,
+                Blas_trans,
+                Blas_left);
+            for (int m_ = k_ - 1; m_ >= 0; m_--)  // int instead of std::size_t for last comparison
+            {
+                std::size_t m = static_cast<std::size_t>(m_);
+                // GEMM: C = C - A^T * B
+                ft_rhs[m * m_tiles + c] = detail::named_dataflow<gemm>(
+                    sched,
+                    schedule::solve_matrix_gemm(sched, m_tiles, c, k, m),
+                    "triangular_solve_tiled_matrix",
+                    ft_tiles[k * n_tiles + m],
+                    ft_rhs[k * m_tiles + c],
+                    ft_rhs[m * m_tiles + c],
+                    N,
+                    M,
+                    N,
+                    Blas_trans,
+                    Blas_no_trans);
+            }
+        }
+    }
+}
+
+/**
+ * @brief Perform tiled matrix-vector multiplication
+ *
+ * @param ft_tiles Tiled matrix represented as a vector of futurized tiles.
+ * @param ft_vector Tiled vector represented as a vector of futurized tiles.
+ * @param ft_rhs Tiled solution represented as a vector of futurized tiles.
+ * @param N_row Tile size of first dimension.
+ * @param N_col Tile size of second dimension.
+ * @param n_tiles Number of tiles in first dimension.
+ * @param m_tiles Number of tiles in second dimension.
+ */
+template <typename Tiles, typename Scheduler = tiled_scheduler_local>
+void matrix_vector_tiled(Scheduler &sched,
+                         Tiles &ft_tiles,
+                         Tiles &ft_vector,
+                         Tiles &ft_rhs,
+                         std::size_t N_row,
+                         std::size_t N_col,
+                         std::size_t n_tiles,
+                         std::size_t m_tiles)
+{
+    for (std::size_t k = 0; k < m_tiles; k++)
+    {
+        for (std::size_t m = 0; m < n_tiles; m++)
+        {
+            ft_rhs[k] = detail::named_dataflow<gemv>(
+                sched,
+                schedule::multiply_gemv(sched, n_tiles, k, m),
+                "prediction_tiled",
+                ft_tiles[k * n_tiles + m],
+                ft_vector[m],
+                ft_rhs[k],
+                N_row,
+                N_col,
+                Blas_add,
+                Blas_no_trans);
+        }
+    }
+}
+
+/**
+ * @brief Perform tiled symmetric k-rank update on diagonal tiles
+ *
+ * @param ft_tiles Tiled matrix represented as a vector of futurized tiles.
+ * @param ft_vector Tiled vector holding the diagonal tile results
+ * @param N Tile size of first dimension.
+ * @param M Tile size of second dimension.
+ * @param n_tiles Number of tiles in first dimension.
+ * @param m_tiles Number of tiles in second dimension.
+ */
+template <typename Tiles, typename Scheduler = tiled_scheduler_local>
+void symmetric_matrix_matrix_diagonal_tiled(
+    Scheduler &sched,
+    Tiles &ft_tiles,
+    Tiles &ft_vector,
+    std::size_t N,
+    std::size_t M,
+    std::size_t n_tiles,
+    std::size_t m_tiles)
+{
+    for (std::size_t i = 0; i < m_tiles; ++i)
+    {
+        for (std::size_t n = 0; n < n_tiles; ++n)
+        {
+            // Compute inner product to obtain diagonal elements of
+            // V^T * V  <=> cross(K) * K^-1 * cross(K)^T
+            ft_vector[i] = detail::named_dataflow<dot_diag_syrk>(
+                sched,
+                schedule::k_rank_dot_diag_syrk(sched, m_tiles, i),
+                "posterior_tiled",
+                ft_tiles[n * m_tiles + i],
+                ft_vector[i],
+                N,
+                M);
+        }
+    }
+}
+
+/**
+ * @brief Perform tiled symmetric k-rank update (ft_tiles^T * ft_tiles)
+ *
+ * @param ft_tiles Tiled matrix represented as a vector of futurized tiles.
+ * @param ft_result Tiled matrix holding the result of the computationi.
+ * @param N Tile size of first dimension.
+ * @param M Tile size of second dimension.
+ * @param n_tiles Number of tiles in first dimension.
+ * @param m_tiles Number of tiles in second dimension.
+ */
+template <typename Tiles, typename Scheduler = tiled_scheduler_local>
+void symmetric_matrix_matrix_tiled(
+    Scheduler &sched,
+    Tiles &ft_tiles,
+    Tiles &ft_result,
+    std::size_t N,
+    std::size_t M,
+    std::size_t n_tiles,
+    std::size_t m_tiles)
+{
+    for (std::size_t c = 0; c < m_tiles; c++)
+    {
+        for (std::size_t k = 0; k < m_tiles; k++)
+        {
+            for (std::size_t m = 0; m < n_tiles; m++)
+            {
+                // (SYRK for (c == k) possible)
+                // GEMM:  C = C - A^T * B
+                ft_result[c * m_tiles + k] = detail::named_dataflow<gemm>(
+                    sched,
+                    schedule::k_rank_gemm(sched, m_tiles, c, k, m),
+                    "triangular_solve_tiled_matrix",
+                    ft_tiles[m * m_tiles + c],
+                    ft_tiles[m * m_tiles + k],
+                    ft_result[c * m_tiles + k],
+                    N,
+                    M,
+                    M,
+                    Blas_trans,
+                    Blas_no_trans);
+            }
+        }
+    }
+}
+
+/**
+ * @brief Compute the difference between two tiled vectors
+ * @param ft_minuend Tiled vector that is being subtracted from.
+ * @param ft_subtrahend Tiled vector that is being subtracted.
+ * @param M Tile size dimension.
+ * @param m_tiles Number of tiles.
+ */
+template <typename Tiles, typename Scheduler = tiled_scheduler_local>
+void vector_difference_tiled(
+    Scheduler &sched, Tiles &ft_minuend, Tiles &ft_subtrahend, std::size_t M, std::size_t m_tiles)
+{
+    for (std::size_t i = 0; i < m_tiles; i++)
+    {
+        ft_subtrahend[i] = detail::named_dataflow<axpy>(
+            sched, schedule::vector_axpy(sched, m_tiles, i), "uncertainty_tiled", ft_minuend[i], ft_subtrahend[i], M);
+    }
+}
+
+/**
+ * @brief Extract the tiled diagonals of a tiled matrix
+ * @param ft_tiles Tiled matrix represented as a vector of futurized tiles.
+ * @param ft_vector Tiled vector containing the diagonals of the matrix tiles
+ * @param M Tile size per dimension.
+ * @param m_tiles Number of tiles per dimension.
+ */
+template <typename Tiles, typename Scheduler = tiled_scheduler_local>
+void matrix_diagonal_tiled(Scheduler &sched, Tiles &ft_tiles, Tiles &ft_vector, std::size_t M, std::size_t m_tiles)
+{
+    for (std::size_t i = 0; i < m_tiles; i++)
+    {
+        ft_vector[i] = detail::named_dataflow<get_matrix_diagonal>(
+            sched, schedule::get_diagonal(sched, m_tiles, i), "uncertainty_tiled", ft_tiles[i * m_tiles + i], M);
+    }
+}
+
+/**
+ * @brief Compute the negative log likelihood loss with a tiled covariance matrix K.
+ *
+ *  Computes l = 0.5 * ( log(det(K)) + y^T * K^-1 * y) + const.)
+ *
+ * @param ft_tiles Tiled Cholesky factor matrix represented as a vector of futurized tiles.
+ * @param ft_alpha Tiled vector containing the solution of K^-1 * y
+ * @param ft_y Tiled vector containing the training output y
+ * @param N Tile size per dimension.
+ * @param n_tiles Number of tiles per dimension.
+ * @return The loss value to be computed
+ */
+template <typename Tiles, typename Scheduler = tiled_scheduler_local>
+hpx::future<double>
+compute_loss_tiled(Scheduler &sched, Tiles &ft_tiles, Tiles &ft_alpha, Tiles &ft_y, std::size_t N, std::size_t n_tiles)
+{
+    std::vector<hpx::future<double>> loss_tiled;
+    loss_tiled.reserve(n_tiles);
+    for (std::size_t k = 0; k < n_tiles; k++)
+    {
+        loss_tiled.push_back(detail::named_dataflow<compute_loss>(
+            sched,
+            schedule::compute_loss(sched, n_tiles, k),
+            "loss_tiled",
+            ft_tiles[k * n_tiles + k],
+            ft_alpha[k],
+            ft_y[k],
+            N));
+    }
+    return detail::named_dataflow<add_losses>("loss_tiled", loss_tiled, N, n_tiles);
+}
+
+/**
+ * @brief Updates a hyperparameter of the SEK kernel using Adam
+ *
+ * @param ft_invK Tiled inverse of the covariance matrix K represented as a vector of futurized tiles.
+ * @param ft_gradK_param Tiled covariance matrix gradient w.r.t. a hyperparameter.
+ * @param ft_alpha Tiled vector containing the precomputed inv(K) * y where y is the training output.
+ * @param adam_params Hyperparameter of the Adam optimizer
+ * @param sek_params Hyperparameters of the SEK kernel
+ * @param N Tile size per dimension.
+ * @param n_tiles Number of tiles per dimension.
+ * @param iter Current iteration.
+ * @param param_idx Index of the hyperparameter to optimize.
+ */
+template <typename Tiles, typename Scheduler = tiled_scheduler_local>
+void update_hyperparameter_tiled_lengthscale(
+    Scheduler &sched,
+    const Tiles &ft_invK,
+    const Tiles &ft_gradK_param,
+    const Tiles &ft_alpha,
+    const AdamParams &adam_params,
+    Tiles &diag_tiles,   // Diagonal tiles
+    Tiles &inter_alpha,  // Intermediate result
+    SEKParams &sek_params,
+    std::size_t N,
+    std::size_t n_tiles,
+    std::size_t iter,
+    std::size_t param_idx)
+{
+    /*
+     * PART 1:
+     * Compute gradient = 0.5 * ( trace(inv(K) * grad(K)_param) + y^T * inv(K) * grad(K)_param * inv(K) * y )
+     *
+     * 1: Compute   trace(inv(K) * grad(K)_param)
+     * 2: Compute   y^T * inv(K) * grad(K)_param * inv(K) * y
+     *
+     * Update parameter:
+     * 3: Update moments
+     *      - m_T = beta1 * m_T-1 + (1 - beta1) * g_T
+     *      - w_T = beta2 + w_T-1 + (1 - beta2) * g_T^2
+     * 4: Adam step:
+     *      - nu_T = nu * sqrt(1 - beta2_T) / (1 - beta1_T)
+     *      - theta_T = theta_T-1 - nu_T * m_T / (sqrt(w_T) + epsilon)
+     */
+    hpx::shared_future<double> trace = hpx::make_ready_future(0.0);
+    hpx::shared_future<double> dot = hpx::make_ready_future(0.0);
+    bool jitter = false;
+    double factor = 1.0;
+
+    // Reset our helper tiles
+    for (std::size_t d = 0; d < n_tiles; d++)
+    {
+        diag_tiles[d] = detail::named_make_tile<gen_tile_zeros>(
+            sched, schedule::diag_tile(sched, n_tiles, d), "assemble", diag_tiles[d], N);
+        inter_alpha[d] = detail::named_make_tile<gen_tile_zeros>(
+            sched, schedule::inter_alpha_tile(sched, n_tiles, d), "assemble", inter_alpha[d], N);
+    }
+
+    ////////////////////////////////////
+    // PART 1: Compute gradient
+    // Step 1: Compute trace(inv(K)*grad_K_param)
+    // Compute diagonal tiles of inv(K) * grad(K)_param
+    for (std::size_t i = 0; i < n_tiles; ++i)
+    {
+        for (std::size_t j = 0; j < n_tiles; ++j)
+        {
+            diag_tiles[i] = detail::named_dataflow<dot_diag_gemm>(
+                sched,
+                schedule::diag_tile(sched, n_tiles, i),
+                "trace",
+                ft_invK[i * n_tiles + j],
+                ft_gradK_param[j * n_tiles + i],
+                diag_tiles[i],
+                N,
+                N);
+        }
+    }
+    // Compute the trace of the diagonal tiles
+    for (std::size_t j = 0; j < n_tiles; ++j)
+    {
+        trace = detail::named_dataflow<compute_trace>(
+            sched, schedule::diag_tile(sched, n_tiles, j), "trace", diag_tiles[j], trace);
+    }
+    // Not sure if can be done this way
+    // Step 2: Compute alpha^T * grad(K)_param * alpha (with alpha = inv(K) * y)
+    // Compute inter_alpha = grad(K)_param * alpha
+    for (std::size_t k = 0; k < n_tiles; k++)
+    {
+        for (std::size_t m = 0; m < n_tiles; m++)
+        {
+            inter_alpha[k] = detail::named_dataflow<gemv>(
+                sched,
+                schedule::inter_alpha_tile(sched, n_tiles, k),
+                "gemv",
+                ft_gradK_param[k * n_tiles + m],
+                ft_alpha[m],
+                inter_alpha[k],
+                N,
+                N,
+                Blas_add,
+                Blas_no_trans);
+        }
+    }
+    // Compute alpha^T * inter_alpha
+    for (std::size_t j = 0; j < n_tiles; ++j)
+    {
+        dot = detail::named_dataflow<compute_dot>(
+            sched, schedule::inter_alpha_tile(sched, n_tiles, j), "grad_right_tiled", inter_alpha[j], ft_alpha[j], dot);
+    }
+
+    impl::update_parameters(
+        adam_params, sek_params, N, n_tiles, iter, param_idx, trace.get(), dot.get(), jitter, factor);
+}
+
+template <typename Tiles, typename Scheduler = tiled_scheduler_local>
+void update_hyperparameter_tiled_noise_variance(
+    Scheduler &sched,
+    const Tiles &ft_invK,
+    const Tiles &ft_alpha,
+    const AdamParams &adam_params,
+    SEKParams &sek_params,
+    std::size_t N,
+    std::size_t n_tiles,
+    std::size_t iter,
+    std::size_t param_idx)
+{
+    /*
+     * PART 1:
+     * Compute gradient = 0.5 * ( trace(inv(K) * grad(K)_param) + y^T * inv(K) * grad(K)_param * inv(K) * y )
+     *
+     * 1: Compute   trace(inv(K) * grad(K)_param)
+     * 2: Compute   y^T * inv(K) * grad(K)_param * inv(K) * y
+     *
+     * Update parameter:
+     * 3: Update moments
+     *      - m_T = beta1 * m_T-1 + (1 - beta1) * g_T
+     *      - w_T = beta2 + w_T-1 + (1 - beta2) * g_T^2
+     * 4: Adam step:
+     *      - nu_T = nu * sqrt(1 - beta2_T) / (1 - beta1_T)
+     *      - theta_T = theta_T-1 - nu_T * m_T / (sqrt(w_T) + epsilon)
+     */
+    hpx::shared_future<double> trace = hpx::make_ready_future(0.0);
+    hpx::shared_future<double> dot = hpx::make_ready_future(0.0);
+    bool jitter = true;
+    double factor = 1.0;
+
+    ////////////////////////////////////
+    // PART 1: Compute gradient
+    // Step 1: Compute the trace of inv(K) * noise_variance
+    for (std::size_t j = 0; j < n_tiles; ++j)
+    {
+        trace = detail::named_dataflow<compute_trace_diag>(
+            sched, schedule::K_inv_tile(sched, n_tiles, j, j), "grad_left_tiled", ft_invK[j * n_tiles + j], trace, N);
+    }
+    ////////////////////////////////////
+    // Step 2: Compute the alpha^T * alpha * noise_variance
+    for (std::size_t j = 0; j < n_tiles; ++j)
+    {
+        dot = detail::named_dataflow<compute_dot>(
+            sched, schedule::alpha_tile(sched, n_tiles, j), "grad_right_tiled", ft_alpha[j], ft_alpha[j], dot);
+    }
+
+    factor = compute_sigmoid(to_unconstrained(sek_params.noise_variance, true));
+
+    impl::update_parameters(
+        adam_params, sek_params, N, n_tiles, iter, param_idx, trace.get(), dot.get(), jitter, factor);
+}
+
+}  // end of namespace cpu
+
+GPRAT_NS_END
+
+#endif
diff --git a/core/include/gprat/detail/async_helpers.hpp b/core/include/gprat/detail/async_helpers.hpp
new file mode 100644
index 00000000..05a24a91
--- /dev/null
+++ b/core/include/gprat/detail/async_helpers.hpp
@@ -0,0 +1,74 @@
+﻿#ifndef GPRAT_DETAIL_DATAFLOW_HELPERS_HPP
+#define GPRAT_DETAIL_DATAFLOW_HELPERS_HPP
+
+#pragma once
+
+#include "gprat/detail/config.hpp"
+
+#include <hpx/async_base/async.hpp>
+#include <hpx/async_base/dataflow.hpp>
+#include <hpx/pack_traversal/unwrap.hpp>
+#include <hpx/threading_base/annotated_function.hpp>
+
+GPRAT_NS_BEGIN
+
+/// @brief Empty type representing local scheduling (always on this locality)
+struct basic_local_scheduler
+{ };
+
+namespace detail
+{
+
+// Functions prefixed with named_* allow the user to specify a custom name for this entry in the
+// execution graph. Much like wrapping your function with hpx::annotated_function would.
+
+// =============================================================
+// non-scheduler aware
+
+template <auto F, typename... Args>
+decltype(auto) named_dataflow(const char *name, Args &&...args)
+{
+    return hpx::dataflow(hpx::annotated_function(hpx::unwrapping(F), name), std::forward<Args>(args)...);
+}
+
+template <auto F, typename... Args>
+decltype(auto) named_async(const char *name, Args &&...args)
+{
+    return hpx::async(hpx::annotated_function(F, name), std::forward<Args>(args)...);
+}
+
+// =============================================================
+// local shared-memory scheduling
+// (no-op, same as above)
+
+template <auto F, typename TileReference, typename... Args>
+decltype(auto) named_make_tile(const basic_local_scheduler & /*sched*/,
+                               std::size_t /*on*/,
+                               const char *name,
+                               TileReference & /*target*/,
+                               Args &&...args)
+{
+    // This method basically ignores the reference to the target tile as the non-action factories don't need it.
+    // (They always create the tile_data locally and return that - only the HPX action wrappers need a reference)
+    return hpx::dataflow(hpx::annotated_function(hpx::unwrapping(F), name), std::forward<Args>(args)...);
+}
+
+template <auto F, typename... Args>
+decltype(auto)
+named_dataflow(const basic_local_scheduler & /*sched*/, std::size_t /*on*/, const char *name, Args &&...args)
+{
+    return hpx::dataflow(hpx::annotated_function(hpx::unwrapping(F), name), std::forward<Args>(args)...);
+}
+
+template <auto F, typename... Args>
+decltype(auto)
+named_async(const basic_local_scheduler & /*sched*/, std::size_t /*on*/, const char *name, Args &&...args)
+{
+    return hpx::async(hpx::annotated_function(F, name), std::forward<Args>(args)...);
+}
+
+}  // namespace detail
+
+GPRAT_NS_END
+
+#endif
diff --git a/core/include/gprat/detail/config.hpp b/core/include/gprat/detail/config.hpp
new file mode 100644
index 00000000..e47a2de7
--- /dev/null
+++ b/core/include/gprat/detail/config.hpp
@@ -0,0 +1,26 @@
+#ifndef GPRAT_DETAIL_CONFIG_HPP
+#define GPRAT_DETAIL_CONFIG_HPP
+
+#pragma once
+
+// clang-format off
+#define GPRAT_NS gprat::v1
+#define GPRAT_NS_BEGIN namespace gprat { inline namespace v1 {
+#define GPRAT_NS_END } }
+// clang-format on
+
+#if defined(_MSC_VER) || defined(__BORLANDC__) || defined(__CODEGEARC__)
+#if defined(GPRAT_DYN_LINK)
+#if defined(GPRAT_SOURCE)
+#define GPRAT_DECL __declspec(dllexport)
+#else
+#define GPRAT_DECL __declspec(dllimport)
+#endif
+#endif
+#endif
+
+#if !defined(GPRAT_DECL)
+#define GPRAT_DECL
+#endif
+
+#endif
diff --git a/core/include/gprat_c.hpp b/core/include/gprat/gprat.hpp
similarity index 82%
rename from core/include/gprat_c.hpp
rename to core/include/gprat/gprat.hpp
index 6781d286..88c6972f 100644
--- a/core/include/gprat_c.hpp
+++ b/core/include/gprat/gprat.hpp
@@ -1,16 +1,19 @@
-#ifndef GPRAT_C_H
-#define GPRAT_C_H
+#ifndef GPRAT_C_HPP
+#define GPRAT_C_HPP
 
-#include "gp_hyperparameters.hpp"
-#include "gp_kernels.hpp"
-#include "target.hpp"
+#pragma once
+
+#include "gprat/detail/config.hpp"
+#include "gprat/hyperparameters.hpp"
+#include "gprat/kernels.hpp"
+#include "gprat/target.hpp"
+
+#include "tile_data.hpp"
 #include <memory>
 #include <string>
 #include <vector>
 
-// namespace for GPRat library entities
-namespace gprat
-{
+GPRAT_NS_BEGIN
 
 /**
  * @brief Data structure for Gaussian Process data
@@ -24,10 +27,10 @@ struct GP_data
     std::string file_path;
 
     /** @brief Number of samples in the data */
-    int n_samples;
+    std::size_t n_samples;
 
     /** @brief Number of GP regressors */
-    int n_regressors;
+    std::size_t n_regressors;
 
     /** @brief Vector containing the data */
     std::vector<double> data;
@@ -38,10 +41,10 @@ struct GP_data
      *
      * The file specified by `f_path` must contain `n` samples.
      *
-     * @param f_path Path to the file
+     * @param file_path Path to the file
      * @param n Number of samples
      */
-    GP_data(const std::string &file_path, int n, int n_reg);
+    GP_data(const std::string &file_path, std::size_t n, std::size_t n_reg);
 };
 
 /**
@@ -61,10 +64,10 @@ class GP
     std::vector<double> training_output_;
 
     /** @brief Number of tiles */
-    int n_tiles_;
+    std::size_t n_tiles_;
 
     /** @brief Size of each tile in each dimension */
-    int n_tile_size_;
+    std::size_t n_tile_size_;
 
     /**
      * @brief List of bools indicating trainable parameters: lengthscale,
@@ -79,12 +82,12 @@ class GP
 
   public:
     /** @brief Number of regressors */
-    int n_reg;
+    std::size_t n_reg;
 
     /**
      * @brief Hyperarameters of the squared exponential kernel
      */
-    gprat_hyper::SEKParams kernel_params;
+    SEKParams kernel_params;
 
     /**
      * @brief Constructs a Gaussian Process (GP)
@@ -102,10 +105,10 @@ class GP
      */
     GP(std::vector<double> input,
        std::vector<double> output,
-       int n_tiles,
-       int n_tile_size,
-       int n_regressors,
-       std::vector<double> kernel_hyperparams,
+       std::size_t n_tiles,
+       std::size_t n_tile_size,
+       std::size_t n_regressors,
+       const std::vector<double> &kernel_hyperparams,
        std::vector<bool> trainable_bool,
        std::shared_ptr<Target> target);
 
@@ -124,10 +127,10 @@ class GP
      */
     GP(std::vector<double> input,
        std::vector<double> output,
-       int n_tiles,
-       int n_tile_size,
-       int n_regressors,
-       std::vector<double> kernel_hyperparams,
+       std::size_t n_tiles,
+       std::size_t n_tile_size,
+       std::size_t n_regressors,
+       const std::vector<double> &kernel_hyperparams,
        std::vector<bool> trainable_bool);
 
     /**
@@ -147,10 +150,10 @@ class GP
      */
     GP(std::vector<double> input,
        std::vector<double> output,
-       int n_tiles,
-       int n_tile_size,
-       int n_regressors,
-       std::vector<double> kernel_hyperparams,
+       std::size_t n_tiles,
+       std::size_t n_tile_size,
+       std::size_t n_regressors,
+       const std::vector<double> &kernel_hyperparams,
        std::vector<bool> trainable_bool,
        int gpu_id,
        int n_streams);
@@ -173,14 +176,14 @@ class GP
     /**
      * @brief Predict output for test input
      */
-    std::vector<double> predict(const std::vector<double> &test_data, int m_tiles, int m_tile_size);
+    std::vector<double> predict(const std::vector<double> &test_data, std::size_t m_tiles, std::size_t m_tile_size);
 
     /**
      * @brief Predict output for test input and additionally provide
      * uncertainty for the predictions.
      */
     std::vector<std::vector<double>>
-    predict_with_uncertainty(const std::vector<double> &test_data, int m_tiles, int m_tile_size);
+    predict_with_uncertainty(const std::vector<double> &test_data, std::size_t m_tiles, std::size_t m_tile_size);
 
     /**
      * @brief Predict output for test input and additionally compute full
@@ -193,7 +196,7 @@ class GP
      * @return Full covariance matrix
      */
     std::vector<std::vector<double>>
-    predict_with_full_cov(const std::vector<double> &test_data, int m_tiles, int m_tile_size);
+    predict_with_full_cov(const std::vector<double> &test_data, std::size_t m_tiles, std::size_t m_tile_size);
 
     /**
      * @brief Optimize hyperparameters
@@ -203,7 +206,7 @@ class GP
      *
      * @return losses
      */
-    std::vector<double> optimize(const gprat_hyper::AdamParams &adam_params);
+    std::vector<double> optimize(const AdamParams &adam_params);
 
     /**
      * @brief Perform a single optimization step
@@ -214,7 +217,7 @@ class GP
      *
      * @return loss
      */
-    double optimize_step(gprat_hyper::AdamParams &adam_params, int iter);
+    double optimize_step(AdamParams &adam_params, std::size_t iter);
 
     /**
      * @brief Calculate loss for given data and Gaussian process model
@@ -224,8 +227,9 @@ class GP
     /**
      * @brief Computes & returns cholesky decomposition
      */
-    std::vector<std::vector<double>> cholesky();
+    std::vector<mutable_tile_data<double>> cholesky();
 };
-}  // namespace gprat
 
-#endif  // end of GPRAT_C_H
+GPRAT_NS_END
+
+#endif
diff --git a/core/include/gpu/adapter_cublas.cuh b/core/include/gprat/gpu/adapter_cublas.cuh
similarity index 97%
rename from core/include/gpu/adapter_cublas.cuh
rename to core/include/gprat/gpu/adapter_cublas.cuh
index 1a69cb58..05972b36 100644
--- a/core/include/gpu/adapter_cublas.cuh
+++ b/core/include/gprat/gpu/adapter_cublas.cuh
@@ -1,10 +1,18 @@
-#ifndef ADAPTER_CUBLAS_H
-#define ADAPTER_CUBLAS_H
+#ifndef GRRAT_GPU_ADAPTER_CUBLAS_HPP
+#define GPRAT_GPU_ADAPTER_CUBLAS_HPP
+
+#pragma once
+
+#include "gprat/detail/config.hpp"
+
+#include "gprat/target.hpp"
 
-#include <cusolverDn.h>
 #include <hpx/future.hpp>
 #include <hpx/modules/async_cuda.hpp>
-#include <target.hpp>
+
+#include <cusolverDn.h>
+
+GPRAT_NS_BEGIN
 
 // Constants, compatible with cuBLAS
 
@@ -262,4 +270,6 @@ inline cublasSideMode_t opposite(cublasSideMode_t side)
     return (side == CUBLAS_SIDE_LEFT) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
 }
 
-#endif  // end of ADAPTER_CUBLAS_H
+GPRAT_NS_END
+
+#endif
diff --git a/core/include/gpu/cuda_kernels.cuh b/core/include/gprat/gpu/cuda_kernels.cuh
similarity index 71%
rename from core/include/gpu/cuda_kernels.cuh
rename to core/include/gprat/gpu/cuda_kernels.cuh
index 4daef473..69a48d8f 100644
--- a/core/include/gpu/cuda_kernels.cuh
+++ b/core/include/gprat/gpu/cuda_kernels.cuh
@@ -1,5 +1,11 @@
-#ifndef CUDA_KERNELS_H
-#define CUDA_KERNELS_H
+#ifndef GPRAT_CUDA_KERNELS_HPP
+#define GPRAT_CUDA_KERNELS_HPP
+
+#pragma once
+
+#include "gprat/detail/config.hpp"
+
+GPRAT_NS_BEGIN
 
 /**
  * @brief Kernel to transpose a matrix.
@@ -11,4 +17,6 @@
  */
 __global__ void transpose(double *transposed, double *original, std::size_t width, std::size_t height);
 
-#endif  // CUDA_KERNELS_H
+GPRAT_NS_END
+
+#endif
diff --git a/core/include/gpu/cuda_utils.cuh b/core/include/gprat/gpu/cuda_utils.cuh
similarity index 89%
rename from core/include/gpu/cuda_utils.cuh
rename to core/include/gprat/gpu/cuda_utils.cuh
index 0c51ea76..128c6e22 100644
--- a/core/include/gpu/cuda_utils.cuh
+++ b/core/include/gprat/gpu/cuda_utils.cuh
@@ -1,14 +1,20 @@
-#ifndef CUDA_UTILS_H
-#define CUDA_UTILS_H
+#ifndef GPRAT_CUDA_UTILS_HPP
+#define GPRAT_CUDA_UTILS_HPP
+
+#pragma once
+
+#include "gprat/detail/config.hpp"
+#include "gprat/target.hpp"
 
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 #include <cusolverDn.h>
 #include <hpx/algorithm.hpp>
 #include <hpx/async_cuda/cuda_exception.hpp>
-#include <target.hpp>
 #include <vector>
 
+GPRAT_NS_BEGIN
+
 #define BLOCK_SIZE 16
 
 using hpx::cuda::experimental::check_cuda_error;
@@ -25,7 +31,7 @@ using hpx::cuda::experimental::check_cuda_error;
  *
  * @return A pointer to the copied vector on the device
  */
-inline double *copy_to_device(const std::vector<double> &h_vector, gprat::CUDA_GPU &gpu)
+inline double *copy_to_device(const std::vector<double> &h_vector, CUDA_GPU &gpu)
 {
     double *d_vector;
     check_cuda_error(cudaMalloc(&d_vector, h_vector.size() * sizeof(double)));
@@ -66,4 +72,6 @@ inline void free(std::vector<hpx::shared_future<double *>> &vector)
     }
 }
 
-#endif  // end of CUDA_UTILS_H
+GPRAT_NS_END
+
+#endif
diff --git a/core/include/gpu/gp_algorithms.cuh b/core/include/gprat/gpu/gp_algorithms.cuh
similarity index 89%
rename from core/include/gpu/gp_algorithms.cuh
rename to core/include/gprat/gpu/gp_algorithms.cuh
index 51cbc355..8da8a956 100644
--- a/core/include/gpu/gp_algorithms.cuh
+++ b/core/include/gprat/gpu/gp_algorithms.cuh
@@ -1,11 +1,18 @@
-#ifndef GPU_GP_ALGORITHMS_H
-#define GPU_GP_ALGORITHMS_H
+#ifndef GPRAT_GPU_GP_ALGORITHMS_HPP
+#define GPRAT_GPU_GP_ALGORITHMS_HPP
+
+#pragma once
+
+#include "gprat/detail/config.hpp"
+#include "gprat/kernels.hpp"
+#include "gprat/target.hpp"
+#include "gprat/tile_data.hpp"
 
-#include "gp_kernels.hpp"
-#include "target.hpp"
 #include <hpx/future.hpp>
 #include <vector>
 
+GPRAT_NS_BEGIN
+
 namespace gpu
 {
 
@@ -28,8 +35,8 @@ double *gen_tile_covariance(const double *d_input,
                             const std::size_t tile_column,
                             const std::size_t n_tile_size,
                             const std::size_t n_regressors,
-                            const gprat_hyper::SEKParams sek_params,
-                            gprat::CUDA_GPU &gpu);
+                            const SEKParams sek_params,
+                            CUDA_GPU &gpu);
 
 /**
  * @brief Generate the diagonal of a diagonal tile in the prior covariance matrix
@@ -51,8 +58,8 @@ double *gen_tile_prior_covariance(
     const std::size_t tile_column,
     const std::size_t n_tile_size,
     const std::size_t n_regressors,
-    const gprat_hyper::SEKParams sek_params,
-    gprat::CUDA_GPU &gpu);
+    const SEKParams sek_params,
+    CUDA_GPU &gpu);
 
 /**
  * @brief Generate a tile of the cross-covariance matrix
@@ -77,8 +84,8 @@ double *gen_tile_cross_covariance(
     const std::size_t n_row_tile_size,
     const std::size_t n_column_tile_size,
     const std::size_t n_regressors,
-    const gprat_hyper::SEKParams sek_params,
-    gprat::CUDA_GPU &gpu);
+    const SEKParams sek_params,
+    CUDA_GPU &gpu);
 
 /**
  * @brief Transpose a tile of size n_row_tile_size x n_column_tile_size
@@ -92,7 +99,7 @@ double *gen_tile_cross_covariance(
 hpx::shared_future<double *> gen_tile_transpose(std::size_t n_row_tile_size,
                                                 std::size_t n_column_tile_size,
                                                 const hpx::shared_future<double *> f_tile,
-                                                gprat::CUDA_GPU &gpu);
+                                                CUDA_GPU &gpu);
 
 /**
  * @brief Generate a tile of the output data
@@ -104,7 +111,7 @@ hpx::shared_future<double *> gen_tile_transpose(std::size_t n_row_tile_size,
  * @return A tile of the output data of size n_tile_size
  */
 double *
-gen_tile_output(const std::size_t row, const std::size_t n_tile_size, const double *d_output, gprat::CUDA_GPU &gpu);
+gen_tile_output(const std::size_t row, const std::size_t n_tile_size, const double *d_output, CUDA_GPU &gpu);
 
 /**
  * @brief Compute the L2-error norm over all tiles and elements
@@ -126,7 +133,7 @@ double compute_error_norm(const std::size_t n_tiles,
  *
  * @return A tile filled with zeros of size N
  */
-double *gen_tile_zeros(std::size_t n_tile_size, gprat::CUDA_GPU &gpu);
+double *gen_tile_zeros(std::size_t n_tile_size, CUDA_GPU &gpu);
 
 /**
  * @brief Allocates the tiled covariance matrix on the device given the training
@@ -144,8 +151,8 @@ std::vector<hpx::shared_future<double *>> assemble_tiled_covariance_matrix(
     const std::size_t n_tiles,
     const std::size_t n_tile_size,
     const std::size_t n_regressors,
-    const gprat_hyper::SEKParams sek_params,
-    gprat::CUDA_GPU &gpu);
+    const SEKParams sek_params,
+    CUDA_GPU &gpu);
 
 /**
  * @brief Allocates the tiled alpha vector on the device given the training
@@ -159,7 +166,7 @@ std::vector<hpx::shared_future<double *>> assemble_tiled_covariance_matrix(
  * @return A tiled alpha vector of size n_tiles x n_tile_size
  */
 std::vector<hpx::shared_future<double *>> assemble_alpha_tiles(
-    const double *d_output, const std::size_t n_tiles, const std::size_t n_tile_size, gprat::CUDA_GPU &gpu);
+    const double *d_output, const std::size_t n_tiles, const std::size_t n_tile_size, CUDA_GPU &gpu);
 
 /**
  * @brief Allocates the tiled cross covariance matrix on the device given the
@@ -185,8 +192,8 @@ std::vector<hpx::shared_future<double *>> assemble_cross_covariance_tiles(
     const std::size_t m_tile_size,
     const std::size_t n_tile_size,
     const std::size_t n_regressors,
-    const gprat_hyper::SEKParams sek_params,
-    gprat::CUDA_GPU &gpu);
+    const SEKParams sek_params,
+    CUDA_GPU &gpu);
 
 /**
  * @brief Allocates a tiled vector on the device and initializes it with zeros.
@@ -198,7 +205,7 @@ std::vector<hpx::shared_future<double *>> assemble_cross_covariance_tiles(
  * @return A tiled vector of size n_tiles x n_tile_size with zeros
  */
 std::vector<hpx::shared_future<double *>>
-assemble_tiles_with_zeros(std::size_t n_tile_size, std::size_t n_tiles, gprat::CUDA_GPU &gpu);
+assemble_tiles_with_zeros(std::size_t n_tile_size, std::size_t n_tiles, CUDA_GPU &gpu);
 
 /**
  * @brief Allocates the tiled prior covariance matrix on the device given the
@@ -218,8 +225,8 @@ std::vector<hpx::shared_future<double *>> assemble_prior_K_tiles(
     const std::size_t m_tiles,
     const std::size_t m_tile_size,
     const std::size_t n_regressors,
-    const gprat_hyper::SEKParams sek_params,
-    gprat::CUDA_GPU &gpu);
+    const SEKParams sek_params,
+    CUDA_GPU &gpu);
 
 /**
  * @brief Allocates the posterior covariance matrix.
@@ -238,8 +245,8 @@ std::vector<hpx::shared_future<double *>> assemble_prior_K_tiles_full(
     const std::size_t m_tiles,
     const std::size_t m_tile_size,
     const std::size_t n_regressors,
-    const gprat_hyper::SEKParams sek_params,
-    gprat::CUDA_GPU &gpu);
+    const SEKParams sek_params,
+    CUDA_GPU &gpu);
 
 /**
  * @brief Allocates the tiled transpose cross covariance matrix on the device
@@ -261,7 +268,7 @@ std::vector<hpx::shared_future<double *>> assemble_t_cross_covariance_tiles(
     const std::size_t m_tiles,
     const std::size_t n_tile_size,
     const std::size_t m_tile_size,
-    gprat::CUDA_GPU &gpu);
+    CUDA_GPU &gpu);
 
 /**
  * @brief Allocates the output vector on the device given the training output
@@ -272,7 +279,7 @@ std::vector<hpx::shared_future<double *>> assemble_t_cross_covariance_tiles(
  * @param gpu GPU target for computations
  */
 std::vector<hpx::shared_future<double *>> assemble_y_tiles(
-    const double *d_training_output, const std::size_t n_tiles, const std::size_t n_tile_size, gprat::CUDA_GPU &gpu);
+    const double *d_training_output, const std::size_t n_tiles, const std::size_t n_tile_size, CUDA_GPU &gpu);
 
 /**
  * @brief Allocates the tiled covariance matrix on the device given the training
@@ -286,7 +293,7 @@ std::vector<hpx::shared_future<double *>> assemble_y_tiles(
 std::vector<double> copy_tiled_vector_to_host_vector(std::vector<hpx::shared_future<double *>> &d_tiles,
                                                      std::size_t n_tile_size,
                                                      std::size_t n_tiles,
-                                                     gprat::CUDA_GPU &gpu);
+                                                     CUDA_GPU &gpu);
 
 /**
  * @brief Moves lower triangular tiles of the covariance matrix to the host.
@@ -298,11 +305,11 @@ std::vector<double> copy_tiled_vector_to_host_vector(std::vector<hpx::shared_fut
  * @param n_tiles The number of tiles
  * @param gpu GPU target for computations
  */
-std::vector<std::vector<double>> move_lower_tiled_matrix_to_host(
+std::vector<mutable_tile_data<double>> move_lower_tiled_matrix_to_host(
     const std::vector<hpx::shared_future<double *>> &d_tiles,
     const std::size_t n_tile_size,
     const std::size_t n_tiles,
-    gprat::CUDA_GPU &gpu);
+    CUDA_GPU &gpu);
 
 /**
  * @brief Frees the device memory of the lower triangular tiles of the covariance matrix.
@@ -314,4 +321,6 @@ void free_lower_tiled_matrix(const std::vector<hpx::shared_future<double *>> &d_
 
 }  // end of namespace gpu
 
-#endif  // end of GPU_GP_ALGORITHMS_H
+GPRAT_NS_END
+
+#endif
diff --git a/core/include/gpu/gp_functions.cuh b/core/include/gprat/gpu/gp_functions.cuh
similarity index 87%
rename from core/include/gpu/gp_functions.cuh
rename to core/include/gprat/gpu/gp_functions.cuh
index 6ea5bd0a..d8746d33 100644
--- a/core/include/gpu/gp_functions.cuh
+++ b/core/include/gprat/gpu/gp_functions.cuh
@@ -1,9 +1,15 @@
 #ifndef GPU_GP_FUNCTIONS_H
 #define GPU_GP_FUNCTIONS_H
 
-#include "gp_hyperparameters.hpp"
-#include "gp_kernels.hpp"
-#include "target.hpp"
+#pragma once
+
+#include "gprat/detail/config.hpp"
+#include "gprat/hyperparameters.hpp"
+#include "gprat/kernels.hpp"
+#include "gprat/target.hpp"
+#include "gprat/tile_data.hpp"
+
+GPRAT_NS_BEGIN
 
 namespace gpu
 {
@@ -28,13 +34,13 @@ std::vector<double>
 predict(const std::vector<double> &training_input,
         const std::vector<double> &training_output,
         const std::vector<double> &test_input,
-        const gprat_hyper::SEKParams &sek_params,
+        const SEKParams &sek_params,
         int n_tiles,
         int n_tile_size,
         int m_tiles,
         int m_tile_size,
         int n_regressors,
-        gprat::CUDA_GPU &gpu);
+        CUDA_GPU &gpu);
 
 /**
  * @brief Compute the predictions with uncertainties.
@@ -56,13 +62,13 @@ std::vector<std::vector<double>> predict_with_uncertainty(
     const std::vector<double> &training_input,
     const std::vector<double> &training_output,
     const std::vector<double> &test_input,
-    const gprat_hyper::SEKParams &sek_params,
+    const SEKParams &sek_params,
     int n_tiles,
     int n_tile_size,
     int m_tiles,
     int m_tile_size,
     int n_regressors,
-    gprat::CUDA_GPU &gpu);
+    CUDA_GPU &gpu);
 
 /**
  * @brief Compute the predictions with full covariance matrix.
@@ -84,13 +90,13 @@ std::vector<std::vector<double>> predict_with_full_cov(
     const std::vector<double> &training_input,
     const std::vector<double> &training_output,
     const std::vector<double> &test_data,
-    const gprat_hyper::SEKParams &sek_params,
+    const SEKParams &sek_params,
     int n_tiles,
     int n_tile_size,
     int m_tiles,
     int m_tile_size,
     int n_regressors,
-    gprat::CUDA_GPU &gpu);
+    CUDA_GPU &gpu);
 
 /**
  * @brief Compute loss for given data and Gaussian process model
@@ -107,11 +113,11 @@ std::vector<std::vector<double>> predict_with_full_cov(
  */
 double compute_loss(const std::vector<double> &training_input,
                     const std::vector<double> &training_output,
-                    const gprat_hyper::SEKParams &sek_params,
+                    const SEKParams &sek_params,
                     int n_tiles,
                     int n_tile_size,
                     int n_regressors,
-                    gprat::CUDA_GPU &gpu);
+                    CUDA_GPU &gpu);
 
 /**
  * @brief Perform optimization for a given number of iterations
@@ -137,10 +143,10 @@ optimize(const std::vector<double> &training_input,
          int n_tiles,
          int n_tile_size,
          int n_regressors,
-         const gprat_hyper::AdamParams &adam_params,
-         gprat_hyper::SEKParams &sek_params,
+         const AdamParams &adam_params,
+         SEKParams &sek_params,
          std::vector<bool> trainable_params,
-         gprat::CUDA_GPU &gpu);
+         CUDA_GPU &gpu);
 
 /**
  * @brief Perform a single optimization step
@@ -166,11 +172,11 @@ double optimize_step(const std::vector<double> &training_input,
                      int n_tiles,
                      int n_tile_size,
                      int n_regressors,
-                     gprat_hyper::AdamParams &adam_params,
-                     gprat_hyper::SEKParams &sek_params,
+                     AdamParams &adam_params,
+                     SEKParams &sek_params,
                      std::vector<bool> trainable_params,
                      int iter,
-                     gprat::CUDA_GPU &gpu);
+                     CUDA_GPU &gpu);
 
 /**
  * @brief Perform Cholesky decompositon (+ Assembly)
@@ -186,14 +192,16 @@ double optimize_step(const std::vector<double> &training_input,
  *
  * @return The tiled Cholesky factor
  */
-std::vector<std::vector<double>>
+std::vector<mutable_tile_data<double>>
 cholesky(const std::vector<double> &training_input,
-         const gprat_hyper::SEKParams &sek_params,
+         const SEKParams &sek_params,
          int n_tiles,
          int n_tile_size,
          int n_regressors,
-         gprat::CUDA_GPU &gpu);
+         CUDA_GPU &gpu);
 
 }  // end of namespace gpu
 
+GPRAT_NS_END
+
 #endif
diff --git a/core/include/gpu/gp_optimizer.cuh b/core/include/gprat/gpu/gp_optimizer.cuh
similarity index 93%
rename from core/include/gpu/gp_optimizer.cuh
rename to core/include/gprat/gpu/gp_optimizer.cuh
index d0c5dd3a..61495de0 100644
--- a/core/include/gpu/gp_optimizer.cuh
+++ b/core/include/gprat/gpu/gp_optimizer.cuh
@@ -1,12 +1,19 @@
-#ifndef GPU_GP_OPTIMIZER_H
-#define GPU_GP_OPTIMIZER_H
+#ifndef GPRAT_GPU_GP_OPTIMIZER_HPP
+#define GPRAT_GPU_GP_OPTIMIZER_HPP
+
+#pragma once
+
+#include "gprat/detail/config.hpp"
+
+#include "gprat/hyperparameters.hpp"
+#include "gprat/kernels.hpp"
+#include "gprat/target.hpp"
 
-#include "gp_hyperparameters.hpp"
-#include "gp_kernels.hpp"
-#include "target.hpp"
 #include <hpx/future.hpp>
 #include <vector>
 
+GPRAT_NS_BEGIN
+
 namespace gpu
 {
 
@@ -56,7 +63,7 @@ double compute_sigmoid(const double parameter);
 double compute_covariance_distance(std::size_t i_global,
                                    std::size_t j_global,
                                    std::size_t n_regressors,
-                                   gprat_hyper::SEKParams sek_params,
+                                   SEKParams sek_params,
                                    const std::vector<double> &i_input,
                                    const std::vector<double> &j_input);
 
@@ -77,7 +84,7 @@ std::vector<double> gen_tile_distance(
     std::size_t col,
     std::size_t N,
     std::size_t n_regressors,
-    gprat_hyper::SEKParams sek_params,
+    SEKParams sek_params,
     const std::vector<double> &input);
 
 /**
@@ -96,7 +103,7 @@ std::vector<double> gen_tile_covariance_with_distance(
     std::size_t col,
     std::size_t N,
     std::size_t n_regressors,
-    gprat_hyper::SEKParams sek_params,
+    SEKParams sek_params,
     const std::vector<double> &cov_dists);
 
 /**
@@ -116,7 +123,7 @@ gen_tile_grad_v(std::size_t row,
                 std::size_t col,
                 std::size_t N,
                 std::size_t n_regressors,
-                gprat_hyper::SEKParams sek_params,
+                SEKParams sek_params,
                 const std::vector<double> &cov_dists);
 
 /**
@@ -136,7 +143,7 @@ gen_tile_grad_l(std::size_t row,
                 std::size_t col,
                 std::size_t N,
                 std::size_t n_regressors,
-                gprat_hyper::SEKParams sek_params,
+                SEKParams sek_params,
                 const std::vector<double> &cov_dists);
 
 /**
@@ -159,7 +166,7 @@ std::vector<double> gen_tile_grad_v_trans(std::size_t N, const std::vector<doubl
  * @return A quadratic tile of the derivative of l of size N x N
  */
 hpx::shared_future<double *>
-gen_tile_grad_l_trans(std::size_t N, const hpx::shared_future<double *> f_grad_l_tile, gprat::CUDA_GPU &gpu);
+gen_tile_grad_l_trans(std::size_t N, const hpx::shared_future<double *> f_grad_l_tile, CUDA_GPU &gpu);
 
 /**
  * @brief Compute hyper-parameter beta_1 or beta_2 to power t.
@@ -187,7 +194,7 @@ compute_loss(const hpx::shared_future<double *> &K_diag_tile,
              const hpx::shared_future<double *> &alpha_tile,
              const hpx::shared_future<double *> &y_tile,
              std::size_t N,
-             gprat::CUDA_GPU &gpu);
+             CUDA_GPU &gpu);
 
 /**
  * @brief Add up negative-log likelihood loss for all tiles.
@@ -260,8 +267,8 @@ double update_second_moment(const double &gradient, double v_T, const double &be
  */
 hpx::shared_future<double>
 update_param(const double unconstrained_hyperparam,
-             gprat_hyper::SEKParams sek_params,
-             gprat_hyper::AdamParams adam_params,
+             SEKParams sek_params,
+             AdamParams adam_params,
              double m_T,
              double v_T,
              const std::vector<double> beta1_T,
@@ -319,7 +326,7 @@ sum_gradright(const std::vector<double> &inter_alpha, const std::vector<double>
  */
 double sum_noise_gradleft(const std::vector<double> &ft_invK,
                           double grad,
-                          gprat_hyper::SEKParams sek_params,
+                          SEKParams sek_params,
                           std::size_t N,
                           std::size_t n_tiles);
 
@@ -334,8 +341,10 @@ double sum_noise_gradleft(const std::vector<double> &ft_invK,
  * @return The sum of the noise gradient
  */
 double
-sum_noise_gradright(const std::vector<double> &alpha, double grad, gprat_hyper::SEKParams sek_params, std::size_t N);
+sum_noise_gradright(const std::vector<double> &alpha, double grad, SEKParams sek_params, std::size_t N);
 
 }  // end of namespace gpu
 
-#endif  // end of GPU_GP_OPTIMIZER_H
+GPRAT_NS_END
+
+#endif
diff --git a/core/include/gpu/gp_uncertainty.cuh b/core/include/gprat/gpu/gp_uncertainty.cuh
similarity index 71%
rename from core/include/gpu/gp_uncertainty.cuh
rename to core/include/gprat/gpu/gp_uncertainty.cuh
index 8c2dce18..4a93eccb 100644
--- a/core/include/gpu/gp_uncertainty.cuh
+++ b/core/include/gprat/gpu/gp_uncertainty.cuh
@@ -1,7 +1,13 @@
-#ifndef GPU_GP_UNCERTAINTY_H
-#define GPU_GP_UNCERTAINTY_H
+#ifndef GPRAT_GPU_GP_UNCERTAINTY_HPP
+#define GPRAT_GPU_GP_UNCERTAINTY_HPP
 
-#include "target.hpp"
+#pragma once
+
+#include "gprat/detail/config.hpp"
+
+#include "gprat/target.hpp"
+
+GPRAT_NS_BEGIN
 
 namespace gpu
 {
@@ -16,7 +22,7 @@ namespace gpu
  * @return Diagonal elements of posterior covariance matrix
  */
 hpx::shared_future<double *> diag_posterior(
-    const hpx::shared_future<double *> A, const hpx::shared_future<double *> B, std::size_t M, gprat::CUDA_GPU &gpu);
+    const hpx::shared_future<double *> A, const hpx::shared_future<double *> B, std::size_t M, CUDA_GPU &gpu);
 
 /**
  * @brief Retrieve diagonal elements of posterior covariance matrix.
@@ -26,8 +32,10 @@ hpx::shared_future<double *> diag_posterior(
  *
  * @return Diagonal elements of posterior covariance matrix
  */
-hpx::shared_future<double *> diag_tile(const hpx::shared_future<double *> A, std::size_t M, gprat::CUDA_GPU &gpu);
+hpx::shared_future<double *> diag_tile(const hpx::shared_future<double *> A, std::size_t M, CUDA_GPU &gpu);
 
 }  // end of namespace gpu
 
-#endif  // end of GPU_GP_UNCERTAINTY_H
+GPRAT_NS_END
+
+#endif
diff --git a/core/include/gpu/tiled_algorithms.cuh b/core/include/gprat/gpu/tiled_algorithms.cuh
similarity index 92%
rename from core/include/gpu/tiled_algorithms.cuh
rename to core/include/gprat/gpu/tiled_algorithms.cuh
index 78c6f5cb..38875e1e 100644
--- a/core/include/gpu/tiled_algorithms.cuh
+++ b/core/include/gprat/gpu/tiled_algorithms.cuh
@@ -1,12 +1,19 @@
-#ifndef GPU_TILED_ALGORITHMS_H
-#define GPU_TILED_ALGORITHMS_H
+#ifndef GPRAT_GPU_TILED_ALGORITHMS_HPP
+#define GPRAT_GPU_TILED_ALGORITHMS_HPP
+
+#pragma once
+
+#include "gprat/detail/config.hpp"
+
+#include "gprat/hyperparameters.hpp"
+#include "gprat/target.hpp"
+#include "gprat/kernels.hpp"
 
-#include "gp_hyperparameters.hpp"
-#include "target.hpp"
 #include <cusolverDn.h>
-#include <gp_kernels.hpp>
 #include <hpx/modules/async_cuda.hpp>
 
+GPRAT_NS_BEGIN
+
 namespace gpu
 {
 
@@ -26,7 +33,7 @@ namespace gpu
 void right_looking_cholesky_tiled(std::vector<hpx::shared_future<double *>> &ft_tiles,
                                   const std::size_t n_tile_size,
                                   const std::size_t n_tiles,
-                                  gprat::CUDA_GPU &gpu,
+                                  CUDA_GPU &gpu,
                                   const cusolverDnHandle_t &cusolver);
 
 // Tiled Triangular Solve Algorithms
@@ -44,7 +51,7 @@ void forward_solve_tiled(std::vector<hpx::shared_future<double *>> &ft_tiles,
                          std::vector<hpx::shared_future<double *>> &ft_rhs,
                          const std::size_t n_tile_size,
                          const std::size_t n_tiles,
-                         gprat::CUDA_GPU &gpu);
+                         CUDA_GPU &gpu);
 
 /**
  * @brief Perform tiled backward triangular matrix-vector solve.
@@ -59,7 +66,7 @@ void backward_solve_tiled(std::vector<hpx::shared_future<double *>> &ft_tiles,
                           std::vector<hpx::shared_future<double *>> &ft_rhs,
                           const std::size_t n_tile_size,
                           const std::size_t n_tiles,
-                          gprat::CUDA_GPU &gpu);
+                          CUDA_GPU &gpu);
 
 /**
  * @brief Perform tiled forward triangular matrix-matrix solve.
@@ -79,7 +86,7 @@ void forward_solve_tiled_matrix(
     const std::size_t m_tile_size,
     const std::size_t n_tiles,
     const std::size_t m_tiles,
-    gprat::CUDA_GPU &gpu);
+    CUDA_GPU &gpu);
 
 /**
  * @brief Perform tiled backward triangular matrix-matrix solve.
@@ -99,7 +106,7 @@ void backward_solve_tiled_matrix(
     const std::size_t m_tile_size,
     const std::size_t n_tiles,
     const std::size_t m_tiles,
-    gprat::CUDA_GPU &gpu);
+    CUDA_GPU &gpu);
 
 /**
  * @brief Perform tiled matrix-vector multiplication
@@ -120,7 +127,7 @@ void matrix_vector_tiled(std::vector<hpx::shared_future<double *>> &ft_tiles,
                          const std::size_t N_col,
                          const std::size_t n_tiles,
                          const std::size_t m_tiles,
-                         gprat::CUDA_GPU &gpu);
+                         CUDA_GPU &gpu);
 
 /**
  * @brief Perform tiled symmetric k-rank update on diagonal tiles
@@ -140,14 +147,14 @@ void symmetric_matrix_matrix_diagonal_tiled(
     const std::size_t m_tile_size,
     const std::size_t n_tiles,
     const std::size_t m_tiles,
-    gprat::CUDA_GPU &gpu);
+    CUDA_GPU &gpu);
 
 void compute_gemm_of_invK_y(std::vector<hpx::shared_future<double *>> &ft_invK,
                             std::vector<hpx::shared_future<double *>> &ft_y,
                             std::vector<hpx::shared_future<double *>> &ft_alpha,
                             const std::size_t n_tile_size,
                             const std::size_t n_tiles,
-                            gprat::CUDA_GPU &gpu);
+                            CUDA_GPU &gpu);
 
 // Tiled Loss
 hpx::shared_future<double> compute_loss_tiled(
@@ -156,7 +163,7 @@ hpx::shared_future<double> compute_loss_tiled(
     std::vector<hpx::shared_future<double *>> &ft_y,
     const std::size_t n_tile_size,
     const std::size_t n_tiles,
-    gprat::CUDA_GPU &gpu);
+    CUDA_GPU &gpu);
 
 // Tiled Diagonal of Posterior Covariance Matrix
 void symmetric_matrix_matrix_tiled(
@@ -166,7 +173,7 @@ void symmetric_matrix_matrix_tiled(
     const std::size_t m_tile_size,
     const std::size_t n_tiles,
     const std::size_t m_tiles,
-    gprat::CUDA_GPU &gpu);
+    CUDA_GPU &gpu);
 
 /**
  * @brief Compute the difference between two tiled vectors
@@ -183,14 +190,14 @@ void vector_difference_tiled(std::vector<hpx::shared_future<double *>> &ft_prior
                              std::vector<hpx::shared_future<double *>> &ft_vector,
                              const std::size_t m_tile_size,
                              const std::size_t m_tiles,
-                             gprat::CUDA_GPU &gpu);
+                             CUDA_GPU &gpu);
 
 // Tiled Prediction Uncertainty
 void matrix_diagonal_tiled(std::vector<hpx::shared_future<double *>> &ft_priorK,
                            std::vector<hpx::shared_future<double *>> &ft_vector,
                            const std::size_t m_tile_size,
                            const std::size_t m_tiles,
-                           gprat::CUDA_GPU &gpu);
+                           CUDA_GPU &gpu);
 
 // Compute I-y*y^T*inv(K)
 void update_grad_K_tiled_mkl(std::vector<hpx::shared_future<double *>> &ft_tiles,
@@ -198,7 +205,7 @@ void update_grad_K_tiled_mkl(std::vector<hpx::shared_future<double *>> &ft_tiles
                              const std::vector<hpx::shared_future<double *>> &ft_v2,
                              const std::size_t n_tile_size,
                              const std::size_t n_tiles,
-                             gprat::CUDA_GPU &gpu);
+                             CUDA_GPU &gpu);
 
 /**
  * @brief Updates the lengthscale hyperparameter of the SEK kernel using Adam.
@@ -223,8 +230,8 @@ double update_lengthscale(
     const std::vector<hpx::shared_future<double *>> &ft_invK,
     const std::vector<hpx::shared_future<double *>> &ft_gradparam,
     const std::vector<hpx::shared_future<double *>> &ft_alpha,
-    gprat_hyper::SEKParams sek_params,
-    gprat_hyper::AdamParams adam_params,
+    SEKParams sek_params,
+    AdamParams adam_params,
     const std::size_t n_tile_size,
     const std::size_t n_tiles,
     std::vector<hpx::shared_future<double>> &m_T,
@@ -232,7 +239,7 @@ double update_lengthscale(
     const std::vector<hpx::shared_future<double>> &beta1_T,
     const std::vector<hpx::shared_future<double>> &beta2_T,
     int iter,
-    gprat::CUDA_GPU &gpu);
+    CUDA_GPU &gpu);
 
 /**
  * @brief Updates the vertical lengthscale hyperparameter of the SEK kernel
@@ -258,8 +265,8 @@ double update_vertical_lengthscale(
     const std::vector<hpx::shared_future<double *>> &ft_invK,
     const std::vector<hpx::shared_future<double *>> &ft_gradparam,
     const std::vector<hpx::shared_future<double *>> &ft_alpha,
-    gprat_hyper::SEKParams sek_params,
-    gprat_hyper::AdamParams adam_params,
+    SEKParams sek_params,
+    AdamParams adam_params,
     const std::size_t n_tile_size,
     const std::size_t n_tiles,
     std::vector<hpx::shared_future<double>> &m_T,
@@ -267,7 +274,7 @@ double update_vertical_lengthscale(
     const std::vector<hpx::shared_future<double>> &beta1_T,
     const std::vector<hpx::shared_future<double>> &beta2_T,
     int iter,
-    gprat::CUDA_GPU &gpu);
+    CUDA_GPU &gpu);
 
 /**
  * @brief Updates a hyperparameter of the SEK kernel using Adam
@@ -290,8 +297,8 @@ double update_vertical_lengthscale(
 double update_noise_variance(
     const std::vector<hpx::shared_future<double *>> &ft_invK,
     const std::vector<hpx::shared_future<double *>> &ft_alpha,
-    gprat_hyper::SEKParams sek_params,
-    gprat_hyper::AdamParams adam_params,
+    SEKParams sek_params,
+    AdamParams adam_params,
     const std::size_t n_tile_size,
     const std::size_t n_tiles,
     std::vector<hpx::shared_future<double>> &m_T,
@@ -299,8 +306,10 @@ double update_noise_variance(
     const std::vector<hpx::shared_future<double>> &beta1_T,
     const std::vector<hpx::shared_future<double>> &beta2_T,
     int iter,
-    gprat::CUDA_GPU &gpu);
+    CUDA_GPU &gpu);
 
 }  // end of namespace gpu
 
-#endif  // end of GPU_TILED_ALGORITHMS_H
+GPRAT_NS_END
+
+#endif
diff --git a/core/include/gp_hyperparameters.hpp b/core/include/gprat/hyperparameters.hpp
similarity index 56%
rename from core/include/gp_hyperparameters.hpp
rename to core/include/gprat/hyperparameters.hpp
index cd9cf5a8..dae073dc 100644
--- a/core/include/gp_hyperparameters.hpp
+++ b/core/include/gprat/hyperparameters.hpp
@@ -1,10 +1,14 @@
-#ifndef GP_HYPERPARAMETERS_H
-#define GP_HYPERPARAMETERS_H
+#ifndef GPRAT_GPHYPERPARAMETERS_HPP
+#define GPRAT_GPHYPERPARAMETERS_HPP
 
+#pragma once
+
+#include "gprat/detail/config.hpp"
+
+#include <memory>
 #include <string>
 
-namespace gprat_hyper
-{
+GPRAT_NS_BEGIN
 
 /**
  * @brief Hyperparameters for the Adam optimizer
@@ -34,7 +38,7 @@ struct AdamParams
     /**
      * @brief Number of optimization iterations
      */
-    int opt_iter;
+    std::size_t opt_iter;
 
     /**
      * @brief Initialize hyperparameters
@@ -44,10 +48,8 @@ struct AdamParams
      * @param b2 beta2
      * @param eps epsilon
      * @param opt_i number of optimization iterationsgp op
-     * @param M_T_init initial values for first moment vector
-     * @param V_T_init initial values for second moment vector
      */
-    AdamParams(double lr = 0.001, double b1 = 0.9, double b2 = 0.999, double eps = 1e-8, int opt_i = 0);
+    AdamParams(double lr = 0.001, double b1 = 0.9, double b2 = 0.999, double eps = 1e-8, std::size_t opt_i = 0);
 
     /**
      * @brief Returns a string representation of the hyperparameters
@@ -55,6 +57,30 @@ struct AdamParams
     std::string repr() const;
 };
 
-}  // namespace gprat_hyper
+template <class Archive>
+void save_construct_data(Archive &ar, const AdamParams *v, const unsigned int)
+{
+    ar << v->learning_rate;
+    ar << v->beta1;
+    ar << v->beta2;
+    ar << v->epsilon;
+    ar << v->opt_iter;
+}
+
+template <class Archive>
+void load_construct_data(Archive &ar, AdamParams *v, const unsigned int)
+{
+    double learning_rate, beta1, beta2, epsilon;
+    int opt_iter;
+    ar >> learning_rate;
+    ar >> beta1;
+    ar >> beta2;
+    ar >> epsilon;
+    ar >> opt_iter;
+
+    new (v) AdamParams(learning_rate, beta1, beta2, epsilon, opt_iter);
+}
+
+GPRAT_NS_END
 
-#endif  // GP_HYPERPARAMETERS_H
+#endif
diff --git a/core/include/gp_kernels.hpp b/core/include/gprat/kernels.hpp
similarity index 55%
rename from core/include/gp_kernels.hpp
rename to core/include/gprat/kernels.hpp
index c1346f32..daa7798b 100644
--- a/core/include/gp_kernels.hpp
+++ b/core/include/gprat/kernels.hpp
@@ -1,12 +1,15 @@
-#ifndef GP_KERNELS_H
-#define GP_KERNELS_H
+#ifndef GPRAT_GPKERNELS_HPP
+#define GPRAT_GPKERNELS_HPP
 
-#include <vector>
+#pragma once
 
-// #include <cstddef>
+#include "gprat/detail/config.hpp"
 
-namespace gprat_hyper
-{
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+GPRAT_NS_BEGIN
 
 /**
  * @brief Squared Exponential Kernel Parameters
@@ -41,12 +44,12 @@ struct SEKParams
     /**
      * @brief Construct a new SEKParams object
      *
-     * @param lengthscale Lengthscale: variance of training output
-     * @param vertical_lengthscale Vertical Lengthscale: standard deviation
+     * @param in_lengthscale Lengthscale: variance of training output
+     * @param in_vertical_lengthscale Vertical Lengthscale: standard deviation
      * of training input
-     * @param noise_variance Noise Variance: small value
+     * @param in_noise_variance Noise Variance: small value
      */
-    SEKParams(double lengthscale_, double vertical_lengthscale_, double noise_variance_);
+    SEKParams(double in_lengthscale, double in_vertical_lengthscale, double in_noise_variance);
 
     /**
      * @brief Return the number of parameters
@@ -77,6 +80,31 @@ struct SEKParams
     const double &get_param(std::size_t index) const;
 };
 
-}  // namespace gprat_hyper
+template <class Archive>
+void save_construct_data(Archive &ar, const SEKParams *v, const unsigned int)
+{
+    ar << v->lengthscale;
+    ar << v->vertical_lengthscale;
+    ar << v->noise_variance;
+}
+
+template <class Archive>
+void load_construct_data(Archive &ar, SEKParams *v, const unsigned int)
+{
+    double lengthscale, vertical_lengthscale, noise_variance;
+    ar >> lengthscale;
+    ar >> vertical_lengthscale;
+    ar >> noise_variance;
+
+    new (v) SEKParams(lengthscale, vertical_lengthscale, noise_variance);
+}
+
+template <typename Archive>
+void serialize(Archive &ar, SEKParams &pt, const unsigned int)
+{
+    ar & pt.m_T & pt.w_T;
+}
+
+GPRAT_NS_END
 
-#endif  // end of GP_KERNELS_H
+#endif
diff --git a/core/include/gprat/performance_counters.hpp b/core/include/gprat/performance_counters.hpp
new file mode 100644
index 00000000..13054735
--- /dev/null
+++ b/core/include/gprat/performance_counters.hpp
@@ -0,0 +1,102 @@
+﻿#ifndef GPRAT_PERFORMANCE_COUNTERS_HPP
+#define GPRAT_PERFORMANCE_COUNTERS_HPP
+
+#pragma once
+
+#include "gprat/detail/config.hpp"
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <hpx/modules/assertion.hpp>
+#include <hpx/timing/high_resolution_timer.hpp>
+#include <hpx/util/get_and_reset_value.hpp>
+#include <span>
+
+GPRAT_NS_BEGIN
+
+/// The following is a very simple way of defining per-function metrics by using the function itself as a template
+/// parameter ensuring that each function receives exactly one instantiation.
+template <auto F>
+struct function_performance_metrics
+{
+    /// Number of times the function was called
+    static std::atomic<std::uint64_t> num_calls;
+
+    /// Total wall-clock time elapsed inside the function
+    static std::atomic<std::uint64_t> elapsed_ns;
+};
+
+template <auto F>
+/*static*/ std::atomic<std::uint64_t> function_performance_metrics<F>::num_calls(0);
+template <auto F>
+/*static*/ std::atomic<std::uint64_t> function_performance_metrics<F>::elapsed_ns(0);
+
+/// @brief This RAII helper allows us to time a function's total wall-clock execution time with minimal code.
+struct scoped_function_timer
+{
+    explicit scoped_function_timer(std::atomic<std::uint64_t> &num_calls, std::atomic<std::uint64_t> &in_total) :
+        total(in_total)
+    {
+        ++num_calls;
+    }
+
+    ~scoped_function_timer()
+    {
+        const auto elapsed = timer.elapsed_nanoseconds();
+        HPX_ASSERT(elapsed >= 0);
+        if (elapsed > 0)
+        {
+            total += static_cast<std::uint64_t>(elapsed);
+        }
+    }
+
+    std::atomic<std::uint64_t> &total;
+    hpx::chrono::high_resolution_timer timer;
+};
+
+/// @brief Time the execution of the enclosing function from the current point to its end.
+/// @param local_function The function key that we're collecting performance information for. Usually the enclosing
+/// function.
+#define GPRAT_TIME_FUNCTION(local_function)                                                                            \
+    scoped_function_timer _gprat_fn_timer(function_performance_metrics<local_function>::num_calls,                     \
+                                          function_performance_metrics<local_function>::elapsed_ns)
+
+template <auto F>
+std::uint64_t get_and_reset_function_elapsed(bool reset)
+{
+    return hpx::util::get_and_reset_value(function_performance_metrics<F>::elapsed_ns, reset);
+}
+
+template <auto F>
+std::uint64_t get_and_reset_function_calls(bool reset)
+{
+    return hpx::util::get_and_reset_value(function_performance_metrics<F>::num_calls, reset);
+}
+
+void track_tile_data_allocation(std::size_t size);
+void track_tile_data_deallocation(std::size_t size);
+
+void register_performance_counters();
+
+void force_evict_memory(const void *start, std::size_t size);
+
+template <typename T>
+void force_evict_memory(std::span<const T> data)
+{
+    force_evict_memory(data.data(), data.size_bytes());
+}
+
+#ifdef GPRAT_ENABLE_BENCHMARK_CACHE_EVICTIONS
+/// @brief Force-evict a memory span from the cache for benchmarking purposes.
+/// @param data The memory region to evict
+#define GPRAT_BENCHMARK_FORCE_EVICT(data) force_evict_memory(data)
+#else
+/// @brief Force-evict a memory span from the cache for benchmarking purposes.
+/// @param data The memory region to evict
+#define GPRAT_BENCHMARK_FORCE_EVICT(data) (void) data
+#endif
+
+GPRAT_NS_END
+
+#endif
diff --git a/core/include/gprat/scheduler.hpp b/core/include/gprat/scheduler.hpp
new file mode 100644
index 00000000..2da7ccd7
--- /dev/null
+++ b/core/include/gprat/scheduler.hpp
@@ -0,0 +1,183 @@
+#ifndef GPRAT_CPU_SCHEDULER_HPP
+#define GPRAT_CPU_SCHEDULER_HPP
+
+#pragma once
+
+#include "gprat/detail/async_helpers.hpp"
+
+// TODO: move to separate header
+#include "gprat/tile_data.hpp"
+
+#include <hpx/future.hpp>
+#include <vector>
+
+GPRAT_NS_BEGIN
+
+using tiled_scheduler_local = basic_local_scheduler;
+
+template <typename T>
+using tiled_dataset_local = std::vector<hpx::shared_future<mutable_tile_data<T>>>;
+
+template <typename Scheduler, typename T>
+struct tile_dataset_type;
+
+template <typename T>
+struct tile_dataset_type<tiled_scheduler_local, T>
+{
+    using type = tiled_dataset_local<T>;
+};
+
+template <typename T, typename Mapper>
+tiled_dataset_local<T> make_tiled_dataset(const tiled_scheduler_local &, std::size_t num_tiles, Mapper &&)
+{
+    return std::vector<hpx::shared_future<mutable_tile_data<T>>>{ num_tiles };
+}
+
+/// @brief This namespace contains the operation placement functions for all schedulers.
+namespace schedule
+{
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#endif
+
+// =============================================================
+// local scheduler
+
+constexpr std::size_t
+covariance_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t row, std::size_t col)
+{
+    return 0;
+}
+
+constexpr std::size_t
+cross_covariance_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t row, std::size_t col)
+{
+    return 0;
+}
+
+constexpr std::size_t alpha_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t i) { return 0; }
+
+constexpr std::size_t prediction_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t i)
+{
+    return 0;
+}
+
+constexpr std::size_t
+t_cross_covariance_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t row, std::size_t col)
+{
+    return 0;
+}
+
+constexpr std::size_t
+prior_K_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t row, std::size_t col)
+{
+    return 0;
+}
+
+constexpr std::size_t
+K_inv_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t row, std::size_t col)
+{
+    return 0;
+}
+
+constexpr std::size_t
+K_grad_v_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t row, std::size_t col)
+{
+    return 0;
+}
+
+constexpr std::size_t
+K_grad_l_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t row, std::size_t col)
+{
+    return 0;
+}
+
+constexpr std::size_t uncertainty_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t i)
+{
+    return 0;
+}
+
+constexpr std::size_t inter_alpha_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t i)
+{
+    return 0;
+}
+
+constexpr std::size_t diag_tile(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t i) { return 0; }
+
+constexpr std::size_t cholesky_potrf(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t k)
+{
+    return 0;
+}
+
+constexpr std::size_t cholesky_syrk(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t m)
+{
+    return 0;
+}
+
+constexpr std::size_t
+cholesky_trsm(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t k, std::size_t m)
+{
+    return 0;
+}
+
+constexpr std::size_t
+cholesky_gemm(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t k, std::size_t m, std::size_t n)
+{
+    return 0;
+}
+
+constexpr std::size_t solve_trsv(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t k) { return 0; }
+
+constexpr std::size_t solve_trsm(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t k) { return 0; }
+
+constexpr std::size_t solve_gemv(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t k, std::size_t m)
+{
+    return 0;
+}
+
+constexpr std::size_t
+solve_matrix_trsm(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t c, std::size_t k)
+{
+    return 0;
+}
+
+constexpr std::size_t
+solve_matrix_gemm(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t c, std::size_t k, std::size_t m)
+{
+    return 0;
+}
+
+constexpr std::size_t
+multiply_gemv(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t k, std::size_t m)
+{
+    return 0;
+}
+
+constexpr std::size_t k_rank_dot_diag_syrk(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t k)
+{
+    return 0;
+}
+
+constexpr std::size_t
+k_rank_gemm(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t c, std::size_t k, std::size_t m)
+{
+    return 0;
+}
+
+constexpr std::size_t vector_axpy(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t k) { return 0; }
+
+constexpr std::size_t get_diagonal(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t k) { return 0; }
+
+constexpr std::size_t compute_loss(const tiled_scheduler_local &sched, std::size_t n_tiles, std::size_t k) { return 0; }
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+}  // namespace schedule
+
+GPRAT_NS_END
+
+#endif
diff --git a/core/include/target.hpp b/core/include/gprat/target.hpp
similarity index 97%
rename from core/include/target.hpp
rename to core/include/gprat/target.hpp
index 8b66cb0b..13487114 100644
--- a/core/include/target.hpp
+++ b/core/include/gprat/target.hpp
@@ -1,5 +1,9 @@
-#ifndef TARGET_H
-#define TARGET_H
+#ifndef GPRAT_TARGET_H
+#define GPRAT_TARGET_H
+
+#pragma once
+
+#include "gprat/detail/config.hpp"
 
 #include <string>
 
@@ -8,8 +12,7 @@
 #include <hpx/async_cuda/cublas_executor.hpp>
 #endif
 
-namespace gprat
-{
+GPRAT_NS_BEGIN
 
 /**
  * @brief This class represents the target on which to perform the Gaussian
@@ -203,6 +206,6 @@ void print_available_gpus();
  */
 int gpu_count();
 
-}  // namespace gprat
+GPRAT_NS_END
 
-#endif  // end of TARGET_H
+#endif
diff --git a/core/include/gprat/tile_data.hpp b/core/include/gprat/tile_data.hpp
new file mode 100644
index 00000000..a2615ad8
--- /dev/null
+++ b/core/include/gprat/tile_data.hpp
@@ -0,0 +1,170 @@
+﻿#ifndef GPRAT_TILE_DATA_HPP
+#define GPRAT_TILE_DATA_HPP
+
+#pragma once
+
+#include "gprat/detail/config.hpp"
+
+#include <hpx/serialization/serialize_buffer.hpp>
+#include <span>
+
+GPRAT_NS_BEGIN
+
+namespace detail
+{
+void *allocate_tile_data(std::size_t num_bytes);
+void deallocate_tile_data(void *p, std::size_t num_bytes);
+
+template <class T>
+struct tile_data_allocator
+{
+    typedef T value_type;
+
+    tile_data_allocator() = default;
+
+    template <class U>
+    constexpr tile_data_allocator(const tile_data_allocator<U> &) noexcept
+    { }
+
+    [[nodiscard]] T *allocate(std::size_t n)
+    {
+        if (n > (std::numeric_limits<std::size_t>::max)() / sizeof(T))
+        {
+            throw std::bad_array_new_length();
+        }
+
+        if (auto p = static_cast<T *>(allocate_tile_data(n * sizeof(T))))
+        {
+            return p;
+        }
+
+        throw std::bad_alloc();
+    }
+
+    void deallocate(T *p, std::size_t n) noexcept { deallocate_tile_data(p, n * sizeof(T)); }
+};
+
+template <class T, class U>
+bool operator==(const tile_data_allocator<T> &, const tile_data_allocator<U> &)
+{
+    return true;
+}
+
+template <class T, class U>
+bool operator!=(const tile_data_allocator<T> &, const tile_data_allocator<U> &)
+{
+    return false;
+}
+}  // namespace detail
+
+/**
+ * @brief Non-mutable reference-counted dynamic array of a given type T.
+ * This class represents a simple reference-counted non-resizeable buffer with elements of type T.
+ * It can be serialized by HPX and thus be used as a parameter for HPX actions.
+ * This type is intended to be used for parameters and attributes that do not require mutable data (i.e., only read
+ * access)
+ *
+ * @tparam T Element type of the tile. Usually some numeric type like double or float. This class currently only
+ * requires T to be serializable by HPX.
+ */
+template <typename T>
+class const_tile_data
+{
+  protected:
+    typedef hpx::serialization::serialize_buffer<T, detail::tile_data_allocator<T>> cpu_buffer_type;
+
+    struct hold_reference
+    {
+        explicit hold_reference(const cpu_buffer_type &data) :
+            data_(data)
+        { }
+
+        void operator()(const T *) const { }  // no deletion necessary
+
+        cpu_buffer_type data_;
+    };
+
+  public:
+    const_tile_data() = default;
+
+    // Create a new (uninitialized) tile_data of the given size.
+    explicit const_tile_data(std::size_t size) :
+        cpu_data_(size)
+    { }
+
+    // Create a tile_data which acts as a proxy to a part of the embedded array.
+    // The proxy is assumed to refer to either the left or the right boundary
+    // element.
+    const_tile_data(const const_tile_data &base, std::size_t offset, std::size_t size) :
+        cpu_data_(base.cpu_data_.data() + offset,
+                  size,
+                  cpu_buffer_type::reference,
+                  hold_reference(base.cpu_data_))  // keep referenced tile_data alive
+    { }
+
+    [[nodiscard]] const T *data() const noexcept { return cpu_data_.data(); }
+
+    [[nodiscard]] std::size_t size() const noexcept { return cpu_data_.size(); }
+
+    [[nodiscard]] const T *begin() const noexcept { return cpu_data_.data(); }
+
+    [[nodiscard]] const T *end() const noexcept { return cpu_data_.data() + cpu_data_.size(); }
+
+    [[nodiscard]] const T &operator[](std::size_t idx) const { return cpu_data_[idx]; }
+
+    [[nodiscard]] std::span<const T> as_span() const noexcept { return { cpu_data_.data(), cpu_data_.size() }; }
+
+    // ReSharper disable once CppNonExplicitConversionOperator
+    operator std::span<const T>() const noexcept  // NOLINT(*-explicit-constructor)
+    {
+        return { cpu_data_.data(), cpu_data_.size() };
+    }
+
+    friend bool operator==(const const_tile_data &a, const const_tile_data &b) noexcept
+    {
+        return a.cpu_data_ == b.cpu_data_;
+    }
+
+  protected:
+    friend class hpx::serialization::access;
+
+    template <typename Archive>
+    void serialize(Archive &ar, const unsigned int)
+    {
+        // clang-format off
+        ar & cpu_data_;
+        // clang-format on
+    }
+
+    cpu_buffer_type cpu_data_;
+};
+
+/**
+ * A mutable version of const_tile_data.
+ *
+ * @tparam T Element type of the tile. See @ref const_tile_data
+ */
+template <typename T>
+class mutable_tile_data : public const_tile_data<T>
+{
+  public:
+    using const_tile_data<T>::const_tile_data;
+
+    [[nodiscard]] T *data() const noexcept { return const_cast<T *>(this->cpu_data_.data()); }
+
+    [[nodiscard]] T *begin() const noexcept { return const_cast<T *>(this->cpu_data_.data()); }
+
+    [[nodiscard]] T *end() const noexcept { return const_cast<T *>(this->cpu_data_.data()) + this->cpu_data_.size(); }
+
+    [[nodiscard]] T &operator[](std::size_t idx) const { return this->cpu_data_[idx]; }
+
+    // ReSharper disable once CppNonExplicitConversionOperator
+    operator std::span<T>() noexcept  // NOLINT(*-explicit-constructor)
+    {
+        return { this->cpu_data_.data(), this->cpu_data_.size() };
+    }
+};
+
+GPRAT_NS_END
+
+#endif
diff --git a/core/include/utils_c.hpp b/core/include/gprat/utils.hpp
similarity index 75%
rename from core/include/utils_c.hpp
rename to core/include/gprat/utils.hpp
index 591bb7ee..86a4ddd2 100644
--- a/core/include/utils_c.hpp
+++ b/core/include/gprat/utils.hpp
@@ -1,5 +1,9 @@
-#ifndef UTILS_C_H
-#define UTILS_C_H
+#ifndef GPRAT_UTILS_HPP
+#define GPRAT_UTILS_HPP
+
+#pragma once
+
+#include "gprat/detail/config.hpp"
 
 #include <hpx/future.hpp>
 #include <hpx/hpx_start.hpp>
@@ -7,8 +11,8 @@
 #include <string>
 #include <vector>
 
-namespace utils
-{
+GPRAT_NS_BEGIN
+
 /**
  * @brief Compute the number of tiles for training data, given the number of
  * samples and the size of each tile.
@@ -16,16 +20,16 @@ namespace utils
  * @param n_samples Number of samples
  * @param n_tile_size Size of each tile
  */
-int compute_train_tiles(int n_samples, int n_tile_size);
+std::size_t compute_train_tiles(std::size_t n_samples, std::size_t n_tile_size);
 
 /**
  * @brief Compute the number of tiles for training data, given the number of
  * samples and the size of each tile.
  *
  * @param n_samples Number of samples
- * @param n_tile_size Size of each tile
+ * @param n_tiles Size of each tile
  */
-int compute_train_tile_size(int n_samples, int n_tiles);
+std::size_t compute_train_tile_size(std::size_t n_samples, std::size_t n_tiles);
 
 /**
  * @brief Compute the number of test tiles and the size of a test tile.
@@ -37,7 +41,8 @@ int compute_train_tile_size(int n_samples, int n_tiles);
  * @param n_tiles Number of tiles
  * @param n_tile_size Size of each tile
  */
-std::pair<int, int> compute_test_tiles(int n_test, int n_tiles, int n_tile_size);
+std::pair<std::size_t, std::size_t>
+compute_test_tiles(std::size_t n_test, std::size_t n_tiles, std::size_t n_tile_size);
 
 /**
  * @brief Load data from file
@@ -45,7 +50,7 @@ std::pair<int, int> compute_test_tiles(int n_test, int n_tiles, int n_tile_size)
  * @param file_path Path to the file
  * @param n_samples Number of samples to load
  */
-std::vector<double> load_data(const std::string &file_path, int n_samples, int offset);
+std::vector<double> load_data(const std::string &file_path, std::size_t n_samples, std::size_t offset);
 
 /**
  * @brief Print a vector
@@ -85,6 +90,6 @@ void stop_hpx_runtime();
  */
 bool compiled_with_cuda();
 
-}  // namespace utils
+GPRAT_NS_END
 
 #endif
diff --git a/core/src/cpu/adapter_cblas_fp32.cpp b/core/src/cpu/adapter_cblas_fp32.cpp
index d91a3867..4cfbea51 100644
--- a/core/src/cpu/adapter_cblas_fp32.cpp
+++ b/core/src/cpu/adapter_cblas_fp32.cpp
@@ -1,4 +1,10 @@
-#include "cpu/adapter_cblas_fp32.hpp"
+#include "gprat/cpu/adapter_cblas_fp32.hpp"
+
+#include "gprat/performance_counters.hpp"
+
+#ifdef HPX_HAVE_MODULE_PERFORMANCE_COUNTERS
+#include <hpx/performance_counters/manage_counter_type.hpp>
+#endif
 
 #ifdef GPRAT_ENABLE_MKL
 // MKL CBLAS and LAPACKE
@@ -9,28 +15,32 @@
 #include "lapacke.h"
 #endif
 
+GPRAT_NS_BEGIN
+
 // BLAS level 3 operations
 
-vector_future potrf(vector_future f_A, const int N)
+mutable_tile_data<float> potrf(const mutable_tile_data<float> &A, const int N)
 {
-    auto A = f_A.get();
+    GPRAT_BENCHMARK_FORCE_EVICT(A.as_span());
+    GPRAT_TIME_FUNCTION(&potrf);
     // POTRF: in-place Cholesky decomposition of A
     // use spotrf2 recursive version for better stability
     LAPACKE_spotrf2(LAPACK_ROW_MAJOR, 'L', N, A.data(), N);
     // return factorized matrix L
-    return hpx::make_ready_future(A);
+    return A;
 }
 
-vector_future trsm(vector_future f_L,
-                   vector_future f_A,
-                   const int N,
-                   const int M,
-                   const BLAS_TRANSPOSE transpose_L,
-                   const BLAS_SIDE side_L)
-
+mutable_tile_data<float>
+trsm(const const_tile_data<float> &L,
+     const mutable_tile_data<float> &A,
+     const int N,
+     const int M,
+     const BLAS_TRANSPOSE transpose_L,
+     const BLAS_SIDE side_L)
 {
-    auto L = f_L.get();
-    auto A = f_A.get();
+    GPRAT_BENCHMARK_FORCE_EVICT(L.as_span());
+    GPRAT_BENCHMARK_FORCE_EVICT(A.as_span());
+    GPRAT_TIME_FUNCTION(&trsm);
     // TRSM constants
     const float alpha = 1.0;
     // TRSM: in-place solve L(^T) * X = A or X * L(^T) = A where L lower triangular
@@ -47,36 +57,37 @@ vector_future trsm(vector_future f_L,
         N,
         A.data(),
         M);
-    // return vector
-    return hpx::make_ready_future(A);
+    return A;
 }
 
-vector_future syrk(vector_future f_A, vector_future f_B, const int N)
+mutable_tile_data<float> syrk(const mutable_tile_data<float> &A, const const_tile_data<float> &B, const int N)
 {
-    auto B = f_B.get();
-    auto A = f_A.get();
+    GPRAT_BENCHMARK_FORCE_EVICT(A.as_span());
+    GPRAT_BENCHMARK_FORCE_EVICT(B.as_span());
+    GPRAT_TIME_FUNCTION(&syrk);
     // SYRK constants
     const float alpha = -1.0;
     const float beta = 1.0;
     // SYRK:A = A - B * B^T
     cblas_ssyrk(CblasRowMajor, CblasLower, CblasNoTrans, N, N, alpha, B.data(), N, beta, A.data(), N);
     // return updated matrix A
-    return hpx::make_ready_future(A);
+    return A;
 }
 
-vector_future
-gemm(vector_future f_A,
-     vector_future f_B,
-     vector_future f_C,
+mutable_tile_data<float>
+gemm(const const_tile_data<float> &A,
+     const const_tile_data<float> &B,
+     const mutable_tile_data<float> &C,
      const int N,
      const int M,
      const int K,
      const BLAS_TRANSPOSE transpose_A,
      const BLAS_TRANSPOSE transpose_B)
 {
-    auto C = f_C.get();
-    auto B = f_B.get();
-    auto A = f_A.get();
+    GPRAT_BENCHMARK_FORCE_EVICT(A.as_span());
+    GPRAT_BENCHMARK_FORCE_EVICT(B.as_span());
+    GPRAT_BENCHMARK_FORCE_EVICT(C.as_span());
+    GPRAT_TIME_FUNCTION(&gemm);
     // GEMM constants
     const float alpha = -1.0;
     const float beta = 1.0;
@@ -97,15 +108,17 @@ gemm(vector_future f_A,
         C.data(),
         M);
     // return updated matrix C
-    return hpx::make_ready_future(C);
+    return C;
 }
 
 // BLAS level 2 operations
 
-vector_future trsv(vector_future f_L, vector_future f_a, const int N, const BLAS_TRANSPOSE transpose_L)
+mutable_tile_data<float>
+trsv(const const_tile_data<float> &L, const mutable_tile_data<float> &a, const int N, const BLAS_TRANSPOSE transpose_L)
 {
-    auto L = f_L.get();
-    auto a = f_a.get();
+    GPRAT_BENCHMARK_FORCE_EVICT(L.as_span());
+    GPRAT_BENCHMARK_FORCE_EVICT(a.as_span());
+    GPRAT_TIME_FUNCTION(&trsv);
     // TRSV: In-place solve L(^T) * x = a where L lower triangular
     cblas_strsv(CblasRowMajor,
                 CblasLower,
@@ -117,20 +130,22 @@ vector_future trsv(vector_future f_L, vector_future f_a, const int N, const BLAS
                 a.data(),
                 1);
     // return solution vector x
-    return hpx::make_ready_future(a);
+    return a;
 }
 
-vector_future gemv(vector_future f_A,
-                   vector_future f_a,
-                   vector_future f_b,
-                   const int N,
-                   const int M,
-                   const BLAS_ALPHA alpha,
-                   const BLAS_TRANSPOSE transpose_A)
+mutable_tile_data<float>
+gemv(const const_tile_data<float> &A,
+     const const_tile_data<float> &a,
+     const mutable_tile_data<float> &b,
+     const int N,
+     const int M,
+     const BLAS_ALPHA alpha,
+     const BLAS_TRANSPOSE transpose_A)
 {
-    auto A = f_A.get();
-    auto a = f_a.get();
-    auto b = f_b.get();
+    GPRAT_BENCHMARK_FORCE_EVICT(A.as_span());
+    GPRAT_BENCHMARK_FORCE_EVICT(a.as_span());
+    GPRAT_BENCHMARK_FORCE_EVICT(b.as_span());
+    GPRAT_TIME_FUNCTION(&gemv);
     // GEMV constants
     // const float alpha = -1.0;
     const float beta = 1.0;
@@ -149,47 +164,102 @@ vector_future gemv(vector_future f_A,
         b.data(),
         1);
     // return updated vector b
-    return hpx::make_ready_future(b);
+    return b;
 }
 
-vector_future dot_diag_syrk(vector_future f_A, vector_future f_r, const int N, const int M)
+mutable_tile_data<float>
+dot_diag_syrk(const const_tile_data<float> &A, const mutable_tile_data<float> &r, const int N, const int M)
 {
-    auto A = f_A.get();
-    auto r = f_r.get();
+    GPRAT_BENCHMARK_FORCE_EVICT(A.as_span());
+    GPRAT_BENCHMARK_FORCE_EVICT(r.as_span());
+    GPRAT_TIME_FUNCTION(&dot_diag_syrk);
+    auto r_p = r.data();
+    auto A_p = A.data();
     // r = r + diag(A^T * A)
     for (std::size_t j = 0; j < static_cast<std::size_t>(M); ++j)
     {
         // Extract the j-th column and compute the dot product with itself
-        r[j] += cblas_sdot(N, &A[j], M, &A[j], M);
+        r_p[j] += cblas_sdot(N, &A_p[j], M, &A_p[j], M);
     }
-    return hpx::make_ready_future(r);
+    return r;
 }
 
-vector_future dot_diag_gemm(vector_future f_A, vector_future f_B, vector_future f_r, const int N, const int M)
+mutable_tile_data<float>
+dot_diag_gemm(const const_tile_data<float> &A,
+              const const_tile_data<float> &B,
+              const mutable_tile_data<float> &r,
+              const int N,
+              const int M)
 {
-    auto A = f_A.get();
-    auto B = f_B.get();
-    auto r = f_r.get();
+    GPRAT_BENCHMARK_FORCE_EVICT(A.as_span());
+    GPRAT_BENCHMARK_FORCE_EVICT(B.as_span());
+    GPRAT_BENCHMARK_FORCE_EVICT(r.as_span());
+    GPRAT_TIME_FUNCTION(&dot_diag_gemm);
+    auto r_p = r.data();
+    auto A_p = A.data();
+    auto B_p = B.data();
     // r = r + diag(A * B)
     for (std::size_t i = 0; i < static_cast<std::size_t>(N); ++i)
     {
-        r[i] += cblas_sdot(M, &A[i * static_cast<std::size_t>(M)], 1, &B[i], N);
+        r_p[i] += cblas_sdot(M, &A_p[i * static_cast<std::size_t>(M)], 1, &B_p[i], N);
     }
-    return hpx::make_ready_future(r);
+    return r;
 }
 
 // BLAS level 1 operations
 
-vector_future axpy(vector_future f_y, vector_future f_x, const int N)
+mutable_tile_data<float> axpy(const mutable_tile_data<float> &y, const const_tile_data<float> &x, const int N)
 {
-    auto y = f_y.get();
-    auto x = f_x.get();
+    GPRAT_BENCHMARK_FORCE_EVICT(y.as_span());
+    GPRAT_BENCHMARK_FORCE_EVICT(x.as_span());
+    GPRAT_TIME_FUNCTION(&axpy);
     cblas_saxpy(N, -1.0, x.data(), 1, y.data(), 1);
-    return hpx::make_ready_future(y);
+    return y;
 }
 
-float dot(std::vector<float> a, std::vector<float> b, const int N)
+float dot(std::span<const float> a, std::span<const float> b, const int N)
 {
+    GPRAT_BENCHMARK_FORCE_EVICT(a);
+    GPRAT_BENCHMARK_FORCE_EVICT(b);
+    GPRAT_TIME_FUNCTION(&dot);
     // DOT: a * b
     return cblas_sdot(N, a.data(), 1, b.data(), 1);
 }
+
+#ifdef HPX_HAVE_MODULE_PERFORMANCE_COUNTERS
+namespace detail
+{
+void register_fp32_performance_counters()
+{
+    // XXX: you can do this with templates, but it's quite a bit more complicated
+#define GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR(name, fn_expr)                                                              \
+    hpx::performance_counters::install_counter_type(                                                                   \
+        name "/time",                                                                                                  \
+        get_and_reset_function_elapsed<fn_expr>,                                                                       \
+        #fn_expr,                                                                                                      \
+        "",                                                                                                            \
+        hpx::performance_counters::counter_type::monotonically_increasing);                                            \
+    hpx::performance_counters::install_counter_type(                                                                   \
+        name "/calls",                                                                                                 \
+        get_and_reset_function_calls<fn_expr>,                                                                         \
+        #fn_expr,                                                                                                      \
+        "",                                                                                                            \
+        hpx::performance_counters::counter_type::monotonically_increasing)
+
+    GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/potrf32", &potrf);
+    GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/trsm32", &trsm);
+    GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/syrk32", &syrk);
+    GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/gemm32", &gemm);
+    GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/trsv32", &trsv);
+    GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/gemv32", &gemv);
+    GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/dot_diag_syrk32", &dot_diag_syrk);
+    GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/dot_diag_gemm32", &dot_diag_gemm);
+    GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/axpy32", &axpy);
+    GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/dot32", &dot);
+
+#undef GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR
+}
+}  // namespace detail
+#endif
+
+GPRAT_NS_END
diff --git a/core/src/cpu/adapter_cblas_fp64.cpp b/core/src/cpu/adapter_cblas_fp64.cpp
index 0c38b3c2..64c94c78 100644
--- a/core/src/cpu/adapter_cblas_fp64.cpp
+++ b/core/src/cpu/adapter_cblas_fp64.cpp
@@ -1,4 +1,10 @@
-#include "cpu/adapter_cblas_fp64.hpp"
+#include "gprat/cpu/adapter_cblas_fp64.hpp"
+
+#include "gprat/performance_counters.hpp"
+
+#ifdef HPX_HAVE_MODULE_PERFORMANCE_COUNTERS
+#include <hpx/performance_counters/manage_counter_type.hpp>
+#endif
 
 #ifdef GPRAT_ENABLE_MKL
 // MKL CBLAS and LAPACKE
@@ -9,28 +15,32 @@
 #include "lapacke.h"
 #endif
 
+GPRAT_NS_BEGIN
+
 // BLAS level 3 operations
 
-vector_future potrf(vector_future f_A, const int N)
+mutable_tile_data<double> potrf(const mutable_tile_data<double> &A, const int N)
 {
-    auto A = f_A.get();
+    GPRAT_BENCHMARK_FORCE_EVICT(A.as_span());
+    GPRAT_TIME_FUNCTION(&potrf);
     // POTRF: in-place Cholesky decomposition of A
     // use dpotrf2 recursive version for better stability
     LAPACKE_dpotrf2(LAPACK_ROW_MAJOR, 'L', N, A.data(), N);
     // return factorized matrix L
-    return hpx::make_ready_future(A);
+    return A;
 }
 
-vector_future trsm(vector_future f_L,
-                   vector_future f_A,
-                   const int N,
-                   const int M,
-                   const BLAS_TRANSPOSE transpose_L,
-                   const BLAS_SIDE side_L)
-
+mutable_tile_data<double>
+trsm(const const_tile_data<double> &L,
+     const mutable_tile_data<double> &A,
+     const int N,
+     const int M,
+     const BLAS_TRANSPOSE transpose_L,
+     const BLAS_SIDE side_L)
 {
-    auto L = f_L.get();
-    auto A = f_A.get();
+    GPRAT_BENCHMARK_FORCE_EVICT(L.as_span());
+    GPRAT_BENCHMARK_FORCE_EVICT(A.as_span());
+    GPRAT_TIME_FUNCTION(&trsm);
     // TRSM constants
     const double alpha = 1.0;
     // TRSM: in-place solve L(^T) * X = A or X * L(^T) = A where L lower triangular
@@ -48,35 +58,37 @@ vector_future trsm(vector_future f_L,
         A.data(),
         M);
     // return vector
-    return hpx::make_ready_future(A);
+    return A;
 }
 
-vector_future syrk(vector_future f_A, vector_future f_B, const int N)
+mutable_tile_data<double> syrk(const mutable_tile_data<double> &A, const const_tile_data<double> &B, const int N)
 {
-    auto B = f_B.get();
-    auto A = f_A.get();
+    GPRAT_BENCHMARK_FORCE_EVICT(A.as_span());
+    GPRAT_BENCHMARK_FORCE_EVICT(B.as_span());
+    GPRAT_TIME_FUNCTION(&syrk);
     // SYRK constants
     const double alpha = -1.0;
     const double beta = 1.0;
     // SYRK:A = A - B * B^T
     cblas_dsyrk(CblasRowMajor, CblasLower, CblasNoTrans, N, N, alpha, B.data(), N, beta, A.data(), N);
     // return updated matrix A
-    return hpx::make_ready_future(A);
+    return A;
 }
 
-vector_future
-gemm(vector_future f_A,
-     vector_future f_B,
-     vector_future f_C,
+mutable_tile_data<double>
+gemm(const const_tile_data<double> &A,
+     const const_tile_data<double> &B,
+     const mutable_tile_data<double> &C,
      const int N,
      const int M,
      const int K,
      const BLAS_TRANSPOSE transpose_A,
      const BLAS_TRANSPOSE transpose_B)
 {
-    auto C = f_C.get();
-    auto B = f_B.get();
-    auto A = f_A.get();
+    GPRAT_BENCHMARK_FORCE_EVICT(A.as_span());
+    GPRAT_BENCHMARK_FORCE_EVICT(B.as_span());
+    GPRAT_BENCHMARK_FORCE_EVICT(C.as_span());
+    GPRAT_TIME_FUNCTION(&gemm);
     // GEMM constants
     const double alpha = -1.0;
     const double beta = 1.0;
@@ -97,15 +109,17 @@ gemm(vector_future f_A,
         C.data(),
         M);
     // return updated matrix C
-    return hpx::make_ready_future(C);
+    return C;
 }
 
 // BLAS level 2 operations
 
-vector_future trsv(vector_future f_L, vector_future f_a, const int N, const BLAS_TRANSPOSE transpose_L)
+mutable_tile_data<double> trsv(
+    const const_tile_data<double> &L, const mutable_tile_data<double> &a, const int N, const BLAS_TRANSPOSE transpose_L)
 {
-    auto L = f_L.get();
-    auto a = f_a.get();
+    GPRAT_BENCHMARK_FORCE_EVICT(L.as_span());
+    GPRAT_BENCHMARK_FORCE_EVICT(a.as_span());
+    GPRAT_TIME_FUNCTION(&trsv);
     // TRSV: In-place solve L(^T) * x = a where L lower triangular
     cblas_dtrsv(CblasRowMajor,
                 CblasLower,
@@ -117,20 +131,22 @@ vector_future trsv(vector_future f_L, vector_future f_a, const int N, const BLAS
                 a.data(),
                 1);
     // return solution vector x
-    return hpx::make_ready_future(a);
+    return a;
 }
 
-vector_future gemv(vector_future f_A,
-                   vector_future f_a,
-                   vector_future f_b,
-                   const int N,
-                   const int M,
-                   const BLAS_ALPHA alpha,
-                   const BLAS_TRANSPOSE transpose_A)
+mutable_tile_data<double>
+gemv(const const_tile_data<double> &A,
+     const const_tile_data<double> &a,
+     const mutable_tile_data<double> &b,
+     const int N,
+     const int M,
+     const BLAS_ALPHA alpha,
+     const BLAS_TRANSPOSE transpose_A)
 {
-    auto A = f_A.get();
-    auto a = f_a.get();
-    auto b = f_b.get();
+    GPRAT_BENCHMARK_FORCE_EVICT(A.as_span());
+    GPRAT_BENCHMARK_FORCE_EVICT(a.as_span());
+    GPRAT_BENCHMARK_FORCE_EVICT(b.as_span());
+    GPRAT_TIME_FUNCTION(&gemv);
     // GEMV constants
     // const double alpha = -1.0;
     const double beta = 1.0;
@@ -149,47 +165,102 @@ vector_future gemv(vector_future f_A,
         b.data(),
         1);
     // return updated vector b
-    return hpx::make_ready_future(b);
+    return b;
 }
 
-vector_future dot_diag_syrk(vector_future f_A, vector_future f_r, const int N, const int M)
+mutable_tile_data<double>
+dot_diag_syrk(const const_tile_data<double> &A, const mutable_tile_data<double> &r, const int N, const int M)
 {
-    auto A = f_A.get();
-    auto r = f_r.get();
+    GPRAT_BENCHMARK_FORCE_EVICT(A.as_span());
+    GPRAT_BENCHMARK_FORCE_EVICT(r.as_span());
+    GPRAT_TIME_FUNCTION(&dot_diag_syrk);
+    auto r_p = r.data();
+    auto A_p = A.data();
     // r = r + diag(A^T * A)
     for (std::size_t j = 0; j < static_cast<std::size_t>(M); ++j)
     {
         // Extract the j-th column and compute the dot product with itself
-        r[j] += cblas_ddot(N, &A[j], M, &A[j], M);
+        r_p[j] += cblas_ddot(N, &A_p[j], M, &A_p[j], M);
     }
-    return hpx::make_ready_future(r);
+    return r;
 }
 
-vector_future dot_diag_gemm(vector_future f_A, vector_future f_B, vector_future f_r, const int N, const int M)
+mutable_tile_data<double>
+dot_diag_gemm(const const_tile_data<double> &A,
+              const const_tile_data<double> &B,
+              const mutable_tile_data<double> &r,
+              const int N,
+              const int M)
 {
-    auto A = f_A.get();
-    auto B = f_B.get();
-    auto r = f_r.get();
+    GPRAT_BENCHMARK_FORCE_EVICT(A.as_span());
+    GPRAT_BENCHMARK_FORCE_EVICT(B.as_span());
+    GPRAT_BENCHMARK_FORCE_EVICT(r.as_span());
+    GPRAT_TIME_FUNCTION(&dot_diag_gemm);
+    auto r_p = r.data();
+    auto A_p = A.data();
+    auto B_p = B.data();
     // r = r + diag(A * B)
     for (std::size_t i = 0; i < static_cast<std::size_t>(N); ++i)
     {
-        r[i] += cblas_ddot(M, &A[i * static_cast<std::size_t>(M)], 1, &B[i], N);
+        r_p[i] += cblas_ddot(M, &A_p[i * static_cast<std::size_t>(M)], 1, &B_p[i], N);
     }
-    return hpx::make_ready_future(r);
+    return r;
 }
 
 // BLAS level 1 operations
 
-vector_future axpy(vector_future f_y, vector_future f_x, const int N)
+mutable_tile_data<double> axpy(const mutable_tile_data<double> &y, const const_tile_data<double> &x, const int N)
 {
-    auto y = f_y.get();
-    auto x = f_x.get();
+    GPRAT_BENCHMARK_FORCE_EVICT(y.as_span());
+    GPRAT_BENCHMARK_FORCE_EVICT(x.as_span());
+    GPRAT_TIME_FUNCTION(&axpy);
     cblas_daxpy(N, -1.0, x.data(), 1, y.data(), 1);
-    return hpx::make_ready_future(y);
+    return y;
 }
 
-double dot(std::vector<double> a, std::vector<double> b, const int N)
+double dot(std::span<const double> a, std::span<const double> b, const int N)
 {
+    GPRAT_BENCHMARK_FORCE_EVICT(a);
+    GPRAT_BENCHMARK_FORCE_EVICT(b);
+    GPRAT_TIME_FUNCTION(&dot);
     // DOT: a * b
     return cblas_ddot(N, a.data(), 1, b.data(), 1);
 }
+
+#ifdef HPX_HAVE_MODULE_PERFORMANCE_COUNTERS
+namespace detail
+{
+void register_fp64_performance_counters()
+{
+    // XXX: you can do this with templates, but it's quite a bit more complicated
+#define GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR(name, fn_expr)                                                              \
+    hpx::performance_counters::install_counter_type(                                                                   \
+        name "/time",                                                                                                  \
+        get_and_reset_function_elapsed<fn_expr>,                                                                       \
+        #fn_expr,                                                                                                      \
+        "",                                                                                                            \
+        hpx::performance_counters::counter_type::monotonically_increasing);                                            \
+    hpx::performance_counters::install_counter_type(                                                                   \
+        name "/calls",                                                                                                 \
+        get_and_reset_function_calls<fn_expr>,                                                                         \
+        #fn_expr,                                                                                                      \
+        "",                                                                                                            \
+        hpx::performance_counters::counter_type::monotonically_increasing)
+
+    GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/potrf64", &potrf);
+    GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/trsm64", &trsm);
+    GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/syrk64", &syrk);
+    GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/gemm64", &gemm);
+    GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/trsv64", &trsv);
+    GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/gemv64", &gemv);
+    GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/dot_diag_syrk64", &dot_diag_syrk);
+    GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/dot_diag_gemm64", &dot_diag_gemm);
+    GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/axpy64", &axpy);
+    GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/dot64", &dot);
+
+#undef GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR
+}
+}  // namespace detail
+#endif
+
+GPRAT_NS_END
diff --git a/core/src/cpu/gp_algorithms.cpp b/core/src/cpu/gp_algorithms.cpp
index 95eb2e2f..ab3ed77b 100644
--- a/core/src/cpu/gp_algorithms.cpp
+++ b/core/src/cpu/gp_algorithms.cpp
@@ -1,182 +1,182 @@
-#include "cpu/gp_algorithms.hpp"
+#include "gprat/cpu/gp_algorithms.hpp"
+
+#include "gprat/performance_counters.hpp"
+#include "gprat/tile_data.hpp"
 
 #include <cmath>
 
+GPRAT_NS_BEGIN
+
 namespace cpu
 {
 
 // Tile generation
 
-double compute_covariance_function(std::size_t i_global,
-                                   std::size_t j_global,
-                                   std::size_t n_regressors,
-                                   const gprat_hyper::SEKParams &sek_params,
-                                   const std::vector<double> &i_input,
-                                   const std::vector<double> &j_input)
+double compute_covariance_function(std::size_t n_regressors,
+                                   const SEKParams &sek_params,
+                                   std::span<const double> i_input,
+                                   std::span<const double> j_input)
 {
+    GPRAT_TIME_FUNCTION(&compute_covariance_function);
     // k(z_i,z_j) = vertical_lengthscale * exp(-0.5 / lengthscale^2 * (z_i - z_j)^2)
     double distance = 0.0;
-    double z_ik_minus_z_jk;
-
     for (std::size_t k = 0; k < n_regressors; k++)
     {
-        z_ik_minus_z_jk = i_input[i_global + k] - j_input[j_global + k];
+        const double z_ik_minus_z_jk = i_input[k] - j_input[k];
         distance += z_ik_minus_z_jk * z_ik_minus_z_jk;
     }
+
     return sek_params.vertical_lengthscale * exp(-0.5 / (sek_params.lengthscale * sek_params.lengthscale) * distance);
 }
 
-std::vector<double> gen_tile_covariance(
+mutable_tile_data<double> gen_tile_covariance(
     std::size_t row,
     std::size_t col,
     std::size_t N,
     std::size_t n_regressors,
-    const gprat_hyper::SEKParams &sek_params,
-    const std::vector<double> &input)
+    const SEKParams &sek_params,
+    std::span<const double> input)
 {
-    std::size_t i_global, j_global;
-    double covariance_function;
-    // Preallocate required memory
-    std::vector<double> tile;
-    tile.reserve(N * N);
-    // Compute entries
+    GPRAT_TIME_FUNCTION(&gen_tile_covariance);
+    mutable_tile_data<double> tile(N * N);
     for (std::size_t i = 0; i < N; i++)
     {
-        i_global = N * row + i;
+        const std::size_t i_global = N * row + i;
         for (std::size_t j = 0; j < N; j++)
         {
-            j_global = N * col + j;
+            const std::size_t j_global = N * col + j;
+
             // compute covariance function
-            covariance_function =
-                compute_covariance_function(i_global, j_global, n_regressors, sek_params, input, input);
+            auto covariance_function = compute_covariance_function(
+                n_regressors, sek_params, input.subspan(i_global, n_regressors), input.subspan(j_global, n_regressors));
             if (i_global == j_global)
             {
                 // noise variance on diagonal
                 covariance_function += sek_params.noise_variance;
             }
-            tile.push_back(covariance_function);
+
+            tile.data()[i * N + j] = covariance_function;
         }
     }
     return tile;
 }
 
-std::vector<double> gen_tile_full_prior_covariance(
+mutable_tile_data<double> gen_tile_full_prior_covariance(
     std::size_t row,
     std::size_t col,
     std::size_t N,
     std::size_t n_regressors,
-    const gprat_hyper::SEKParams &sek_params,
-    const std::vector<double> &input)
+    const SEKParams &sek_params,
+    std::span<const double> input)
 {
-    std::size_t i_global, j_global;
-    // Preallocate required memory
-    std::vector<double> tile;
-    tile.reserve(N * N);
-    // Compute entries
+    GPRAT_TIME_FUNCTION(&gen_tile_full_prior_covariance);
+    mutable_tile_data<double> tile(N * N);
     for (std::size_t i = 0; i < N; i++)
     {
-        i_global = N * row + i;
+        const std::size_t i_global = N * row + i;
         for (std::size_t j = 0; j < N; j++)
         {
-            j_global = N * col + j;
+            const std::size_t j_global = N * col + j;
             // compute covariance function
-            tile.push_back(compute_covariance_function(i_global, j_global, n_regressors, sek_params, input, input));
+            tile.data()[i * N + j] = compute_covariance_function(
+                n_regressors, sek_params, input.subspan(i_global, n_regressors), input.subspan(j_global, n_regressors));
         }
     }
     return tile;
 }
 
-std::vector<double> gen_tile_prior_covariance(
+mutable_tile_data<double> gen_tile_prior_covariance(
     std::size_t row,
     std::size_t col,
     std::size_t N,
     std::size_t n_regressors,
-    const gprat_hyper::SEKParams &sek_params,
-    const std::vector<double> &input)
+    const SEKParams &sek_params,
+    std::span<const double> input)
 {
-    std::size_t i_global, j_global;
-    // Preallocate required memory
-    std::vector<double> tile;
-    tile.reserve(N);
-    // Compute entries
+    GPRAT_TIME_FUNCTION(&gen_tile_prior_covariance);
+    mutable_tile_data<double> tile(N);
     for (std::size_t i = 0; i < N; i++)
     {
-        i_global = N * row + i;
-        j_global = N * col + i;
+        const std::size_t i_global = N * row + i;
+        const std::size_t j_global = N * col + i;
         // compute covariance function
-        tile.push_back(compute_covariance_function(i_global, j_global, n_regressors, sek_params, input, input));
+        tile.data()[i] = compute_covariance_function(
+            n_regressors, sek_params, input.subspan(i_global, n_regressors), input.subspan(j_global, n_regressors));
     }
     return tile;
 }
 
-std::vector<double> gen_tile_cross_covariance(
+mutable_tile_data<double> gen_tile_cross_covariance(
     std::size_t row,
     std::size_t col,
     std::size_t N_row,
     std::size_t N_col,
     std::size_t n_regressors,
-    const gprat_hyper::SEKParams &sek_params,
-    const std::vector<double> &row_input,
-    const std::vector<double> &col_input)
+    const SEKParams &sek_params,
+    std::span<const double> row_input,
+    std::span<const double> col_input)
 {
-    std::size_t i_global, j_global;
-    // Preallocate required memory
-    std::vector<double> tile;
-    tile.reserve(N_row * N_col);
-    // Compute entries
+    GPRAT_TIME_FUNCTION(&gen_tile_cross_covariance);
+    mutable_tile_data<double> tile(N_row * N_col);
     for (std::size_t i = 0; i < N_row; i++)
     {
-        i_global = N_row * row + i;
+        std::size_t i_global = N_row * row + i;
         for (std::size_t j = 0; j < N_col; j++)
         {
-            j_global = N_col * col + j;
+            std::size_t j_global = N_col * col + j;
             // compute covariance function
-            tile.push_back(
-                compute_covariance_function(i_global, j_global, n_regressors, sek_params, row_input, col_input));
+            tile.data()[i * N_col + j] = compute_covariance_function(
+                n_regressors,
+                sek_params,
+                row_input.subspan(i_global, n_regressors),
+                col_input.subspan(j_global, n_regressors));
         }
     }
     return tile;
 }
 
-std::vector<double> gen_tile_transpose(std::size_t N_row, std::size_t N_col, const std::vector<double> &tile)
+mutable_tile_data<double> gen_tile_transpose(std::size_t N_row, std::size_t N_col, std::span<const double> tile)
 {
-    // Preallocate required memory
-    std::vector<double> transposed;
-    transposed.reserve(N_row * N_col);
+    GPRAT_TIME_FUNCTION(&gen_tile_transpose);
+    mutable_tile_data<double> transposed(N_row * N_col);
     // Transpose entries
     for (std::size_t j = 0; j < N_col; j++)
     {
         for (std::size_t i = 0; i < N_row; ++i)
         {
             // Mapping (i, j) in the original tile to (j, i) in the transposed tile
-            transposed.push_back(tile[i * N_col + j]);
+            transposed.data()[j * N_row + i] = tile[i * N_col + j];
         }
     }
     return transposed;
 }
 
-std::vector<double> gen_tile_output(std::size_t row, std::size_t N, const std::vector<double> &output)
+mutable_tile_data<double> gen_tile_output(std::size_t row, std::size_t N, std::span<const double> output)
 {
-    // Preallocate required memory
-    std::vector<double> tile;
-    tile.reserve(N);
-    // Copy entries
-    std::copy(output.begin() + static_cast<long int>(N * row),
-              output.begin() + static_cast<long int>(N * (row + 1)),
-              std::back_inserter(tile));
+    GPRAT_TIME_FUNCTION(&gen_tile_output);
+    mutable_tile_data<double> tile(N);
+    std::copy(output.data() + (N * row), output.data() + (N * (row + 1)), tile.data());
     return tile;
 }
 
-std::vector<double> gen_tile_zeros(std::size_t N) { return std::vector<double>(N, 0.0); }
+mutable_tile_data<double> gen_tile_zeros(std::size_t N)
+{
+    GPRAT_TIME_FUNCTION(&gen_tile_zeros);
+    mutable_tile_data<double> tile(N);
+    std::fill_n(tile.data(), N, 0.0);
+    return tile;
+}
 
-std::vector<double> gen_tile_identity(std::size_t N)
+mutable_tile_data<double> gen_tile_identity(std::size_t N)
 {
+    GPRAT_TIME_FUNCTION(&gen_tile_identity);
+    mutable_tile_data<double> tile(N * N);
     // Initialize zero tile
-    std::vector<double> tile(N * N, 0.0);
+    std::fill_n(tile.data(), N * N, 0.0);
     // Fill diagonal with ones
     for (std::size_t i = 0; i < N; i++)
     {
-        tile[i * N + i] = 1.0;
+        tile.data()[i * N + i] = 1.0;
     }
     return tile;
 }
@@ -188,6 +188,7 @@ double compute_error_norm(std::size_t n_tiles,
                           const std::vector<double> &b,
                           const std::vector<std::vector<double>> &tiles)
 {
+    GPRAT_TIME_FUNCTION(&compute_error_norm);
     double error = 0.0;
     for (std::size_t k = 0; k < n_tiles; k++)
     {
@@ -203,3 +204,5 @@ double compute_error_norm(std::size_t n_tiles,
 }
 
 }  // end of namespace cpu
+
+GPRAT_NS_END
diff --git a/core/src/cpu/gp_functions.cpp b/core/src/cpu/gp_functions.cpp
index 92caa275..097f4867 100644
--- a/core/src/cpu/gp_functions.cpp
+++ b/core/src/cpu/gp_functions.cpp
@@ -1,1161 +1,20 @@
-#include "cpu/gp_functions.hpp"
+#include "gprat/cpu/gp_functions.hpp"
+
+#include "gprat/cpu/gp_algorithms.hpp"
+#include "gprat/cpu/gp_optimizer.hpp"
+#include "gprat/cpu/tiled_algorithms.hpp"
+#include "gprat/detail/async_helpers.hpp"
 
-#include "cpu/gp_algorithms.hpp"
-#include "cpu/gp_optimizer.hpp"
-#include "cpu/tiled_algorithms.hpp"
 #include <hpx/future.hpp>
 
-using Tiled_matrix = std::vector<hpx::shared_future<std::vector<double>>>;
-using Tiled_vector = std::vector<hpx::shared_future<std::vector<double>>>;
+GPRAT_NS_BEGIN
 
 namespace cpu
 {
 
 ///////////////////////////////////////////////////////////////////////////
 // PREDICT
-std::vector<std::vector<double>>
-cholesky(const std::vector<double> &training_input,
-         const gprat_hyper::SEKParams &sek_params,
-         int n_tiles,
-         int n_tile_size,
-         int n_regressors)
-{
-    std::vector<std::vector<double>> result;
-    // Tiled future data structures
-    Tiled_matrix K_tiles;  // Tiled covariance matrix
-
-    // Preallocate memory
-    result.resize(static_cast<std::size_t>(n_tiles * n_tiles));
-    K_tiles.resize(static_cast<std::size_t>(n_tiles * n_tiles));  // No reserve because of triangular structure
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous assembly
-    for (std::size_t i = 0; i < static_cast<std::size_t>(n_tiles); i++)
-    {
-        for (std::size_t j = 0; j <= i; j++)
-        {
-            K_tiles[i * static_cast<std::size_t>(n_tiles) + j] = hpx::async(
-                hpx::annotated_function(gen_tile_covariance, "assemble_tiled_K"),
-                i,
-                j,
-                n_tile_size,
-                n_regressors,
-                sek_params,
-                training_input);
-        }
-    }
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous Cholesky decomposition: K = L * L^T
-    right_looking_cholesky_tiled(K_tiles, n_tile_size, static_cast<std::size_t>(n_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Synchronize
-    for (std::size_t i = 0; i < static_cast<std::size_t>(n_tiles); i++)
-    {
-        for (std::size_t j = 0; j <= i; j++)
-        {
-            result[i * static_cast<std::size_t>(n_tiles) + j] =
-                K_tiles[i * static_cast<std::size_t>(n_tiles) + j].get();
-        }
-    }
-    return result;
-}
-
-std::vector<double>
-predict(const std::vector<double> &training_input,
-        const std::vector<double> &training_output,
-        const std::vector<double> &test_input,
-        const gprat_hyper::SEKParams &sek_params,
-        int n_tiles,
-        int n_tile_size,
-        int m_tiles,
-        int m_tile_size,
-        int n_regressors)
-{
-    /*
-     * Prediction: hat(y)_M = cross(K)_MxN * K^-1_NxN * y_N
-     * - Covariance matrix K_NxN
-     * - Cross-covariance cross(K)_MxN
-     * - Training ouput y_N
-     * - Prediction output hat(y)_M
-     *
-     * Algorithm:
-     * 1: Compute lower triangular part of covariance matrix K
-     * 2: Compute Cholesky factor L of K
-     * 3: Compute prediction hat(y):
-     *    - triangular solve L * beta = y
-     *    - triangular solve L^T * alpha = beta
-     *    - compute hat(y) = cross(K) * alpha
-     */
-
-    std::vector<double> prediction_result;
-    // Tiled future data structures
-    Tiled_matrix K_tiles;                 // Tiled covariance matrix
-    Tiled_matrix cross_covariance_tiles;  // Tiled cross_covariance matrix
-    Tiled_vector prediction_tiles;        // Tiled solution
-    Tiled_vector alpha_tiles;             // Tiled intermediate solution
-
-    // Preallocate memory
-    prediction_result.reserve(test_input.size());
-
-    K_tiles.resize(static_cast<std::size_t>(n_tiles * n_tiles));  // No reserve because of triangular structure
-    alpha_tiles.reserve(static_cast<std::size_t>(n_tiles));
-    cross_covariance_tiles.reserve(static_cast<std::size_t>(m_tiles) * static_cast<std::size_t>(n_tiles));
-    prediction_tiles.reserve(static_cast<std::size_t>(m_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous assembly
-    for (std::size_t i = 0; i < static_cast<std::size_t>(n_tiles); i++)
-    {
-        for (std::size_t j = 0; j <= i; j++)
-        {
-            K_tiles[i * static_cast<std::size_t>(n_tiles) + j] = hpx::async(
-                hpx::annotated_function(gen_tile_covariance, "assemble_tiled_K"),
-                i,
-                j,
-                n_tile_size,
-                n_regressors,
-                sek_params,
-                training_input);
-        }
-    }
-
-    for (std::size_t i = 0; i < static_cast<std::size_t>(n_tiles); i++)
-    {
-        alpha_tiles.push_back(hpx::async(
-            hpx::annotated_function(gen_tile_output, "assemble_tiled_alpha"), i, n_tile_size, training_output));
-    }
-
-    for (std::size_t i = 0; i < static_cast<std::size_t>(m_tiles); i++)
-    {
-        for (std::size_t j = 0; j < static_cast<std::size_t>(n_tiles); j++)
-        {
-            cross_covariance_tiles.push_back(hpx::async(
-                hpx::annotated_function(gen_tile_cross_covariance, "assemble_pred"),
-                i,
-                j,
-                m_tile_size,
-                n_tile_size,
-                n_regressors,
-                sek_params,
-                test_input,
-                training_input));
-        }
-    }
-
-    for (std::size_t i = 0; i < static_cast<std::size_t>(m_tiles); i++)
-    {
-        prediction_tiles.push_back(hpx::async(hpx::annotated_function(gen_tile_zeros, "assemble_tiled"), m_tile_size));
-    }
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous Cholesky decomposition: K = L * L^T
-    right_looking_cholesky_tiled(K_tiles, n_tile_size, static_cast<std::size_t>(n_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous triangular solve  L * (L^T * alpha) = y
-    forward_solve_tiled(K_tiles, alpha_tiles, n_tile_size, static_cast<std::size_t>(n_tiles));
-    backward_solve_tiled(K_tiles, alpha_tiles, n_tile_size, static_cast<std::size_t>(n_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous prediction computation solve: \hat{y} = K_cross_cov * alpha
-    matrix_vector_tiled(
-        cross_covariance_tiles,
-        alpha_tiles,
-        prediction_tiles,
-        m_tile_size,
-        n_tile_size,
-        static_cast<std::size_t>(n_tiles),
-        static_cast<std::size_t>(m_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Synchronize prediction
-    for (std::size_t i = 0; i < static_cast<std::size_t>(m_tiles); i++)
-    {
-        auto tile = prediction_tiles[i].get();
-        std::copy(tile.begin(), tile.end(), std::back_inserter(prediction_result));
-    }
-    return prediction_result;
-}
-
-std::vector<std::vector<double>> predict_with_uncertainty(
-    const std::vector<double> &training_input,
-    const std::vector<double> &training_output,
-    const std::vector<double> &test_input,
-    const gprat_hyper::SEKParams &sek_params,
-    int n_tiles,
-    int n_tile_size,
-    int m_tiles,
-    int m_tile_size,
-    int n_regressors)
-{
-    /*
-     * Prediction: hat(y) = cross(K) * K^-1 * y
-     * Uncertainty: diag(Sigma) = diag(prior(K)) * diag(cross(K)^T * K^-1 * cross(K))
-     * - Covariance matrix K_NxN
-     * - Cross-covariance cross(K)_MxN
-     * - Prior covariance prior(K)_MxM
-     * - Training ouput y_N
-     * - Prediction output hat(y)_M
-     * - Posterior covariance matrix Sigma_MxM
-     *
-     * Algorithm:
-     * 1: Compute lower triangular part of covariance matrix K
-     * 2: Compute Cholesky factor L of K
-     * 3: Compute prediction hat(y):
-     *    - triangular solve L * beta = y
-     *    - triangular solve L^T * alpha = beta
-     *    - compute hat(y) = cross(K) * alpha
-     * 4: Compute uncertainty diag(Sigma):
-     *    - triangular solve L * V = cross(K)^T
-     *    - compute diag(W) = diag(V^T * V)
-     *    - compute diag(Sigma) = diag(prior(K)) - diag(W)
-     */
-
-    std::vector<double> prediction_result;
-    std::vector<double> uncertainty_result;
-    // Tiled future data structures for prediction
-    Tiled_matrix K_tiles;                 // Tiled covariance matrix K_NxN
-    Tiled_matrix cross_covariance_tiles;  // Tiled cross_covariance matrix K_NxM
-    Tiled_vector prediction_tiles;        // Tiled solution
-    Tiled_vector alpha_tiles;             // Tiled intermediate solution
-    // Tiled future data structures for uncertainty
-    Tiled_matrix t_cross_covariance_tiles;  // Tiled transposed cross_covariance matrix K_MxN
-    Tiled_vector prior_K_tiles;             // Tiled prior covariance matrix diagonal diag(K_MxM)
-    Tiled_vector uncertainty_tiles;         // Tiled uncertainty solution
-
-    // Preallocate memory
-    prediction_result.reserve(test_input.size());
-    uncertainty_result.reserve(test_input.size());
-
-    K_tiles.resize(static_cast<std::size_t>(n_tiles * n_tiles));  // No reserve because of triangular structure
-    cross_covariance_tiles.reserve(static_cast<std::size_t>(m_tiles) * static_cast<std::size_t>(n_tiles));
-    prediction_tiles.reserve(static_cast<std::size_t>(m_tiles));
-    alpha_tiles.reserve(static_cast<std::size_t>(n_tiles));
-
-    t_cross_covariance_tiles.reserve(static_cast<std::size_t>(n_tiles) * static_cast<std::size_t>(m_tiles));
-    prior_K_tiles.reserve(static_cast<std::size_t>(m_tiles));
-    uncertainty_tiles.reserve(static_cast<std::size_t>(m_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous assembly
-    for (std::size_t i = 0; i < static_cast<std::size_t>(n_tiles); i++)
-    {
-        for (std::size_t j = 0; j <= i; j++)
-        {
-            K_tiles[i * static_cast<std::size_t>(n_tiles) + j] = hpx::async(
-                hpx::annotated_function(gen_tile_covariance, "assemble_tiled_K"),
-                i,
-                j,
-                n_tile_size,
-                n_regressors,
-                sek_params,
-                training_input);
-        }
-    }
-
-    for (std::size_t i = 0; i < static_cast<std::size_t>(n_tiles); i++)
-    {
-        alpha_tiles.push_back(hpx::async(
-            hpx::annotated_function(gen_tile_output, "assemble_tiled_alpha"), i, n_tile_size, training_output));
-    }
-
-    for (std::size_t i = 0; i < static_cast<std::size_t>(m_tiles); i++)
-    {
-        for (std::size_t j = 0; j < static_cast<std::size_t>(n_tiles); j++)
-        {
-            cross_covariance_tiles.push_back(hpx::async(
-                hpx::annotated_function(gen_tile_cross_covariance, "assemble_pred"),
-                i,
-                j,
-                m_tile_size,
-                n_tile_size,
-                n_regressors,
-                sek_params,
-                test_input,
-                training_input));
-        }
-    }
-
-    for (std::size_t i = 0; i < static_cast<std::size_t>(m_tiles); i++)
-    {
-        prediction_tiles.push_back(hpx::async(hpx::annotated_function(gen_tile_zeros, "assemble_tiled"), m_tile_size));
-    }
-
-    for (std::size_t i = 0; i < static_cast<std::size_t>(m_tiles); i++)
-    {
-        prior_K_tiles.push_back(hpx::async(
-            hpx::annotated_function(gen_tile_prior_covariance, "assemble_tiled"),
-            i,
-            i,
-            m_tile_size,
-            n_regressors,
-            sek_params,
-            test_input));
-    }
-
-    for (std::size_t j = 0; j < static_cast<std::size_t>(n_tiles); j++)
-    {
-        for (std::size_t i = 0; i < static_cast<std::size_t>(m_tiles); i++)
-        {
-            t_cross_covariance_tiles.push_back(hpx::dataflow(
-                hpx::annotated_function(hpx::unwrapping(&gen_tile_transpose), "assemble_pred"),
-                m_tile_size,
-                n_tile_size,
-                cross_covariance_tiles[i * static_cast<std::size_t>(n_tiles) + j]));
-        }
-    }
-
-    for (std::size_t i = 0; i < static_cast<std::size_t>(m_tiles); i++)
-    {
-        uncertainty_tiles.push_back(
-            hpx::async(hpx::annotated_function(gen_tile_zeros, "assemble_prior_inter"), m_tile_size));
-    }
-
-    // Prediction
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous Cholesky decomposition: K = L * L^T
-    right_looking_cholesky_tiled(K_tiles, n_tile_size, static_cast<std::size_t>(n_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous triangular solve  L * (L^T * alpha) = y
-    forward_solve_tiled(K_tiles, alpha_tiles, n_tile_size, static_cast<std::size_t>(n_tiles));
-    backward_solve_tiled(K_tiles, alpha_tiles, n_tile_size, static_cast<std::size_t>(n_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous prediction computation solve: hat(y) = cross(K) * alpha
-    matrix_vector_tiled(
-        cross_covariance_tiles,
-        alpha_tiles,
-        prediction_tiles,
-        m_tile_size,
-        n_tile_size,
-        static_cast<std::size_t>(n_tiles),
-        static_cast<std::size_t>(m_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous triangular solve L * V = cross(K)^T
-    forward_solve_tiled_matrix(
-        K_tiles,
-        t_cross_covariance_tiles,
-        n_tile_size,
-        m_tile_size,
-        static_cast<std::size_t>(n_tiles),
-        static_cast<std::size_t>(m_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous computation diag(W) = diag(V^T * V)
-    symmetric_matrix_matrix_diagonal_tiled(
-        t_cross_covariance_tiles,
-        uncertainty_tiles,
-        n_tile_size,
-        m_tile_size,
-        static_cast<std::size_t>(n_tiles),
-        static_cast<std::size_t>(m_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous computation diag(Sigma) = diag(prior(K)) - diag(W)
-    vector_difference_tiled(prior_K_tiles, uncertainty_tiles, m_tile_size, static_cast<std::size_t>(m_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Synchronize prediction
-    for (std::size_t i = 0; i < static_cast<std::size_t>(m_tiles); i++)
-    {
-        auto tile = prediction_tiles[i].get();
-        std::copy(tile.begin(), tile.end(), std::back_inserter(prediction_result));
-    }
-
-    // Synchronize uncertainty
-    for (std::size_t i = 0; i < static_cast<std::size_t>(m_tiles); i++)
-    {
-        auto tile = uncertainty_tiles[i].get();
-        std::copy(tile.begin(), tile.end(), std::back_inserter(uncertainty_result));
-    }
-
-    return std::vector<std::vector<double>>{ std::move(prediction_result), std::move(uncertainty_result) };
-}
-
-std::vector<std::vector<double>> predict_with_full_cov(
-    const std::vector<double> &training_input,
-    const std::vector<double> &training_output,
-    const std::vector<double> &test_input,
-    const gprat_hyper::SEKParams &sek_params,
-    int n_tiles,
-    int n_tile_size,
-    int m_tiles,
-    int m_tile_size,
-    int n_regressors)
-{
-    /*
-     * Prediction: hat(y)_M = cross(K) * K^-1 * y
-     * Full covariance: Sigma = prior(K) - cross(K)^T * K^-1 * cross(K)
-     * - Covariance matrix K_NxN
-     * - Cross-covariance cross(K)_MxN
-     * - Prior covariance prior(K)_MxM
-     * - Training ouput y_N
-     * - Prediction output hat(y)_M
-     * - Posterior covariance matrix Sigma_MxM
-     *
-     * Algorithm:
-     * 1: Compute lower triangular part of covariance matrix K
-     * 2: Compute Cholesky factor L of K
-     * 3: Compute prediction hat(y):
-     *    - triangular solve L * beta = y
-     *    - triangular solve L^T * alpha = beta
-     *    - compute hat(y) = cross(K) * alpha
-     * 4: Compute full covariance matrix Sigma:
-     *    - triangular solve L * V = cross(K)^T
-     *    - compute W = V^T * V
-     *    - compute Sigma = prior(K) - W
-     * 5: Compute diag(Sigma)
-     */
-
-    std::vector<double> prediction_result;
-    std::vector<double> uncertainty_result;
-    // Tiled future data structures for prediction
-    Tiled_matrix K_tiles;                 // Tiled covariance matrix K_NxN
-    Tiled_matrix cross_covariance_tiles;  // Tiled cross_covariance matrix K_NxM
-    Tiled_vector prediction_tiles;        // Tiled solution
-    Tiled_vector alpha_tiles;             // Tiled intermediate solution
-    // Tiled future data structures for uncertainty
-    Tiled_matrix t_cross_covariance_tiles;  // Tiled transposed cross_covariance matrix K_MxN
-    Tiled_matrix prior_K_tiles;             // Tiled prior covariance matrix K_MxM
-    Tiled_vector uncertainty_tiles;         // Tiled uncertainty solution
-
-    // Preallocate memory
-    prediction_result.reserve(test_input.size());
-    uncertainty_result.reserve(test_input.size());
-
-    K_tiles.resize(static_cast<std::size_t>(n_tiles * n_tiles));  // No reserve because of triangular structure
-    cross_covariance_tiles.reserve(static_cast<std::size_t>(m_tiles) * static_cast<std::size_t>(n_tiles));
-    prediction_tiles.reserve(static_cast<std::size_t>(m_tiles));
-    alpha_tiles.reserve(static_cast<std::size_t>(n_tiles));
-
-    t_cross_covariance_tiles.reserve(static_cast<std::size_t>(n_tiles) * static_cast<std::size_t>(m_tiles));
-    prior_K_tiles.resize(static_cast<std::size_t>(m_tiles * m_tiles));
-    uncertainty_tiles.reserve(static_cast<std::size_t>(m_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous assembly
-    for (std::size_t i = 0; i < static_cast<std::size_t>(n_tiles); i++)
-    {
-        for (std::size_t j = 0; j <= i; j++)
-        {
-            K_tiles[i * static_cast<std::size_t>(n_tiles) + j] = hpx::async(
-                hpx::annotated_function(gen_tile_covariance, "assemble_tiled_K"),
-                i,
-                j,
-                n_tile_size,
-                n_regressors,
-                sek_params,
-                training_input);
-        }
-    }
-
-    for (std::size_t i = 0; i < static_cast<std::size_t>(n_tiles); i++)
-    {
-        alpha_tiles.push_back(hpx::async(
-            hpx::annotated_function(gen_tile_output, "assemble_tiled_alpha"), i, n_tile_size, training_output));
-    }
-
-    for (std::size_t i = 0; i < static_cast<std::size_t>(m_tiles); i++)
-    {
-        for (std::size_t j = 0; j < static_cast<std::size_t>(n_tiles); j++)
-        {
-            cross_covariance_tiles.push_back(hpx::async(
-                hpx::annotated_function(gen_tile_cross_covariance, "assemble_pred"),
-                i,
-                j,
-                m_tile_size,
-                n_tile_size,
-                n_regressors,
-                sek_params,
-                test_input,
-                training_input));
-        }
-    }
-
-    for (std::size_t i = 0; i < static_cast<std::size_t>(m_tiles); i++)
-    {
-        prediction_tiles.push_back(hpx::async(hpx::annotated_function(gen_tile_zeros, "assemble_tiled"), m_tile_size));
-    }
-
-    // Assemble prior covariance matrix vector
-    for (std::size_t i = 0; i < static_cast<std::size_t>(m_tiles); i++)
-    {
-        for (std::size_t j = 0; j <= i; j++)
-        {
-            prior_K_tiles[i * static_cast<std::size_t>(m_tiles) + j] = hpx::async(
-                hpx::annotated_function(gen_tile_full_prior_covariance, "assemble_prior_tiled"),
-                i,
-                j,
-                m_tile_size,
-                n_regressors,
-                sek_params,
-                test_input);
-
-            if (i != j)
-            {
-                prior_K_tiles[j * static_cast<std::size_t>(m_tiles) + i] = hpx::dataflow(
-                    hpx::annotated_function(hpx::unwrapping(&gen_tile_transpose), "assemble_prior_tiled"),
-                    m_tile_size,
-                    m_tile_size,
-                    prior_K_tiles[i * static_cast<std::size_t>(m_tiles) + j]);
-            }
-        }
-    }
-
-    for (std::size_t j = 0; j < static_cast<std::size_t>(n_tiles); j++)
-    {
-        for (std::size_t i = 0; i < static_cast<std::size_t>(m_tiles); i++)
-        {
-            t_cross_covariance_tiles.push_back(hpx::dataflow(
-                hpx::annotated_function(hpx::unwrapping(&gen_tile_transpose), "assemble_pred"),
-                m_tile_size,
-                n_tile_size,
-                cross_covariance_tiles[i * static_cast<std::size_t>(n_tiles) + j]));
-        }
-    }
-
-    for (std::size_t i = 0; i < static_cast<std::size_t>(m_tiles); i++)
-    {
-        uncertainty_tiles.push_back(hpx::async(hpx::annotated_function(gen_tile_zeros, "assemble_tiled"), m_tile_size));
-    }
-
-    // Prediction
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous Cholesky decomposition: K = L * L^T
-    right_looking_cholesky_tiled(K_tiles, n_tile_size, static_cast<std::size_t>(n_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous triangular solve  L * (L^T * alpha) = y
-    forward_solve_tiled(K_tiles, alpha_tiles, n_tile_size, static_cast<std::size_t>(n_tiles));
-    backward_solve_tiled(K_tiles, alpha_tiles, n_tile_size, static_cast<std::size_t>(n_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous prediction computation solve: hat(y) = K_cross_cov * alpha
-    matrix_vector_tiled(
-        cross_covariance_tiles,
-        alpha_tiles,
-        prediction_tiles,
-        m_tile_size,
-        n_tile_size,
-        static_cast<std::size_t>(n_tiles),
-        static_cast<std::size_t>(m_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous triangular solve L * V = cross(K)^T
-    forward_solve_tiled_matrix(
-        K_tiles,
-        t_cross_covariance_tiles,
-        n_tile_size,
-        m_tile_size,
-        static_cast<std::size_t>(n_tiles),
-        static_cast<std::size_t>(m_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous computation of full covariance Sigma = prior(K) - V^T * V
-    symmetric_matrix_matrix_tiled(
-        t_cross_covariance_tiles,
-        prior_K_tiles,
-        n_tile_size,
-        m_tile_size,
-        static_cast<std::size_t>(n_tiles),
-        static_cast<std::size_t>(m_tiles));
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous computation of uncertainty diag(Sigma)
-    matrix_diagonal_tiled(prior_K_tiles, uncertainty_tiles, m_tile_size, static_cast<std::size_t>(m_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Synchronize prediction
-    for (std::size_t i = 0; i < static_cast<std::size_t>(m_tiles); i++)
-    {
-        auto tile = prediction_tiles[i].get();
-        std::copy(tile.begin(), tile.end(), std::back_inserter(prediction_result));
-    }
-
-    // Synchronize uncertainty
-    for (std::size_t i = 0; i < static_cast<std::size_t>(m_tiles); i++)
-    {
-        auto tile = uncertainty_tiles[i].get();
-        std::copy(tile.begin(), tile.end(), std::back_inserter(uncertainty_result));
-    }
-
-    return std::vector<std::vector<double>>{ std::move(prediction_result), std::move(uncertainty_result) };
-}
-
-///////////////////////////////////////////////////////////////////////////
-// OPTIMIZATION
-double compute_loss(const std::vector<double> &training_input,
-                    const std::vector<double> &training_output,
-                    const gprat_hyper::SEKParams &sek_params,
-                    int n_tiles,
-                    int n_tile_size,
-                    int n_regressors)
-{
-    /*
-     * Negative log likelihood loss:
-     * loss(theta) = 0.5 * ( log(det(K)) - y^T * K^-1 * y - N * log(2 * pi) )
-     * - Covariance matrix K(theta)_NxN
-     * - Training ouput y_N
-     * - Hyperparameters theta ={ v, l, v_n }
-     *
-     * Algorithm:
-     * 1: Compute lower triangular part of covariance matrix K
-     * 2: Compute Cholesky factor L of K
-     * 3: Compute prediction alpha = K^-1 * y:
-     *    - triangular solve L * beta = y
-     *    - triangular solve L^T * alpha = beta
-     * 5: Compute beta = K^-1 * y
-     * 6: Compute negative log likelihood loss
-     *    - Calculate sum_i^N log(L_ii^2)
-     *    - Calculate y^T * beta
-     *    - Add constant N * log (2 * pi)
-     */
-
-    hpx::shared_future<double> loss_value;
-    // Tiled future data structures
-    Tiled_matrix K_tiles;      // Tiled covariance matrix K_NxN
-    Tiled_vector y_tiles;      // Tiled output
-    Tiled_vector alpha_tiles;  // Tiled intermediate solution
-
-    // Preallocate memory
-    K_tiles.resize(static_cast<std::size_t>(n_tiles * n_tiles));  // No reserve because of triangular structure
-    y_tiles.reserve(static_cast<std::size_t>(n_tiles));
-    alpha_tiles.reserve(static_cast<std::size_t>(n_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous assembly
-    for (std::size_t i = 0; i < static_cast<std::size_t>(n_tiles); i++)
-    {
-        for (std::size_t j = 0; j <= i; j++)
-        {
-            K_tiles[i * static_cast<std::size_t>(n_tiles) + j] = hpx::async(
-                hpx::annotated_function(gen_tile_covariance, "assemble_tiled_K"),
-                i,
-                j,
-                n_tile_size,
-                n_regressors,
-                sek_params,
-                training_input);
-        }
-    }
-
-    for (std::size_t i = 0; i < static_cast<std::size_t>(n_tiles); i++)
-    {
-        y_tiles.push_back(
-            hpx::async(hpx::annotated_function(gen_tile_output, "assemble_tiled_y"), i, n_tile_size, training_output));
-    }
-
-    for (std::size_t i = 0; i < static_cast<std::size_t>(n_tiles); i++)
-    {
-        alpha_tiles.push_back(hpx::async(
-            hpx::annotated_function(gen_tile_output, "assemble_tiled_alpha"), i, n_tile_size, training_output));
-    }
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous Cholesky decomposition: K = L * L^T
-    right_looking_cholesky_tiled(K_tiles, n_tile_size, static_cast<std::size_t>(n_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous triangular solve  L * (L^T * alpha) = y
-    forward_solve_tiled(K_tiles, alpha_tiles, n_tile_size, static_cast<std::size_t>(n_tiles));
-    backward_solve_tiled(K_tiles, alpha_tiles, n_tile_size, static_cast<std::size_t>(n_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous loss computation
-    compute_loss_tiled(K_tiles, alpha_tiles, y_tiles, loss_value, n_tile_size, static_cast<std::size_t>(n_tiles));
-
-    return loss_value.get();
-}
-
-std::vector<double>
-optimize(const std::vector<double> &training_input,
-         const std::vector<double> &training_output,
-         int n_tiles,
-         int n_tile_size,
-         int n_regressors,
-         const gprat_hyper::AdamParams &adam_params,
-         gprat_hyper::SEKParams &sek_params,
-         std::vector<bool> trainable_params)
-{
-    /*
-     * - Hyperparameters theta={v, l, v_n}
-     * - Covariance matrix K(theta)
-     * - Training ouput y
-     *
-     * Algorithm:
-     * for opt_iter:
-     *   1: Compute distance for entries of covariance matrix K
-     *   2: Compute lower triangular part of K with distance
-     *   3: Compute lower triangular gradients for delta(K)/delta(v), and delta(K)/delta(l) with distance
-     *
-     *   4: Compute Cholesky factor L of K
-     *   5: Compute K^-1:
-     *       - triangular solve L * {} = I
-     *       - triangular solve L^T * K^-1 = {}
-     *   6: Compute beta = K^-1 * y
-     *
-     *   7: Compute negative log likelihood loss
-     *       - Calculate 0.5 sum_i^N log(L_ii^2)
-     *       - Calculate 0.5 y^T * beta
-     *       - Add constant N / 2 * log (2 * pi)
-     *
-     *   8: Compute delta(loss)/delta(param_i)
-     *       - Compute trace(K^-1 * delta(K)/delta(theta_i))
-     *       - Compute beta^T *  delta(K)/delta(theta_i) * beta
-     *   9: Update hyperparameters theta with Adam optimizer
-     *       - m_T = beta1 * m_T-1 + (1 - beta1) * g_T
-     *       - w_T = beta2 + w_T-1 + (1 - beta2) * g_T^2
-     *       - nu_T = nu * sqrt(1 - beta2_T) / (1 - beta1_T)
-     *       - theta_T = theta_T-1 - nu_T * m_T / (sqrt(w_T) + epsilon)
-     * endfor
-     */
-
-    // data holder for loss
-    hpx::shared_future<double> loss_value;
-    // data holder for computed loss values
-    std::vector<double> losses;
-
-    // Tiled future data structures
-    Tiled_matrix K_tiles;      // Tiled covariance matrix K_NxN
-    Tiled_vector y_tiles;      // Tiled output
-    Tiled_vector alpha_tiles;  // Tiled intermediate solution
-    Tiled_matrix K_inv_tiles;  // Tiled inversed covariance matrix K^-1_NxN
-    // Tiled future data structures for gradients
-    Tiled_matrix grad_v_tiles;  // Tiled covariance with gradient v
-    Tiled_matrix grad_l_tiles;  // Tiled covariance with gradient l
-
-    // Preallocate memory
-    losses.reserve(static_cast<std::size_t>(adam_params.opt_iter));
-    y_tiles.reserve(static_cast<std::size_t>(n_tiles));
-
-    alpha_tiles.resize(static_cast<std::size_t>(n_tiles));            // for now resize since reset in loop
-    K_inv_tiles.resize(static_cast<std::size_t>(n_tiles * n_tiles));  // for now resize since reset in loop
-
-    K_tiles.resize(static_cast<std::size_t>(n_tiles * n_tiles));       // No reserve because of triangular structure
-    grad_v_tiles.resize(static_cast<std::size_t>(n_tiles * n_tiles));  // No reserve because of triangular structure
-    grad_l_tiles.resize(static_cast<std::size_t>(n_tiles * n_tiles));  // No reserve because of triangular structure
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous assembly of output y
-    for (std::size_t i = 0; i < static_cast<std::size_t>(n_tiles); i++)
-    {
-        y_tiles.push_back(
-            hpx::async(hpx::annotated_function(gen_tile_output, "assemble_y"), i, n_tile_size, training_output));
-    }
-
-    //////////////////////////////////////////////////////////////////////////////
-    // Perform optimization
-    for (std::size_t iter = 0; iter < static_cast<std::size_t>(adam_params.opt_iter); iter++)
-    {
-        ///////////////////////////////////////////////////////////////////////////
-        // Launch asynchronous assembly of tiled covariance matrix, derivative of covariance matrix
-        // vector w.r.t. to vertical lengthscale and derivative of covariance
-        // matrix vector w.r.t. to lengthscale
-        for (std::size_t i = 0; i < static_cast<std::size_t>(n_tiles); i++)
-        {
-            for (std::size_t j = 0; j <= i; j++)
-            {
-                // Compute the distance (z_i - z_j) of K entries to reuse
-                hpx::shared_future<std::vector<double>> cov_dists = hpx::async(
-                    hpx::annotated_function(gen_tile_distance, "assemble_cov_dist"),
-                    i,
-                    j,
-                    n_tile_size,
-                    n_regressors,
-                    sek_params,
-                    training_input);
-
-                K_tiles[i * static_cast<std::size_t>(n_tiles) + j] = hpx::dataflow(
-                    hpx::annotated_function(hpx::unwrapping(&gen_tile_covariance_with_distance), "assemble_K"),
-                    i,
-                    j,
-                    n_tile_size,
-                    sek_params,
-                    cov_dists);
-                if (trainable_params[0])
-                {
-                    grad_l_tiles[i * static_cast<std::size_t>(n_tiles) + j] = hpx::dataflow(
-                        hpx::annotated_function(hpx::unwrapping(&gen_tile_grad_l), "assemble_gradl"),
-                        n_tile_size,
-                        sek_params,
-                        cov_dists);
-                    if (i != j)
-                    {
-                        grad_l_tiles[j * static_cast<std::size_t>(n_tiles) + i] = hpx::dataflow(
-                            hpx::annotated_function(hpx::unwrapping(&gen_tile_transpose), "assemble_gradl_t"),
-                            n_tile_size,
-                            n_tile_size,
-                            grad_l_tiles[i * static_cast<std::size_t>(n_tiles) + j]);
-                    }
-                }
-
-                if (trainable_params[1])
-                {
-                    grad_v_tiles[i * static_cast<std::size_t>(n_tiles) + j] = hpx::dataflow(
-                        hpx::annotated_function(hpx::unwrapping(&gen_tile_grad_v), "assemble_gradv"),
-                        n_tile_size,
-                        sek_params,
-                        cov_dists);
-                    if (i != j)
-                    {
-                        grad_v_tiles[j * static_cast<std::size_t>(n_tiles) + i] = hpx::dataflow(
-                            hpx::annotated_function(hpx::unwrapping(&gen_tile_transpose), "assemble_gradv_t"),
-                            n_tile_size,
-                            n_tile_size,
-                            grad_v_tiles[i * static_cast<std::size_t>(n_tiles) + j]);
-                    }
-                }
-            }
-        }
-
-        // Assembly with reallocation -> optimize to only set existing values
-        for (std::size_t i = 0; i < static_cast<std::size_t>(n_tiles); i++)
-        {
-            alpha_tiles[i] = hpx::async(hpx::annotated_function(gen_tile_zeros, "assemble_tiled"), n_tile_size);
-        }
-
-        for (std::size_t i = 0; i < static_cast<std::size_t>(n_tiles); i++)
-        {
-            for (std::size_t j = 0; j < static_cast<std::size_t>(n_tiles); j++)
-            {
-                if (i == j)
-                {
-                    K_inv_tiles[i * static_cast<std::size_t>(n_tiles) + j] =
-                        hpx::async(hpx::annotated_function(gen_tile_identity, "assemble_identity_matrix"), n_tile_size);
-                }
-                else
-                {
-                    K_inv_tiles[i * static_cast<std::size_t>(n_tiles) + j] = hpx::async(
-                        hpx::annotated_function(gen_tile_zeros, "assemble_identity_matrix"), n_tile_size * n_tile_size);
-                }
-            }
-        }
-
-        ///////////////////////////////////////////////////////////////////////////
-        // Launch asynchronous Cholesky decomposition: K = L * L^T
-        right_looking_cholesky_tiled(K_tiles, n_tile_size, static_cast<std::size_t>(n_tiles));
-
-        ///////////////////////////////////////////////////////////////////////////
-        // Launch asynchronous compute K^-1 through L* (L^T * X) = I
-        forward_solve_tiled_matrix(
-            K_tiles,
-            K_inv_tiles,
-            n_tile_size,
-            n_tile_size,
-            static_cast<std::size_t>(n_tiles),
-            static_cast<std::size_t>(n_tiles));
-        backward_solve_tiled_matrix(
-            K_tiles,
-            K_inv_tiles,
-            n_tile_size,
-            n_tile_size,
-            static_cast<std::size_t>(n_tiles),
-            static_cast<std::size_t>(n_tiles));
-
-        ///////////////////////////////////////////////////////////////////////////
-        // Launch asynchronous compute beta = inv(K) * y
-        matrix_vector_tiled(
-            K_inv_tiles,
-            y_tiles,
-            alpha_tiles,
-            n_tile_size,
-            n_tile_size,
-            static_cast<std::size_t>(n_tiles),
-            static_cast<std::size_t>(n_tiles));
-
-        ///////////////////////////////////////////////////////////////////////////
-        // Launch asynchronous loss computation where
-        // loss(theta) = 0.5 * ( log(det(K)) - y^T * K^-1 * y - N * log(2 * pi) )
-        compute_loss_tiled(K_tiles, alpha_tiles, y_tiles, loss_value, n_tile_size, static_cast<std::size_t>(n_tiles));
-
-        ///////////////////////////////////////////////////////////////////////////
-        // Launch asynchronous update of the hyperparameters
-        if (trainable_params[0])
-        {  // lengthscale
-            update_hyperparameter_tiled(
-                K_inv_tiles,
-                grad_l_tiles,
-                alpha_tiles,
-                adam_params,
-                sek_params,
-                n_tile_size,
-                static_cast<std::size_t>(n_tiles),
-                iter,
-                0);
-        }
-        if (trainable_params[1])
-        {  // vertical_lengthscale
-            update_hyperparameter_tiled(
-                K_inv_tiles,
-                grad_v_tiles,
-                alpha_tiles,
-                adam_params,
-                sek_params,
-                n_tile_size,
-                static_cast<std::size_t>(n_tiles),
-                iter,
-                1);
-        }
-        if (trainable_params[2])
-        {  // noise_variance
-            update_hyperparameter_tiled(
-                K_inv_tiles,
-                Tiled_matrix{},  // no tiled gradient matrix required
-                alpha_tiles,
-                adam_params,
-                sek_params,
-                n_tile_size,
-                static_cast<std::size_t>(n_tiles),
-                iter,
-                2);
-        }
-        // Synchronize after iteration
-        losses.push_back(loss_value.get());
-    }
-    // Return losses
-    return losses;
-}
-
-double optimize_step(const std::vector<double> &training_input,
-                     const std::vector<double> &training_output,
-                     int n_tiles,
-                     int n_tile_size,
-                     int n_regressors,
-                     gprat_hyper::AdamParams &adam_params,
-                     gprat_hyper::SEKParams &sek_params,
-                     std::vector<bool> trainable_params,
-                     int iter)
-{
-    /*
-     * - Hyperparameters theta={v, l, v_n}
-     * - Covariance matrix K(theta)
-     * - Training ouput y
-     *
-     * Algorithm:
-     * 1: Compute distance for entries of covariance matrix K
-     * 2: Compute lower triangular part of K with distance
-     * 3: Compute lower triangular gradients for delta(K)/delta(v), and delta(K)/delta(l) with distance
-     *
-     * 4: Compute Cholesky factor L of K
-     * 5: Compute K^-1:
-     *     - triangular solve L * {} = I
-     *     - triangular solve L^T * K^-1 = {}
-     * 6: Compute beta = K^-1 * y
-     *
-     * 7: Compute negative log likelihood loss
-     *     - Calculate 0.5 sum_i^N log(L_ii^2)
-     *     - Calculate 0.5 y^T * beta
-     *     - Add constant N / 2 * log (2 * pi)
-     *
-     * 8: Compute delta(loss)/delta(param_i)
-     *     - Compute trace(K^-1 * delta(K)/delta(theta_i))
-     *     - Compute beta^T * delta(K)/delta(theta_i) * beta
-     * 9: Update hyperparameters theta with Adam optimizer
-     *     - m_T = beta1 * m_T-1 + (1 - beta1) * g_T
-     *     - w_T = beta2 + w_T-1 + (1 - beta2) * g_T^2
-     *     - nu_T = nu * sqrt(1 - beta2_T) / (1 - beta1_T)
-     *     - theta_T = theta_T-1 - nu_T * m_T / (sqrt(w_T) + epsilon)
-     */
-
-    // data holder for loss
-    hpx::shared_future<double> loss_value;
-
-    // Tiled future data structures
-    Tiled_matrix K_tiles;      // Tiled covariance matrix K_NxN
-    Tiled_vector y_tiles;      // Tiled output
-    Tiled_vector alpha_tiles;  // Tiled intermediate solution
-    Tiled_matrix K_inv_tiles;  // Tiled inversed covariance matrix K^-1_NxN
-    // Tiled future data structures for gradients
-    Tiled_matrix grad_v_tiles;  // Tiled covariance with gradient v
-    Tiled_matrix grad_l_tiles;  // Tiled covariance with gradient l
-
-    // Preallocate memory
-    y_tiles.reserve(static_cast<std::size_t>(n_tiles));
-
-    alpha_tiles.resize(static_cast<std::size_t>(n_tiles));            // for now resize since reset in loop
-    K_inv_tiles.resize(static_cast<std::size_t>(n_tiles * n_tiles));  // for now resize since reset in loop
-
-    K_tiles.resize(static_cast<std::size_t>(n_tiles * n_tiles));       // No reserve because of triangular structure
-    grad_v_tiles.resize(static_cast<std::size_t>(n_tiles * n_tiles));  // No reserve because of triangular structure
-    grad_l_tiles.resize(static_cast<std::size_t>(n_tiles * n_tiles));  // No reserve because of triangular structure
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous assembly of output y
-    for (std::size_t i = 0; i < static_cast<std::size_t>(n_tiles); i++)
-    {
-        y_tiles.push_back(
-            hpx::async(hpx::annotated_function(gen_tile_output, "assemble_y"), i, n_tile_size, training_output));
-    }
-
-    //////////////////////////////////////////////////////////////////////////////
-    // Perform one optimization step
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous assembly of tiled covariance matrix, derivative of covariance matrix
-    // vector w.r.t. to vertical lengthscale and derivative of covariance
-    // matrix vector w.r.t. to lengthscale
-    for (std::size_t i = 0; i < static_cast<std::size_t>(n_tiles); i++)
-    {
-        for (std::size_t j = 0; j <= i; j++)
-        {
-            // Compute the distance (z_i - z_j) of K entries to reuse
-            hpx::shared_future<std::vector<double>> cov_dists = hpx::async(
-                hpx::annotated_function(gen_tile_distance, "assemble_cov_dist"),
-                i,
-                j,
-                n_tile_size,
-                n_regressors,
-                sek_params,
-                training_input);
-
-            K_tiles[i * static_cast<std::size_t>(n_tiles) + j] = hpx::dataflow(
-                hpx::annotated_function(hpx::unwrapping(&gen_tile_covariance_with_distance), "assemble_K"),
-                i,
-                j,
-                n_tile_size,
-                sek_params,
-                cov_dists);
-
-            if (trainable_params[0])
-            {
-                grad_l_tiles[i * static_cast<std::size_t>(n_tiles) + j] = hpx::dataflow(
-                    hpx::annotated_function(hpx::unwrapping(&gen_tile_grad_l), "assemble_gradl"),
-                    n_tile_size,
-                    sek_params,
-                    cov_dists);
-                if (i != j)
-                {
-                    grad_l_tiles[j * static_cast<std::size_t>(n_tiles) + i] = hpx::dataflow(
-                        hpx::annotated_function(hpx::unwrapping(&gen_tile_transpose), "assemble_gradl_t"),
-                        n_tile_size,
-                        n_tile_size,
-                        grad_l_tiles[i * static_cast<std::size_t>(n_tiles) + j]);
-                }
-            }
-
-            if (trainable_params[1])
-            {
-                grad_v_tiles[i * static_cast<std::size_t>(n_tiles) + j] = hpx::dataflow(
-                    hpx::annotated_function(hpx::unwrapping(&gen_tile_grad_v), "assemble_gradv"),
-                    n_tile_size,
-                    sek_params,
-                    cov_dists);
-                if (i != j)
-                {
-                    grad_v_tiles[j * static_cast<std::size_t>(n_tiles) + i] = hpx::dataflow(
-                        hpx::annotated_function(hpx::unwrapping(&gen_tile_transpose), "assemble_gradv_t"),
-                        n_tile_size,
-                        n_tile_size,
-                        grad_v_tiles[i * static_cast<std::size_t>(n_tiles) + j]);
-                }
-            }
-        }
-    }
-
-    // Assembly with reallocation -> optimize to only set existing values
-    for (std::size_t i = 0; i < static_cast<std::size_t>(n_tiles); i++)
-    {
-        alpha_tiles[i] = hpx::async(hpx::annotated_function(gen_tile_zeros, "assemble_tiled"), n_tile_size);
-    }
-
-    for (std::size_t i = 0; i < static_cast<std::size_t>(n_tiles); i++)
-    {
-        for (std::size_t j = 0; j < static_cast<std::size_t>(n_tiles); j++)
-        {
-            if (i == j)
-            {
-                K_inv_tiles[i * static_cast<std::size_t>(n_tiles) + j] =
-                    hpx::async(hpx::annotated_function(gen_tile_identity, "assemble_identity_matrix"), n_tile_size);
-            }
-            else
-            {
-                K_inv_tiles[i * static_cast<std::size_t>(n_tiles) + j] = hpx::async(
-                    hpx::annotated_function(gen_tile_zeros, "assemble_identity_matrix"), n_tile_size * n_tile_size);
-            }
-        }
-    }
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous Cholesky decomposition: K = L * L^T
-    right_looking_cholesky_tiled(K_tiles, n_tile_size, static_cast<std::size_t>(n_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous compute K^-1 through L* (L^T * X) = I
-    forward_solve_tiled_matrix(
-        K_tiles,
-        K_inv_tiles,
-        n_tile_size,
-        n_tile_size,
-        static_cast<std::size_t>(n_tiles),
-        static_cast<std::size_t>(n_tiles));
-    backward_solve_tiled_matrix(
-        K_tiles,
-        K_inv_tiles,
-        n_tile_size,
-        n_tile_size,
-        static_cast<std::size_t>(n_tiles),
-        static_cast<std::size_t>(n_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous compute beta = inv(K) * y
-    matrix_vector_tiled(
-        K_inv_tiles,
-        y_tiles,
-        alpha_tiles,
-        n_tile_size,
-        n_tile_size,
-        static_cast<std::size_t>(n_tiles),
-        static_cast<std::size_t>(n_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous loss computation where
-    // loss(theta) = 0.5 * ( log(det(K)) - y^T * K^-1 * y - N * log(2 * pi) )
-    compute_loss_tiled(K_tiles, alpha_tiles, y_tiles, loss_value, n_tile_size, static_cast<std::size_t>(n_tiles));
-
-    ///////////////////////////////////////////////////////////////////////////
-    // Launch asynchronous update of the hyperparameters
-    if (trainable_params[0])
-    {  // lengthscale
-        update_hyperparameter_tiled(
-            K_inv_tiles,
-            grad_l_tiles,
-            alpha_tiles,
-            adam_params,
-            sek_params,
-            n_tile_size,
-            static_cast<std::size_t>(n_tiles),
-            static_cast<std::size_t>(iter),
-            0);
-    }
-    if (trainable_params[1])
-    {  // vertical_lengthscale
-        update_hyperparameter_tiled(
-            K_inv_tiles,
-            grad_v_tiles,
-            alpha_tiles,
-            adam_params,
-            sek_params,
-            n_tile_size,
-            static_cast<std::size_t>(n_tiles),
-            static_cast<std::size_t>(iter),
-            1);
-    }
-    if (trainable_params[2])
-    {  // noise_variance
-        update_hyperparameter_tiled(
-            K_inv_tiles,
-            Tiled_matrix{},  // no tiled gradient matrix required
-            alpha_tiles,
-            adam_params,
-            sek_params,
-            n_tile_size,
-            static_cast<std::size_t>(n_tiles),
-            static_cast<std::size_t>(iter),
-            2);
-    }
-    return loss_value.get();
-}
 
 }  // end of namespace cpu
+
+GPRAT_NS_END
diff --git a/core/src/cpu/gp_optimizer.cpp b/core/src/cpu/gp_optimizer.cpp
index f9c5d500..7c1c76f7 100644
--- a/core/src/cpu/gp_optimizer.cpp
+++ b/core/src/cpu/gp_optimizer.cpp
@@ -1,8 +1,13 @@
-#include "cpu/gp_optimizer.hpp"
+#include "gprat/cpu/gp_optimizer.hpp"
 
-#include "cpu/adapter_cblas_fp64.hpp"
+#include "gprat/cpu/adapter_cblas_fp64.hpp"
+
+#include <cmath>
+#include <numbers>
 #include <numeric>
 
+GPRAT_NS_BEGIN
+
 namespace cpu
 {
 
@@ -39,110 +44,103 @@ double compute_sigmoid(double parameter) { return 1.0 / (1.0 + exp(-parameter));
 double compute_covariance_distance(std::size_t i_global,
                                    std::size_t j_global,
                                    std::size_t n_regressors,
-                                   const gprat_hyper::SEKParams &sek_params,
+                                   const SEKParams &sek_params,
                                    const std::vector<double> &i_input,
                                    const std::vector<double> &j_input)
 {
     // -0.5*lengthscale^2*(z_i-z_j)^2
     double distance = 0.0;
-    double z_ik_minus_z_jk;
-
     for (std::size_t k = 0; k < n_regressors; k++)
     {
-        z_ik_minus_z_jk = i_input[i_global + k] - j_input[j_global + k];
+        const double z_ik_minus_z_jk = i_input[i_global + k] - j_input[j_global + k];
         distance += z_ik_minus_z_jk * z_ik_minus_z_jk;
     }
     return -0.5 / (sek_params.lengthscale * sek_params.lengthscale) * distance;
 }
 
-std::vector<double> gen_tile_distance(
+mutable_tile_data<double> gen_tile_distance(
     std::size_t row,
     std::size_t col,
     std::size_t N,
     std::size_t n_regressors,
-    const gprat_hyper::SEKParams &sek_params,
+    const SEKParams &sek_params,
     const std::vector<double> &input)
 {
-    std::size_t i_global, j_global;
     // Preallocate memory
-    std::vector<double> tile;
-    tile.reserve(N * N);
+    mutable_tile_data<double> tile(N * N);
     for (std::size_t i = 0; i < N; i++)
     {
-        i_global = N * row + i;
+        const std::size_t i_global = N * row + i;
         for (std::size_t j = 0; j < N; j++)
         {
-            j_global = N * col + j;
+            const std::size_t j_global = N * col + j;
             // compute covariance function
-            tile.push_back(compute_covariance_distance(i_global, j_global, n_regressors, sek_params, input, input));
+            tile.data()[i * N + j] =
+                compute_covariance_distance(i_global, j_global, n_regressors, sek_params, input, input);
         }
     }
     return tile;
 }
 
-std::vector<double> gen_tile_covariance_with_distance(
+mutable_tile_data<double> gen_tile_covariance_with_distance(
     std::size_t row,
     std::size_t col,
     std::size_t N,
-    const gprat_hyper::SEKParams &sek_params,
-    const std::vector<double> &distance)
+    const SEKParams &sek_params,
+    const const_tile_data<double> &distance)
 {
-    std::size_t i_global, j_global;
-    double covariance;
     // Preallocate required memory
-    std::vector<double> tile;
-    tile.reserve(N * N);
+    mutable_tile_data<double> tile(N * N);
     for (std::size_t i = 0; i < N; i++)
     {
-        i_global = N * row + i;
+        const std::size_t i_global = N * row + i;
         for (std::size_t j = 0; j < N; j++)
         {
-            j_global = N * col + j;
+            const std::size_t j_global = N * col + j;
             // compute covariance function
-            covariance = sek_params.vertical_lengthscale * exp(distance[i * N + j]);
+            double covariance = sek_params.vertical_lengthscale * exp(distance.data()[i * N + j]);
             if (i_global == j_global)
             {
                 // noise variance on diagonal
                 covariance += sek_params.noise_variance;
             }
-            tile.push_back(covariance);
+            tile.data()[i * N + j] = covariance;
         }
     }
     return tile;
 }
 
-std::vector<double>
-gen_tile_grad_v(std::size_t N, const gprat_hyper::SEKParams &sek_params, const std::vector<double> &distance)
+mutable_tile_data<double>
+gen_tile_grad_v(std::size_t N, const SEKParams &sek_params, const const_tile_data<double> &distance)
 {
     // Preallocate required memory
-    std::vector<double> tile;
-    tile.reserve(N * N);
+    mutable_tile_data<double> tile(N * N);
     double hyperparam_der = compute_sigmoid(to_unconstrained(sek_params.vertical_lengthscale, false));
     for (std::size_t i = 0; i < N; i++)
     {
         for (std::size_t j = 0; j < N; j++)
         {
             // compute derivative
-            tile.push_back(exp(distance[i * N + j]) * hyperparam_der);
+            tile.data()[i * N + j] = exp(distance.data()[i * N + j]) * hyperparam_der;
         }
     }
     return tile;
 }
 
-std::vector<double>
-gen_tile_grad_l(std::size_t N, const gprat_hyper::SEKParams &sek_params, const std::vector<double> &distance)
+mutable_tile_data<double>
+gen_tile_grad_l(std::size_t N, const SEKParams &sek_params, const const_tile_data<double> &distance)
 {
     // Preallocate required memory
-    std::vector<double> tile;
-    tile.reserve(N * N);
-    double hyperparam_der = compute_sigmoid(to_unconstrained(sek_params.lengthscale, false));
-    double factor = -2.0 * sek_params.vertical_lengthscale / sek_params.lengthscale;
+    mutable_tile_data<double> tile(N * N);
+    const double hyperparam_der = compute_sigmoid(to_unconstrained(sek_params.lengthscale, false));
+    const double factor = -2.0 * sek_params.vertical_lengthscale / sek_params.lengthscale;
     for (std::size_t i = 0; i < N; i++)
     {
         for (std::size_t j = 0; j < N; j++)
         {
             // compute derivative
-            tile.push_back(factor * distance[i * N + j] * exp(distance[i * N + j]) * hyperparam_der);
+            tile.data()[i * N + j] =
+                factor * distance.data()[i * N + j] * exp(distance.data()[i * N + j]) * hyperparam_der;
         }
     }
     return tile;
@@ -160,11 +158,8 @@ double update_second_moment(double gradient, double v_T, double beta_2)
     return beta_2 * v_T + (1.0 - beta_2) * gradient * gradient;
 }
 
-double adam_step(const double unconstrained_hyperparam,
-                 const gprat_hyper::AdamParams &adam_params,
-                 double m_T,
-                 double v_T,
-                 std::size_t iter)
+double adam_step(
+    const double unconstrained_hyperparam, const AdamParams &adam_params, double m_T, double v_T, std::size_t iter)
 {
     // Compute decay rate
     double beta1_T = pow(adam_params.beta1, static_cast<double>(iter + 1));
@@ -183,9 +178,9 @@ double adam_step(const double unconstrained_hyperparam,
 
 /////////////////////////////////////////////////////////////////////////
 // Loss
-double compute_loss(const std::vector<double> &K_diag_tile,
-                    const std::vector<double> &alpha_tile,
-                    const std::vector<double> &y_tile,
+double compute_loss(std::span<const double> K_diag_tile,
+                    std::span<const double> alpha_tile,
+                    std::span<const double> y_tile,
                     std::size_t N)
 {
     // l = y^T * alpha + \sum_i^N log(L_ii^2)
@@ -201,7 +196,7 @@ double compute_loss(const std::vector<double> &K_diag_tile,
     return l;
 }
 
-double add_losses(const std::vector<double> &losses, std::size_t N, std::size_t n_tiles)
+double add_losses(std::span<const double> losses, std::size_t N, std::size_t n_tiles)
 {
     // 0.5 * \sum losses + const
     double l = 0.0;
@@ -212,7 +207,7 @@ double add_losses(const std::vector<double> &losses, std::size_t N, std::size_t
         l += losses[i];
     }
 
-    l += Nn * log(2.0 * M_PI);
+    l += Nn * log(2.0 * std::numbers::pi);
     return 0.5 * l / Nn;  // why /Nn?
 }
 
@@ -223,17 +218,17 @@ double compute_gradient(double trace, double dot, std::size_t N, std::size_t n_t
     return 0.5 / static_cast<double>(N * n_tiles) * (trace - dot);
 }
 
-double compute_trace(const std::vector<double> &diagonal, double trace)
+double compute_trace(std::span<const double> diagonal, double trace)
 {
     return trace + std::reduce(diagonal.begin(), diagonal.end());
 }
 
-double compute_dot(const std::vector<double> &vector_T, const std::vector<double> &vector, double result)
+double compute_dot(std::span<const double> vector_T, std::span<const double> vector, double result)
 {
     return result + dot(vector_T, vector, static_cast<int>(vector.size()));
 }
 
-double compute_trace_diag(const std::vector<double> &tile, double trace, std::size_t N)
+double compute_trace_diag(std::span<const double> tile, double trace, std::size_t N)
 {
     double local_trace = 0.0;
     for (std::size_t i = 0; i < N; ++i)
@@ -244,3 +239,5 @@ double compute_trace_diag(const std::vector<double> &tile, double trace, std::si
 }
 
 }  // end of namespace cpu
+
+GPRAT_NS_END
diff --git a/core/src/cpu/gp_uncertainty.cpp b/core/src/cpu/gp_uncertainty.cpp
index 3ea6a7a9..5f03366f 100644
--- a/core/src/cpu/gp_uncertainty.cpp
+++ b/core/src/cpu/gp_uncertainty.cpp
@@ -1,21 +1,22 @@
-#include "cpu/gp_uncertainty.hpp"
+#include "gprat/cpu/gp_uncertainty.hpp"
+
+#include "gprat/tile_data.hpp"
+
+GPRAT_NS_BEGIN
 
 namespace cpu
 {
 
-hpx::shared_future<std::vector<double>> get_matrix_diagonal(hpx::shared_future<std::vector<double>> f_A, std::size_t M)
+mutable_tile_data<double> get_matrix_diagonal(const const_tile_data<double> &A, std::size_t M)
 {
-    auto A = f_A.get();
-    // Preallocate memory
-    std::vector<double> tile;
-    tile.reserve(M);
-    // Add elements
+    mutable_tile_data<double> tile(M);
     for (std::size_t i = 0; i < M; ++i)
     {
-        tile.push_back(A[i * M + i]);
+        tile.data()[i] = A.data()[i * M + i];
     }
-
-    return hpx::make_ready_future(std::move(tile));
+    return tile;
 }
 
 }  // end of namespace cpu
+
+GPRAT_NS_END
diff --git a/core/src/cpu/tiled_algorithms.cpp b/core/src/cpu/tiled_algorithms.cpp
index 5c5b2573..d035b89d 100644
--- a/core/src/cpu/tiled_algorithms.cpp
+++ b/core/src/cpu/tiled_algorithms.cpp
@@ -1,429 +1,31 @@
-#include "cpu/tiled_algorithms.hpp"
+#include "gprat/cpu/tiled_algorithms.hpp"
 
-#include "cpu/adapter_cblas_fp64.hpp"
-#include "cpu/gp_algorithms.hpp"
-#include "cpu/gp_optimizer.hpp"
-#include "cpu/gp_uncertainty.hpp"
-#include <hpx/future.hpp>
+#include "gprat/cpu/adapter_cblas_fp64.hpp"
+#include "gprat/cpu/gp_algorithms.hpp"
+#include "gprat/cpu/gp_optimizer.hpp"
 
-namespace cpu
-{
-
-// Tiled Cholesky Algorithm
-
-void right_looking_cholesky_tiled(Tiled_matrix &ft_tiles, int N, std::size_t n_tiles)
-{
-    for (std::size_t k = 0; k < n_tiles; k++)
-    {
-        // POTRF: Compute Cholesky factor L
-        ft_tiles[k * n_tiles + k] =
-            hpx::dataflow(hpx::annotated_function(potrf, "cholesky_tiled"), ft_tiles[k * n_tiles + k], N);
-        for (std::size_t m = k + 1; m < n_tiles; m++)
-        {
-            // TRSM:  Solve X * L^T = A
-            ft_tiles[m * n_tiles + k] = hpx::dataflow(
-                hpx::annotated_function(trsm, "cholesky_tiled"),
-                ft_tiles[k * n_tiles + k],
-                ft_tiles[m * n_tiles + k],
-                N,
-                N,
-                Blas_trans,
-                Blas_right);
-        }
-        for (std::size_t m = k + 1; m < n_tiles; m++)
-        {
-            // SYRK:  A = A - B * B^T
-            ft_tiles[m * n_tiles + m] = hpx::dataflow(
-                hpx::annotated_function(syrk, "cholesky_tiled"),
-                ft_tiles[m * n_tiles + m],
-                ft_tiles[m * n_tiles + k],
-                N);
-            for (std::size_t n = k + 1; n < m; n++)
-            {
-                // GEMM: C = C - A * B^T
-                ft_tiles[m * n_tiles + n] = hpx::dataflow(
-                    hpx::annotated_function(gemm, "cholesky_tiled"),
-                    ft_tiles[m * n_tiles + k],
-                    ft_tiles[n * n_tiles + k],
-                    ft_tiles[m * n_tiles + n],
-                    N,
-                    N,
-                    N,
-                    Blas_no_trans,
-                    Blas_trans);
-            }
-        }
-    }
-}
-
-// Tiled Triangular Solve Algorithms
-
-void forward_solve_tiled(Tiled_matrix &ft_tiles, Tiled_vector &ft_rhs, int N, std::size_t n_tiles)
-{
-    for (std::size_t k = 0; k < n_tiles; k++)
-    {
-        // TRSM: Solve L * x = a
-        ft_rhs[k] = hpx::dataflow(
-            hpx::annotated_function(trsv, "triangular_solve_tiled"),
-            ft_tiles[k * n_tiles + k],
-            ft_rhs[k],
-            N,
-            Blas_no_trans);
-        for (std::size_t m = k + 1; m < n_tiles; m++)
-        {
-            // GEMV: b = b - A * a
-            ft_rhs[m] = hpx::dataflow(
-                hpx::annotated_function(gemv, "triangular_solve_tiled"),
-                ft_tiles[m * n_tiles + k],
-                ft_rhs[k],
-                ft_rhs[m],
-                N,
-                N,
-                Blas_substract,
-                Blas_no_trans);
-        }
-    }
-}
-
-void backward_solve_tiled(Tiled_matrix &ft_tiles, Tiled_vector &ft_rhs, int N, std::size_t n_tiles)
-{
-    for (int k_ = static_cast<int>(n_tiles) - 1; k_ >= 0; k_--)  // int instead of std::size_t for last comparison
-    {
-        std::size_t k = static_cast<std::size_t>(k_);
-        // TRSM: Solve L^T * x = a
-        ft_rhs[k] = hpx::dataflow(
-            hpx::annotated_function(trsv, "triangular_solve_tiled"),
-            ft_tiles[k * n_tiles + k],
-            ft_rhs[k],
-            N,
-            Blas_trans);
-        for (int m_ = k_ - 1; m_ >= 0; m_--)  // int instead of std::size_t for last comparison
-        {
-            std::size_t m = static_cast<std::size_t>(m_);
-            // GEMV:b = b - A^T * a
-            ft_rhs[m] = hpx::dataflow(
-                hpx::annotated_function(gemv, "triangular_solve_tiled"),
-                ft_tiles[k * n_tiles + m],
-                ft_rhs[k],
-                ft_rhs[m],
-                N,
-                N,
-                Blas_substract,
-                Blas_trans);
-        }
-    }
-}
-
-void forward_solve_tiled_matrix(
-    Tiled_matrix &ft_tiles, Tiled_matrix &ft_rhs, int N, int M, std::size_t n_tiles, std::size_t m_tiles)
-{
-    for (std::size_t c = 0; c < m_tiles; c++)
-    {
-        for (std::size_t k = 0; k < n_tiles; k++)
-        {
-            // TRSM: solve L * X = A
-            ft_rhs[k * m_tiles + c] = hpx::dataflow(
-                hpx::annotated_function(trsm, "triangular_solve_tiled_matrix"),
-                ft_tiles[k * n_tiles + k],
-                ft_rhs[k * m_tiles + c],
-                N,
-                M,
-                Blas_no_trans,
-                Blas_left);
-            for (std::size_t m = k + 1; m < n_tiles; m++)
-            {
-                // GEMM: C = C - A * B
-                ft_rhs[m * m_tiles + c] = hpx::dataflow(
-                    hpx::annotated_function(gemm, "triangular_solve_tiled_matrix"),
-                    ft_tiles[m * n_tiles + k],
-                    ft_rhs[k * m_tiles + c],
-                    ft_rhs[m * m_tiles + c],
-                    N,
-                    M,
-                    N,
-                    Blas_no_trans,
-                    Blas_no_trans);
-            }
-        }
-    }
-}
-
-void backward_solve_tiled_matrix(
-    Tiled_matrix &ft_tiles, Tiled_matrix &ft_rhs, int N, int M, std::size_t n_tiles, std::size_t m_tiles)
-{
-    for (std::size_t c = 0; c < m_tiles; c++)
-    {
-        for (int k_ = static_cast<int>(n_tiles) - 1; k_ >= 0; k_--)  // int instead of std::size_t for last comparison
-        {
-            std::size_t k = static_cast<std::size_t>(k_);
-            // TRSM: solve L^T * X = A
-            ft_rhs[k * m_tiles + c] = hpx::dataflow(
-                hpx::annotated_function(trsm, "triangular_solve_tiled_matrix"),
-                ft_tiles[k * n_tiles + k],
-                ft_rhs[k * m_tiles + c],
-                N,
-                M,
-                Blas_trans,
-                Blas_left);
-            for (int m_ = k_ - 1; m_ >= 0; m_--)  // int instead of std::size_t for last comparison
-            {
-                std::size_t m = static_cast<std::size_t>(m_);
-                // GEMM: C = C - A^T * B
-                ft_rhs[m * m_tiles + c] = hpx::dataflow(
-                    hpx::annotated_function(gemm, "triangular_solve_tiled_matrix"),
-                    ft_tiles[k * n_tiles + m],
-                    ft_rhs[k * m_tiles + c],
-                    ft_rhs[m * m_tiles + c],
-                    N,
-                    M,
-                    N,
-                    Blas_trans,
-                    Blas_no_trans);
-            }
-        }
-    }
-}
-
-void matrix_vector_tiled(Tiled_matrix &ft_tiles,
-                         Tiled_vector &ft_vector,
-                         Tiled_vector &ft_rhs,
-                         int N_row,
-                         int N_col,
-                         std::size_t n_tiles,
-                         std::size_t m_tiles)
-{
-    for (std::size_t k = 0; k < m_tiles; k++)
-    {
-        for (std::size_t m = 0; m < n_tiles; m++)
-        {
-            ft_rhs[k] = hpx::dataflow(
-                hpx::annotated_function(gemv, "prediction_tiled"),
-                ft_tiles[k * n_tiles + m],
-                ft_vector[m],
-                ft_rhs[k],
-                N_row,
-                N_col,
-                Blas_add,
-                Blas_no_trans);
-        }
-    }
-}
-
-void symmetric_matrix_matrix_diagonal_tiled(
-    Tiled_matrix &ft_tiles, Tiled_vector &ft_vector, int N, int M, std::size_t n_tiles, std::size_t m_tiles)
-{
-    for (std::size_t i = 0; i < m_tiles; ++i)
-    {
-        for (std::size_t n = 0; n < n_tiles; ++n)
-        {  // Compute inner product to obtain diagonal elements of
-           // V^T * V  <=> cross(K) * K^-1 * cross(K)^T
-            ft_vector[i] = hpx::dataflow(
-                hpx::annotated_function(dot_diag_syrk, "posterior_tiled"),
-                ft_tiles[n * m_tiles + i],
-                ft_vector[i],
-                N,
-                M);
-        }
-    }
-}
-
-void symmetric_matrix_matrix_tiled(
-    Tiled_matrix &ft_tiles, Tiled_matrix &ft_result, int N, int M, std::size_t n_tiles, std::size_t m_tiles)
-{
-    for (std::size_t c = 0; c < m_tiles; c++)
-    {
-        for (std::size_t k = 0; k < m_tiles; k++)
-        {
-            for (std::size_t m = 0; m < n_tiles; m++)
-            {
-                // (SYRK for (c == k) possible)
-                // GEMM:  C = C - A^T * B
-                ft_result[c * m_tiles + k] = hpx::dataflow(
-                    hpx::annotated_function(&gemm, "triangular_solve_tiled_matrix"),
-                    ft_tiles[m * m_tiles + c],
-                    ft_tiles[m * m_tiles + k],
-                    ft_result[c * m_tiles + k],
-                    N,
-                    M,
-                    M,
-                    Blas_trans,
-                    Blas_no_trans);
-            }
-        }
-    }
-}
-
-void vector_difference_tiled(Tiled_vector &ft_minuend, Tiled_vector &ft_subtrahend, int M, std::size_t m_tiles)
-{
-    for (std::size_t i = 0; i < m_tiles; i++)
-    {
-        ft_subtrahend[i] =
-            hpx::dataflow(hpx::annotated_function(&axpy, "uncertainty_tiled"), ft_minuend[i], ft_subtrahend[i], M);
-    }
-}
+GPRAT_NS_BEGIN
 
-void matrix_diagonal_tiled(Tiled_matrix &ft_tiles, Tiled_vector &ft_vector, int M, std::size_t m_tiles)
+namespace cpu
 {
-    for (std::size_t i = 0; i < m_tiles; i++)
-    {
-        ft_vector[i] = hpx::dataflow(
-            hpx::annotated_function(get_matrix_diagonal, "uncertainty_tiled"), ft_tiles[i * m_tiles + i], M);
-    }
-}
 
-void compute_loss_tiled(Tiled_matrix &ft_tiles,
-                        Tiled_vector &ft_alpha,
-                        Tiled_vector &ft_y,
-                        hpx::shared_future<double> &loss,
-                        int N,
-                        std::size_t n_tiles)
+namespace impl
 {
-    std::vector<hpx::shared_future<double>> loss_tiled;
-    loss_tiled.reserve(n_tiles);
-    for (std::size_t k = 0; k < n_tiles; k++)
-    {
-        loss_tiled.push_back(hpx::dataflow(
-            hpx::annotated_function(hpx::unwrapping(&compute_loss), "loss_tiled"),
-            ft_tiles[k * n_tiles + k],
-            ft_alpha[k],
-            ft_y[k],
-            N));
-    }
-
-    loss = hpx::dataflow(hpx::annotated_function(hpx::unwrapping(&add_losses), "loss_tiled"), loss_tiled, N, n_tiles);
-}
 
-void update_hyperparameter_tiled(
-    const Tiled_matrix &ft_invK,
-    const Tiled_matrix &ft_gradK_param,
-    const Tiled_vector &ft_alpha,
-    const gprat_hyper::AdamParams &adam_params,
-    gprat_hyper::SEKParams &sek_params,
-    int N,
+void update_parameters(
+    const AdamParams &adam_params,
+    SEKParams &sek_params,
+    std::size_t N,
     std::size_t n_tiles,
     std::size_t iter,
-    std::size_t param_idx)
+    std::size_t param_idx,
+    double trace,
+    double dot,
+    bool jitter,
+    double factor)
 {
-    /*
-     * PART 1:
-     * Compute gradient = 0.5 * ( trace(inv(K) * grad(K)_param) + y^T * inv(K) * grad(K)_param * inv(K) * y )
-     *
-     * 1: Compute   trace(inv(K) * grad(K)_param)
-     * 2: Compute   y^T * inv(K) * grad(K)_param * inv(K) * y
-     *
-     * Update parameter:
-     * 3: Update moments
-     *      - m_T = beta1 * m_T-1 + (1 - beta1) * g_T
-     *      - w_T = beta2 + w_T-1 + (1 - beta2) * g_T^2
-     * 4: Adam step:
-     *      - nu_T = nu * sqrt(1 - beta2_T) / (1 - beta1_T)
-     *      - theta_T = theta_T-1 - nu_T * m_T / (sqrt(w_T) + epsilon)
-     */
-    hpx::shared_future<double> trace = hpx::make_ready_future(0.0);
-    hpx::shared_future<double> dot = hpx::make_ready_future(0.0);
-    bool jitter = false;
-    double factor = 1.0;
-    if (param_idx == 0 || param_idx == 1)  // 0: lengthscale; 1: vertical_lengthscale
-    {
-        Tiled_vector diag_tiles;   // Diagonal tiles
-        Tiled_vector inter_alpha;  // Intermediate result
-        // Preallocate memory
-        inter_alpha.reserve(n_tiles);
-        diag_tiles.reserve(n_tiles);
-        // Asynchrnonous initialization
-        for (std::size_t d = 0; d < n_tiles; d++)
-        {
-            diag_tiles.push_back(hpx::async(hpx::annotated_function(gen_tile_zeros, "assemble"), N));
-            inter_alpha.push_back(hpx::async(hpx::annotated_function(gen_tile_zeros, "assemble"), N));
-        }
-
-        ////////////////////////////////////
-        // PART 1: Compute gradient
-        // Step 1: Compute trace(inv(K)*grad_K_param)
-        // Compute diagonal tiles of inv(K) * grad(K)_param
-        for (std::size_t i = 0; i < n_tiles; ++i)
-        {
-            for (std::size_t j = 0; j < n_tiles; ++j)
-            {
-                diag_tiles[i] = hpx::dataflow(
-                    hpx::annotated_function(dot_diag_gemm, "trace"),
-                    ft_invK[i * n_tiles + j],
-                    ft_gradK_param[j * n_tiles + i],
-                    diag_tiles[i],
-                    N,
-                    N);
-            }
-        }
-        // Compute the trace of the diagonal tiles
-        for (std::size_t j = 0; j < n_tiles; ++j)
-        {
-            trace =
-                hpx::dataflow(hpx::annotated_function(hpx::unwrapping(&compute_trace), "trace"), diag_tiles[j], trace);
-        }
-        // Not sure if can be done this way
-        // Step 2: Compute alpha^T * grad(K)_param * alpha (with alpha = inv(K) * y)
-        // Compute inter_alpha = grad(K)_param * alpha
-        for (std::size_t k = 0; k < n_tiles; k++)
-        {
-            for (std::size_t m = 0; m < n_tiles; m++)
-            {
-                inter_alpha[k] = hpx::dataflow(
-                    hpx::annotated_function(gemv, "gemv"),
-                    ft_gradK_param[k * n_tiles + m],
-                    ft_alpha[m],
-                    inter_alpha[k],
-                    N,
-                    N,
-                    Blas_add,
-                    Blas_no_trans);
-            }
-        }
-        // Compute alpha^T * inter_alpha
-        for (std::size_t j = 0; j < n_tiles; ++j)
-        {
-            dot = hpx::dataflow(hpx::annotated_function(hpx::unwrapping(&compute_dot), "grad_right_tiled"),
-                                inter_alpha[j],
-                                ft_alpha[j],
-                                dot);
-        }
-    }
-    else if (param_idx == 2)  // @2: noise_variance
-    {
-        jitter = true;
-        ////////////////////////////////////
-        // PART 1: Compute gradient
-        // Step 1: Compute the trace of inv(K) * noise_variance
-        for (std::size_t j = 0; j < n_tiles; ++j)
-        {
-            trace = hpx::dataflow(hpx::annotated_function(hpx::unwrapping(&compute_trace_diag), "grad_left_tiled"),
-                                  ft_invK[j * n_tiles + j],
-                                  trace,
-                                  N);
-        }
-        ////////////////////////////////////
-        // Step 2: Compute the alpha^T * alpha * noise_variance
-        for (std::size_t j = 0; j < n_tiles; ++j)
-        {
-            dot = hpx::dataflow(hpx::annotated_function(hpx::unwrapping(&compute_dot), "grad_right_tiled"),
-                                ft_alpha[j],
-                                ft_alpha[j],
-                                dot);
-        }
-
-        factor = compute_sigmoid(to_unconstrained(sek_params.noise_variance, true));
-    }
-    else
-    {
-        // Throw an exception for invalid param_idx
-        throw std::invalid_argument("Invalid param_idx");
-    }
-
     // Compute gradient = trace + dot
-    double gradient =
-        factor
-        * hpx::dataflow(
-              hpx::annotated_function(hpx::unwrapping(&compute_gradient), "update_hyperparam"), trace, dot, N, n_tiles)
-              .get();
+    double gradient = factor * compute_gradient(trace, dot, N, n_tiles);
 
     ////////////////////////////////////
     // PART 2: Update parameter
@@ -437,14 +39,14 @@ void update_hyperparameter_tiled(
     double unconstrained_param = to_unconstrained(sek_params.get_param(param_idx), jitter);
     // Adam step update with unconstrained parameter
     // compute beta_t inside
-    double updated_param = adam_step(
-        unconstrained_param,
-        adam_params,
-        sek_params.m_T[param_idx],
-        sek_params.w_T[param_idx],
-        static_cast<std::size_t>(iter));
+    double updated_param =
+        adam_step(unconstrained_param, adam_params, sek_params.m_T[param_idx], sek_params.w_T[param_idx], iter);
     // Transform hyperparameter back to constrained form
     sek_params.set_param(param_idx, to_constrained(updated_param, jitter));
 }
 
+}  // namespace impl
+
 }  // end of namespace cpu
+
+GPRAT_NS_END
diff --git a/core/src/gprat.cpp b/core/src/gprat.cpp
new file mode 100644
index 00000000..969fdb9e
--- /dev/null
+++ b/core/src/gprat.cpp
@@ -0,0 +1,285 @@
+#include "gprat/gprat.hpp"
+
+#include "gprat/cpu/gp_functions.hpp"
+#include "gprat/utils.hpp"
+
+#if GPRAT_WITH_CUDA
+#include "gprat/gpu/gp_functions.cuh"
+#endif
+
+GPRAT_NS_BEGIN
+
+GP_data::GP_data(const std::string &f_path, std::size_t n, std::size_t n_reg) :
+    file_path(f_path),
+    n_samples(n),
+    n_regressors(n_reg)
+{
+    data = load_data(f_path, n, n_reg - 1);
+}
+
+GP::GP(std::vector<double> input,
+       std::vector<double> output,
+       std::size_t n_tiles,
+       std::size_t n_tile_size,
+       std::size_t n_regressors,
+       const std::vector<double> &kernel_hyperparams,
+       std::vector<bool> trainable_bool,
+       std::shared_ptr<Target> target) :
+    training_input_(std::move(input)),
+    training_output_(std::move(output)),
+    n_tiles_(n_tiles),
+    n_tile_size_(n_tile_size),
+    trainable_params_(std::move(trainable_bool)),
+    target_(std::move(target)),
+    n_reg(n_regressors),
+    kernel_params(kernel_hyperparams[0], kernel_hyperparams[1], kernel_hyperparams[2])
+{ }
+
+GP::GP(std::vector<double> input,
+       std::vector<double> output,
+       std::size_t n_tiles,
+       std::size_t n_tile_size,
+       std::size_t n_regressors,
+       const std::vector<double> &kernel_hyperparams,
+       std::vector<bool> trainable_bool) :
+    training_input_(std::move(input)),
+    training_output_(std::move(output)),
+    n_tiles_(n_tiles),
+    n_tile_size_(n_tile_size),
+    trainable_params_(std::move(trainable_bool)),
+    target_(std::make_shared<CPU>()),
+    n_reg(n_regressors),
+    kernel_params(kernel_hyperparams[0], kernel_hyperparams[1], kernel_hyperparams[2])
+{ }
+
+GP::GP(std::vector<double> input,
+       std::vector<double> output,
+       std::size_t n_tiles,
+       std::size_t n_tile_size,
+       std::size_t n_regressors,
+       const std::vector<double> &kernel_hyperparams,
+       std::vector<bool> trainable_bool,
+       int gpu_id,
+       int n_streams) :
+    training_input_(std::move(input)),
+    training_output_(std::move(output)),
+    n_tiles_(n_tiles),
+    n_tile_size_(n_tile_size),
+    trainable_params_(std::move(trainable_bool)),
+#if GPRAT_WITH_CUDA
+    target_(std::make_shared<CUDA_GPU>(CUDA_GPU(gpu_id, n_streams))),
+#else
+    target_(std::make_shared<CPU>()),
+#endif
+    n_reg(n_regressors),
+    kernel_params(kernel_hyperparams[0], kernel_hyperparams[1], kernel_hyperparams[2])
+{
+#if !GPRAT_WITH_CUDA
+    throw std::runtime_error(
+        "Cannot create GP object using CUDA for computation. "
+        "CUDA is not available because GPRat has been compiled without CUDA. "
+        "Remove arguments gpu_id ("
+        + std::to_string(gpu_id) + ") and n_streams (" + std::to_string(n_streams)
+        + ") to perform computations on the CPU.");
+#endif
+}
+
+std::string GP::repr() const
+{
+    std::ostringstream oss;
+    oss << std::fixed << std::setprecision(12);
+    oss << "Kernel_Params: [lengthscale=" << kernel_params.lengthscale << ", vertical_lengthscale="
+        << kernel_params.vertical_lengthscale << ", noise_variance=" << kernel_params.noise_variance
+        << ", n_regressors=" << n_reg << "], Trainable_Params: [trainable_params l=" << trainable_params_[0]
+        << ", trainable_params v=" << trainable_params_[1] << ", trainable_params n=" << trainable_params_[2]
+        << "], Target: [" << target_->repr() << "], n_tiles=" << n_tiles_ << ", n_tile_size=" << n_tile_size_;
+    return oss.str();
+}
+
+std::vector<double> GP::get_training_input() const { return training_input_; }
+
+std::vector<double> GP::get_training_output() const { return training_output_; }
+
+std::vector<double> GP::predict(const std::vector<double> &test_input, std::size_t m_tiles, std::size_t m_tile_size)
+{
+#if GPRAT_WITH_CUDA
+    if (target_->is_gpu())
+    {
+        return gpu::predict(
+            training_input_,
+            training_output_,
+            test_input,
+            kernel_params,
+            n_tiles_,
+            n_tile_size_,
+            m_tiles,
+            m_tile_size,
+            n_reg,
+            *std::dynamic_pointer_cast<CUDA_GPU>(target_));
+    }
+#endif
+
+    tiled_scheduler_local scheduler;
+    return cpu::predict(
+        scheduler,
+        training_input_,
+        training_output_,
+        test_input,
+        kernel_params,
+        n_tiles_,
+        n_tile_size_,
+        m_tiles,
+        m_tile_size,
+        n_reg);
+}
+
+std::vector<std::vector<double>>
+GP::predict_with_uncertainty(const std::vector<double> &test_input, std::size_t m_tiles, std::size_t m_tile_size)
+{
+#if GPRAT_WITH_CUDA
+    if (target_->is_gpu())
+    {
+        return gpu::predict_with_uncertainty(
+            training_input_,
+            training_output_,
+            test_input,
+            kernel_params,
+            n_tiles_,
+            n_tile_size_,
+            m_tiles,
+            m_tile_size,
+            n_reg,
+            *std::dynamic_pointer_cast<CUDA_GPU>(target_));
+    }
+#endif
+    tiled_scheduler_local scheduler;
+    return cpu::predict_with_uncertainty(
+        scheduler,
+        training_input_,
+        training_output_,
+        test_input,
+        kernel_params,
+        n_tiles_,
+        n_tile_size_,
+        m_tiles,
+        m_tile_size,
+        n_reg);
+}
+
+std::vector<std::vector<double>>
+GP::predict_with_full_cov(const std::vector<double> &test_input, std::size_t m_tiles, std::size_t m_tile_size)
+{
+#if GPRAT_WITH_CUDA
+    if (target_->is_gpu())
+    {
+        return gpu::predict_with_full_cov(
+            training_input_,
+            training_output_,
+            test_input,
+            kernel_params,
+            n_tiles_,
+            n_tile_size_,
+            m_tiles,
+            m_tile_size,
+            n_reg,
+            *std::dynamic_pointer_cast<CUDA_GPU>(target_));
+    }
+#endif
+    tiled_scheduler_local scheduler;
+    return cpu::predict_with_full_cov(
+        scheduler,
+        training_input_,
+        training_output_,
+        test_input,
+        kernel_params,
+        n_tiles_,
+        n_tile_size_,
+        m_tiles,
+        m_tile_size,
+        n_reg);
+}
+
+std::vector<double> GP::optimize(const AdamParams &adam_params)
+{
+#if GPRAT_WITH_CUDA
+    if (target_->is_gpu())
+    {
+        std::cerr << "GP::optimze_step has not been implemented for the GPU.\n"
+                  << "Instead, this operation executes the CPU implementation." << std::endl;
+    }
+#endif
+    tiled_scheduler_local scheduler;
+    return cpu::optimize(
+        scheduler,
+        training_input_,
+        training_output_,
+        n_tiles_,
+        n_tile_size_,
+        n_reg,
+        adam_params,
+        kernel_params,
+        trainable_params_);
+}
+
+double GP::optimize_step(AdamParams &adam_params, std::size_t iter)
+{
+#if GPRAT_WITH_CUDA
+    if (target_->is_gpu())
+    {
+        std::cerr << "GP::optimze_step has not been implemented for the GPU.\n"
+                  << "Instead, this operation executes the CPU implementation." << std::endl;
+    }
+#endif
+    tiled_scheduler_local scheduler;
+    return cpu::optimize_step(
+        scheduler,
+        training_input_,
+        training_output_,
+        n_tiles_,
+        n_tile_size_,
+        n_reg,
+        adam_params,
+        kernel_params,
+        trainable_params_,
+        iter);
+}
+
+double GP::calculate_loss()
+{
+#if GPRAT_WITH_CUDA
+    if (target_->is_gpu())
+    {
+        return gpu::compute_loss(
+            training_input_,
+            training_output_,
+            kernel_params,
+            n_tiles_,
+            n_tile_size_,
+            n_reg,
+            *std::dynamic_pointer_cast<CUDA_GPU>(target_));
+    }
+#endif
+    tiled_scheduler_local scheduler;
+    return cpu::calculate_loss(
+        scheduler, training_input_, training_output_, kernel_params, n_tiles_, n_tile_size_, n_reg);
+}
+
+std::vector<mutable_tile_data<double>> GP::cholesky()
+{
+#if GPRAT_WITH_CUDA
+    if (target_->is_gpu())
+    {
+        return gpu::cholesky(
+            training_input_,
+            kernel_params,
+            n_tiles_,
+            n_tile_size_,
+            n_reg,
+            *std::dynamic_pointer_cast<CUDA_GPU>(target_));
+    }
+#endif
+    tiled_scheduler_local sched;
+    return cpu::cholesky(sched, training_input_, kernel_params, n_tiles_, n_tile_size_, n_reg);
+}
+
+GPRAT_NS_END
diff --git a/core/src/gprat_c.cpp b/core/src/gprat_c.cpp
deleted file mode 100644
index c93e792c..00000000
--- a/core/src/gprat_c.cpp
+++ /dev/null
@@ -1,363 +0,0 @@
-#include "gprat_c.hpp"
-
-#include "cpu/gp_functions.hpp"
-#include "utils_c.hpp"
-#include <cstdio>
-
-#if GPRAT_WITH_CUDA
-#include "gpu/gp_functions.cuh"
-#endif
-
-// namespace for GPRat library entities
-namespace gprat
-{
-
-GP_data::GP_data(const std::string &f_path, int n, int n_reg) :
-    file_path(f_path),
-    n_samples(n),
-    n_regressors(n_reg)
-{
-    data = utils::load_data(f_path, n, n_reg - 1);
-}
-
-GP::GP(std::vector<double> input,
-       std::vector<double> output,
-       int n_tiles,
-       int n_tile_size,
-       int n_regressors,
-       std::vector<double> kernel_hyperparams,
-       std::vector<bool> trainable_bool,
-       std::shared_ptr<Target> target) :
-    training_input_(input),
-    training_output_(output),
-    n_tiles_(n_tiles),
-    n_tile_size_(n_tile_size),
-    trainable_params_(trainable_bool),
-    target_(target),
-    n_reg(n_regressors),
-    kernel_params(kernel_hyperparams[0], kernel_hyperparams[1], kernel_hyperparams[2])
-{ }
-
-GP::GP(std::vector<double> input,
-       std::vector<double> output,
-       int n_tiles,
-       int n_tile_size,
-       int n_regressors,
-       std::vector<double> kernel_hyperparams,
-       std::vector<bool> trainable_bool) :
-    training_input_(input),
-    training_output_(output),
-    n_tiles_(n_tiles),
-    n_tile_size_(n_tile_size),
-    trainable_params_(trainable_bool),
-    target_(std::make_shared<CPU>()),
-    n_reg(n_regressors),
-    kernel_params(kernel_hyperparams[0], kernel_hyperparams[1], kernel_hyperparams[2])
-{ }
-
-GP::GP(std::vector<double> input,
-       std::vector<double> output,
-       int n_tiles,
-       int n_tile_size,
-       int n_regressors,
-       std::vector<double> kernel_hyperparams,
-       std::vector<bool> trainable_bool,
-       int gpu_id,
-       int n_streams) :
-    training_input_(input),
-    training_output_(output),
-    n_tiles_(n_tiles),
-    n_tile_size_(n_tile_size),
-    trainable_params_(trainable_bool),
-#if GPRAT_WITH_CUDA
-    target_(std::make_shared<CUDA_GPU>(CUDA_GPU(gpu_id, n_streams))),
-#else
-    target_(std::make_shared<CPU>()),
-#endif
-    n_reg(n_regressors),
-    kernel_params(kernel_hyperparams[0], kernel_hyperparams[1], kernel_hyperparams[2])
-{
-#if !GPRAT_WITH_CUDA
-    throw std::runtime_error(
-        "Cannot create GP object using CUDA for computation. "
-        "CUDA is not available because GPRat has been compiled without CUDA. "
-        "Remove arguments gpu_id ("
-        + std::to_string(gpu_id) + ") and n_streams (" + std::to_string(n_streams)
-        + ") to perform computations on the CPU.");
-#endif
-}
-
-std::string GP::repr() const
-{
-    std::ostringstream oss;
-    oss << std::fixed << std::setprecision(12);
-    oss << "Kernel_Params: [lengthscale=" << kernel_params.lengthscale << ", vertical_lengthscale="
-        << kernel_params.vertical_lengthscale << ", noise_variance=" << kernel_params.noise_variance
-        << ", n_regressors=" << n_reg << "], Trainable_Params: [trainable_params l=" << trainable_params_[0]
-        << ", trainable_params v=" << trainable_params_[1] << ", trainable_params n=" << trainable_params_[2]
-        << "], Target: [" << target_->repr() << "], n_tiles=" << n_tiles_ << ", n_tile_size=" << n_tile_size_;
-    return oss.str();
-}
-
-std::vector<double> GP::get_training_input() const { return training_input_; }
-
-std::vector<double> GP::get_training_output() const { return training_output_; }
-
-std::vector<double> GP::predict(const std::vector<double> &test_input, int m_tiles, int m_tile_size)
-{
-    return hpx::async(
-               [this, &test_input, m_tiles, m_tile_size]()
-               {
-#if GPRAT_WITH_CUDA
-                   if (target_->is_gpu())
-                   {
-                       return gpu::predict(
-                           training_input_,
-                           training_output_,
-                           test_input,
-                           kernel_params,
-                           n_tiles_,
-                           n_tile_size_,
-                           m_tiles,
-                           m_tile_size,
-                           n_reg,
-                           *std::dynamic_pointer_cast<gprat::CUDA_GPU>(target_));
-                   }
-                   else
-                   {
-                       return cpu::predict(
-                           training_input_,
-                           training_output_,
-                           test_input,
-                           kernel_params,
-                           n_tiles_,
-                           n_tile_size_,
-                           m_tiles,
-                           m_tile_size,
-                           n_reg);
-                   }
-#else
-                   return cpu::predict(
-                       training_input_,
-                       training_output_,
-                       test_input,
-                       kernel_params,
-                       n_tiles_,
-                       n_tile_size_,
-                       m_tiles,
-                       m_tile_size,
-                       n_reg);
-#endif
-               })
-        .get();
-}
-
-std::vector<std::vector<double>>
-GP::predict_with_uncertainty(const std::vector<double> &test_input, int m_tiles, int m_tile_size)
-{
-    return hpx::async(
-               [this, &test_input, m_tiles, m_tile_size]()
-               {
-#if GPRAT_WITH_CUDA
-                   if (target_->is_gpu())
-                   {
-                       return gpu::predict_with_uncertainty(
-                           training_input_,
-                           training_output_,
-                           test_input,
-                           kernel_params,
-                           n_tiles_,
-                           n_tile_size_,
-                           m_tiles,
-                           m_tile_size,
-                           n_reg,
-                           *std::dynamic_pointer_cast<gprat::CUDA_GPU>(target_));
-                   }
-                   else
-                   {
-                       return cpu::predict_with_uncertainty(
-                           training_input_,
-                           training_output_,
-                           test_input,
-                           kernel_params,
-                           n_tiles_,
-                           n_tile_size_,
-                           m_tiles,
-                           m_tile_size,
-                           n_reg);
-                   }
-#else
-                   return cpu::predict_with_uncertainty(
-                       training_input_,
-                       training_output_,
-                       test_input,
-                       kernel_params,
-                       n_tiles_,
-                       n_tile_size_,
-                       m_tiles,
-                       m_tile_size,
-                       n_reg);
-#endif
-               })
-        .get();
-}
-
-std::vector<std::vector<double>>
-GP::predict_with_full_cov(const std::vector<double> &test_input, int m_tiles, int m_tile_size)
-{
-    return hpx::async(
-               [this, &test_input, m_tiles, m_tile_size]()
-               {
-#if GPRAT_WITH_CUDA
-                   if (target_->is_gpu())
-                   {
-                       return gpu::predict_with_full_cov(
-                           training_input_,
-                           training_output_,
-                           test_input,
-                           kernel_params,
-                           n_tiles_,
-                           n_tile_size_,
-                           m_tiles,
-                           m_tile_size,
-                           n_reg,
-                           *std::dynamic_pointer_cast<gprat::CUDA_GPU>(target_));
-                   }
-                   else
-                   {
-                       return cpu::predict_with_full_cov(
-                           training_input_,
-                           training_output_,
-                           test_input,
-                           kernel_params,
-                           n_tiles_,
-                           n_tile_size_,
-                           m_tiles,
-                           m_tile_size,
-                           n_reg);
-                   }
-#else
-                   return cpu::predict_with_full_cov(
-                       training_input_,
-                       training_output_,
-                       test_input,
-                       kernel_params,
-                       n_tiles_,
-                       n_tile_size_,
-                       m_tiles,
-                       m_tile_size,
-                       n_reg);
-#endif
-               })
-        .get();
-}
-
-std::vector<double> GP::optimize(const gprat_hyper::AdamParams &adam_params)
-{
-    return hpx::async(
-               [this, &adam_params]()
-               {
-#if GPRAT_WITH_CUDA
-                   if (target_->is_gpu())
-                   {
-                       std::cerr << "GP::optimze_step has not been implemented for the GPU.\n"
-                                 << "Instead, this operation executes the CPU implementation." << std::endl;
-                   }
-#endif
-                   return cpu::optimize(
-                       training_input_,
-                       training_output_,
-                       n_tiles_,
-                       n_tile_size_,
-                       n_reg,
-                       adam_params,
-                       kernel_params,
-                       trainable_params_);
-               })
-        .get();
-}
-
-double GP::optimize_step(gprat_hyper::AdamParams &adam_params, int iter)
-{
-    return hpx::async(
-               [this, &adam_params, iter]()
-               {
-#if GPRAT_WITH_CUDA
-                   if (target_->is_gpu())
-                   {
-                       std::cerr << "GP::optimze_step has not been implemented for the GPU.\n"
-                                 << "Instead, this operation executes the CPU implementation." << std::endl;
-                   }
-#endif
-                   return cpu::optimize_step(
-                       training_input_,
-                       training_output_,
-                       n_tiles_,
-                       n_tile_size_,
-                       n_reg,
-                       adam_params,
-                       kernel_params,
-                       trainable_params_,
-                       iter);
-               })
-        .get();
-}
-
-double GP::calculate_loss()
-{
-    return hpx::async(
-               [this]()
-               {
-#if GPRAT_WITH_CUDA
-                   if (target_->is_gpu())
-                   {
-                       return gpu::compute_loss(
-                           training_input_,
-                           training_output_,
-                           kernel_params,
-                           n_tiles_,
-                           n_tile_size_,
-                           n_reg,
-                           *std::dynamic_pointer_cast<gprat::CUDA_GPU>(target_));
-                   }
-                   else
-                   {
-                       return cpu::compute_loss(
-                           training_input_, training_output_, kernel_params, n_tiles_, n_tile_size_, n_reg);
-                   }
-#else
-                   return cpu::compute_loss(
-                       training_input_, training_output_, kernel_params, n_tiles_, n_tile_size_, n_reg);
-#endif
-               })
-        .get();
-}
-
-std::vector<std::vector<double>> GP::cholesky()
-{
-    return hpx::async(
-               [this]()
-               {
-#if GPRAT_WITH_CUDA
-                   if (target_->is_gpu())
-                   {
-                       return gpu::cholesky(
-                           training_input_,
-                           kernel_params,
-                           n_tiles_,
-                           n_tile_size_,
-                           n_reg,
-                           *std::dynamic_pointer_cast<gprat::CUDA_GPU>(target_));
-                   }
-                   else
-                   {
-                       return cpu::cholesky(training_input_, kernel_params, n_tiles_, n_tile_size_, n_reg);
-                   }
-#else
-                   return cpu::cholesky(training_input_, kernel_params, n_tiles_, n_tile_size_, n_reg);
-#endif
-               })
-        .get();
-}
-
-}  // namespace gprat
diff --git a/core/src/gpu/adapter_cublas.cu b/core/src/gpu/adapter_cublas.cu
index 61227e8d..c3833aac 100644
--- a/core/src/gpu/adapter_cublas.cu
+++ b/core/src/gpu/adapter_cublas.cu
@@ -1,4 +1,6 @@
-#include "gpu/adapter_cublas.cuh"
+#include "gprat/gpu/adapter_cublas.cuh"
+
+GPRAT_NS_BEGIN
 
 // frequently used names
 using hpx::cuda::experimental::check_cuda_error;
@@ -411,3 +413,5 @@ dot(cublasHandle_t cublas,
 
     return hpx::make_ready_future(result);
 }
+
+GPRAT_NS_END
diff --git a/core/src/gpu/cuda_kernels.cu b/core/src/gpu/cuda_kernels.cu
index 37378f37..5e77ec6a 100644
--- a/core/src/gpu/cuda_kernels.cu
+++ b/core/src/gpu/cuda_kernels.cu
@@ -1,6 +1,8 @@
-#include "gpu/cuda_kernels.cuh"
+#include "gprat/gpu/cuda_kernels.cuh"
 
-#include "gpu/cuda_utils.cuh"
+#include "gprat/gpu/cuda_utils.cuh"
+
+GPRAT_NS_BEGIN
 
 __global__ void transpose(double *transposed, double *original, std::size_t width, std::size_t height)
 {
@@ -25,3 +27,5 @@ __global__ void transpose(double *transposed, double *original, std::size_t widt
         transposed[index_out] = block[threadIdx.x][threadIdx.y];
     }
 }
+
+GPRAT_NS_END
diff --git a/core/src/gpu/gp_algorithms.cu b/core/src/gpu/gp_algorithms.cu
index 39407ed6..b9125e57 100644
--- a/core/src/gpu/gp_algorithms.cu
+++ b/core/src/gpu/gp_algorithms.cu
@@ -1,14 +1,18 @@
-#include "gpu/gp_algorithms.cuh"
+#include "gprat/gpu/gp_algorithms.cuh"
+
+#include "gprat/gpu/cuda_kernels.cuh"
+#include "gprat/gpu/cuda_utils.cuh"
+#include "gprat/gpu/gp_optimizer.cuh"
+#include "gprat/kernels.hpp"
+#include "gprat/target.hpp"
+#include "gprat/tile_data.hpp"
 
-#include "gp_kernels.hpp"
-#include "gpu/cuda_kernels.cuh"
-#include "gpu/cuda_utils.cuh"
-#include "gpu/gp_optimizer.cuh"
-#include "target.hpp"
 #include <cuda_runtime.h>
 #include <hpx/algorithm.hpp>
 #include <hpx/async_cuda/cuda_exception.hpp>
 
+GPRAT_NS_BEGIN
+
 namespace gpu
 {
 
@@ -20,7 +24,7 @@ __global__ void gen_tile_covariance_kernel(
     const std::size_t n_regressors,
     const std::size_t tile_row,
     const std::size_t tile_column,
-    const gprat_hyper::SEKParams sek_params)
+    const SEKParams sek_params)
 {
     // Compute the global indices of the thread
     unsigned int i = blockIdx.y * blockDim.y + threadIdx.y;
@@ -59,8 +63,8 @@ double *gen_tile_covariance(const double *d_input,
                             const std::size_t tile_column,
                             const std::size_t n_tile_size,
                             const std::size_t n_regressors,
-                            const gprat_hyper::SEKParams sek_params,
-                            gprat::CUDA_GPU &gpu)
+                            const SEKParams sek_params,
+                            CUDA_GPU &gpu)
 {
     double *d_tile;
 
@@ -85,7 +89,7 @@ __global__ void gen_tile_full_prior_covariance_kernel(
     const std::size_t n_regressors,
     const std::size_t tile_row,
     const std::size_t tile_column,
-    const gprat_hyper::SEKParams sek_params)
+    const SEKParams sek_params)
 {
     unsigned int i = blockIdx.y * blockDim.y + threadIdx.y;
     unsigned int j = blockIdx.x * blockDim.x + threadIdx.x;
@@ -117,8 +121,8 @@ double *gen_tile_full_prior_covariance(
     const std::size_t tile_colums,
     const std::size_t n_tile_size,
     const std::size_t n_regressors,
-    const gprat_hyper::SEKParams sek_params,
-    gprat::CUDA_GPU &gpu)
+    const SEKParams sek_params,
+    CUDA_GPU &gpu)
 {
     double *d_tile;
 
@@ -143,7 +147,7 @@ __global__ void gen_tile_prior_covariance_kernel(
     const std::size_t n_regressors,
     const std::size_t tile_row,
     const std::size_t tile_column,
-    const gprat_hyper::SEKParams sek_params)
+    const SEKParams sek_params)
 {
     unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -174,8 +178,8 @@ double *gen_tile_prior_covariance(
     const std::size_t tile_column,
     const std::size_t n_tile_size,
     const std::size_t n_regressors,
-    const gprat_hyper::SEKParams sek_params,
-    gprat::CUDA_GPU &gpu)
+    const SEKParams sek_params,
+    CUDA_GPU &gpu)
 {
     double *d_tile;
 
@@ -202,7 +206,7 @@ __global__ void gen_tile_cross_covariance_kernel(
     const std::size_t tile_row,
     const std::size_t tile_column,
     const std::size_t n_regressors,
-    const gprat_hyper::SEKParams sek_params)
+    const SEKParams sek_params)
 {
     unsigned int i = blockIdx.y * blockDim.y + threadIdx.y;
     unsigned int j = blockIdx.x * blockDim.x + threadIdx.x;
@@ -235,8 +239,8 @@ double *gen_tile_cross_covariance(
     const std::size_t n_row_tile_size,
     const std::size_t n_column_tile_size,
     const std::size_t n_regressors,
-    const gprat_hyper::SEKParams sek_params,
-    gprat::CUDA_GPU &gpu)
+    const SEKParams sek_params,
+    CUDA_GPU &gpu)
 {
     double *d_tile;
 
@@ -265,7 +269,7 @@ double *gen_tile_cross_covariance(
 hpx::shared_future<double *> gen_tile_cross_cov_T(std::size_t n_row_tile_size,
                                                   std::size_t n_column_tile_size,
                                                   const hpx::shared_future<double *> f_cross_covariance_tile,
-                                                  gprat::CUDA_GPU &gpu)
+                                                  CUDA_GPU &gpu)
 {
     double *transposed;
     check_cuda_error(cudaMalloc(&transposed, n_row_tile_size * n_column_tile_size * sizeof(double)));
@@ -293,8 +297,7 @@ __global__ void gen_tile_output_kernel(double *tile, const double *output, std::
     }
 }
 
-double *
-gen_tile_output(const std::size_t row, const std::size_t n_tile_size, const double *d_output, gprat::CUDA_GPU &gpu)
+double *gen_tile_output(const std::size_t row, const std::size_t n_tile_size, const double *d_output, CUDA_GPU &gpu)
 {
     dim3 threads_per_block(256);
     dim3 n_blocks((n_tile_size + 255) / 256);
@@ -311,7 +314,7 @@ gen_tile_output(const std::size_t row, const std::size_t n_tile_size, const doub
     return d_tile;
 }
 
-double *gen_tile_zeros(std::size_t n_tile_size, gprat::CUDA_GPU &gpu)
+double *gen_tile_zeros(std::size_t n_tile_size, CUDA_GPU &gpu)
 {
     double *d_tile;
     cudaStream_t stream = gpu.next_stream();
@@ -345,8 +348,8 @@ std::vector<hpx::shared_future<double *>> assemble_tiled_covariance_matrix(
     const std::size_t n_tiles,
     const std::size_t n_tile_size,
     const std::size_t n_regressors,
-    const gprat_hyper::SEKParams sek_params,
-    gprat::CUDA_GPU &gpu)
+    const SEKParams sek_params,
+    CUDA_GPU &gpu)
 {
     std::vector<hpx::shared_future<double *>> d_tiles(n_tiles * n_tiles);
 
@@ -369,8 +372,8 @@ std::vector<hpx::shared_future<double *>> assemble_tiled_covariance_matrix(
     return d_tiles;
 }
 
-std::vector<hpx::shared_future<double *>> assemble_alpha_tiles(
-    const double *d_output, const std::size_t n_tiles, const std::size_t n_tile_size, gprat::CUDA_GPU &gpu)
+std::vector<hpx::shared_future<double *>>
+assemble_alpha_tiles(const double *d_output, const std::size_t n_tiles, const std::size_t n_tile_size, CUDA_GPU &gpu)
 {
     std::vector<hpx::shared_future<double *>> alpha_tiles(n_tiles);
     for (std::size_t i = 0; i < n_tiles; i++)
@@ -390,8 +393,8 @@ std::vector<hpx::shared_future<double *>> assemble_cross_covariance_tiles(
     const std::size_t m_tile_size,
     const std::size_t n_tile_size,
     const std::size_t n_regressors,
-    const gprat_hyper::SEKParams sek_params,
-    gprat::CUDA_GPU &gpu)
+    const SEKParams sek_params,
+    CUDA_GPU &gpu)
 {
     std::vector<hpx::shared_future<double *>> cross_covariance_tiles;
     cross_covariance_tiles.resize(m_tiles * n_tiles);
@@ -416,7 +419,7 @@ std::vector<hpx::shared_future<double *>> assemble_cross_covariance_tiles(
 }
 
 std::vector<hpx::shared_future<double *>>
-assemble_tiles_with_zeros(std::size_t n_tile_size, std::size_t n_tiles, gprat::CUDA_GPU &gpu)
+assemble_tiles_with_zeros(std::size_t n_tile_size, std::size_t n_tiles, CUDA_GPU &gpu)
 {
     std::vector<hpx::shared_future<double *>> tiles(n_tiles);
     for (std::size_t i = 0; i < n_tiles; i++)
@@ -431,8 +434,8 @@ std::vector<hpx::shared_future<double *>> assemble_prior_K_tiles(
     const std::size_t m_tiles,
     const std::size_t m_tile_size,
     const std::size_t n_regressors,
-    const gprat_hyper::SEKParams sek_params,
-    gprat::CUDA_GPU &gpu)
+    const SEKParams sek_params,
+    CUDA_GPU &gpu)
 {
     std::vector<hpx::shared_future<double *>> d_prior_K_tiles;
     d_prior_K_tiles.resize(m_tiles);
@@ -449,8 +452,8 @@ std::vector<hpx::shared_future<double *>> assemble_prior_K_tiles_full(
     const std::size_t m_tiles,
     const std::size_t m_tile_size,
     const std::size_t n_regressors,
-    const gprat_hyper::SEKParams sek_params,
-    gprat::CUDA_GPU &gpu)
+    const SEKParams sek_params,
+    CUDA_GPU &gpu)
 {
     std::vector<hpx::shared_future<double *>> d_prior_K_tiles(m_tiles * m_tiles);
     for (std::size_t i = 0; i < m_tiles; i++)
@@ -483,7 +486,7 @@ std::vector<hpx::shared_future<double *>> assemble_t_cross_covariance_tiles(
     const std::size_t m_tiles,
     const std::size_t n_tile_size,
     const std::size_t m_tile_size,
-    gprat::CUDA_GPU &gpu)
+    CUDA_GPU &gpu)
 {
     std::vector<hpx::shared_future<double *>> d_t_cross_covariance_tiles(m_tiles * n_tiles);
     for (std::size_t i = 0; i < m_tiles; i++)
@@ -502,7 +505,7 @@ std::vector<hpx::shared_future<double *>> assemble_t_cross_covariance_tiles(
 }
 
 std::vector<hpx::shared_future<double *>> assemble_y_tiles(
-    const double *d_training_output, const std::size_t n_tiles, const std::size_t n_tile_size, gprat::CUDA_GPU &gpu)
+    const double *d_training_output, const std::size_t n_tiles, const std::size_t n_tile_size, CUDA_GPU &gpu)
 {
     std::vector<hpx::shared_future<double *>> d_y_tiles(n_tiles);
     for (std::size_t i = 0; i < n_tiles; i++)
@@ -512,10 +515,8 @@ std::vector<hpx::shared_future<double *>> assemble_y_tiles(
     return d_y_tiles;
 }
 
-std::vector<double> copy_tiled_vector_to_host_vector(std::vector<hpx::shared_future<double *>> &d_tiles,
-                                                     std::size_t n_tile_size,
-                                                     std::size_t n_tiles,
-                                                     gprat::CUDA_GPU &gpu)
+std::vector<double> copy_tiled_vector_to_host_vector(
+    std::vector<hpx::shared_future<double *>> &d_tiles, std::size_t n_tile_size, std::size_t n_tiles, CUDA_GPU &gpu)
 {
     std::vector<double> h_vector(n_tiles * n_tile_size);
     std::vector<cudaStream_t> streams(n_tiles);
@@ -533,13 +534,13 @@ std::vector<double> copy_tiled_vector_to_host_vector(std::vector<hpx::shared_fut
     return h_vector;
 }
 
-std::vector<std::vector<double>> move_lower_tiled_matrix_to_host(
+std::vector<mutable_tile_data<double>> move_lower_tiled_matrix_to_host(
     const std::vector<hpx::shared_future<double *>> &d_tiles,
     const std::size_t n_tile_size,
     const std::size_t n_tiles,
-    gprat::CUDA_GPU &gpu)
+    CUDA_GPU &gpu)
 {
-    std::vector<std::vector<double>> h_tiles(n_tiles * n_tiles);
+    std::vector<mutable_tile_data<double>> h_tiles(n_tiles * n_tiles);
 
     std::vector<cudaStream_t> streams(n_tiles * (n_tiles + 1) / 2);
     for (std::size_t i = 0; i < n_tiles; ++i)
@@ -547,7 +548,7 @@ std::vector<std::vector<double>> move_lower_tiled_matrix_to_host(
         for (std::size_t j = 0; j <= i; ++j)
         {
             streams[i] = gpu.next_stream();
-            h_tiles[i * n_tiles + j].resize(n_tile_size * n_tile_size);
+            h_tiles[i * n_tiles + j] = mutable_tile_data<double>(n_tile_size * n_tile_size);
             check_cuda_error(cudaMemcpyAsync(
                 h_tiles[i * n_tiles + j].data(),
                 d_tiles[i * n_tiles + j].get(),
@@ -574,3 +575,5 @@ void free_lower_tiled_matrix(const std::vector<hpx::shared_future<double *>> &d_
 }
 
 }  // end of namespace gpu
+
+GPRAT_NS_END
diff --git a/core/src/gpu/gp_functions.cu b/core/src/gpu/gp_functions.cu
index 8f5e341f..a4485992 100644
--- a/core/src/gpu/gp_functions.cu
+++ b/core/src/gpu/gp_functions.cu
@@ -1,14 +1,18 @@
-#include "gpu/gp_functions.cuh"
+#include "gprat/gpu/gp_functions.cuh"
+
+#include "gprat/gpu/cuda_utils.cuh"
+#include "gprat/gpu/gp_algorithms.cuh"
+#include "gprat/gpu/tiled_algorithms.cuh"
+#include "gprat/kernels.hpp"
+#include "gprat/target.hpp"
+#include "gprat/tile_data.hpp"
 
-#include "gp_kernels.hpp"
-#include "gpu/cuda_utils.cuh"
-#include "gpu/gp_algorithms.cuh"
-#include "gpu/tiled_algorithms.cuh"
-#include "target.hpp"
 #include <cuda_runtime.h>
 #include <hpx/algorithm.hpp>
 #include <hpx/async_cuda/cuda_exception.hpp>
 
+GPRAT_NS_BEGIN
+
 namespace gpu
 {
 
@@ -16,13 +20,13 @@ std::vector<double>
 predict(const std::vector<double> &h_training_input,
         const std::vector<double> &h_training_output,
         const std::vector<double> &h_test_input,
-        const gprat_hyper::SEKParams &sek_params,
+        const SEKParams &sek_params,
         int n_tiles,
         int n_tile_size,
         int m_tiles,
         int m_tile_size,
         int n_regressors,
-        gprat::CUDA_GPU &gpu)
+        CUDA_GPU &gpu)
 {
     gpu.create();
 
@@ -65,13 +69,13 @@ std::vector<std::vector<double>> predict_with_uncertainty(
     const std::vector<double> &h_training_input,
     const std::vector<double> &h_training_output,
     const std::vector<double> &h_test_input,
-    const gprat_hyper::SEKParams &sek_params,
+    const SEKParams &sek_params,
     int n_tiles,
     int n_tile_size,
     int m_tiles,
     int m_tile_size,
     int n_regressors,
-    gprat::CUDA_GPU &gpu)
+    CUDA_GPU &gpu)
 {
     gpu.create();
 
@@ -150,13 +154,13 @@ std::vector<std::vector<double>> predict_with_full_cov(
     const std::vector<double> &h_training_input,
     const std::vector<double> &h_training_output,
     const std::vector<double> &h_test_input,
-    const gprat_hyper::SEKParams &sek_params,
+    const SEKParams &sek_params,
     int n_tiles,
     int n_tile_size,
     int m_tiles,
     int m_tile_size,
     int n_regressors,
-    gprat::CUDA_GPU &gpu)
+    CUDA_GPU &gpu)
 {
     gpu.create();
 
@@ -229,11 +233,11 @@ std::vector<std::vector<double>> predict_with_full_cov(
 
 double compute_loss(const std::vector<double> &h_training_input,
                     const std::vector<double> &h_training_output,
-                    const gprat_hyper::SEKParams &sek_params,
+                    const SEKParams &sek_params,
                     int n_tiles,
                     int n_tile_size,
                     int n_regressors,
-                    gprat::CUDA_GPU &gpu)
+                    CUDA_GPU &gpu)
 {
     gpu.create();
 
@@ -279,10 +283,10 @@ optimize(const std::vector<double> &training_input,
          int n_tiles,
          int n_tile_size,
          int n_regressors,
-         const gprat_hyper::AdamParams &adam_params,
-         const gprat_hyper::SEKParams &sek_params,
+         const AdamParams &adam_params,
+         const SEKParams &sek_params,
          std::vector<bool> trainable_params,
-         gprat::CUDA_GPU &gpu)
+         CUDA_GPU &gpu)
 {
     throw std::logic_error("Function not implemented for GPU");
     // return std::vector<double>>();
@@ -293,23 +297,23 @@ double optimize_step(const std::vector<double> &training_input,
                      int n_tiles,
                      int n_tile_size,
                      int n_regressors,
-                     gprat_hyper::AdamParams &adam_params,
-                     gprat_hyper::SEKParams &sek_params,
+                     AdamParams &adam_params,
+                     SEKParams &sek_params,
                      std::vector<bool> trainable_params,
                      int iter,
-                     gprat::CUDA_GPU &gpu)
+                     CUDA_GPU &gpu)
 {
     throw std::logic_error("Function not implemented for GPU");
     // return 0.0;
 }
 
-std::vector<std::vector<double>>
+std::vector<mutable_tile_data<double>>
 cholesky(const std::vector<double> &h_training_input,
-         const gprat_hyper::SEKParams &sek_params,
+         const SEKParams &sek_params,
          int n_tiles,
          int n_tile_size,
          int n_regressors,
-         gprat::CUDA_GPU &gpu)
+         CUDA_GPU &gpu)
 {
     gpu.create();
 
@@ -323,7 +327,7 @@ cholesky(const std::vector<double> &h_training_input,
     right_looking_cholesky_tiled(d_tiles, n_tile_size, n_tiles, gpu, cusolver);
 
     // Copy tiled matrix to host
-    std::vector<std::vector<double>> h_tiles = move_lower_tiled_matrix_to_host(d_tiles, n_tile_size, n_tiles, gpu);
+    auto h_tiles = move_lower_tiled_matrix_to_host(d_tiles, n_tile_size, n_tiles, gpu);
 
     cudaFree(d_training_input);
     destroy(cusolver);
@@ -333,3 +337,5 @@ cholesky(const std::vector<double> &h_training_input,
 }
 
 }  // end of namespace gpu
+
+GPRAT_NS_END
diff --git a/core/src/gpu/gp_optimizer.cu b/core/src/gpu/gp_optimizer.cu
index 53cca8bb..ea465261 100644
--- a/core/src/gpu/gp_optimizer.cu
+++ b/core/src/gpu/gp_optimizer.cu
@@ -1,8 +1,12 @@
-#include "gpu/gp_optimizer.cuh"
+#include "gprat/gpu/gp_optimizer.cuh"
 
-#include "gpu/adapter_cublas.cuh"
-#include "gpu/cuda_kernels.cuh"
-#include "gpu/cuda_utils.cuh"
+#include "gprat/gpu/adapter_cublas.cuh"
+#include "gprat/gpu/cuda_kernels.cuh"
+#include "gprat/gpu/cuda_utils.cuh"
+
+#include <numbers>
+
+GPRAT_NS_BEGIN
 
 namespace gpu
 {
@@ -36,7 +40,7 @@ double compute_sigmoid(const double parameter) { return 1.0 / (1.0 + exp(-parame
 double compute_covariance_distance(std::size_t i_global,
                                    std::size_t j_global,
                                    std::size_t n_regressors,
-                                   gprat_hyper::SEKParams sek_params,
+                                   SEKParams sek_params,
                                    const std::vector<double> &i_input,
                                    const std::vector<double> &j_input)
 {
@@ -58,7 +62,7 @@ std::vector<double> gen_tile_distance(
     std::size_t col,
     std::size_t N,
     std::size_t n_regressors,
-    gprat_hyper::SEKParams sek_params,
+    SEKParams sek_params,
     const std::vector<double> &input)
 {
     std::size_t i_global, j_global;
@@ -85,7 +89,7 @@ std::vector<double> gen_tile_covariance_with_distance(
     std::size_t col,
     std::size_t N,
     std::size_t n_regressors,
-    gprat_hyper::SEKParams sek_params,
+    SEKParams sek_params,
     const std::vector<double> &cov_dists)
 {
     std::size_t i_global, j_global;
@@ -117,7 +121,7 @@ gen_tile_grad_v(std::size_t row,
                 std::size_t col,
                 std::size_t N,
                 std::size_t n_regressors,
-                gprat_hyper::SEKParams sek_params,
+                SEKParams sek_params,
                 const std::vector<double> &cov_dists)
 {
     // Initialize tile
@@ -140,7 +144,7 @@ gen_tile_grad_l(std::size_t row,
                 std::size_t col,
                 std::size_t N,
                 std::size_t n_regressors,
-                gprat_hyper::SEKParams sek_params,
+                SEKParams sek_params,
                 const std::vector<double> &cov_dists)
 {
     // Initialize tile
@@ -176,7 +180,7 @@ std::vector<double> gen_tile_grad_v_trans(std::size_t N, const std::vector<doubl
 }
 
 hpx::shared_future<double *>
-gen_tile_grad_l_trans(std::size_t N, const hpx::shared_future<double *> f_grad_l_tile, gprat::CUDA_GPU &gpu)
+gen_tile_grad_l_trans(std::size_t N, const hpx::shared_future<double *> f_grad_l_tile, CUDA_GPU &gpu)
 {
     double *transposed;
     check_cuda_error(cudaMalloc(&transposed, N * N * sizeof(double)));
@@ -209,7 +213,7 @@ compute_loss(const hpx::shared_future<double *> &K_diag_tile,
              const hpx::shared_future<double *> &alpha_tile,
              const hpx::shared_future<double *> &y_tile,
              std::size_t N,
-             gprat::CUDA_GPU &gpu)
+             CUDA_GPU &gpu)
 {
     auto [cublas, stream] = gpu.next_cublas_handle();
 
@@ -233,7 +237,7 @@ add_losses(const std::vector<hpx::shared_future<double>> &losses, std::size_t n_
     {
         l += losses[i].get();
     }
-    l += n_tile_size * n_tiles * log(2.0 * M_PI);
+    l += n_tile_size * n_tiles * log(2.0 * std::numbers::pi);
 
     return hpx::make_ready_future(0.5 * l / (n_tile_size * n_tiles));
 }
@@ -276,8 +280,8 @@ double update_second_moment(const double &gradient, double v_T, const double &be
 
 hpx::shared_future<double>
 update_param(const double unconstrained_hyperparam,
-             gprat_hyper::SEKParams sek_params,
-             gprat_hyper::AdamParams adam_params,
+             SEKParams sek_params,
+             AdamParams adam_params,
              double m_T,
              double v_T,
              const std::vector<double> beta1_T,
@@ -339,11 +343,8 @@ sum_gradright(const std::vector<double> &inter_alpha, const std::vector<double>
     return 0.0;
 }
 
-double sum_noise_gradleft(const std::vector<double> &ft_invK,
-                          double grad,
-                          gprat_hyper::SEKParams sek_params,
-                          std::size_t N,
-                          std::size_t n_tiles)
+double sum_noise_gradleft(
+    const std::vector<double> &ft_invK, double grad, SEKParams sek_params, std::size_t N, std::size_t n_tiles)
 {
     double noise_der = compute_sigmoid(to_unconstrained(sek_params.noise_variance, true));
     for (std::size_t i = 0; i < N; ++i)
@@ -353,8 +354,7 @@ double sum_noise_gradleft(const std::vector<double> &ft_invK,
     return std::move(grad);
 }
 
-double
-sum_noise_gradright(const std::vector<double> &alpha, double grad, gprat_hyper::SEKParams sek_params, std::size_t N)
+double sum_noise_gradright(const std::vector<double> &alpha, double grad, SEKParams sek_params, std::size_t N)
 {
     // double noise_der =
     //     compute_sigmoid(to_unconstrained(sek_params.noise_variance, true));
@@ -364,3 +364,5 @@ sum_noise_gradright(const std::vector<double> &alpha, double grad, gprat_hyper::
 }
 
 }  // end of namespace gpu
+
+GPRAT_NS_END
diff --git a/core/src/gpu/gp_uncertainty.cu b/core/src/gpu/gp_uncertainty.cu
index a7919457..6cc7f50b 100644
--- a/core/src/gpu/gp_uncertainty.cu
+++ b/core/src/gpu/gp_uncertainty.cu
@@ -1,16 +1,19 @@
-#include "gpu/gp_uncertainty.cuh"
+#include "gprat/gpu/gp_uncertainty.cuh"
+
+#include "gprat/gpu/cuda_utils.cuh"
+#include "gprat/target.hpp"
 
-#include "gpu/cuda_utils.cuh"
-#include "target.hpp"
 #include <hpx/async_cuda/cuda_exception.hpp>
 
+GPRAT_NS_BEGIN
+
 using hpx::cuda::experimental::check_cuda_error;
 
 namespace gpu
 {
 
-hpx::shared_future<double *> diag_posterior(
-    const hpx::shared_future<double *> A, const hpx::shared_future<double *> B, std::size_t M, gprat::CUDA_GPU &gpu)
+hpx::shared_future<double *>
+diag_posterior(const hpx::shared_future<double *> A, const hpx::shared_future<double *> B, std::size_t M, CUDA_GPU &gpu)
 {
     auto [cublas, stream] = gpu.next_cublas_handle();
 
@@ -27,7 +30,7 @@ hpx::shared_future<double *> diag_posterior(
     return hpx::make_ready_future(tile);
 }
 
-hpx::shared_future<double *> diag_tile(const hpx::shared_future<double *> A, std::size_t M, gprat::CUDA_GPU &gpu)
+hpx::shared_future<double *> diag_tile(const hpx::shared_future<double *> A, std::size_t M, CUDA_GPU &gpu)
 {
     double *diag_tile;
     check_cuda_error(cudaMalloc(&diag_tile, M * sizeof(double)));
@@ -41,3 +44,5 @@ hpx::shared_future<double *> diag_tile(const hpx::shared_future<double *> A, std
 }
 
 }  // end of namespace gpu
+
+GPRAT_NS_END
diff --git a/core/src/gpu/tiled_algorithms.cu b/core/src/gpu/tiled_algorithms.cu
index 1ffdd866..3c479ffd 100644
--- a/core/src/gpu/tiled_algorithms.cu
+++ b/core/src/gpu/tiled_algorithms.cu
@@ -1,10 +1,13 @@
-#include "gpu/tiled_algorithms.cuh"
+#include "gprat/gpu/tiled_algorithms.cuh"
+
+#include "gprat/gpu/adapter_cublas.cuh"
+#include "gprat/gpu/gp_optimizer.cuh"
+#include "gprat/gpu/gp_uncertainty.cuh"
 
-#include "gpu/adapter_cublas.cuh"
-#include "gpu/gp_optimizer.cuh"
-#include "gpu/gp_uncertainty.cuh"
 #include <hpx/algorithm.hpp>
 
+GPRAT_NS_BEGIN
+
 namespace gpu
 {
 
@@ -13,7 +16,7 @@ namespace gpu
 void right_looking_cholesky_tiled(std::vector<hpx::shared_future<double *>> &ft_tiles,
                                   const std::size_t n_tile_size,
                                   const std::size_t n_tiles,
-                                  gprat::CUDA_GPU &gpu,
+                                  CUDA_GPU &gpu,
                                   const cusolverDnHandle_t &cusolver)
 {
     for (std::size_t k = 0; k < n_tiles; ++k)
@@ -86,7 +89,7 @@ void forward_solve_tiled(std::vector<hpx::shared_future<double *>> &ft_tiles,
                          std::vector<hpx::shared_future<double *>> &ft_rhs,
                          const std::size_t n_tile_size,
                          const std::size_t n_tiles,
-                         gprat::CUDA_GPU &gpu)
+                         CUDA_GPU &gpu)
 {
     for (std::size_t k = 0; k < n_tiles; ++k)
     {
@@ -120,7 +123,7 @@ void backward_solve_tiled(std::vector<hpx::shared_future<double *>> &ft_tiles,
                           std::vector<hpx::shared_future<double *>> &ft_rhs,
                           const std::size_t n_tile_size,
                           const std::size_t n_tiles,
-                          gprat::CUDA_GPU &gpu)
+                          CUDA_GPU &gpu)
 {
     // NOTE: The loops traverse backwards. Its last comparisons require the
     // usage negative numbers. Therefore they use signed int instead of the
@@ -160,7 +163,7 @@ void forward_solve_tiled_matrix(
     const std::size_t m_tile_size,
     const std::size_t n_tiles,
     const std::size_t m_tiles,
-    gprat::CUDA_GPU &gpu)
+    CUDA_GPU &gpu)
 {
     for (std::size_t c = 0; c < m_tiles; ++c)
     {
@@ -209,7 +212,7 @@ void backward_solve_tiled_matrix(
     const std::size_t m_tile_size,
     const std::size_t n_tiles,
     const std::size_t m_tiles,
-    gprat::CUDA_GPU &gpu)
+    CUDA_GPU &gpu)
 {
     for (std::size_t c = 0; c < m_tiles; ++c)
     {
@@ -258,7 +261,7 @@ void matrix_vector_tiled(std::vector<hpx::shared_future<double *>> &ft_tiles,
                          const std::size_t N_col,
                          const std::size_t n_tiles,
                          const std::size_t m_tiles,
-                         gprat::CUDA_GPU &gpu)
+                         CUDA_GPU &gpu)
 {
     for (std::size_t k = 0; k < m_tiles; ++k)
     {
@@ -288,7 +291,7 @@ void symmetric_matrix_matrix_diagonal_tiled(
     const std::size_t m_tile_size,
     const std::size_t n_tiles,
     const std::size_t m_tiles,
-    gprat::CUDA_GPU &gpu)
+    CUDA_GPU &gpu)
 {
     for (std::size_t i = 0; i < m_tiles; ++i)
     {
@@ -315,7 +318,7 @@ void compute_gemm_of_invK_y(std::vector<hpx::shared_future<double *>> &ft_invK,
                             std::vector<hpx::shared_future<double *>> &ft_alpha,
                             const std::size_t n_tile_size,
                             const std::size_t n_tiles,
-                            gprat::CUDA_GPU &gpu)
+                            CUDA_GPU &gpu)
 {
     for (std::size_t i = 0; i < n_tiles; ++i)
     {
@@ -344,7 +347,7 @@ hpx::shared_future<double> compute_loss_tiled(
     std::vector<hpx::shared_future<double *>> &ft_y,
     const std::size_t n_tile_size,
     const std::size_t n_tiles,
-    gprat::CUDA_GPU &gpu)
+    CUDA_GPU &gpu)
 {
     std::vector<hpx::shared_future<double>> loss_tiled(n_tiles);
 
@@ -364,7 +367,7 @@ void symmetric_matrix_matrix_tiled(
     const std::size_t m_tile_size,
     const std::size_t n_tiles,
     const std::size_t m_tiles,
-    gprat::CUDA_GPU &gpu)
+    CUDA_GPU &gpu)
 {
     for (std::size_t c = 0; c < m_tiles; ++c)
     {
@@ -397,7 +400,7 @@ void vector_difference_tiled(std::vector<hpx::shared_future<double *>> &ft_prior
                              std::vector<hpx::shared_future<double *>> &ft_vector,
                              const std::size_t m_tile_size,
                              const std::size_t m_tiles,
-                             gprat::CUDA_GPU &gpu)
+                             CUDA_GPU &gpu)
 {
     for (std::size_t i = 0; i < m_tiles; i++)
     {
@@ -409,7 +412,7 @@ void matrix_diagonal_tiled(std::vector<hpx::shared_future<double *>> &ft_priorK,
                            std::vector<hpx::shared_future<double *>> &ft_vector,
                            const std::size_t m_tile_size,
                            const std::size_t m_tiles,
-                           gprat::CUDA_GPU &gpu)
+                           CUDA_GPU &gpu)
 {
     for (std::size_t i = 0; i < m_tiles; i++)
     {
@@ -422,7 +425,7 @@ void update_grad_K_tiled_mkl(std::vector<hpx::shared_future<double *>> &ft_tiles
                              const std::vector<hpx::shared_future<double *>> &ft_v2,
                              const std::size_t n_tile_size,
                              const std::size_t n_tiles,
-                             gprat::CUDA_GPU &gpu)
+                             CUDA_GPU &gpu)
 {
     for (std::size_t i = 0; i < n_tiles; ++i)
     {
@@ -441,8 +444,8 @@ static double update_hyperparameter(
     const std::vector<hpx::shared_future<double *>> &ft_gradparam,
     const std::vector<hpx::shared_future<double *>> &ft_alpha,
     double &hyperparameter,  // lengthscale or vertical-lengthscale
-    gprat_hyper::SEKParams sek_params,
-    gprat_hyper::AdamParams adam_params,
+    SEKParams sek_params,
+    AdamParams adam_params,
     const std::size_t n_tile_size,
     const std::size_t n_tiles,
     std::vector<hpx::shared_future<double>> &m_T,
@@ -451,7 +454,7 @@ static double update_hyperparameter(
     const std::vector<hpx::shared_future<double>> &beta2_T,
     int iter,
     int param_idx,  // 0 for lengthscale, 1 for vertical-lengthscale
-    gprat::CUDA_GPU &gpu)
+    CUDA_GPU &gpu)
 {
     throw std::logic_error("Function not implemented for GPU");
     // return 0;
@@ -461,8 +464,8 @@ double update_lengthscale(
     const std::vector<hpx::shared_future<double *>> &ft_invK,
     const std::vector<hpx::shared_future<double *>> &ft_gradparam,
     const std::vector<hpx::shared_future<double *>> &ft_alpha,
-    gprat_hyper::SEKParams sek_params,
-    gprat_hyper::AdamParams adam_params,
+    SEKParams sek_params,
+    AdamParams adam_params,
     const std::size_t n_tile_size,
     const std::size_t n_tiles,
     std::vector<hpx::shared_future<double>> &m_T,
@@ -470,7 +473,7 @@ double update_lengthscale(
     const std::vector<hpx::shared_future<double>> &beta1_T,
     const std::vector<hpx::shared_future<double>> &beta2_T,
     int iter,
-    gprat::CUDA_GPU &gpu)
+    CUDA_GPU &gpu)
 {
     return update_hyperparameter(
         ft_invK,
@@ -494,8 +497,8 @@ double update_vertical_lengthscale(
     const std::vector<hpx::shared_future<double *>> &ft_invK,
     const std::vector<hpx::shared_future<double *>> &ft_gradparam,
     const std::vector<hpx::shared_future<double *>> &ft_alpha,
-    gprat_hyper::SEKParams sek_params,
-    gprat_hyper::AdamParams adam_params,
+    SEKParams sek_params,
+    AdamParams adam_params,
     const std::size_t n_tile_size,
     const std::size_t n_tiles,
     std::vector<hpx::shared_future<double>> &m_T,
@@ -503,7 +506,7 @@ double update_vertical_lengthscale(
     const std::vector<hpx::shared_future<double>> &beta1_T,
     const std::vector<hpx::shared_future<double>> &beta2_T,
     int iter,
-    gprat::CUDA_GPU &gpu)
+    CUDA_GPU &gpu)
 {
     return update_hyperparameter(
         ft_invK,
@@ -526,8 +529,8 @@ double update_vertical_lengthscale(
 double update_noise_variance(
     const std::vector<hpx::shared_future<double *>> &ft_invK,
     const std::vector<hpx::shared_future<double *>> &ft_alpha,
-    gprat_hyper::SEKParams sek_params,
-    gprat_hyper::AdamParams adam_params,
+    SEKParams sek_params,
+    AdamParams adam_params,
     const std::size_t n_tile_size,
     const std::size_t n_tiles,
     std::vector<hpx::shared_future<double>> &m_T,
@@ -535,10 +538,12 @@ double update_noise_variance(
     const std::vector<hpx::shared_future<double>> &beta1_T,
     const std::vector<hpx::shared_future<double>> &beta2_T,
     int iter,
-    gprat::CUDA_GPU &gpu)
+    CUDA_GPU &gpu)
 {
     throw std::logic_error("Function not implemented for GPU");
     // return 0;
 }
 
 }  // end of namespace gpu
+
+GPRAT_NS_END
diff --git a/core/src/gp_hyperparameters.cpp b/core/src/hyperparameters.cpp
similarity index 85%
rename from core/src/gp_hyperparameters.cpp
rename to core/src/hyperparameters.cpp
index c7c0d9c0..2a4800ce 100644
--- a/core/src/gp_hyperparameters.cpp
+++ b/core/src/hyperparameters.cpp
@@ -1,11 +1,11 @@
-#include "gp_hyperparameters.hpp"
+#include "gprat/hyperparameters.hpp"
 
 #include <iomanip>
+#include <sstream>
 
-namespace gprat_hyper
-{
+GPRAT_NS_BEGIN
 
-AdamParams::AdamParams(double lr, double b1, double b2, double eps, int opt_i) :
+AdamParams::AdamParams(double lr, double b1, double b2, double eps, std::size_t opt_i) :
     learning_rate(lr),
     beta1(b1),
     beta2(b2),
@@ -29,4 +29,4 @@ std::string AdamParams::repr() const
     return oss.str();
 }
 
-}  // namespace gprat_hyper
+GPRAT_NS_END
diff --git a/core/src/gp_kernels.cpp b/core/src/kernels.cpp
similarity index 73%
rename from core/src/gp_kernels.cpp
rename to core/src/kernels.cpp
index 42952e7e..9fd0218e 100644
--- a/core/src/gp_kernels.cpp
+++ b/core/src/kernels.cpp
@@ -1,13 +1,13 @@
-#include "gp_kernels.hpp"
+#include "gprat/kernels.hpp"
 
 #include <stdexcept>
 
-namespace gprat_hyper
-{
-SEKParams::SEKParams(double lengthscale_, double vertical_lengthscale_, double noise_variance_) :
-    lengthscale(lengthscale_),
-    vertical_lengthscale(vertical_lengthscale_),
-    noise_variance(noise_variance_)
+GPRAT_NS_BEGIN
+
+SEKParams::SEKParams(double in_lengthscale, double in_vertical_lengthscale, double in_noise_variance) :
+    lengthscale(in_lengthscale),
+    vertical_lengthscale(in_vertical_lengthscale),
+    noise_variance(in_noise_variance)
 {
     m_T.resize(this->size());
     w_T.resize(this->size());
@@ -51,4 +51,5 @@ const double &SEKParams::get_param(std::size_t index) const
     }
     throw std::invalid_argument("Get Invalid param_idx");
 }
-}  // namespace gprat_hyper
+
+GPRAT_NS_END
diff --git a/core/src/performance_counters.cpp b/core/src/performance_counters.cpp
new file mode 100644
index 00000000..0434e2bb
--- /dev/null
+++ b/core/src/performance_counters.cpp
@@ -0,0 +1,86 @@
+#include "gprat/performance_counters.hpp"
+
+#include <atomic>
+#include <emmintrin.h>
+#include <hpx/util/get_and_reset_value.hpp>
+#ifdef HPX_HAVE_MODULE_PERFORMANCE_COUNTERS
+#include <hpx/performance_counters/manage_counter_type.hpp>
+#endif
+
+GPRAT_NS_BEGIN
+
+#define GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR(name)                                                                       \
+    static std::atomic<std::uint64_t> name(0);                                                                         \
+    std::uint64_t get_##name(bool reset) { return hpx::util::get_and_reset_value(name, reset); }
+
+GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR(tile_data_allocations)
+GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR(tile_data_deallocations)
+
+#undef GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR
+
+void track_tile_data_allocation(std::size_t /*size*/) { tile_data_allocations += 1; }
+
+void track_tile_data_deallocation(std::size_t /*size*/) { tile_data_deallocations += 1; }
+
+#ifdef HPX_HAVE_MODULE_PERFORMANCE_COUNTERS
+// These are non-public functions of their respective CUs.
+namespace detail
+{
+void register_fp32_performance_counters();
+void register_fp64_performance_counters();
+}  // namespace detail
+
+void register_performance_counters()
+{
+    // XXX: you can do this with templates, but it's quite a bit more complicated
+#define GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR(name, stats_expr)                                                           \
+    hpx::performance_counters::install_counter_type(                                                                   \
+        name,                                                                                                          \
+        [](bool reset) { return hpx::util::get_and_reset_value(stats_expr, reset); },                                  \
+        #stats_expr,                                                                                                   \
+        "",                                                                                                            \
+        hpx::performance_counters::counter_type::monotonically_increasing)
+
+    GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/tile_data/num_allocations", tile_data_allocations);
+    GPRAT_MAKE_SIMPLE_COUNTER_ACCESSOR("/gprat/tile_data/num_deallocations", tile_data_deallocations);
+
+#undef GPRAT_MAKE_STATISTICS_ACCESSOR
+
+    detail::register_fp32_performance_counters();
+    detail::register_fp64_performance_counters();
+}
+
+#else
+void register_performance_counters()
+{
+    // no-op for binary compatibility
+}
+#endif
+
+void force_evict_memory(const void *start, std::size_t size)
+{
+    // A cache line size of 64 seems to be a safe estimate.
+    // see: https://lemire.me/blog/2023/12/12/measuring-the-size-of-the-cache-line-empirically/
+    constexpr std::size_t cache_line_size = 64;
+
+    const char *p = static_cast<const char *>(start);
+    const char *end = p + size;
+
+    _mm_mfence();
+    do {
+        // Intel recommends clflushopt over normal clflush due to higher performance, see:
+        // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
+        _mm_clflush(p);
+        p += cache_line_size;
+    } while (p < end);
+
+    // Make sure we don't miss a cache line at the end
+    if ((reinterpret_cast<std::uintptr_t>(p) & (cache_line_size - 1))
+        != (reinterpret_cast<std::uintptr_t>(end - 1) & (cache_line_size - 1)))
+    {
+        _mm_clflush(end - 1);
+    }
+    _mm_mfence();
+}
+
+GPRAT_NS_END
diff --git a/core/src/target.cpp b/core/src/target.cpp
index 1b500702..3cd90504 100644
--- a/core/src/target.cpp
+++ b/core/src/target.cpp
@@ -1,16 +1,14 @@
-#include "target.hpp"
+#include "gprat/target.hpp"
 
 #include <iostream>
 
 #if GPRAT_WITH_CUDA
-#include "gpu/cuda_utils.cuh"
-using hpx::cuda::experimental::check_cuda_error;
+#include "gprat/gpu/cuda_utils.cuh"
 #endif
 
-namespace gprat
-{
+GPRAT_NS_BEGIN
 
-CPU::CPU() { }
+CPU::CPU() = default;
 
 bool CPU::is_cpu() { return true; }
 
@@ -154,4 +152,4 @@ int gpu_count()
 #endif
 }
 
-}  // namespace gprat
+GPRAT_NS_END
diff --git a/core/src/tile_data.cpp b/core/src/tile_data.cpp
new file mode 100644
index 00000000..24ef9eb3
--- /dev/null
+++ b/core/src/tile_data.cpp
@@ -0,0 +1,34 @@
+#include "gprat/tile_data.hpp"
+
+#include "gprat/performance_counters.hpp"
+
+#include <hpx/runtime_local/runtime_local.hpp>
+
+GPRAT_NS_BEGIN
+
+namespace detail
+{
+
+void *allocate_tile_data(std::size_t num_bytes)
+{
+    auto &topology = hpx::get_runtime().get_topology();
+    const auto bitmap = topology.cpuset_to_nodeset(topology.get_machine_affinity_mask());
+
+    track_tile_data_allocation(num_bytes);
+    return topology.allocate_membind(num_bytes, bitmap, hpx::threads::hpx_hwloc_membind_policy::membind_firsttouch, 0);
+}
+
+void deallocate_tile_data(void *p, std::size_t num_bytes)
+{
+    track_tile_data_deallocation(num_bytes);
+
+    if (hpx::is_running())
+    {
+        auto &topology = hpx::get_runtime().get_topology();
+        topology.deallocate(p, num_bytes);
+    }
+}
+
+}  // namespace detail
+
+GPRAT_NS_END
diff --git a/core/src/utils_c.cpp b/core/src/utils.cpp
similarity index 76%
rename from core/src/utils_c.cpp
rename to core/src/utils.cpp
index 896b7ad0..47935bfd 100644
--- a/core/src/utils_c.cpp
+++ b/core/src/utils.cpp
@@ -1,11 +1,10 @@
-#include "utils_c.hpp"
+#include "gprat/utils.hpp"
 
 #include <cstdio>
 
-namespace utils
-{
+GPRAT_NS_BEGIN
 
-int compute_train_tiles(int n_samples, int n_tile_size)
+std::size_t compute_train_tiles(std::size_t n_samples, std::size_t n_tile_size)
 {
     if (n_tile_size > 0)
     {
@@ -18,7 +17,7 @@ int compute_train_tiles(int n_samples, int n_tile_size)
     }
 }
 
-int compute_train_tile_size(int n_samples, int n_tiles)
+std::size_t compute_train_tile_size(std::size_t n_samples, std::size_t n_tiles)
 {
     if (n_tiles > 0)
     {
@@ -31,10 +30,10 @@ int compute_train_tile_size(int n_samples, int n_tiles)
     }
 }
 
-std::pair<int, int> compute_test_tiles(int n_test, int n_tiles, int n_tile_size)
+std::pair<std::size_t, std::size_t> compute_test_tiles(std::size_t n_test, std::size_t n_tiles, std::size_t n_tile_size)
 {
-    int m_tiles;
-    int m_tile_size;
+    std::size_t m_tiles;
+    std::size_t m_tile_size;
 
     // if n_test is not divisible by (incl. smaller than) n_tile_size, use the same number of tiles
     if ((n_test % n_tile_size) > 0)
@@ -51,10 +50,10 @@ std::pair<int, int> compute_test_tiles(int n_test, int n_tiles, int n_tile_size)
     return { m_tiles, m_tile_size };
 }
 
-std::vector<double> load_data(const std::string &file_path, int n_samples, int offset)
+std::vector<double> load_data(const std::string &file_path, std::size_t n_samples, std::size_t offset)
 {
     std::vector<double> _data;
-    _data.resize(static_cast<std::size_t>(n_samples + offset), 0.0);
+    _data.resize(n_samples + offset, 0.0);
 
     FILE *input_file = fopen(file_path.c_str(), "r");
     if (input_file == NULL)
@@ -63,11 +62,14 @@ std::vector<double> load_data(const std::string &file_path, int n_samples, int o
     }
 
     // load data
-    int scanned_elements = 0;
-    for (int i = 0; i < n_samples; i++)
+    std::size_t scanned_elements = 0;
+    for (std::size_t i = 0; i < n_samples; i++)
     {
-        scanned_elements +=
-            fscanf(input_file, "%lf", &_data[static_cast<std::size_t>(i + offset)]);  // scanned_elements++;
+        const auto r = fscanf(input_file, "%lf", &_data[(i + offset)]);
+        if (r > 0)
+        {
+            scanned_elements += static_cast<std::size_t>(r);
+        }
     }
 
     fclose(input_file);
@@ -141,4 +143,4 @@ bool compiled_with_cuda()
 #endif
 }
 
-}  // namespace utils
+GPRAT_NS_END
diff --git a/examples/gprat_cpp/src/execute.cpp b/examples/gprat_cpp/src/execute.cpp
index 8c415727..7089155e 100644
--- a/examples/gprat_cpp/src/execute.cpp
+++ b/examples/gprat_cpp/src/execute.cpp
@@ -1,32 +1,63 @@
-#include "gprat_c.hpp"
-#include "utils_c.hpp"
+#include "gprat/gprat.hpp"
+#include "gprat/utils.hpp"
+
 #include <chrono>
 #include <fstream>
 #include <iostream>
 
 int main(int argc, char *argv[])
 {
+    namespace po = hpx::program_options;
+    po::options_description desc("Allowed options");
+    // clang-format off
+    desc.add_options()
+        ("help", "produce help message")
+        ("train_x_path", po::value<std::string>()->default_value("../../../data/data_1024/training_input.txt"), "training data (x)")
+        ("train_y_path", po::value<std::string>()->default_value("../../../data/data_1024/training_output.txt"), "training data (y)")
+        ("test_path", po::value<std::string>()->default_value("../../../data/data_1024/test_input.txt"), "test data")
+        ("tiles", po::value<std::size_t>()->default_value(16), "tiles per dimension")
+        ("regressors", po::value<std::size_t>()->default_value(8), "num regressors")
+        ("start-cores", po::value<std::size_t>()->default_value(2), "num CPUs to start with")
+        ("end-cores", po::value<std::size_t>()->default_value(4), "num CPUs to end with")
+        ("start", po::value<std::size_t>()->default_value(512), "Starting number of training samples")
+        ("end", po::value<std::size_t>()->default_value(1024), "End number of training samples")
+        ("step", po::value<std::size_t>()->default_value(2), "Increment of training samples")
+        ("loop", po::value<std::size_t>()->default_value(2), "Number of iterations to be performed for each number of training samples")
+        ("opt_iter", po::value<std::size_t>()->default_value(1), "Number of optimization iterations*/")
+    ;
+    // clang-format on
+
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    if (vm.contains("help"))
+    {
+        std::cout << desc << "\n";
+        return 1;
+    }
+
     /////////////////////
     /////// configuration
-    std::size_t START = 512;
-    std::size_t END = 1024;
-    std::size_t STEP = 2;
-    std::size_t LOOP = 2;
-    const std::size_t OPT_ITER = 1;
+    std::size_t START = vm["start"].as<std::size_t>();
+    std::size_t END = vm["end"].as<std::size_t>();
+    std::size_t STEP = vm["step"].as<std::size_t>();
+    std::size_t LOOP = vm["loop"].as<std::size_t>();
+    const std::size_t OPT_ITER = vm["opt_iter"].as<std::size_t>();
 
-    int n_test = 1024;
-    const std::size_t N_CORES = 4;
-    const std::size_t n_tiles = 16;
-    const std::size_t n_reg = 8;
+    const std::size_t n_test = START;
+    const std::size_t N_CORES = vm["end-cores"].as<std::size_t>();
+    const std::size_t n_tiles = vm["tiles"].as<std::size_t>();
+    const std::size_t n_reg = vm["regressors"].as<std::size_t>();
 
-    std::string train_path = "../../../data/data_1024/training_input.txt";
-    std::string out_path = "../../../data/data_1024/training_output.txt";
-    std::string test_path = "../../../data/data_1024/test_input.txt";
+    std::string train_path = vm["train_x_path"].as<std::string>();
+    std::string out_path = vm["train_y_path"].as<std::string>();
+    std::string test_path = vm["test_path"].as<std::string>();
 
     bool use_gpu =
-        utils::compiled_with_cuda() && gprat::gpu_count() > 0 && argc > 1 && std::strcmp(argv[1], "--use_gpu") == 0;
+        gprat::compiled_with_cuda() && gprat::gpu_count() > 0 && argc > 1 && std::strcmp(argv[1], "--use_gpu") == 0;
 
-    for (std::size_t core = 2; core <= N_CORES; core = core * 2)
+    for (std::size_t core = vm["start-cores"].as<std::size_t>(); core <= N_CORES; core = core * 2)
     {
         // Create new argc and argv to include the --hpx:threads argument
         std::vector<std::string> args(argv, argv + argc);
@@ -48,15 +79,15 @@ int main(int argc, char *argv[])
 
         for (std::size_t start = START; start <= END; start = start * STEP)
         {
-            int n_train = static_cast<int>(start);
+            const auto n_train = start;
             for (std::size_t l = 0; l < LOOP; l++)
             {
                 // Compute tile sizes and number of predict tiles
-                int tile_size = utils::compute_train_tile_size(n_train, n_tiles);
-                auto result = utils::compute_test_tiles(n_test, n_tiles, tile_size);
+                const auto tile_size = gprat::compute_train_tile_size(n_train, n_tiles);
+                const auto result = gprat::compute_test_tiles(n_test, n_tiles, tile_size);
                 /////////////////////
                 ///// hyperparams
-                gprat_hyper::AdamParams hpar = { 0.1, 0.9, 0.999, 1e-8, OPT_ITER };
+                gprat::AdamParams hpar = { 0.1, 0.9, 0.999, 1e-8, OPT_ITER };
 
                 /////////////////////
                 ////// data loading
@@ -93,34 +124,32 @@ int main(int argc, char *argv[])
                     init_time = end_init - start_init;
 
                     // Initialize HPX with the new arguments, don't run hpx_main
-                    utils::start_hpx_runtime(new_argc, new_argv);
+                    gprat::start_hpx_runtime(new_argc, new_argv);
 
                     // Measure the time taken to execute gp.cholesky();
                     auto start_cholesky = std::chrono::high_resolution_clock::now();
-                    std::vector<std::vector<double>> choleksy_cpu = gp_cpu.cholesky();
+                    const auto choleksy_cpu = gp_cpu.cholesky();
                     auto end_cholesky = std::chrono::high_resolution_clock::now();
                     cholesky_time = end_cholesky - start_cholesky;
 
                     // Measure the time taken to execute gp.optimize(hpar);
                     auto start_opt = std::chrono::high_resolution_clock::now();
-                    std::vector<double> losses = gp_cpu.optimize(hpar);
+                    const auto losses = gp_cpu.optimize(hpar);
                     auto end_opt = std::chrono::high_resolution_clock::now();
                     opt_time = end_opt - start_opt;
 
                     auto start_pred_uncer = std::chrono::high_resolution_clock::now();
-                    std::vector<std::vector<double>> sum_cpu =
-                        gp_cpu.predict_with_uncertainty(test_input.data, result.first, result.second);
+                    const auto sum_cpu = gp_cpu.predict_with_uncertainty(test_input.data, result.first, result.second);
                     auto end_pred_uncer = std::chrono::high_resolution_clock::now();
                     pred_uncer_time = end_pred_uncer - start_pred_uncer;
 
                     auto start_pred_full_cov = std::chrono::high_resolution_clock::now();
-                    std::vector<std::vector<double>> full_cpu =
-                        gp_cpu.predict_with_full_cov(test_input.data, result.first, result.second);
+                    const auto full_cpu = gp_cpu.predict_with_full_cov(test_input.data, result.first, result.second);
                     auto end_pred_full_cov = std::chrono::high_resolution_clock::now();
                     pred_full_cov_time = end_pred_full_cov - start_pred_full_cov;
 
                     auto start_pred = std::chrono::high_resolution_clock::now();
-                    std::vector<double> pred_cpu = gp_cpu.predict(test_input.data, result.first, result.second);
+                    const auto pred_cpu = gp_cpu.predict(test_input.data, result.first, result.second);
                     auto end_pred = std::chrono::high_resolution_clock::now();
                     pred_time = end_pred - start_pred;
                 }
@@ -143,10 +172,10 @@ int main(int argc, char *argv[])
                     init_time = end_init - start_init;
 
                     // Initialize HPX with the new arguments, don't run hpx_main
-                    utils::start_hpx_runtime(new_argc, new_argv);
+                    gprat::start_hpx_runtime(new_argc, new_argv);
 
                     auto start_cholesky = std::chrono::high_resolution_clock::now();
-                    std::vector<std::vector<double>> choleksy_gpu = gp_gpu.cholesky();
+                    const auto choleksy_gpu = gp_gpu.cholesky();
                     auto end_cholesky = std::chrono::high_resolution_clock::now();
                     cholesky_time = end_cholesky - start_cholesky;
 
@@ -154,31 +183,29 @@ int main(int argc, char *argv[])
                     opt_time = std::chrono::seconds(-1);
 
                     auto start_pred_uncer = std::chrono::high_resolution_clock::now();
-                    std::vector<std::vector<double>> sum_gpu =
-                        gp_gpu.predict_with_uncertainty(test_input.data, result.first, result.second);
+                    const auto sum_gpu = gp_gpu.predict_with_uncertainty(test_input.data, result.first, result.second);
                     auto end_pred_uncer = std::chrono::high_resolution_clock::now();
                     pred_uncer_time = end_pred_uncer - start_pred_uncer;
 
                     auto start_pred_full_cov = std::chrono::high_resolution_clock::now();
-                    std::vector<std::vector<double>> full_gpu =
-                        gp_gpu.predict_with_full_cov(test_input.data, result.first, result.second);
+                    const auto full_gpu = gp_gpu.predict_with_full_cov(test_input.data, result.first, result.second);
                     auto end_pred_full_cov = std::chrono::high_resolution_clock::now();
                     pred_full_cov_time = end_pred_full_cov - start_pred_full_cov;
 
                     auto start_pred = std::chrono::high_resolution_clock::now();
-                    std::vector<double> pred_gpu = gp_gpu.predict(test_input.data, result.first, result.second);
+                    const auto pred_gpu = gp_gpu.predict(test_input.data, result.first, result.second);
                     auto end_pred = std::chrono::high_resolution_clock::now();
                     pred_time = end_pred - start_pred;
                 }
 
                 // Stop the HPX runtime
-                utils::stop_hpx_runtime();
+                gprat::stop_hpx_runtime();
 
                 auto end_total = std::chrono::high_resolution_clock::now();
                 auto total_time = end_total - start_total;
 
                 // Save parameters and times to a .txt file with a header
-                std::ofstream outfile("../output.csv", std::ios::app);  // Append mode
+                std::ofstream outfile("output.csv", std::ios::app);  // Append mode
                 if (outfile.tellp() == 0)
                 {
                     // If file is empty, write the header
diff --git a/external_ports/README.md b/external_ports/README.md
new file mode 100644
index 00000000..993ec19c
--- /dev/null
+++ b/external_ports/README.md
@@ -0,0 +1,3 @@
+# What is this?
+
+This contains custom vcpkg ports and forks of official ones.
diff --git a/external_ports/intel-mkl/copy-from-dmg.cmake b/external_ports/intel-mkl/copy-from-dmg.cmake
new file mode 100644
index 00000000..a5aa67cd
--- /dev/null
+++ b/external_ports/intel-mkl/copy-from-dmg.cmake
@@ -0,0 +1,53 @@
+find_program(HDIUTIL NAMES hdiutil REQUIRED)
+set(dmg_path "NOTFOUND" CACHE FILEPATH "Where to find the DMG")
+set(output_dir "output_dir" CACHE FILEPATH "Where to put the packages")
+
+if(NOT EXISTS "${dmg_path}")
+    message(FATAL_ERROR "'dmg_path' (${dmg_path}) does not exist.")
+endif()
+if(NOT IS_DIRECTORY "${output_dir}")
+    message(FATAL_ERROR "'output_dir' (${output_dir}) is not a directory.")
+endif()
+
+execute_process(
+    COMMAND mktemp -d
+    RESULT_VARIABLE mktemp_result
+    OUTPUT_VARIABLE mount_point
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+if(NOT mktemp_result STREQUAL "0")
+    message(FATAL_ERROR "mktemp -d failed: ${mktemp_result}")
+elseif(NOT IS_DIRECTORY "${mount_point}")
+    message(FATAL_ERROR "'mount_point' (${mount_point}) is not a directory.")
+endif()
+
+execute_process(
+    COMMAND "${HDIUTIL}" attach "${dmg_path}" -mountpoint "${mount_point}" -readonly
+    RESULT_VARIABLE mount_result
+)
+if(mount_result STREQUAL "0")
+    set(dmg_packages_dir "${mount_point}/bootstrapper.app/Contents/Resources/packages")
+    file(GLOB packages
+        "${dmg_packages_dir}/intel.oneapi.mac.mkl.devel,*"
+        "${dmg_packages_dir}/intel.oneapi.mac.mkl.runtime,*"
+        "${dmg_packages_dir}/intel.oneapi.mac.mkl.product,*"
+        "${dmg_packages_dir}/intel.oneapi.mac.openmp,*"
+    )
+    # Using execute_process to avoid direct errors
+    execute_process(
+        COMMAND cp -R ${packages} "${output_dir}/"
+        RESULT_VARIABLE copy_result
+    )
+endif()
+execute_process(
+    COMMAND "${HDIUTIL}" detach "${mount_point}"
+    RESULT_VARIABLE unmount_result
+)
+
+if(NOT mount_result STREQUAL "0")
+    message(FATAL_ERROR "Mounting ${dmg_path} failed: ${mount_result}")
+elseif(NOT copy_result STREQUAL "0")
+    message(FATAL_ERROR "Coyping packages failed: ${copy_result}")
+elseif(NOT unmount_result STREQUAL "0")
+    message(FATAL_ERROR "Unounting ${dmg_path} failed: ${unmount_result}")
+endif()
diff --git a/external_ports/intel-mkl/portfile.cmake b/external_ports/intel-mkl/portfile.cmake
new file mode 100644
index 00000000..b07c79f1
--- /dev/null
+++ b/external_ports/intel-mkl/portfile.cmake
@@ -0,0 +1,256 @@
+# This package installs Intel MKL on Linux, macOS and Windows for x64.
+# Configuration:
+#   - lp64
+#   - sequential
+
+set(VCPKG_POLICY_EMPTY_PACKAGE enabled)
+
+# https://registrationcenter-download.intel.com/akdlm/IRC_NAS/19150/w_onemkl_p_2023.0.0.25930_offline.exe # windows
+# https://registrationcenter-download.intel.com/akdlm/IRC_NAS/19116/m_onemkl_p_2023.0.0.25376_offline.dmg # macos
+# https://registrationcenter-download.intel.com/akdlm/irc_nas/19138/l_onemkl_p_2023.0.0.25398_offline.sh # linux
+set(sha "")
+if(NOT VCPKG_TARGET_ARCHITECTURE STREQUAL "x64")
+  # nop
+elseif(VCPKG_TARGET_IS_WINDOWS)
+  set(filename w_onemkl_p_2023.0.0.25930_offline.exe)
+  set(magic_number 19150)
+  set(sha a3eb6b75241a2eccb73ed73035ff111172c55d3fa51f545c7542277a155df84ff72fc826621711153e683f84058e64cb549c030968f9f964531db76ca8a3ed46)
+  set(package_infix "win")
+elseif(VCPKG_TARGET_IS_OSX)
+  set(filename m_onemkl_p_2023.0.0.25376_offline.dmg)
+  set(magic_number 19116)
+  set(sha 7b9b8c004054603e6830fb9b9c049d5a4cfc0990c224cb182ac5262ab9f1863775a67491413040e3349c590e2cca58edcfc704db9f3b9f9faa8b5b09022cd2af)
+  set(package_infix "mac")
+  set(package_libdir "lib")
+  set(compiler_libdir "mac/compiler/lib")
+elseif(VCPKG_TARGET_IS_LINUX)
+  set(filename l_onemkl_p_2023.0.0.25398_offline.sh)
+  set(magic_number 19138)
+  set(sha b5f2f464675f0fd969dde2faf2e622b834eb1cc406c4a867148116f6c24ba5c709d98b678840f4a89a1778e12cde0ff70ce2ef59faeef3d3f3aa1d0329c71af1)
+  set(package_infix "lin")
+  set(package_libdir "lib/intel64")
+  set(compiler_libdir "linux/compiler/lib/intel64_lin")
+endif()
+
+if(NOT sha)
+  message(WARNING "${PORT} is empty for ${TARGET_TRIPLET}.")
+  return()
+endif()
+
+vcpkg_download_distfile(installer_path
+    URLS "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/${magic_number}/${filename}"
+    FILENAME "${filename}"
+    SHA512 "${sha}"
+)
+
+# Note: intel_thread and lp64 are the defaults.
+set(interface "lp64") # or ilp64; ilp == 64 bit int api
+#https://www.intel.com/content/www/us/en/develop/documentation/onemkl-linux-developer-guide/top/linking-your-application-with-onemkl/linking-in-detail/linking-with-interface-libraries/using-the-ilp64-interface-vs-lp64-interface.html
+set(threading "sequential")
+if(threading STREQUAL "intel_thread")
+    set(short_thread "iomp")
+else()
+    string(SUBSTRING "${threading}" "0" "3" short_thread)
+endif()
+set(main_pc_file "mkl-${VCPKG_LIBRARY_LINKAGE}-${interface}-${short_thread}.pc")
+
+# First extraction level: packages (from offline installer)
+set(extract_0_dir "${CURRENT_BUILDTREES_DIR}/${TARGET_TRIPLET}-extract")
+file(REMOVE_RECURSE "${extract_0_dir}")
+file(MAKE_DIRECTORY "${extract_0_dir}")
+
+# Second extraction level: actual files (from packages)
+set(extract_1_dir "${CURRENT_PACKAGES_DIR}/intel-extract")
+file(REMOVE_RECURSE "${extract_1_dir}")
+file(MAKE_DIRECTORY "${extract_1_dir}")
+
+file(MAKE_DIRECTORY "${CURRENT_PACKAGES_DIR}/lib/pkgconfig")
+
+if(VCPKG_TARGET_IS_WINDOWS)
+    vcpkg_find_acquire_program(7Z)
+    message(STATUS "Extracting offline installer")
+    vcpkg_execute_required_process(
+        COMMAND "${7Z}" x "${installer_path}" "-o${extract_0_dir}" "-y" "-bso0" "-bsp0"
+        WORKING_DIRECTORY "${extract_0_dir}"
+        LOGNAME "extract-${TARGET_TRIPLET}-0"
+    )
+
+    set(packages 
+        "intel.oneapi.win.mkl.devel,v=2023.0.0-25930/oneapi-mkl-devel-for-installer_p_2023.0.0.25930.msi" # has the required libs. 
+        "intel.oneapi.win.mkl.runtime,v=2023.0.0-25930/oneapi-mkl-for-installer_p_2023.0.0.25930.msi" # has the required DLLs
+        #"intel.oneapi.win.compilers-common-runtime,v=2023.0.0-25922" # SVML
+        "intel.oneapi.win.openmp,v=2023.0.0-25922/oneapi-comp-openmp-for-installer_p_2023.0.0.25922.msi" # OpenMP
+        #"intel.oneapi.win.tbb.runtime,v=2021.8.0-25874" #TBB
+        )
+
+    foreach(pack IN LISTS packages)
+        set(package_path "${extract_0_dir}/packages/${pack}")
+        cmake_path(GET pack STEM LAST_ONLY packstem)
+        cmake_path(NATIVE_PATH package_path package_path_native)
+        vcpkg_execute_required_process(
+            COMMAND "${LESSMSI}" x "${package_path_native}"
+            WORKING_DIRECTORY "${extract_1_dir}" 
+            LOGNAME "extract-${TARGET_TRIPLET}-${packstem}"
+        )
+        file(COPY "${extract_1_dir}/${packstem}/SourceDir/" DESTINATION "${extract_1_dir}")
+        file(REMOVE_RECURSE "${extract_1_dir}/${packstem}")
+    endforeach()
+
+    set(mkl_dir "${extract_1_dir}/Intel/Compiler/12.0/mkl/2023.0.0")
+    file(COPY "${mkl_dir}/include/" DESTINATION "${CURRENT_PACKAGES_DIR}/include")
+    # see https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl-link-line-advisor.html for linking
+    if(VCPKG_LIBRARY_LINKAGE STREQUAL "dynamic")
+      set(files "mkl_core_dll.lib" "mkl_${threading}_dll.lib" "mkl_intel_${interface}_dll.lib" "mkl_blas95_${interface}.lib" "mkl_lapack95_${interface}.lib") # "mkl_rt.lib" single dynamic lib with dynamic dispatch
+      file(COPY "${mkl_dir}/redist/intel64/" DESTINATION "${CURRENT_PACKAGES_DIR}/bin") # Could probably be reduced instead of copying all
+      if(NOT VCPKG_BUILD_TYPE)
+        file(COPY "${mkl_dir}/redist/intel64/" DESTINATION "${CURRENT_PACKAGES_DIR}/debug/bin")
+      endif()
+    else()
+      set(files "mkl_core.lib" "mkl_${threading}.lib" "mkl_intel_${interface}.lib" "mkl_blas95_${interface}.lib" "mkl_lapack95_${interface}.lib")
+    endif()
+    foreach(file IN LISTS files)
+      file(COPY "${mkl_dir}/lib/intel64/${file}" DESTINATION "${CURRENT_PACKAGES_DIR}/lib/intel64") # instead of manual-link keep normal structure
+      if(NOT VCPKG_BUILD_TYPE)
+        file(COPY "${mkl_dir}/lib/intel64/${file}" DESTINATION "${CURRENT_PACKAGES_DIR}/debug/lib/intel64")
+      endif()
+    endforeach()
+    file(COPY_FILE "${mkl_dir}/lib/pkgconfig/${main_pc_file}" "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/${main_pc_file}")
+
+    set(compiler_dir "${extract_1_dir}/Intel/Compiler/12.0/compiler/2023.0.0")
+    if(threading STREQUAL "intel_thread")
+      file(COPY "${compiler_dir}/windows/redist/intel64_win/compiler/" DESTINATION "${CURRENT_PACKAGES_DIR}/bin")
+      file(COPY "${compiler_dir}/windows/compiler/lib/intel64_win/" DESTINATION "${CURRENT_PACKAGES_DIR}/lib/intel64")
+      file(COPY_FILE "${compiler_dir}/lib/pkgconfig/openmp.pc" "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/libiomp5.pc")
+      vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/lib/pkgconfig/libiomp5.pc" "/windows/compiler/lib/intel64_win/" "/lib/intel64/")
+      vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/lib/pkgconfig/libiomp5.pc" "-I \${includedir}" "-I\"\${includedir}\"")
+      vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/lib/pkgconfig/${main_pc_file}" "openmp" "libiomp5")
+      if(NOT VCPKG_BUILD_TYPE)
+        file(COPY "${compiler_dir}/windows/redist/intel64_win/compiler/" DESTINATION "${CURRENT_PACKAGES_DIR}/debug/bin")
+        file(COPY "${compiler_dir}/windows/compiler/lib/intel64_win/" DESTINATION "${CURRENT_PACKAGES_DIR}/debug/lib/intel64")
+      endif()
+    endif()
+else()
+    message(STATUS "Warning: This port is still a work on progress. 
+   E.g. it is not correctly filtering the libraries in accordance with
+   VCPKG_LIBRARY_LINKAGE. It is using the default threading (Intel OpenMP)
+   which is known to segfault when used together with GNU OpenMP.
+")
+
+    message(STATUS "Extracting offline installer")
+    if(VCPKG_TARGET_IS_LINUX)
+      vcpkg_execute_required_process(
+          COMMAND "bash" "--verbose" "--noprofile" "${installer_path}" "--extract-only" "--extract-folder" "${extract_0_dir}"
+          WORKING_DIRECTORY "${extract_0_dir}"
+          LOGNAME "extract-${TARGET_TRIPLET}-0"
+      )
+      file(RENAME "${extract_0_dir}/l_onemkl_p_2023.0.0.25398_offline/packages" "${extract_0_dir}/packages")
+    elseif(VCPKG_TARGET_IS_OSX)
+      find_program(HDIUTIL NAMES hdiutil REQUIRED)
+      file(MAKE_DIRECTORY "${extract_0_dir}/packages")
+      message(STATUS "... Don't interrupt.")
+      vcpkg_execute_required_process(
+          COMMAND "${CMAKE_COMMAND}" "-Ddmg_path=${installer_path}"
+                                     "-Doutput_dir=${extract_0_dir}/packages"
+                                     "-DHDIUTIL=${HDIUTIL}"
+                                     -P "${CMAKE_CURRENT_LIST_DIR}/copy-from-dmg.cmake"
+          WORKING_DIRECTORY "${extract_0_dir}"
+          LOGNAME "extract-${TARGET_TRIPLET}-0"
+      )
+      message(STATUS "... Done.")
+    endif()
+
+    file(GLOB package_path "${extract_0_dir}/packages/intel.oneapi.${package_infix}.mkl.runtime,v=2023.0.0-*")
+    cmake_path(GET package_path STEM LAST_ONLY packstem)
+    message(STATUS "Extracting ${packstem}")
+    vcpkg_execute_required_process(
+        COMMAND "${CMAKE_COMMAND}" "-E" "tar" "-xf" "${package_path}/cupPayload.cup"
+            "_installdir/mkl/2023.0.0/lib"
+            "_installdir/mkl/2023.0.0/licensing"
+        WORKING_DIRECTORY "${extract_1_dir}"
+        LOGNAME "extract-${TARGET_TRIPLET}-${packstem}"
+    )
+    file(GLOB package_path "${extract_0_dir}/packages/intel.oneapi.${package_infix}.mkl.devel,v=2023.0.0-*")
+    cmake_path(GET package_path STEM LAST_ONLY packstem)
+    message(STATUS "Extracting ${packstem}")
+    vcpkg_execute_required_process(
+        COMMAND "${CMAKE_COMMAND}" "-E" "tar" "-xf" "${package_path}/cupPayload.cup"
+            "_installdir/mkl/2023.0.0/bin"
+            "_installdir/mkl/2023.0.0/include"
+            "_installdir/mkl/2023.0.0/lib"
+        WORKING_DIRECTORY "${extract_1_dir}"
+        LOGNAME "extract-${TARGET_TRIPLET}-${packstem}"
+    )
+    file(GLOB package_path "${extract_0_dir}/packages/intel.oneapi.${package_infix}.openmp,v=2023.0.0-*")
+    cmake_path(GET package_path STEM LAST_ONLY packstem)
+    message(STATUS "Extracting ${packstem}")
+    vcpkg_execute_required_process(
+        COMMAND "${CMAKE_COMMAND}" "-E" "tar" "-xf" "${package_path}/cupPayload.cup"
+            "_installdir/compiler/2023.0.0"
+        WORKING_DIRECTORY "${extract_1_dir}"
+        LOGNAME "extract-${TARGET_TRIPLET}-${packstem}"
+    )
+
+    set(mkl_dir "${extract_1_dir}/_installdir/mkl/2023.0.0")
+    file(COPY "${mkl_dir}/include/" DESTINATION "${CURRENT_PACKAGES_DIR}/include")
+    file(COPY "${mkl_dir}/${package_libdir}/" DESTINATION "${CURRENT_PACKAGES_DIR}/lib/intel64")
+    if(VCPKG_LIBRARY_LINKAGE STREQUAL "dynamic")
+      set(to_remove_suffix .a)
+    elseif(VCPKG_TARGET_IS_OSX)
+      set(to_remove_suffix .dylib)
+    else()
+      set(to_remove_suffix .so)
+    endif()
+    file(GLOB_RECURSE files_to_remove
+        "${CURRENT_PACKAGES_DIR}/lib/intel64/*${to_remove_suffix}"
+        "${CURRENT_PACKAGES_DIR}/lib/intel64/*${to_remove_suffix}.?"
+    )
+    file(REMOVE ${files_to_remove})
+    file(COPY_FILE "${mkl_dir}/lib/pkgconfig/${main_pc_file}" "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/${main_pc_file}")
+    vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/lib/pkgconfig/${main_pc_file}" "\${exec_prefix}/${package_libdir}" "\${exec_prefix}/lib/intel64" IGNORE_UNCHANGED)
+  
+    set(compiler_dir "${extract_1_dir}/_installdir/compiler/2023.0.0")
+    if(threading STREQUAL "intel_thread")
+      file(COPY "${compiler_dir}/${compiler_libdir}/" DESTINATION "${CURRENT_PACKAGES_DIR}/lib/intel64")
+      file(COPY_FILE "${compiler_dir}/lib/pkgconfig/openmp.pc" "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/libiomp5.pc")
+      vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/lib/pkgconfig/libiomp5.pc" "/${compiler_libdir}/" "/lib/intel64/" IGNORE_UNCHANGED)
+      vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/lib/pkgconfig/${main_pc_file}" "openmp" "libiomp5")
+    endif()
+endif()
+
+file(COPY_FILE "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/${main_pc_file}" "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/mkl.pc")
+if(NOT VCPKG_BUILD_TYPE)
+    file(MAKE_DIRECTORY "${CURRENT_PACKAGES_DIR}/debug/lib/pkgconfig")
+    file(GLOB pc_files RELATIVE "${CURRENT_PACKAGES_DIR}/lib/pkgconfig" "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/*.pc")
+    foreach(file IN LISTS pc_files)
+      file(COPY_FILE "${CURRENT_PACKAGES_DIR}/lib/pkgconfig/${file}" "${CURRENT_PACKAGES_DIR}/debug/lib/pkgconfig/${file}")
+      vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/debug/lib/pkgconfig/${file}" "/include" "/../include")
+      if(NOT VCPKG_TARGET_IS_WINDOWS)
+        vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/debug/lib/pkgconfig/${file}" "/lib/intel64" "/../lib/intel64")
+      endif()
+    endforeach()
+endif()
+
+file(COPY "${mkl_dir}/lib/cmake/" DESTINATION "${CURRENT_PACKAGES_DIR}/share/")
+vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/share/mkl/MKLConfig.cmake" "MKL_CMAKE_PATH}/../../../" "MKL_CMAKE_PATH}/../../")
+vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/share/mkl/MKLConfig.cmake" "redist/\${MKL_ARCH}" "bin")
+if(${VCPKG_LIBRARY_LINKAGE} STREQUAL "static")
+vcpkg_replace_string("${CURRENT_PACKAGES_DIR}/share/mkl/MKLConfig.cmake" "define_param(MKL_LINK DEFAULT_MKL_LINK MKL_LINK_LIST)" 
+[[define_param(MKL_LINK DEFAULT_MKL_LINK MKL_LINK_LIST)
+ set(MKL_LINK "static")
+]])
+endif()
+#TODO: Hardcode settings from portfile in config.cmake
+#TODO: Give lapack/blas information about the correct BLA_VENDOR depending on settings. 
+
+file(INSTALL "${mkl_dir}/licensing" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}")
+file(GLOB package_path "${extract_0_dir}/packages/intel.oneapi.${package_infix}.mkl.product,v=2023.0.0-*")
+vcpkg_install_copyright(FILE_LIST "${package_path}/licenses/license.htm")
+
+file(REMOVE_RECURSE
+    "${extract_0_dir}"
+    "${extract_1_dir}"
+    "${CURRENT_PACKAGES_DIR}/lib/intel64/cmake"
+    "${CURRENT_PACKAGES_DIR}/lib/intel64/pkgconfig"
+)
+
+file(INSTALL "${CMAKE_CURRENT_LIST_DIR}/usage" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}")
diff --git a/external_ports/intel-mkl/usage b/external_ports/intel-mkl/usage
new file mode 100644
index 00000000..b8ee798f
--- /dev/null
+++ b/external_ports/intel-mkl/usage
@@ -0,0 +1,4 @@
+intel-mkl provides CMake targets:
+
+    find_package(MKL CONFIG REQUIRED)
+    target_link_libraries(main PRIVATE MKL::MKL)
diff --git a/external_ports/intel-mkl/vcpkg.json b/external_ports/intel-mkl/vcpkg.json
new file mode 100644
index 00000000..fc0a76ec
--- /dev/null
+++ b/external_ports/intel-mkl/vcpkg.json
@@ -0,0 +1,16 @@
+{
+  "name": "intel-mkl",
+  "version": "2023.0.0",
+  "port-version": 5,
+  "description": "Intel® Math Kernel Library (Intel® MKL) accelerates math processing routines, increases application performance, and reduces development time on Intel® processors.",
+  "homepage": "https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.html",
+  "license": null,
+  "supports": "(windows | linux | osx) & x64",
+  "dependencies": [
+    {
+      "name": "vcpkg-tool-lessmsi",
+      "host": true,
+      "platform": "windows"
+    }
+  ]
+}
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index d5378540..9618627f 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -27,7 +27,7 @@ find_package(Boost REQUIRED)
 
 add_executable(GPRat_test_output_correctness src/output_correctness.cpp)
 target_link_libraries(GPRat_test_output_correctness
-                      PRIVATE GPRat::core Catch2::Catch2WithMain)
+                      PRIVATE GPRat::core Catch2::Catch2WithMain Boost::boost)
 target_compile_features(GPRat_test_output_correctness PRIVATE cxx_std_17)
 
 add_test(
diff --git a/test/src/output_correctness.cpp b/test/src/output_correctness.cpp
index 1fc73536..1e7ca8fc 100644
--- a/test/src/output_correctness.cpp
+++ b/test/src/output_correctness.cpp
@@ -1,5 +1,6 @@
-#include "gprat_c.hpp"
-#include "utils_c.hpp"
+#include "gprat/gprat.hpp"
+#include "gprat/utils.hpp"
+
 #include <catch2/catch_test_macros.hpp>
 #include <catch2/matchers/catch_matchers_floating_point.hpp>
 
@@ -40,6 +41,36 @@ void tag_invoke(boost::json::value_from_tag, boost::json::value &jv, const gprat
     };
 }
 
+template <typename T>
+std::vector<T> to_vector(const gprat::const_tile_data<T> &data)
+{
+    return { data.begin(), data.end() };
+}
+
+template <typename T>
+std::vector<std::vector<T>> to_vector(const std::vector<gprat::const_tile_data<T>> &data)
+{
+    std::vector<std::vector<T>> out;
+    out.reserve(data.size());
+    for (const auto &row : data)
+    {
+        out.emplace_back(to_vector<T>(row));
+    }
+    return out;
+}
+
+template <typename T>
+std::vector<std::vector<T>> to_vector(const std::vector<gprat::mutable_tile_data<T>> &data)
+{
+    std::vector<std::vector<T>> out;
+    out.reserve(data.size());
+    for (const auto &row : data)
+    {
+        out.emplace_back(to_vector<T>(row));
+    }
+    return out;
+}
+
 // This helper function deduces the type and assigns the value with the matching key
 template <typename T>
 inline void extract(const boost::json::object &obj, T &t, std::string_view key)
@@ -73,11 +104,11 @@ gprat_results run_on_data_cpu(const std::string &train_path, const std::string &
     const std::size_t n_reg = 8;
 
     // Compute tile sizes and number of predict tiles
-    const int tile_size = utils::compute_train_tile_size(n_train, n_tiles);
-    const auto test_tiles = utils::compute_test_tiles(n_test, n_tiles, tile_size);
+    const auto tile_size = gprat::compute_train_tile_size(n_train, n_tiles);
+    const auto test_tiles = gprat::compute_test_tiles(n_test, n_tiles, tile_size);
 
     // hyperparams
-    gprat_hyper::AdamParams hpar = { 0.1, 0.9, 0.999, 1e-8, OPT_ITER };
+    gprat::AdamParams hpar = { 0.1, 0.9, 0.999, 1e-8, OPT_ITER };
 
     // data loading
     gprat::GP_data training_input(train_path, n_train, n_reg);
@@ -90,22 +121,17 @@ gprat_results run_on_data_cpu(const std::string &train_path, const std::string &
         training_input.data, training_output.data, n_tiles, tile_size, n_reg, { 1.0, 1.0, 0.1 }, trainable);
 
     // Initialize HPX with no arguments, don't run hpx_main
-    utils::start_hpx_runtime(0, nullptr);
+    gprat::start_hpx_runtime(0, nullptr);
 
     gprat_results results_cpu;
-
-    results_cpu.choleksy = gp_cpu.cholesky();
-
+    results_cpu.choleksy = to_vector(gp_cpu.cholesky());
     results_cpu.losses = gp_cpu.optimize(hpar);
-
     results_cpu.sum = gp_cpu.predict_with_uncertainty(test_input.data, test_tiles.first, test_tiles.second);
-
     results_cpu.full = gp_cpu.predict_with_full_cov(test_input.data, test_tiles.first, test_tiles.second);
-
     results_cpu.pred = gp_cpu.predict(test_input.data, test_tiles.first, test_tiles.second);
 
     // Stop the HPX runtime
-    utils::stop_hpx_runtime();
+    gprat::stop_hpx_runtime();
 
     return results_cpu;
 }
@@ -120,8 +146,8 @@ gprat_results run_on_data_gpu(const std::string &train_path, const std::string &
     const int gpu_id = 0;
     const int n_streams = 1;
 
-    const int tile_size = utils::compute_train_tile_size(n_train, n_tiles);
-    const auto test_tiles = utils::compute_test_tiles(n_test, n_tiles, tile_size);
+    const auto tile_size = gprat::compute_train_tile_size(n_train, n_tiles);
+    const auto test_tiles = gprat::compute_test_tiles(n_test, n_tiles, tile_size);
 
     gprat::GP_data training_input(train_path, n_train, n_reg);
     gprat::GP_data training_output(out_path, n_train, n_reg);
@@ -139,16 +165,16 @@ gprat_results run_on_data_gpu(const std::string &train_path, const std::string &
         gpu_id,
         n_streams);
 
-    utils::start_hpx_runtime(0, nullptr);
+    gprat::start_hpx_runtime(0, nullptr);
 
     gprat_results results_gpu;
-    results_gpu.choleksy = gp_gpu.cholesky();
+    results_gpu.choleksy = to_vector(gp_gpu.cholesky());
     // NOTE: optimize and optimize_step are currently not implemented for GPU
     results_gpu.sum_no_optimize = gp_gpu.predict_with_uncertainty(test_input.data, test_tiles.first, test_tiles.second);
     results_gpu.full_no_optimize = gp_gpu.predict_with_full_cov(test_input.data, test_tiles.first, test_tiles.second);
     results_gpu.pred_no_optimize = gp_gpu.predict(test_input.data, test_tiles.first, test_tiles.second);
 
-    utils::stop_hpx_runtime();
+    gprat::stop_hpx_runtime();
 
     return results_gpu;
 }
@@ -256,7 +282,7 @@ TEST_CASE("GP CPU results match known-good values", "[integration][cpu]")
 // NOTE: using higher tolerance than for CPU
 TEST_CASE("GP GPU results match known-good values (no loss)", "[integration][gpu]")
 {
-    if (!utils::compiled_with_cuda())
+    if (!gprat::compiled_with_cuda())
     {
         WARN("CUDA not available — skipping GPU test.");
         return;
diff --git a/vcpkg-configuration.json b/vcpkg-configuration.json
new file mode 100644
index 00000000..3afcbd70
--- /dev/null
+++ b/vcpkg-configuration.json
@@ -0,0 +1,6 @@
+{
+  "$schema": "https://raw.githubusercontent.com/microsoft/vcpkg-tool/main/docs/vcpkg-configuration.schema.json",
+  "overlay-ports": [
+    "./external_ports"
+  ]
+}
diff --git a/vcpkg.json b/vcpkg.json
new file mode 100644
index 00000000..0b252332
--- /dev/null
+++ b/vcpkg.json
@@ -0,0 +1,31 @@
+{
+  "$schema": "https://raw.githubusercontent.com/microsoft/vcpkg-tool/main/docs/vcpkg.schema.json",
+  "name": "gprat",
+  "version-semver": "0.1.0",
+  "dependencies": [
+    {
+      "name": "boost-json"
+    },
+    {
+      "name": "intel-mkl"
+    },
+    {
+      "name": "fmt"
+    },
+    {
+      "name": "hpx",
+      "features": [
+        "cuda",
+        "bzip2",
+        "mpi",
+        "snappy",
+        "zlib"
+      ]
+    },
+    {
+      "name": "cuda"
+    }
+  ],
+  "default-features": [],
+  "builtin-baseline": "365f6444ab40ee87c73c947b475b3a267b3cb77c"
+}