diff --git a/CMakeLists.txt b/CMakeLists.txt
index b18ad492..a3a6fbb7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -294,77 +294,58 @@ if(BUILD_LTO)
 endif()
 
 #
-# SIMD SUPPORT (independent of OpenMP)
+# SIMD SUPPORT (runtime CPU dispatch)
 #
-
-# Option to disable SIMD entirely
-option(USE_SIMD "Enable SIMD optimizations (SSE4.2/AVX2 on x86_64, NEON on ARM64)" ON)
-
-# Check architecture
-# CMAKE_SYSTEM_PROCESSOR is "x86_64" on Intel Macs and Linux x86_64, "arm64"/"aarch64" on ARM
+# SLiM's SIMD kernels live in eidos_simd_*.cpp, compiled once per instruction-
+# set tier: scalar, SSE4.2, and AVX2+FMA on x86_64, and NEON on ARM64.  Only
+# the per-tier files are given instruction-set-specific compiler flags (those
+# are applied at the end of this file); every other translation unit, and the
+# dispatcher, is compiled at the plain baseline ABI.  At startup
+# Eidos_SIMD_Init() probes the CPU and points the
+# kernels at the fastest tier the hardware supports, so a single binary is
+# correct on any CPU, from pre-AVX2 hardware up.  Applying -mavx2 globally (the
+# previous approach) let the compiler emit AVX2 throughout the whole binary and
+# caused SIGILL crashes on older CPUs (issue #628).
+
+option(USE_SIMD "Enable SIMD-accelerated kernels with runtime CPU dispatch" ON)
+
+# Detect x86 so the SSE4.2/AVX2 tier files can be given their ISA flags.  NEON
+# needs no flag (it is baseline on ARM64); other architectures use the scalar
+# tier only.
 set(IS_X86_64 FALSE)
-set(IS_ARM64 FALSE)
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64|i686|i386")
     set(IS_X86_64 TRUE)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64|ARM64")
-    set(IS_ARM64 TRUE)
 endif()
 
-if(USE_SIMD AND NOT WIN32 AND IS_X86_64)
-    include(CheckCXXCompilerFlag)
-
-    # Check for AVX2 support
-    check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2)
-    check_cxx_compiler_flag("-msse4.2" COMPILER_SUPPORTS_SSE42)
-    check_cxx_compiler_flag("-mfma" COMPILER_SUPPORTS_FMA)
-
-    if(COMPILER_SUPPORTS_AVX2)
-        message(STATUS "SIMD: AVX2 support detected")
-        add_compile_definitions(EIDOS_HAS_AVX2=1)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2")
-        if(COMPILER_SUPPORTS_FMA)
-            message(STATUS "SIMD: FMA support detected")
-            add_compile_definitions(EIDOS_HAS_FMA=1)
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
-        endif()
-    elseif(COMPILER_SUPPORTS_SSE42)
-        message(STATUS "SIMD: SSE4.2 support detected (no AVX2)")
-        add_compile_definitions(EIDOS_HAS_SSE42=1)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2")
-    else()
-        message(STATUS "SIMD: No x86 SIMD support detected, using scalar fallback")
-    endif()
-elseif(USE_SIMD AND NOT WIN32 AND IS_ARM64)
-    # ARM64 NEON is always available on ARM64, no compiler flag needed
-    message(STATUS "SIMD: ARM64 NEON support enabled")
-    add_compile_definitions(EIDOS_HAS_NEON=1)
-elseif(USE_SIMD AND NOT WIN32)
-    message(STATUS "SIMD: Unknown architecture (${CMAKE_SYSTEM_PROCESSOR}), using scalar fallback")
-elseif(USE_SIMD AND WIN32)
-    # Windows SIMD detection - MinGW uses GCC, so we can use the same flags as Linux
-    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+if(USE_SIMD)
+    if(IS_X86_64 AND NOT MSVC)
         include(CheckCXXCompilerFlag)
-        check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2_WIN)
-        check_cxx_compiler_flag("-mfma" COMPILER_SUPPORTS_FMA_WIN)
-
-        if(COMPILER_SUPPORTS_AVX2_WIN)
-            message(STATUS "SIMD: AVX2 support detected (Windows/MinGW)")
-            add_compile_definitions(EIDOS_HAS_AVX2=1)
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2")
-            if(COMPILER_SUPPORTS_FMA_WIN)
-                message(STATUS "SIMD: FMA support detected (Windows/MinGW)")
-                add_compile_definitions(EIDOS_HAS_FMA=1)
-                set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
-            endif()
+        check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2)
+        check_cxx_compiler_flag("-msse4.2" COMPILER_SUPPORTS_SSE42)
+
+        if(COMPILER_SUPPORTS_AVX2 AND COMPILER_SUPPORTS_SSE42)
+            message(STATUS "SIMD: runtime dispatch enabled (scalar / SSE4.2 / AVX2+FMA selected per-CPU at startup)")
+            # The avx2/sse42 tier files need -mavx2 -mfma / -msse4.2.  The flags
+            # are applied at the end of this file rather than here, because the
+            # Windows target blocks below overwrite COMPILE_FLAGS on every
+            # source file with "-include config.h"; applying our flags last,
+            # with APPEND_STRING, lets the two coexist (issue #628).
+            set(SIMD_TIER_FLAGS_X86 TRUE)
         else()
-            message(STATUS "SIMD: No AVX2 support on Windows/MinGW, using scalar fallback")
+            message(STATUS "SIMD: compiler lacks -mavx2/-msse4.2; building scalar kernels only")
+            add_compile_definitions(EIDOS_SUPPRESS_SIMD_DISPATCH=1)
         endif()
+    elseif(MSVC)
+        message(STATUS "SIMD: runtime dispatch not implemented for MSVC; building scalar kernels only")
+        add_compile_definitions(EIDOS_SUPPRESS_SIMD_DISPATCH=1)
     else()
-        # MSVC path - not currently used but could be added in the future
-        message(STATUS "SIMD: MSVC SIMD detection not yet implemented, using scalar fallback")
+        # ARM64 (NEON, baseline) or any other architecture (scalar fallback):
+        # the tier files need no instruction-set flags.
+        message(STATUS "SIMD: runtime dispatch enabled (NEON or scalar kernels selected at startup)")
     endif()
 else()
-    message(STATUS "SIMD: Disabled by user")
+    message(STATUS "SIMD: disabled (USE_SIMD=OFF); all math uses scalar kernels")
+    add_compile_definitions(EIDOS_SUPPRESS_SIMD_DISPATCH=1)
 endif()
 
 # GSL - adding /usr/local/include so all targets that use GSL_INCLUDES get omp.h
@@ -535,6 +516,20 @@ if(BUILD_SLIMGUI)
 endif(BUILD_SLIMGUI)
 
 
+# Apply the per-tier SIMD instruction-set flags to the avx2/sse42 tier files.
+# This is deliberately done here, after the WIN32 blocks above: those run
+# set_source_files_properties(... COMPILE_FLAGS "-include config.h") over every
+# source file, and COMPILE_FLAGS is a single string that gets overwritten, not
+# appended.  Setting our flags last with APPEND_STRING preserves the Windows
+# "-include config.h" while still confining AVX2/SSE4.2 to these two files.
+if(SIMD_TIER_FLAGS_X86)
+    set_property(SOURCE "${PROJECT_SOURCE_DIR}/eidos/eidos_simd_avx2.cpp"
+        APPEND_STRING PROPERTY COMPILE_FLAGS " -mavx2 -mfma")
+    set_property(SOURCE "${PROJECT_SOURCE_DIR}/eidos/eidos_simd_sse42.cpp"
+        APPEND_STRING PROPERTY COMPILE_FLAGS " -msse4.2")
+endif()
+
+
 # Deal with the PROFILE and PARALLEL flags, which interact and are handled in a complex way.
 #
 # For SLiMgui, profiling is always on for Release builds, always off for Debug builds; PROFILE does not affect it
diff --git a/QtSLiM/QtSLiM.pro b/QtSLiM/QtSLiM.pro
index d095e198..85f248fe 100644
--- a/QtSLiM/QtSLiM.pro
+++ b/QtSLiM/QtSLiM.pro
@@ -83,6 +83,11 @@ QMAKE_CFLAGS += $$(CFLAGS)
 DEFINES += EIDOS_GUI
 DEFINES += SLIMGUI=1
 
+# The Qt Creator (qmake) build does not apply per-file SIMD compiler flags, so
+# it builds the scalar SIMD kernels only.  The CMake build provides the full
+# runtime SIMD dispatch (scalar / SSE4.2 / AVX2+FMA / NEON).  See eidos_simd.h.
+DEFINES += EIDOS_SUPPRESS_SIMD_DISPATCH
+
 
 # Uncomment this define to disable the use of OpenGL in SLiMgui completely.  This, plus removing the
 # link dependency on openglwidgets, should allow you to build SLiMgui without linking OpenGL at all.
diff --git a/core/core.pro b/core/core.pro
index d4394a80..6e9daa9a 100644
--- a/core/core.pro
+++ b/core/core.pro
@@ -45,6 +45,11 @@ QMAKE_CXXFLAGS += -Xarch_arm64 -DEIDOS_HAS_NEON=1
 DEFINES += EIDOS_GUI
 DEFINES += SLIMGUI=1
 
+# The Qt Creator (qmake) build does not apply per-file SIMD compiler flags, so
+# it builds the scalar SIMD kernels only.  The CMake build provides the full
+# runtime SIMD dispatch (scalar / SSE4.2 / AVX2+FMA / NEON).  See eidos_simd.h.
+DEFINES += EIDOS_SUPPRESS_SIMD_DISPATCH
+
 CONFIG -= qt
 CONFIG += c++11
 CONFIG += c11
diff --git a/eidos/eidos.pro b/eidos/eidos.pro
index a5b3ff88..09e28a1a 100644
--- a/eidos/eidos.pro
+++ b/eidos/eidos.pro
@@ -45,6 +45,11 @@ QMAKE_CXXFLAGS += -Xarch_arm64 -DEIDOS_HAS_NEON=1
 DEFINES += EIDOS_GUI
 DEFINES += SLIMGUI=1
 
+# The Qt Creator (qmake) build does not apply per-file SIMD compiler flags, so
+# it builds the scalar SIMD kernels only.  The CMake build provides the full
+# runtime SIMD dispatch (scalar / SSE4.2 / AVX2+FMA / NEON).  See eidos_simd.h.
+DEFINES += EIDOS_SUPPRESS_SIMD_DISPATCH
+
 CONFIG -= qt
 CONFIG += c++11
 CONFIG += c11
@@ -115,6 +120,11 @@ SOURCES += \
     eidos_property_signature.cpp \
     eidos_rng.cpp \
     eidos_script.cpp \
+    eidos_simd.cpp \
+    eidos_simd_avx2.cpp \
+    eidos_simd_neon.cpp \
+    eidos_simd_scalar.cpp \
+    eidos_simd_sse42.cpp \
     eidos_sorting.cpp \
     eidos_symbol_table.cpp \
     eidos_test.cpp \
@@ -149,6 +159,7 @@ HEADERS += \
     eidos_rng.h \
     eidos_script.h \
 	eidos_simd.h \
+	eidos_simd_impl.h \
 	eidos_sorting.h \
     eidos_symbol_table.h \
     eidos_test_builtins.h \
diff --git a/eidos/eidos_globals.cpp b/eidos/eidos_globals.cpp
index 172afca9..a57b9369 100644
--- a/eidos/eidos_globals.cpp
+++ b/eidos/eidos_globals.cpp
@@ -30,6 +30,7 @@
 #include "eidos_class_DataFrame.h"
 #include "eidos_class_Image.h"
 #include "eidos_class_TestElement.h"
+#include "eidos_simd.h"
 
 #include <stdlib.h>
 #include <execinfo.h>
@@ -1163,6 +1164,9 @@ void Eidos_WarmUp(void)
 	{
 		been_here = true;
 		
+		// Detect the CPU and select the SIMD kernel tier; must happen before any Eidos_SIMD kernel is called.
+		Eidos_SIMD_Init();
+		
 		// Initialize the random number generator with a random-ish seed.  This seed may be overridden by the Context downstream.
 		Eidos_InitializeRNG();
 		Eidos_SetRNGSeed(Eidos_GenerateRNGSeed());
diff --git a/eidos/eidos_simd.cpp b/eidos/eidos_simd.cpp
new file mode 100644
index 00000000..c407969c
--- /dev/null
+++ b/eidos/eidos_simd.cpp
@@ -0,0 +1,131 @@
+//
+//  eidos_simd.cpp
+//  Eidos
+//
+//  Created by Andrew Kern on 5/21/2026.
+//  Copyright (c) 2024-2025 Benjamin C. Haller.  All rights reserved.
+//	A product of the Messer Lab, http://messerlab.org/slim/
+//
+
+//	This file is part of Eidos.
+//
+//	Eidos is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
+//
+//	Eidos is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License along with Eidos.  If not, see <http://www.gnu.org/licenses/>.
+
+/*
+
+ SIMD runtime dispatcher.  This translation unit is compiled at the plain
+ baseline ABI (no instruction-set flags); it owns the public Eidos_SIMD
+ function pointers and selects a tier for them at startup.  See eidos_simd.h.
+
+ */
+
+#include "eidos_simd.h"
+
+#include <cstring>
+
+
+// The public kernel pointers.  They are statically initialized to the scalar
+// tier so that a call is well-defined even if it somehow happens before
+// Eidos_SIMD_Init() runs; the address of a function is a constant expression,
+// so there is no static-initialization-order dependency here.
+namespace Eidos_SIMD {
+#define X(ret, name, params) ret (*name) params = &Eidos_SIMD_scalar::name;
+EIDOS_SIMD_FUNCTION_TABLE
+#undef X
+}
+
+
+enum class Eidos_SIMD_Tier { kScalar, kSSE42, kAVX2_FMA, kNEON };
+
+static Eidos_SIMD_Tier sActiveTier = Eidos_SIMD_Tier::kScalar;
+
+
+bool Eidos_SIMD_SelectTier(const char *tier_name)
+{
+	// The scalar tier is built on every platform and always available.
+	if (std::strcmp(tier_name, "scalar") == 0)
+	{
+		Eidos_SIMD_Fill_scalar();
+		sActiveTier = Eidos_SIMD_Tier::kScalar;
+		return true;
+	}
+
+#if EIDOS_SIMD_DISPATCH_X86
+	// __builtin_cpu_supports() reads CPUID; it is available on GCC and Clang
+	// for x86 and works regardless of the flags this file was compiled with.
+	// AVX2 and FMA shipped together (Haswell), but we require both explicitly
+	// since the AVX2 tier and SLEEF both use FMA instructions.
+	if (std::strcmp(tier_name, "AVX2+FMA") == 0)
+	{
+		if (!(__builtin_cpu_supports("avx2") && __builtin_cpu_supports("fma")))
+			return false;
+		Eidos_SIMD_Fill_avx2();
+		sActiveTier = Eidos_SIMD_Tier::kAVX2_FMA;
+		return true;
+	}
+	if (std::strcmp(tier_name, "SSE4.2") == 0)
+	{
+		if (!__builtin_cpu_supports("sse4.2"))
+			return false;
+		Eidos_SIMD_Fill_sse42();
+		sActiveTier = Eidos_SIMD_Tier::kSSE42;
+		return true;
+	}
+#endif
+
+#if EIDOS_SIMD_DISPATCH_ARM
+	// NEON is baseline on every ARM64 CPU, so it is always available here.
+	if (std::strcmp(tier_name, "NEON") == 0)
+	{
+		Eidos_SIMD_Fill_neon();
+		sActiveTier = Eidos_SIMD_Tier::kNEON;
+		return true;
+	}
+#endif
+
+	return false;
+}
+
+void Eidos_SIMD_Init(void)
+{
+	// Install the fastest tier the CPU supports. This is idempotent: calling it
+	// again re-runs detection and re-installs the same tier, which is how the
+	// SIMD self-tests restore normal dispatch after cycling through every tier.
+#if EIDOS_SIMD_DISPATCH_X86
+	if (Eidos_SIMD_SelectTier("AVX2+FMA"))
+		return;
+	if (Eidos_SIMD_SelectTier("SSE4.2"))
+		return;
+#endif
+#if EIDOS_SIMD_DISPATCH_ARM
+	if (Eidos_SIMD_SelectTier("NEON"))
+		return;
+#endif
+	// Fallback for pre-AVX2/pre-SSE4.2 x86, unknown architectures, MSVC, and
+	// USE_SIMD=OFF builds: the scalar tier, which runs on any CPU.
+	Eidos_SIMD_SelectTier("scalar");
+}
+
+const char *Eidos_SIMD_ActiveTierName(void)
+{
+	switch (sActiveTier)
+	{
+		case Eidos_SIMD_Tier::kAVX2_FMA:	return "AVX2+FMA";
+		case Eidos_SIMD_Tier::kSSE42:		return "SSE4.2";
+		case Eidos_SIMD_Tier::kNEON:		return "NEON";
+		case Eidos_SIMD_Tier::kScalar:		return "scalar";
+	}
+	return "scalar";
+}
+
+bool Eidos_SIMD_SLEEFActive(void)
+{
+	// SLEEF transcendentals are wired up only for the AVX2+FMA and NEON tiers.
+	return (sActiveTier == Eidos_SIMD_Tier::kAVX2_FMA) || (sActiveTier == Eidos_SIMD_Tier::kNEON);
+}
diff --git a/eidos/eidos_simd.h b/eidos/eidos_simd.h
index b379eb78..ca2abb10 100644
--- a/eidos/eidos_simd.h
+++ b/eidos/eidos_simd.h
@@ -19,13 +19,39 @@
 
 /*
 
- SIMD acceleration for Eidos math operations, independent of OpenMP.
-
- This header provides vectorized implementations of common math operations
- using platform-specific SIMD intrinsics when available:
-   - x86_64: SSE4.2 or AVX2 via <immintrin.h>
-   - ARM64: NEON via <arm_neon.h>
- Falls back to scalar code when no SIMD is available.
+ SIMD acceleration for Eidos math operations, with runtime CPU dispatch.
+
+ This header is the public interface to a set of vectorized array-math kernels
+ (sqrt, exp, the spatial-interaction kernels, etc.).  Each kernel is exposed as
+ a function pointer in namespace Eidos_SIMD; callers simply write, e.g.,
+ Eidos_SIMD::sqrt_float64(in, out, n) exactly as if it were a function.
+
+ How the dispatch works, and why
+ -------------------------------
+ The kernels are compiled once per instruction-set "tier":
+
+   - scalar       portable C++ that runs on any CPU
+   - SSE4.2       x86_64-v2, present on essentially every x86_64 CPU (~2009+)
+   - AVX2 + FMA   x86_64-v3, Haswell / Excavator and later (~2013+)
+   - NEON         baseline on every ARM64 CPU
+
+ Each tier lives in its own translation unit (eidos_simd_scalar.cpp,
+ eidos_simd_sse42.cpp, eidos_simd_avx2.cpp, eidos_simd_neon.cpp); only those
+ translation units are built with instruction-set-specific compiler flags.
+ Every other translation unit in SLiM, and the dispatcher itself, is compiled
+ at the plain x86_64 (SSE2) baseline.  At startup Eidos_SIMD_Init() probes the
+ CPU with __builtin_cpu_supports() and points the Eidos_SIMD function pointers
+ at the fastest tier the hardware actually supports.
+
+ The point of this design is that a single binary is correct everywhere: the
+ bulk of the executable contains only baseline instructions, so it cannot fault
+ on an old CPU, while the AVX2/SSE4.2 code is reached only after the CPU has
+ been confirmed to support it.  Previously SLiM was built with -mavx2 applied
+ globally, which let the compiler emit AVX2 instructions throughout the whole
+ binary and caused SIGILL crashes on pre-AVX2 hardware (issue #628).
+
+ Building with -D USE_SIMD=OFF defines EIDOS_SUPPRESS_SIMD_DISPATCH, which
+ forces the scalar tier and omits all instruction-set-specific code.
 
  */
 
@@ -33,1190 +59,105 @@
 #define eidos_simd_h
 
 #include <cstdint>
-#include <cmath>
-
-// Determine SIMD capability level
-#if defined(EIDOS_HAS_AVX2)
-    #include <immintrin.h>
-    #define EIDOS_SIMD_WIDTH 4          // 4 doubles per AVX register
-    #define EIDOS_SIMD_FLOAT_WIDTH 8    // 8 floats per AVX register
-#elif defined(EIDOS_HAS_SSE42)
-    #include <emmintrin.h>
-    #include <smmintrin.h>
-    #define EIDOS_SIMD_WIDTH 2          // 2 doubles per SSE register
-    #define EIDOS_SIMD_FLOAT_WIDTH 4    // 4 floats per SSE register
-#elif defined(EIDOS_HAS_NEON)
-    #include <arm_neon.h>
-    #define EIDOS_SIMD_WIDTH 2          // 2 doubles per NEON register
-    #define EIDOS_SIMD_FLOAT_WIDTH 4    // 4 floats per NEON register
-#else
-    #define EIDOS_SIMD_WIDTH 1          // Scalar fallback
-    #define EIDOS_SIMD_FLOAT_WIDTH 1
-#endif
-
-// Include SLEEF for vectorized transcendental functions (exp, log, log10, log2)
-// SLEEF is only used when AVX2+FMA or NEON is available
-// BCH 12/31/2025: SLEEF generates tons of shadowed variable warnings for some reason; disable them
-#if defined(EIDOS_HAS_AVX2) || defined(EIDOS_HAS_NEON)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wshadow"
-#pragma GCC diagnostic ignored "-Wdouble-promotion"
-#pragma GCC diagnostic ignored "-Wunused-parameter"
-#pragma GCC diagnostic ignored "-Wunused-function"
-#pragma GCC diagnostic ignored "-Wimplicit-float-conversion"
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wshadow"
-#pragma clang diagnostic ignored "-Wdouble-promotion"
-#pragma clang diagnostic ignored "-Wunused-parameter"
-#pragma clang diagnostic ignored "-Wunused-function"
-#pragma clang diagnostic ignored "-Wimplicit-float-conversion"
-#include "sleef/sleef_config.h"
-#pragma clang diagnostic pop
-#pragma GCC diagnostic pop
-#endif
-
-// Disable certain warnings for the remainder of this file
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Waggressive-loop-optimizations"
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Waggressive-loop-optimizations"
-
-
-// ================================
-// SIMD Vector Math Operations
-// ================================
-// These functions apply an operation to arrays of doubles.
-// They handle the loop, SIMD processing, and scalar remainder.
 
+// EIDOS_SIMD_DISPATCH_X86 / _ARM are defined when this build includes a
+// hardware-specific tier beyond scalar.  Runtime dispatch needs both a
+// supported compiler (for __builtin_cpu_supports and target attributes /
+// per-file ISA flags) and a known architecture.  When USE_SIMD=OFF, neither
+// is defined and only the scalar tier exists.
+#if !defined(EIDOS_SUPPRESS_SIMD_DISPATCH) && (defined(__GNUC__) || defined(__clang__))
+	#if defined(__x86_64__) || defined(__i386__)
+		#define EIDOS_SIMD_DISPATCH_X86 1
+	#elif defined(__aarch64__) || defined(__arm64__)
+		#define EIDOS_SIMD_DISPATCH_ARM 1
+	#endif
+#endif
+
+// The complete set of dispatched SIMD kernels, as X-macro entries of the form
+// X(return_type, name, (parameter_types)).  This single list drives the
+// function-pointer declarations, their definitions, and the per-tier dispatch
+// tables, so a kernel is added or removed in exactly one place.
+#define EIDOS_SIMD_FUNCTION_TABLE \
+	X(void,   sqrt_float64,                  (const double *, double *, int64_t)) \
+	X(void,   abs_float64,                   (const double *, double *, int64_t)) \
+	X(void,   floor_float64,                 (const double *, double *, int64_t)) \
+	X(void,   ceil_float64,                  (const double *, double *, int64_t)) \
+	X(void,   trunc_float64,                 (const double *, double *, int64_t)) \
+	X(void,   round_float64,                 (const double *, double *, int64_t)) \
+	X(void,   exp_float64,                   (const double *, double *, int64_t)) \
+	X(void,   log_float64,                   (const double *, double *, int64_t)) \
+	X(void,   log10_float64,                 (const double *, double *, int64_t)) \
+	X(void,   log2_float64,                  (const double *, double *, int64_t)) \
+	X(void,   sin_float64,                   (const double *, double *, int64_t)) \
+	X(void,   cos_float64,                   (const double *, double *, int64_t)) \
+	X(void,   tan_float64,                   (const double *, double *, int64_t)) \
+	X(void,   asin_float64,                  (const double *, double *, int64_t)) \
+	X(void,   acos_float64,                  (const double *, double *, int64_t)) \
+	X(void,   atan_float64,                  (const double *, double *, int64_t)) \
+	X(void,   atan2_float64,                 (const double *, const double *, double *, int64_t)) \
+	X(void,   pow_float64,                   (const double *, const double *, double *, int64_t)) \
+	X(void,   pow_float64_scalar_exp,        (const double *, double, double *, int64_t)) \
+	X(void,   pow_float64_scalar_base,       (double, const double *, double *, int64_t)) \
+	X(double, sum_float64,                   (const double *, int64_t)) \
+	X(double, product_float64,               (const double *, int64_t)) \
+	X(void,   exp_float32,                   (const float *, float *, int64_t)) \
+	X(void,   exp_kernel_float32,            (float *, int64_t, float, float)) \
+	X(void,   normal_kernel_float32,         (float *, int64_t, float, float)) \
+	X(void,   tdist_kernel_float32,          (float *, int64_t, float, float, float)) \
+	X(void,   cauchy_kernel_float32,         (float *, int64_t, float, float)) \
+	X(void,   linear_kernel_float32,         (float *, int64_t, float, float)) \
+	X(void,   convolve_dot_product_float64,        (const double *, const double *, int64_t, double &, double &)) \
+	X(void,   convolve_dot_product_scaled_float64, (const double *, const double *, int64_t, double, double &, double &))
+
+// The public kernels: function pointers set by Eidos_SIMD_Init() to point at
+// the tier selected for the running CPU.  Call them as Eidos_SIMD::name(...).
 namespace Eidos_SIMD {
-
-// ---------------------
-// Square Root: sqrt(x)
-// ---------------------
-inline void sqrt_float64(const double *input, double *output, int64_t count)
-{
-    int64_t i = 0;
-
-#if defined(EIDOS_HAS_AVX2)
-    // Process 4 doubles at a time
-    for (; i + 4 <= count; i += 4)
-    {
-        __m256d v = _mm256_loadu_pd(&input[i]);
-        __m256d r = _mm256_sqrt_pd(v);
-        _mm256_storeu_pd(&output[i], r);
-    }
-#elif defined(EIDOS_HAS_SSE42)
-    // Process 2 doubles at a time
-    for (; i + 2 <= count; i += 2)
-    {
-        __m128d v = _mm_loadu_pd(&input[i]);
-        __m128d r = _mm_sqrt_pd(v);
-        _mm_storeu_pd(&output[i], r);
-    }
-#elif defined(EIDOS_HAS_NEON)
-    // Process 2 doubles at a time
-    for (; i + 2 <= count; i += 2)
-    {
-        float64x2_t v = vld1q_f64(&input[i]);
-        float64x2_t r = vsqrtq_f64(v);
-        vst1q_f64(&output[i], r);
-    }
-#endif
-
-    // Scalar remainder
-    for (; i < count; i++)
-        output[i] = std::sqrt(input[i]);
-}
-
-// ---------------------
-// Absolute Value: abs(x)
-// ---------------------
-inline void abs_float64(const double *input, double *output, int64_t count)
-{
-    int64_t i = 0;
-
-#if defined(EIDOS_HAS_AVX2)
-    // Create sign mask (all bits except sign bit)
-    __m256d sign_mask = _mm256_set1_pd(-0.0);
-    for (; i + 4 <= count; i += 4)
-    {
-        __m256d v = _mm256_loadu_pd(&input[i]);
-        __m256d r = _mm256_andnot_pd(sign_mask, v);  // Clear sign bit
-        _mm256_storeu_pd(&output[i], r);
-    }
-#elif defined(EIDOS_HAS_SSE42)
-    __m128d sign_mask = _mm_set1_pd(-0.0);
-    for (; i + 2 <= count; i += 2)
-    {
-        __m128d v = _mm_loadu_pd(&input[i]);
-        __m128d r = _mm_andnot_pd(sign_mask, v);
-        _mm_storeu_pd(&output[i], r);
-    }
-#elif defined(EIDOS_HAS_NEON)
-    for (; i + 2 <= count; i += 2)
-    {
-        float64x2_t v = vld1q_f64(&input[i]);
-        float64x2_t r = vabsq_f64(v);
-        vst1q_f64(&output[i], r);
-    }
-#endif
-
-    for (; i < count; i++)
-        output[i] = std::fabs(input[i]);
-}
-
-// ---------------------
-// Floor: floor(x)
-// ---------------------
-inline void floor_float64(const double *input, double *output, int64_t count)
-{
-    int64_t i = 0;
-
-#if defined(EIDOS_HAS_AVX2)
-    for (; i + 4 <= count; i += 4)
-    {
-        __m256d v = _mm256_loadu_pd(&input[i]);
-        __m256d r = _mm256_floor_pd(v);
-        _mm256_storeu_pd(&output[i], r);
-    }
-#elif defined(EIDOS_HAS_SSE42)
-    for (; i + 2 <= count; i += 2)
-    {
-        __m128d v = _mm_loadu_pd(&input[i]);
-        __m128d r = _mm_floor_pd(v);
-        _mm_storeu_pd(&output[i], r);
-    }
-#elif defined(EIDOS_HAS_NEON)
-    for (; i + 2 <= count; i += 2)
-    {
-        float64x2_t v = vld1q_f64(&input[i]);
-        float64x2_t r = vrndmq_f64(v);  // Round toward minus infinity (floor)
-        vst1q_f64(&output[i], r);
-    }
-#endif
-
-    for (; i < count; i++)
-        output[i] = std::floor(input[i]);
-}
-
-// ---------------------
-// Ceil: ceil(x)
-// ---------------------
-inline void ceil_float64(const double *input, double *output, int64_t count)
-{
-    int64_t i = 0;
-
-#if defined(EIDOS_HAS_AVX2)
-    for (; i + 4 <= count; i += 4)
-    {
-        __m256d v = _mm256_loadu_pd(&input[i]);
-        __m256d r = _mm256_ceil_pd(v);
-        _mm256_storeu_pd(&output[i], r);
-    }
-#elif defined(EIDOS_HAS_SSE42)
-    for (; i + 2 <= count; i += 2)
-    {
-        __m128d v = _mm_loadu_pd(&input[i]);
-        __m128d r = _mm_ceil_pd(v);
-        _mm_storeu_pd(&output[i], r);
-    }
-#elif defined(EIDOS_HAS_NEON)
-    for (; i + 2 <= count; i += 2)
-    {
-        float64x2_t v = vld1q_f64(&input[i]);
-        float64x2_t r = vrndpq_f64(v);  // Round toward plus infinity (ceil)
-        vst1q_f64(&output[i], r);
-    }
-#endif
-
-    for (; i < count; i++)
-        output[i] = std::ceil(input[i]);
-}
-
-// ---------------------
-// Truncate: trunc(x)
-// ---------------------
-inline void trunc_float64(const double *input, double *output, int64_t count)
-{
-    int64_t i = 0;
-
-#if defined(EIDOS_HAS_AVX2)
-    for (; i + 4 <= count; i += 4)
-    {
-        __m256d v = _mm256_loadu_pd(&input[i]);
-        __m256d r = _mm256_round_pd(v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-        _mm256_storeu_pd(&output[i], r);
-    }
-#elif defined(EIDOS_HAS_SSE42)
-    for (; i + 2 <= count; i += 2)
-    {
-        __m128d v = _mm_loadu_pd(&input[i]);
-        __m128d r = _mm_round_pd(v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-        _mm_storeu_pd(&output[i], r);
-    }
-#elif defined(EIDOS_HAS_NEON)
-    for (; i + 2 <= count; i += 2)
-    {
-        float64x2_t v = vld1q_f64(&input[i]);
-        float64x2_t r = vrndq_f64(v);  // Round toward zero (truncate)
-        vst1q_f64(&output[i], r);
-    }
-#endif
-
-    for (; i < count; i++)
-        output[i] = std::trunc(input[i]);
-}
-
-// ---------------------
-// Round: round(x)
-// ---------------------
-inline void round_float64(const double *input, double *output, int64_t count)
-{
-    int64_t i = 0;
-
-#if defined(EIDOS_HAS_AVX2)
-    for (; i + 4 <= count; i += 4)
-    {
-        __m256d v = _mm256_loadu_pd(&input[i]);
-        __m256d r = _mm256_round_pd(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-        _mm256_storeu_pd(&output[i], r);
-    }
-#elif defined(EIDOS_HAS_SSE42)
-    for (; i + 2 <= count; i += 2)
-    {
-        __m128d v = _mm_loadu_pd(&input[i]);
-        __m128d r = _mm_round_pd(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-        _mm_storeu_pd(&output[i], r);
-    }
-#elif defined(EIDOS_HAS_NEON)
-    for (; i + 2 <= count; i += 2)
-    {
-        float64x2_t v = vld1q_f64(&input[i]);
-        float64x2_t r = vrndaq_f64(v);  // Round to nearest, ties away from zero
-        vst1q_f64(&output[i], r);
-    }
-#endif
-
-    for (; i < count; i++)
-        output[i] = std::round(input[i]);
-}
-
-// ---------------------
-// Exponential: exp(x)
-// ---------------------
-inline void exp_float64(const double *input, double *output, int64_t count)
-{
-    int64_t i = 0;
-
-#if EIDOS_SLEEF_AVAILABLE
-    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
-    {
-        EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]);
-        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_EXP_D(v);
-        EIDOS_SLEEF_STORE_D(&output[i], r);
-    }
-#endif
-
-    // Scalar remainder
-    for (; i < count; i++)
-        output[i] = std::exp(input[i]);
-}
-
-// ---------------------
-// Natural Log: log(x)
-// ---------------------
-inline void log_float64(const double *input, double *output, int64_t count)
-{
-    int64_t i = 0;
-
-#if EIDOS_SLEEF_AVAILABLE
-    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
-    {
-        EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]);
-        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_LOG_D(v);
-        EIDOS_SLEEF_STORE_D(&output[i], r);
-    }
-#endif
-
-    // Scalar remainder
-    for (; i < count; i++)
-        output[i] = std::log(input[i]);
-}
-
-// ---------------------
-// Log base 10: log10(x)
-// ---------------------
-inline void log10_float64(const double *input, double *output, int64_t count)
-{
-    int64_t i = 0;
-
-#if EIDOS_SLEEF_AVAILABLE
-    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
-    {
-        EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]);
-        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_LOG10_D(v);
-        EIDOS_SLEEF_STORE_D(&output[i], r);
-    }
-#endif
-
-    // Scalar remainder
-    for (; i < count; i++)
-        output[i] = std::log10(input[i]);
-}
-
-// ---------------------
-// Log base 2: log2(x)
-// ---------------------
-inline void log2_float64(const double *input, double *output, int64_t count)
-{
-    int64_t i = 0;
-
-#if EIDOS_SLEEF_AVAILABLE
-    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
-    {
-        EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]);
-        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_LOG2_D(v);
-        EIDOS_SLEEF_STORE_D(&output[i], r);
-    }
-#endif
-
-    // Scalar remainder
-    for (; i < count; i++)
-        output[i] = std::log2(input[i]);
-}
-
-// ---------------------
-// Sine: sin(x)
-// ---------------------
-inline void sin_float64(const double *input, double *output, int64_t count)
-{
-    int64_t i = 0;
-
-#if EIDOS_SLEEF_AVAILABLE
-    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
-    {
-        EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]);
-        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_SIN_D(v);
-        EIDOS_SLEEF_STORE_D(&output[i], r);
-    }
-#endif
-
-    // Scalar remainder
-    for (; i < count; i++)
-        output[i] = std::sin(input[i]);
-}
-
-// ---------------------
-// Cosine: cos(x)
-// ---------------------
-inline void cos_float64(const double *input, double *output, int64_t count)
-{
-    int64_t i = 0;
-
-#if EIDOS_SLEEF_AVAILABLE
-    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
-    {
-        EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]);
-        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_COS_D(v);
-        EIDOS_SLEEF_STORE_D(&output[i], r);
-    }
-#endif
-
-    // Scalar remainder
-    for (; i < count; i++)
-        output[i] = std::cos(input[i]);
-}
-
-// ---------------------
-// Tangent: tan(x)
-// ---------------------
-inline void tan_float64(const double *input, double *output, int64_t count)
-{
-    int64_t i = 0;
-
-#if EIDOS_SLEEF_AVAILABLE
-    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
-    {
-        EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]);
-        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_TAN_D(v);
-        EIDOS_SLEEF_STORE_D(&output[i], r);
-    }
-#endif
-
-    // Scalar remainder
-    for (; i < count; i++)
-        output[i] = std::tan(input[i]);
-}
-
-// ---------------------
-// Arc Sine: asin(x)
-// ---------------------
-inline void asin_float64(const double *input, double *output, int64_t count)
-{
-    int64_t i = 0;
-
-#if EIDOS_SLEEF_AVAILABLE
-    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
-    {
-        EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]);
-        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_ASIN_D(v);
-        EIDOS_SLEEF_STORE_D(&output[i], r);
-    }
-#endif
-
-    // Scalar remainder
-    for (; i < count; i++)
-        output[i] = std::asin(input[i]);
-}
-
-// ---------------------
-// Arc Cosine: acos(x)
-// ---------------------
-inline void acos_float64(const double *input, double *output, int64_t count)
-{
-    int64_t i = 0;
-
-#if EIDOS_SLEEF_AVAILABLE
-    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
-    {
-        EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]);
-        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_ACOS_D(v);
-        EIDOS_SLEEF_STORE_D(&output[i], r);
-    }
-#endif
-
-    // Scalar remainder
-    for (; i < count; i++)
-        output[i] = std::acos(input[i]);
-}
-
-// ---------------------
-// Arc Tangent: atan(x)
-// ---------------------
-inline void atan_float64(const double *input, double *output, int64_t count)
-{
-    int64_t i = 0;
-
-#if EIDOS_SLEEF_AVAILABLE
-    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
-    {
-        EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]);
-        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_ATAN_D(v);
-        EIDOS_SLEEF_STORE_D(&output[i], r);
-    }
-#endif
-
-    // Scalar remainder
-    for (; i < count; i++)
-        output[i] = std::atan(input[i]);
-}
-
-// ---------------------
-// Arc Tangent 2: atan2(y, x)
-// ---------------------
-inline void atan2_float64(const double *y, const double *x, double *output, int64_t count)
-{
-    int64_t i = 0;
-
-#if EIDOS_SLEEF_AVAILABLE
-    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
-    {
-        EIDOS_SLEEF_TYPE_D vy = EIDOS_SLEEF_LOAD_D(&y[i]);
-        EIDOS_SLEEF_TYPE_D vx = EIDOS_SLEEF_LOAD_D(&x[i]);
-        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_ATAN2_D(vy, vx);
-        EIDOS_SLEEF_STORE_D(&output[i], r);
-    }
-#endif
-
-    // Scalar remainder
-    for (; i < count; i++)
-        output[i] = std::atan2(y[i], x[i]);
-}
-
-// ---------------------
-// Power: pow(x, y) = x^y
-// ---------------------
-inline void pow_float64(const double *base, const double *exp, double *output, int64_t count)
-{
-    int64_t i = 0;
-
-#if EIDOS_SLEEF_AVAILABLE
-    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
-    {
-        EIDOS_SLEEF_TYPE_D vb = EIDOS_SLEEF_LOAD_D(&base[i]);
-        EIDOS_SLEEF_TYPE_D ve = EIDOS_SLEEF_LOAD_D(&exp[i]);
-        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_POW_D(vb, ve);
-        EIDOS_SLEEF_STORE_D(&output[i], r);
-    }
-#endif
-
-    // Scalar remainder
-    for (; i < count; i++)
-        output[i] = std::pow(base[i], exp[i]);
-}
-
-// Broadcast version: all elements raised to same power (base_array ^ scalar_exp)
-inline void pow_float64_scalar_exp(const double *base, double exp_scalar, double *output, int64_t count)
-{
-    int64_t i = 0;
-
-#if defined(EIDOS_HAS_AVX2) && defined(EIDOS_HAS_FMA)
-    __m256d ve_broadcast = _mm256_set1_pd(exp_scalar);
-    for (; i + 4 <= count; i += 4)
-    {
-        __m256d vb = _mm256_loadu_pd(&base[i]);
-        __m256d r = Sleef_powd4_u10avx2(vb, ve_broadcast);
-        _mm256_storeu_pd(&output[i], r);
-    }
-#elif defined(EIDOS_HAS_NEON)
-    float64x2_t ve_broadcast = vdupq_n_f64(exp_scalar);
-    for (; i + 2 <= count; i += 2)
-    {
-        float64x2_t vb = vld1q_f64(&base[i]);
-        float64x2_t r = Sleef_powd2_u10advsimd(vb, ve_broadcast);
-        vst1q_f64(&output[i], r);
-    }
-#endif
-
-    // Scalar remainder
-    for (; i < count; i++)
-        output[i] = std::pow(base[i], exp_scalar);
-}
-
-// Broadcast version: scalar base raised to array of powers (scalar_base ^ exp_array)
-inline void pow_float64_scalar_base(double base_scalar, const double *exp, double *output, int64_t count)
-{
-    int64_t i = 0;
-
-#if defined(EIDOS_HAS_AVX2) && defined(EIDOS_HAS_FMA)
-    __m256d vb_broadcast = _mm256_set1_pd(base_scalar);
-    for (; i + 4 <= count; i += 4)
-    {
-        __m256d ve = _mm256_loadu_pd(&exp[i]);
-        __m256d r = Sleef_powd4_u10avx2(vb_broadcast, ve);
-        _mm256_storeu_pd(&output[i], r);
-    }
-#elif defined(EIDOS_HAS_NEON)
-    float64x2_t vb_broadcast = vdupq_n_f64(base_scalar);
-    for (; i + 2 <= count; i += 2)
-    {
-        float64x2_t ve = vld1q_f64(&exp[i]);
-        float64x2_t r = Sleef_powd2_u10advsimd(vb_broadcast, ve);
-        vst1q_f64(&output[i], r);
-    }
-#endif
-
-    // Scalar remainder
-    for (; i < count; i++)
-        output[i] = std::pow(base_scalar, exp[i]);
-}
-
-// ================================
-// Reductions
-// ================================
-
-// ---------------------
-// Sum: sum(x)
-// ---------------------
-inline double sum_float64(const double *input, int64_t count)
-{
-    double sum = 0.0;
-    int64_t i = 0;
-
-#if defined(EIDOS_HAS_AVX2)
-    __m256d vsum = _mm256_setzero_pd();
-    for (; i + 4 <= count; i += 4)
-    {
-        __m256d v = _mm256_loadu_pd(&input[i]);
-        vsum = _mm256_add_pd(vsum, v);
-    }
-    // Horizontal sum of 4 doubles
-    __m128d vlow  = _mm256_castpd256_pd128(vsum);
-    __m128d vhigh = _mm256_extractf128_pd(vsum, 1);
-    vlow  = _mm_add_pd(vlow, vhigh);     // 2 doubles
-    __m128d shuf = _mm_shuffle_pd(vlow, vlow, 1);
-    vlow = _mm_add_sd(vlow, shuf);       // 1 double
-    sum = _mm_cvtsd_f64(vlow);
-#elif defined(EIDOS_HAS_SSE42)
-    __m128d vsum = _mm_setzero_pd();
-    for (; i + 2 <= count; i += 2)
-    {
-        __m128d v = _mm_loadu_pd(&input[i]);
-        vsum = _mm_add_pd(vsum, v);
-    }
-    // Horizontal sum of 2 doubles
-    __m128d shuf = _mm_shuffle_pd(vsum, vsum, 1);
-    vsum = _mm_add_sd(vsum, shuf);
-    sum = _mm_cvtsd_f64(vsum);
-#elif defined(EIDOS_HAS_NEON)
-    float64x2_t vsum = vdupq_n_f64(0.0);
-    for (; i + 2 <= count; i += 2)
-    {
-        float64x2_t v = vld1q_f64(&input[i]);
-        vsum = vaddq_f64(vsum, v);
-    }
-    // Horizontal sum of 2 doubles
-    sum = vaddvq_f64(vsum);
-#endif
-
-    // Scalar remainder
-    for (; i < count; i++)
-        sum += input[i];
-
-    return sum;
-}
-
-// ---------------------
-// Product: product(x)
-// ---------------------
-inline double product_float64(const double *input, int64_t count)
-{
-    double prod = 1.0;
-    int64_t i = 0;
-
-#if defined(EIDOS_HAS_AVX2)
-    __m256d vprod = _mm256_set1_pd(1.0);
-    for (; i + 4 <= count; i += 4)
-    {
-        __m256d v = _mm256_loadu_pd(&input[i]);
-        vprod = _mm256_mul_pd(vprod, v);
-    }
-    // Horizontal product of 4 doubles
-    __m128d vlow  = _mm256_castpd256_pd128(vprod);
-    __m128d vhigh = _mm256_extractf128_pd(vprod, 1);
-    vlow  = _mm_mul_pd(vlow, vhigh);
-    __m128d shuf = _mm_shuffle_pd(vlow, vlow, 1);
-    vlow = _mm_mul_sd(vlow, shuf);
-    prod = _mm_cvtsd_f64(vlow);
-#elif defined(EIDOS_HAS_SSE42)
-    __m128d vprod = _mm_set1_pd(1.0);
-    for (; i + 2 <= count; i += 2)
-    {
-        __m128d v = _mm_loadu_pd(&input[i]);
-        vprod = _mm_mul_pd(vprod, v);
-    }
-    __m128d shuf = _mm_shuffle_pd(vprod, vprod, 1);
-    vprod = _mm_mul_sd(vprod, shuf);
-    prod = _mm_cvtsd_f64(vprod);
-#elif defined(EIDOS_HAS_NEON)
-    float64x2_t vprod = vdupq_n_f64(1.0);
-    for (; i + 2 <= count; i += 2)
-    {
-        float64x2_t v = vld1q_f64(&input[i]);
-        vprod = vmulq_f64(vprod, v);
-    }
-    // Horizontal product of 2 doubles
-    prod = vgetq_lane_f64(vprod, 0) * vgetq_lane_f64(vprod, 1);
-#endif
-
-    for (; i < count; i++)
-        prod *= input[i];
-
-    return prod;
-}
-
-// ================================
-// Float (Single-Precision) SIMD Operations
-// ================================
-// These functions operate on arrays of floats, used by spatial interaction kernels.
-
-// ---------------------
-// Exponential: exp(x) for floats
-// ---------------------
-inline void exp_float32(const float *input, float *output, int64_t count)
-{
-    int64_t i = 0;
-
-#if EIDOS_SLEEF_FLOAT_AVAILABLE
-    for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F)
-    {
-        EIDOS_SLEEF_TYPE_F v = EIDOS_SLEEF_LOAD_F(&input[i]);
-        EIDOS_SLEEF_TYPE_F r = EIDOS_SLEEF_EXP_F(v);
-        EIDOS_SLEEF_STORE_F(&output[i], r);
-    }
-#endif
-
-    // Scalar remainder
-    for (; i < count; i++)
-        output[i] = std::exp(input[i]);
-}
-
-// ---------------------
-// Exponential Kernel: strength = fmax * exp(-lambda * distance)
-// ---------------------
-// Operates in-place on a distance array, transforming distances to strengths.
-// This is optimized for the spatial interaction kernel calculation.
-inline void exp_kernel_float32(float *distances, int64_t count, float fmax, float lambda)
-{
-    int64_t i = 0;
-
-#if EIDOS_SLEEF_FLOAT_AVAILABLE
-    // We need to compute: fmax * exp(-lambda * distance)
-    // First, compute -lambda * distance for all values, then exp, then multiply by fmax
-    EIDOS_SLEEF_TYPE_F v_fmax =
-#if defined(EIDOS_HAS_AVX2)
-        _mm256_set1_ps(fmax);
-    EIDOS_SLEEF_TYPE_F v_neg_lambda = _mm256_set1_ps(-lambda);
-#elif defined(EIDOS_HAS_NEON)
-        vdupq_n_f32(fmax);
-    EIDOS_SLEEF_TYPE_F v_neg_lambda = vdupq_n_f32(-lambda);
-#endif
-
-    for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F)
-    {
-        EIDOS_SLEEF_TYPE_F v_dist = EIDOS_SLEEF_LOAD_F(&distances[i]);
-
-        // Compute -lambda * distance
-#if defined(EIDOS_HAS_AVX2)
-        EIDOS_SLEEF_TYPE_F v_arg = _mm256_mul_ps(v_neg_lambda, v_dist);
-#elif defined(EIDOS_HAS_NEON)
-        EIDOS_SLEEF_TYPE_F v_arg = vmulq_f32(v_neg_lambda, v_dist);
-#endif
-
-        // Compute exp(-lambda * distance)
-        EIDOS_SLEEF_TYPE_F v_exp = EIDOS_SLEEF_EXP_F(v_arg);
-
-        // Compute fmax * exp(...)
-#if defined(EIDOS_HAS_AVX2)
-        EIDOS_SLEEF_TYPE_F v_result = _mm256_mul_ps(v_fmax, v_exp);
-#elif defined(EIDOS_HAS_NEON)
-        EIDOS_SLEEF_TYPE_F v_result = vmulq_f32(v_fmax, v_exp);
-#endif
-
-        EIDOS_SLEEF_STORE_F(&distances[i], v_result);
-    }
-#endif
-
-    // Scalar remainder
-    for (; i < count; i++)
-        distances[i] = fmax * std::exp(-lambda * distances[i]);
-}
-
-// ---------------------
-// Normal (Gaussian) Kernel: strength = fmax * exp(-distance^2 / 2sigma^2)
-// ---------------------
-// Operates in-place on a distance array, transforming distances to strengths.
-// The two_sigma_sq parameter is pre-computed as 2 * sigma^2 for efficiency.
-inline void normal_kernel_float32(float *distances, int64_t count, float fmax, float two_sigma_sq)
-{
-    int64_t i = 0;
-
-#if EIDOS_SLEEF_FLOAT_AVAILABLE
-    // We need to compute: fmax * exp(-distance^2 / two_sigma_sq)
-    EIDOS_SLEEF_TYPE_F v_fmax =
-#if defined(EIDOS_HAS_AVX2)
-        _mm256_set1_ps(fmax);
-    EIDOS_SLEEF_TYPE_F v_neg_inv_2sigsq = _mm256_set1_ps(-1.0f / two_sigma_sq);
-#elif defined(EIDOS_HAS_NEON)
-        vdupq_n_f32(fmax);
-    EIDOS_SLEEF_TYPE_F v_neg_inv_2sigsq = vdupq_n_f32(-1.0f / two_sigma_sq);
-#endif
-
-    for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F)
-    {
-        EIDOS_SLEEF_TYPE_F v_dist = EIDOS_SLEEF_LOAD_F(&distances[i]);
-
-        // Compute distance^2
-#if defined(EIDOS_HAS_AVX2)
-        EIDOS_SLEEF_TYPE_F v_dist_sq = _mm256_mul_ps(v_dist, v_dist);
-        // Compute -distance^2 / 2sigma^2
-        EIDOS_SLEEF_TYPE_F v_arg = _mm256_mul_ps(v_dist_sq, v_neg_inv_2sigsq);
-#elif defined(EIDOS_HAS_NEON)
-        EIDOS_SLEEF_TYPE_F v_dist_sq = vmulq_f32(v_dist, v_dist);
-        // Compute -distance^2 / 2sigma^2
-        EIDOS_SLEEF_TYPE_F v_arg = vmulq_f32(v_dist_sq, v_neg_inv_2sigsq);
-#endif
-
-        // Compute exp(-distance^2 / 2sigma^2)
-        EIDOS_SLEEF_TYPE_F v_exp = EIDOS_SLEEF_EXP_F(v_arg);
-
-        // Compute fmax * exp(...)
-#if defined(EIDOS_HAS_AVX2)
-        EIDOS_SLEEF_TYPE_F v_result = _mm256_mul_ps(v_fmax, v_exp);
-#elif defined(EIDOS_HAS_NEON)
-        EIDOS_SLEEF_TYPE_F v_result = vmulq_f32(v_fmax, v_exp);
-#endif
-
-        EIDOS_SLEEF_STORE_F(&distances[i], v_result);
-    }
-#endif
-
-    // Scalar remainder
-    for (; i < count; i++)
-    {
-        float d = distances[i];
-        distances[i] = fmax * std::exp(-(d * d) / two_sigma_sq);
-    }
-}
-
-// ---------------------
-// Student's T Kernel: strength = fmax / pow(1 + (d/tau)^2 / nu, (nu+1)/2)
-// ---------------------
-// Operates in-place on a distance array, transforming distances to strengths.
-// Parameters: fmax = maximum strength, nu = degrees of freedom, tau = scale
-inline void tdist_kernel_float32(float *distances, int64_t count, float fmax, float nu, float tau)
-{
-    int64_t i = 0;
-
-    // Pre-compute constants
-    float inv_tau = 1.0f / tau;
-    float inv_nu = 1.0f / nu;
-    float exponent = (nu + 1.0f) / 2.0f;
-
-#if EIDOS_SLEEF_FLOAT_AVAILABLE
-    EIDOS_SLEEF_TYPE_F v_fmax, v_inv_tau, v_inv_nu, v_exponent, v_one;
-#if defined(EIDOS_HAS_AVX2)
-    v_fmax = _mm256_set1_ps(fmax);
-    v_inv_tau = _mm256_set1_ps(inv_tau);
-    v_inv_nu = _mm256_set1_ps(inv_nu);
-    v_exponent = _mm256_set1_ps(-exponent);
-    v_one = _mm256_set1_ps(1.0f);
-#elif defined(EIDOS_HAS_NEON)
-    v_fmax = vdupq_n_f32(fmax);
-    v_inv_tau = vdupq_n_f32(inv_tau);
-    v_inv_nu = vdupq_n_f32(inv_nu);
-    v_exponent = vdupq_n_f32(-exponent);
-    v_one = vdupq_n_f32(1.0f);
-#endif
-
-    for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F)
-    {
-        EIDOS_SLEEF_TYPE_F v_dist = EIDOS_SLEEF_LOAD_F(&distances[i]);
-
-        // Compute (d / tau)
-#if defined(EIDOS_HAS_AVX2)
-        EIDOS_SLEEF_TYPE_F v_d_over_tau = _mm256_mul_ps(v_dist, v_inv_tau);
-        // Compute (d/tau)^2
-        EIDOS_SLEEF_TYPE_F v_d_over_tau_sq = _mm256_mul_ps(v_d_over_tau, v_d_over_tau);
-        // Compute (d/tau)^2 / nu
-        EIDOS_SLEEF_TYPE_F v_term = _mm256_mul_ps(v_d_over_tau_sq, v_inv_nu);
-        // Compute 1 + (d/tau)^2 / nu
-        EIDOS_SLEEF_TYPE_F v_base = _mm256_add_ps(v_one, v_term);
-#elif defined(EIDOS_HAS_NEON)
-        EIDOS_SLEEF_TYPE_F v_d_over_tau = vmulq_f32(v_dist, v_inv_tau);
-        EIDOS_SLEEF_TYPE_F v_d_over_tau_sq = vmulq_f32(v_d_over_tau, v_d_over_tau);
-        EIDOS_SLEEF_TYPE_F v_term = vmulq_f32(v_d_over_tau_sq, v_inv_nu);
-        EIDOS_SLEEF_TYPE_F v_base = vaddq_f32(v_one, v_term);
-#endif
-
-        // Compute pow(base, -exponent) = 1 / pow(base, exponent)
-        EIDOS_SLEEF_TYPE_F v_pow = EIDOS_SLEEF_POW_F(v_base, v_exponent);
-
-        // Compute fmax * pow(...)
-#if defined(EIDOS_HAS_AVX2)
-        EIDOS_SLEEF_TYPE_F v_result = _mm256_mul_ps(v_fmax, v_pow);
-#elif defined(EIDOS_HAS_NEON)
-        EIDOS_SLEEF_TYPE_F v_result = vmulq_f32(v_fmax, v_pow);
-#endif
-
-        EIDOS_SLEEF_STORE_F(&distances[i], v_result);
-    }
-#endif
-
-    // Scalar remainder
-    for (; i < count; i++)
-    {
-        float d = distances[i];
-        float d_over_tau = d * inv_tau;
-        distances[i] = fmax * std::pow(1.0f + d_over_tau * d_over_tau * inv_nu, -exponent);
-    }
-}
-
-// ---------------------
-// Cauchy Kernel: strength = fmax / (1 + (d/lambda)^2)
-// ---------------------
-// Operates in-place on a distance array, transforming distances to strengths.
-// Parameters: fmax = maximum strength, lambda = scale parameter
-inline void cauchy_kernel_float32(float *distances, int64_t count, float fmax, float lambda)
-{
-    int64_t i = 0;
-    float inv_lambda = 1.0f / lambda;
-
-#if defined(EIDOS_HAS_AVX2)
-    __m256 v_fmax = _mm256_set1_ps(fmax);
-    __m256 v_inv_lambda = _mm256_set1_ps(inv_lambda);
-    __m256 v_one = _mm256_set1_ps(1.0f);
-
-    for (; i + 8 <= count; i += 8)
-    {
-        __m256 v_dist = _mm256_loadu_ps(&distances[i]);
-        __m256 v_temp = _mm256_mul_ps(v_dist, v_inv_lambda);      // d/lambda
-        __m256 v_temp_sq = _mm256_mul_ps(v_temp, v_temp);         // (d/lambda)^2
-        __m256 v_denom = _mm256_add_ps(v_one, v_temp_sq);         // 1 + (d/lambda)^2
-        __m256 v_result = _mm256_div_ps(v_fmax, v_denom);         // fmax / denom
-        _mm256_storeu_ps(&distances[i], v_result);
-    }
-#elif defined(EIDOS_HAS_NEON)
-    float32x4_t v_fmax = vdupq_n_f32(fmax);
-    float32x4_t v_inv_lambda = vdupq_n_f32(inv_lambda);
-    float32x4_t v_one = vdupq_n_f32(1.0f);
-
-    for (; i + 4 <= count; i += 4)
-    {
-        float32x4_t v_dist = vld1q_f32(&distances[i]);
-        float32x4_t v_temp = vmulq_f32(v_dist, v_inv_lambda);
-        float32x4_t v_temp_sq = vmulq_f32(v_temp, v_temp);
-        float32x4_t v_denom = vaddq_f32(v_one, v_temp_sq);
-        float32x4_t v_result = vdivq_f32(v_fmax, v_denom);
-        vst1q_f32(&distances[i], v_result);
-    }
-#endif
-
-    // Scalar remainder
-    for (; i < count; i++)
-    {
-        float d = distances[i];
-        float temp = d * inv_lambda;
-        distances[i] = fmax / (1.0f + temp * temp);
-    }
-}
-
-// ---------------------
-// Linear Kernel: strength = fmax * (1 - d / max_distance)
-// ---------------------
-// Operates in-place on a distance array, transforming distances to strengths.
-// Parameters: fmax = maximum strength, max_distance = interaction max distance
-inline void linear_kernel_float32(float *distances, int64_t count, float fmax, float max_distance)
-{
-    int64_t i = 0;
-    float fmax_over_maxdist = fmax / max_distance;
-
-#if defined(EIDOS_HAS_AVX2)
-    __m256 v_fmax = _mm256_set1_ps(fmax);
-    __m256 v_fmax_over_maxdist = _mm256_set1_ps(fmax_over_maxdist);
-
-    for (; i + 8 <= count; i += 8)
-    {
-        __m256 v_dist = _mm256_loadu_ps(&distances[i]);
-        // fmax - d * (fmax / max_distance) = fmax * (1 - d/max_distance)
-        __m256 v_term = _mm256_mul_ps(v_dist, v_fmax_over_maxdist);
-        __m256 v_result = _mm256_sub_ps(v_fmax, v_term);
-        _mm256_storeu_ps(&distances[i], v_result);
-    }
-#elif defined(EIDOS_HAS_NEON)
-    float32x4_t v_fmax = vdupq_n_f32(fmax);
-    float32x4_t v_fmax_over_maxdist = vdupq_n_f32(fmax_over_maxdist);
-
-    for (; i + 4 <= count; i += 4)
-    {
-        float32x4_t v_dist = vld1q_f32(&distances[i]);
-        float32x4_t v_term = vmulq_f32(v_dist, v_fmax_over_maxdist);
-        float32x4_t v_result = vsubq_f32(v_fmax, v_term);
-        vst1q_f32(&distances[i], v_result);
-    }
-#endif
-
-    // Scalar remainder
-    for (; i < count; i++)
-    {
-        distances[i] = fmax - distances[i] * fmax_over_maxdist;
-    }
-}
-
-// ================================
-// Convolution Helpers for SpatialMap::smooth()
-// ================================
-// These functions compute vectorized dot products for convolution operations.
-// They accumulate both kernel_sum and conv_sum (weighted sum) in a single pass.
-
-// ---------------------
-// Convolution dot product: accumulates kernel_sum and kernel*values sum
-// ---------------------
-// Computes: kernel_sum += sum(kernel), conv_sum += sum(kernel * values)
-// Used by SpatialMap convolution when all values are valid (no edge handling needed)
-inline void convolve_dot_product_float64(
-    const double *kernel_ptr, const double *pixel_ptr,
-    int64_t count, double &kernel_sum, double &conv_sum)
-{
-    int64_t i = 0;
-    double local_ksum = 0.0;
-    double local_csum = 0.0;
-
-#if defined(EIDOS_HAS_AVX2) && defined(EIDOS_HAS_FMA)
-    __m256d v_ksum = _mm256_setzero_pd();
-    __m256d v_csum = _mm256_setzero_pd();
-
-    for (; i + 4 <= count; i += 4)
-    {
-        __m256d v_kernel = _mm256_loadu_pd(&kernel_ptr[i]);
-        __m256d v_pixel = _mm256_loadu_pd(&pixel_ptr[i]);
-
-        v_ksum = _mm256_add_pd(v_ksum, v_kernel);
-        v_csum = _mm256_fmadd_pd(v_kernel, v_pixel, v_csum);
-    }
-
-    // Horizontal sum
-    __m128d ksum_low = _mm256_castpd256_pd128(v_ksum);
-    __m128d ksum_high = _mm256_extractf128_pd(v_ksum, 1);
-    ksum_low = _mm_add_pd(ksum_low, ksum_high);
-    __m128d ksum_shuf = _mm_shuffle_pd(ksum_low, ksum_low, 1);
-    ksum_low = _mm_add_sd(ksum_low, ksum_shuf);
-    local_ksum = _mm_cvtsd_f64(ksum_low);
-
-    __m128d csum_low = _mm256_castpd256_pd128(v_csum);
-    __m128d csum_high = _mm256_extractf128_pd(v_csum, 1);
-    csum_low = _mm_add_pd(csum_low, csum_high);
-    __m128d csum_shuf = _mm_shuffle_pd(csum_low, csum_low, 1);
-    csum_low = _mm_add_sd(csum_low, csum_shuf);
-    local_csum = _mm_cvtsd_f64(csum_low);
-
-#elif defined(EIDOS_HAS_SSE42)
-    __m128d v_ksum = _mm_setzero_pd();
-    __m128d v_csum = _mm_setzero_pd();
-
-    for (; i + 2 <= count; i += 2)
-    {
-        __m128d v_kernel = _mm_loadu_pd(&kernel_ptr[i]);
-        __m128d v_pixel = _mm_loadu_pd(&pixel_ptr[i]);
-
-        v_ksum = _mm_add_pd(v_ksum, v_kernel);
-        __m128d v_prod = _mm_mul_pd(v_kernel, v_pixel);
-        v_csum = _mm_add_pd(v_csum, v_prod);
-    }
-
-    // Horizontal sum
-    __m128d ksum_shuf = _mm_shuffle_pd(v_ksum, v_ksum, 1);
-    v_ksum = _mm_add_sd(v_ksum, ksum_shuf);
-    local_ksum = _mm_cvtsd_f64(v_ksum);
-
-    __m128d csum_shuf = _mm_shuffle_pd(v_csum, v_csum, 1);
-    v_csum = _mm_add_sd(v_csum, csum_shuf);
-    local_csum = _mm_cvtsd_f64(v_csum);
-
-#elif defined(EIDOS_HAS_NEON)
-    float64x2_t v_ksum = vdupq_n_f64(0.0);
-    float64x2_t v_csum = vdupq_n_f64(0.0);
-
-    for (; i + 2 <= count; i += 2)
-    {
-        float64x2_t v_kernel = vld1q_f64(&kernel_ptr[i]);
-        float64x2_t v_pixel = vld1q_f64(&pixel_ptr[i]);
-
-        v_ksum = vaddq_f64(v_ksum, v_kernel);
-        v_csum = vfmaq_f64(v_csum, v_kernel, v_pixel);
-    }
-
-    local_ksum = vaddvq_f64(v_ksum);
-    local_csum = vaddvq_f64(v_csum);
-#endif
-
-    // Scalar remainder
-    for (; i < count; i++)
-    {
-        local_ksum += kernel_ptr[i];
-        local_csum += kernel_ptr[i] * pixel_ptr[i];
-    }
-
-    kernel_sum += local_ksum;
-    conv_sum += local_csum;
-}
-
-// ---------------------
-// Scaled convolution dot product for edge handling
-// ---------------------
-// Like convolve_dot_product_float64 but scales kernel values by coverage factor.
-// Used when handling edges where some kernel positions are out of bounds.
-inline void convolve_dot_product_scaled_float64(
-    const double *kernel_ptr, const double *pixel_ptr,
-    int64_t count, double coverage,
-    double &kernel_sum, double &conv_sum)
-{
-    int64_t i = 0;
-    double local_ksum = 0.0;
-    double local_csum = 0.0;
-
-#if defined(EIDOS_HAS_AVX2) && defined(EIDOS_HAS_FMA)
-    __m256d v_coverage = _mm256_set1_pd(coverage);
-    __m256d v_ksum = _mm256_setzero_pd();
-    __m256d v_csum = _mm256_setzero_pd();
-
-    for (; i + 4 <= count; i += 4)
-    {
-        __m256d v_kernel = _mm256_loadu_pd(&kernel_ptr[i]);
-        __m256d v_pixel = _mm256_loadu_pd(&pixel_ptr[i]);
-        __m256d v_scaled_kernel = _mm256_mul_pd(v_kernel, v_coverage);
-
-        v_ksum = _mm256_add_pd(v_ksum, v_scaled_kernel);
-        v_csum = _mm256_fmadd_pd(v_scaled_kernel, v_pixel, v_csum);
-    }
-
-    // Horizontal sum
-    __m128d ksum_low = _mm256_castpd256_pd128(v_ksum);
-    __m128d ksum_high = _mm256_extractf128_pd(v_ksum, 1);
-    ksum_low = _mm_add_pd(ksum_low, ksum_high);
-    __m128d ksum_shuf = _mm_shuffle_pd(ksum_low, ksum_low, 1);
-    ksum_low = _mm_add_sd(ksum_low, ksum_shuf);
-    local_ksum = _mm_cvtsd_f64(ksum_low);
-
-    __m128d csum_low = _mm256_castpd256_pd128(v_csum);
-    __m128d csum_high = _mm256_extractf128_pd(v_csum, 1);
-    csum_low = _mm_add_pd(csum_low, csum_high);
-    __m128d csum_shuf = _mm_shuffle_pd(csum_low, csum_low, 1);
-    csum_low = _mm_add_sd(csum_low, csum_shuf);
-    local_csum = _mm_cvtsd_f64(csum_low);
-
-#elif defined(EIDOS_HAS_SSE42)
-    __m128d v_coverage = _mm_set1_pd(coverage);
-    __m128d v_ksum = _mm_setzero_pd();
-    __m128d v_csum = _mm_setzero_pd();
-
-    for (; i + 2 <= count; i += 2)
-    {
-        __m128d v_kernel = _mm_loadu_pd(&kernel_ptr[i]);
-        __m128d v_pixel = _mm_loadu_pd(&pixel_ptr[i]);
-        __m128d v_scaled_kernel = _mm_mul_pd(v_kernel, v_coverage);
-
-        v_ksum = _mm_add_pd(v_ksum, v_scaled_kernel);
-        __m128d v_prod = _mm_mul_pd(v_scaled_kernel, v_pixel);
-        v_csum = _mm_add_pd(v_csum, v_prod);
-    }
-
-    // Horizontal sum
-    __m128d ksum_shuf = _mm_shuffle_pd(v_ksum, v_ksum, 1);
-    v_ksum = _mm_add_sd(v_ksum, ksum_shuf);
-    local_ksum = _mm_cvtsd_f64(v_ksum);
-
-    __m128d csum_shuf = _mm_shuffle_pd(v_csum, v_csum, 1);
-    v_csum = _mm_add_sd(v_csum, csum_shuf);
-    local_csum = _mm_cvtsd_f64(v_csum);
-
-#elif defined(EIDOS_HAS_NEON)
-    float64x2_t v_coverage = vdupq_n_f64(coverage);
-    float64x2_t v_ksum = vdupq_n_f64(0.0);
-    float64x2_t v_csum = vdupq_n_f64(0.0);
-
-    for (; i + 2 <= count; i += 2)
-    {
-        float64x2_t v_kernel = vld1q_f64(&kernel_ptr[i]);
-        float64x2_t v_pixel = vld1q_f64(&pixel_ptr[i]);
-        float64x2_t v_scaled_kernel = vmulq_f64(v_kernel, v_coverage);
-
-        v_ksum = vaddq_f64(v_ksum, v_scaled_kernel);
-        v_csum = vfmaq_f64(v_csum, v_scaled_kernel, v_pixel);
-    }
-
-    local_ksum = vaddvq_f64(v_ksum);
-    local_csum = vaddvq_f64(v_csum);
-#endif
-
-    // Scalar remainder with scaling
-    for (; i < count; i++)
-    {
-        double scaled_k = kernel_ptr[i] * coverage;
-        local_ksum += scaled_k;
-        local_csum += scaled_k * pixel_ptr[i];
-    }
-
-    kernel_sum += local_ksum;
-    conv_sum += local_csum;
-}
-
-} // namespace Eidos_SIMD
-
-
-// stop suppressing warnings
-#pragma clang diagnostic pop
-#pragma GCC diagnostic pop
-
+#define X(ret, name, params) extern ret (*name) params;
+EIDOS_SIMD_FUNCTION_TABLE
+#undef X
+}
+
+// Declarations of the scalar-tier kernels.  These are the universal fallback;
+// the function pointers above are statically initialized to point here, so a
+// call is always valid even before Eidos_SIMD_Init() has run.  The scalar tier
+// is defined in eidos_simd_scalar.cpp and built on every platform.
+namespace Eidos_SIMD_scalar {
+#define X(ret, name, params) ret name params;
+EIDOS_SIMD_FUNCTION_TABLE
+#undef X
+}
+
+// Each tier's translation unit defines a fill function that points the
+// Eidos_SIMD pointers at that tier's kernels.  Eidos_SIMD_Init() calls the
+// appropriate one based on runtime CPU detection.
+void Eidos_SIMD_Fill_scalar(void);
+#if EIDOS_SIMD_DISPATCH_X86
+void Eidos_SIMD_Fill_sse42(void);
+void Eidos_SIMD_Fill_avx2(void);
+#endif
+#if EIDOS_SIMD_DISPATCH_ARM
+void Eidos_SIMD_Fill_neon(void);
+#endif
+
+// Detect the CPU and select the fastest available SIMD tier.  This must be
+// called once, early in startup, before any Eidos_SIMD kernel is used;
+// Eidos_WarmUp() does this.  It is idempotent.
+void Eidos_SIMD_Init(void);
+
+// Force a specific SIMD tier by name ("scalar", "SSE4.2", "AVX2+FMA", "NEON")
+// instead of the auto-selected best tier.  Returns false, leaving the active
+// tier unchanged, if the named tier is unavailable on this CPU or in this
+// build.  Used by the SIMD self-tests to exercise every tier; calling
+// Eidos_SIMD_Init() afterward restores the best tier for the CPU.
+bool Eidos_SIMD_SelectTier(const char *tier_name);
+
+// The name of the SIMD tier selected at startup: "scalar", "SSE4.2",
+// "AVX2+FMA", or "NEON".  Valid only after Eidos_SIMD_Init() has run.
+const char *Eidos_SIMD_ActiveTierName(void);
+
+// True if the active tier uses SLEEF for vectorized transcendental functions
+// (exp, log, sin, ...).  SLEEF is wired up for the AVX2+FMA and NEON tiers;
+// the scalar and SSE4.2 tiers use std:: transcendentals.
+bool Eidos_SIMD_SLEEFActive(void);
 
 #endif /* eidos_simd_h */
diff --git a/eidos/eidos_simd_avx2.cpp b/eidos/eidos_simd_avx2.cpp
new file mode 100644
index 00000000..4084db5f
--- /dev/null
+++ b/eidos/eidos_simd_avx2.cpp
@@ -0,0 +1,50 @@
+//
+//  eidos_simd_avx2.cpp
+//  Eidos
+//
+//  Created by Andrew Kern on 5/21/2026.
+//  Copyright (c) 2024-2025 Benjamin C. Haller.  All rights reserved.
+//	A product of the Messer Lab, http://messerlab.org/slim/
+//
+
+//	This file is part of Eidos.
+//
+//	Eidos is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
+//
+//	Eidos is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License along with Eidos.  If not, see <http://www.gnu.org/licenses/>.
+
+/*
+
+ The AVX2+FMA SIMD tier (x86_64 only).  This entire translation unit is built
+ with -mavx2 -mfma by the build system (see CMakeLists.txt); no other
+ translation unit gets those flags, so AVX2/FMA instructions (including those
+ inside the SLEEF transcendental headers pulled in here) are confined to this
+ file and reached only through the dispatch pointers after the CPU has been
+ confirmed to support AVX2 and FMA.
+
+ On non-x86 architectures, when USE_SIMD=OFF, and on MSVC, this file compiles
+ to nothing.
+
+ */
+
+#include "eidos_simd.h"
+
+#if EIDOS_SIMD_DISPATCH_X86
+
+#define EIDOS_HAS_AVX2 1
+#define EIDOS_HAS_FMA 1
+#define EIDOS_SIMD_IMPL_NAMESPACE Eidos_SIMD_avx2
+#include "eidos_simd_impl.h"
+
+void Eidos_SIMD_Fill_avx2(void)
+{
+#define X(ret, name, params) Eidos_SIMD::name = &Eidos_SIMD_avx2::name;
+EIDOS_SIMD_FUNCTION_TABLE
+#undef X
+}
+
+#endif // EIDOS_SIMD_DISPATCH_X86
diff --git a/eidos/eidos_simd_impl.h b/eidos/eidos_simd_impl.h
new file mode 100644
index 00000000..11734a35
--- /dev/null
+++ b/eidos/eidos_simd_impl.h
@@ -0,0 +1,1227 @@
+//
+//  eidos_simd_impl.h
+//  Eidos
+//
+//  Created by Andrew Kern on 5/21/2026.
+//  Copyright (c) 2024-2025 Benjamin C. Haller.  All rights reserved.
+//	A product of the Messer Lab, http://messerlab.org/slim/
+//
+
+//	This file is part of Eidos.
+//
+//	Eidos is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
+//
+//	Eidos is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License along with Eidos.  If not, see <http://www.gnu.org/licenses/>.
+
+/*
+
+ Tier-parameterized SIMD kernel implementations for Eidos math operations.
+
+ This file is the shared body of the SIMD runtime-dispatch system.  It is
+ included once per instruction-set tier by eidos_simd_scalar.cpp,
+ eidos_simd_sse42.cpp, eidos_simd_avx2.cpp, and eidos_simd_neon.cpp.  Each
+ includer first defines EIDOS_SIMD_IMPL_NAMESPACE and the EIDOS_HAS_* macro
+ for its tier, then includes this file; the preprocessor blocks below select
+ the matching intrinsics (or the scalar fallback).  See eidos_simd.h for the
+ dispatch design and the rationale (issue #628).
+
+ There is deliberately no include guard: this file is a template body meant
+ to be expanded into a different namespace in each translation unit that
+ includes it.  Each translation unit must include it exactly once.
+
+ */
+
+#ifndef EIDOS_SIMD_IMPL_NAMESPACE
+#error "Define EIDOS_SIMD_IMPL_NAMESPACE (and any EIDOS_HAS_* tier macro) before including eidos_simd_impl.h"
+#endif
+
+#include <cstdint>
+#include <cmath>
+#include <cstring>		// SLEEF's inline headers use memcpy() without including this themselves
+
+// Determine SIMD capability level
+#if defined(EIDOS_HAS_AVX2)
+    #include <immintrin.h>
+    #define EIDOS_SIMD_WIDTH 4          // 4 doubles per AVX register
+    #define EIDOS_SIMD_FLOAT_WIDTH 8    // 8 floats per AVX register
+#elif defined(EIDOS_HAS_SSE42)
+    #include <emmintrin.h>
+    #include <smmintrin.h>
+    #define EIDOS_SIMD_WIDTH 2          // 2 doubles per SSE register
+    #define EIDOS_SIMD_FLOAT_WIDTH 4    // 4 floats per SSE register
+#elif defined(EIDOS_HAS_NEON)
+    #include <arm_neon.h>
+    #define EIDOS_SIMD_WIDTH 2          // 2 doubles per NEON register
+    #define EIDOS_SIMD_FLOAT_WIDTH 4    // 4 floats per NEON register
+#else
+    #define EIDOS_SIMD_WIDTH 1          // Scalar fallback
+    #define EIDOS_SIMD_FLOAT_WIDTH 1
+#endif
+
+// Include SLEEF for vectorized transcendental functions (exp, log, log10, log2)
+// SLEEF is only used when AVX2+FMA or NEON is available
+// BCH 12/31/2025: SLEEF generates tons of shadowed variable warnings for some reason; disable them
+#if defined(EIDOS_HAS_AVX2) || defined(EIDOS_HAS_NEON)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wshadow"
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wunused-function"
+#pragma GCC diagnostic ignored "-Wimplicit-float-conversion"
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wshadow"
+#pragma clang diagnostic ignored "-Wdouble-promotion"
+#pragma clang diagnostic ignored "-Wunused-parameter"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wimplicit-float-conversion"
+#include "sleef/sleef_config.h"
+#pragma clang diagnostic pop
+#pragma GCC diagnostic pop
+#endif
+
+// Disable certain warnings for the remainder of this file
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Waggressive-loop-optimizations"
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Waggressive-loop-optimizations"
+
+
+// ================================
+// SIMD Vector Math Operations
+// ================================
+// These functions apply an operation to arrays of doubles.
+// They handle the loop, SIMD processing, and scalar remainder.
+
+namespace EIDOS_SIMD_IMPL_NAMESPACE {
+
+// ---------------------
+// Square Root: sqrt(x)
+// ---------------------
+void sqrt_float64(const double *input, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if defined(EIDOS_HAS_AVX2)
+    // Process 4 doubles at a time
+    for (; i + 4 <= count; i += 4)
+    {
+        __m256d v = _mm256_loadu_pd(&input[i]);
+        __m256d r = _mm256_sqrt_pd(v);
+        _mm256_storeu_pd(&output[i], r);
+    }
+#elif defined(EIDOS_HAS_SSE42)
+    // Process 2 doubles at a time
+    for (; i + 2 <= count; i += 2)
+    {
+        __m128d v = _mm_loadu_pd(&input[i]);
+        __m128d r = _mm_sqrt_pd(v);
+        _mm_storeu_pd(&output[i], r);
+    }
+#elif defined(EIDOS_HAS_NEON)
+    // Process 2 doubles at a time
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        float64x2_t r = vsqrtq_f64(v);
+        vst1q_f64(&output[i], r);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+        output[i] = std::sqrt(input[i]);
+}
+
+// ---------------------
+// Absolute Value: abs(x)
+// ---------------------
+void abs_float64(const double *input, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if defined(EIDOS_HAS_AVX2)
+    // Create sign mask (all bits except sign bit)
+    __m256d sign_mask = _mm256_set1_pd(-0.0);
+    for (; i + 4 <= count; i += 4)
+    {
+        __m256d v = _mm256_loadu_pd(&input[i]);
+        __m256d r = _mm256_andnot_pd(sign_mask, v);  // Clear sign bit
+        _mm256_storeu_pd(&output[i], r);
+    }
+#elif defined(EIDOS_HAS_SSE42)
+    __m128d sign_mask = _mm_set1_pd(-0.0);
+    for (; i + 2 <= count; i += 2)
+    {
+        __m128d v = _mm_loadu_pd(&input[i]);
+        __m128d r = _mm_andnot_pd(sign_mask, v);
+        _mm_storeu_pd(&output[i], r);
+    }
+#elif defined(EIDOS_HAS_NEON)
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        float64x2_t r = vabsq_f64(v);
+        vst1q_f64(&output[i], r);
+    }
+#endif
+
+    for (; i < count; i++)
+        output[i] = std::fabs(input[i]);
+}
+
+// ---------------------
+// Floor: floor(x)
+// ---------------------
+void floor_float64(const double *input, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if defined(EIDOS_HAS_AVX2)
+    for (; i + 4 <= count; i += 4)
+    {
+        __m256d v = _mm256_loadu_pd(&input[i]);
+        __m256d r = _mm256_floor_pd(v);
+        _mm256_storeu_pd(&output[i], r);
+    }
+#elif defined(EIDOS_HAS_SSE42)
+    for (; i + 2 <= count; i += 2)
+    {
+        __m128d v = _mm_loadu_pd(&input[i]);
+        __m128d r = _mm_floor_pd(v);
+        _mm_storeu_pd(&output[i], r);
+    }
+#elif defined(EIDOS_HAS_NEON)
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        float64x2_t r = vrndmq_f64(v);  // Round toward minus infinity (floor)
+        vst1q_f64(&output[i], r);
+    }
+#endif
+
+    for (; i < count; i++)
+        output[i] = std::floor(input[i]);
+}
+
+// ---------------------
+// Ceil: ceil(x)
+// ---------------------
+void ceil_float64(const double *input, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if defined(EIDOS_HAS_AVX2)
+    for (; i + 4 <= count; i += 4)
+    {
+        __m256d v = _mm256_loadu_pd(&input[i]);
+        __m256d r = _mm256_ceil_pd(v);
+        _mm256_storeu_pd(&output[i], r);
+    }
+#elif defined(EIDOS_HAS_SSE42)
+    for (; i + 2 <= count; i += 2)
+    {
+        __m128d v = _mm_loadu_pd(&input[i]);
+        __m128d r = _mm_ceil_pd(v);
+        _mm_storeu_pd(&output[i], r);
+    }
+#elif defined(EIDOS_HAS_NEON)
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        float64x2_t r = vrndpq_f64(v);  // Round toward plus infinity (ceil)
+        vst1q_f64(&output[i], r);
+    }
+#endif
+
+    for (; i < count; i++)
+        output[i] = std::ceil(input[i]);
+}
+
+// ---------------------
+// Truncate: trunc(x)
+// ---------------------
+void trunc_float64(const double *input, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if defined(EIDOS_HAS_AVX2)
+    for (; i + 4 <= count; i += 4)
+    {
+        __m256d v = _mm256_loadu_pd(&input[i]);
+        __m256d r = _mm256_round_pd(v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        _mm256_storeu_pd(&output[i], r);
+    }
+#elif defined(EIDOS_HAS_SSE42)
+    for (; i + 2 <= count; i += 2)
+    {
+        __m128d v = _mm_loadu_pd(&input[i]);
+        __m128d r = _mm_round_pd(v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        _mm_storeu_pd(&output[i], r);
+    }
+#elif defined(EIDOS_HAS_NEON)
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        float64x2_t r = vrndq_f64(v);  // Round toward zero (truncate)
+        vst1q_f64(&output[i], r);
+    }
+#endif
+
+    for (; i < count; i++)
+        output[i] = std::trunc(input[i]);
+}
+
+// ---------------------
+// Round: round(x)
+// ---------------------
+void round_float64(const double *input, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if defined(EIDOS_HAS_AVX2)
+    for (; i + 4 <= count; i += 4)
+    {
+        __m256d v = _mm256_loadu_pd(&input[i]);
+        __m256d r = _mm256_round_pd(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        _mm256_storeu_pd(&output[i], r);
+    }
+#elif defined(EIDOS_HAS_SSE42)
+    for (; i + 2 <= count; i += 2)
+    {
+        __m128d v = _mm_loadu_pd(&input[i]);
+        __m128d r = _mm_round_pd(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        _mm_storeu_pd(&output[i], r);
+    }
+#elif defined(EIDOS_HAS_NEON)
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        float64x2_t r = vrndaq_f64(v);  // Round to nearest, ties away from zero
+        vst1q_f64(&output[i], r);
+    }
+#endif
+
+    for (; i < count; i++)
+        output[i] = std::round(input[i]);
+}
+
+// ---------------------
+// Exponential: exp(x)
+// ---------------------
+void exp_float64(const double *input, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if EIDOS_SLEEF_AVAILABLE
+    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
+    {
+        EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]);
+        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_EXP_D(v);
+        EIDOS_SLEEF_STORE_D(&output[i], r);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+        output[i] = std::exp(input[i]);
+}
+
+// ---------------------
+// Natural Log: log(x)
+// ---------------------
+void log_float64(const double *input, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if EIDOS_SLEEF_AVAILABLE
+    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
+    {
+        EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]);
+        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_LOG_D(v);
+        EIDOS_SLEEF_STORE_D(&output[i], r);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+        output[i] = std::log(input[i]);
+}
+
+// ---------------------
+// Log base 10: log10(x)
+// ---------------------
+void log10_float64(const double *input, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if EIDOS_SLEEF_AVAILABLE
+    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
+    {
+        EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]);
+        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_LOG10_D(v);
+        EIDOS_SLEEF_STORE_D(&output[i], r);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+        output[i] = std::log10(input[i]);
+}
+
+// ---------------------
+// Log base 2: log2(x)
+// ---------------------
+void log2_float64(const double *input, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if EIDOS_SLEEF_AVAILABLE
+    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
+    {
+        EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]);
+        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_LOG2_D(v);
+        EIDOS_SLEEF_STORE_D(&output[i], r);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+        output[i] = std::log2(input[i]);
+}
+
+// ---------------------
+// Sine: sin(x)
+// ---------------------
+void sin_float64(const double *input, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if EIDOS_SLEEF_AVAILABLE
+    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
+    {
+        EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]);
+        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_SIN_D(v);
+        EIDOS_SLEEF_STORE_D(&output[i], r);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+        output[i] = std::sin(input[i]);
+}
+
+// ---------------------
+// Cosine: cos(x)
+// ---------------------
+void cos_float64(const double *input, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if EIDOS_SLEEF_AVAILABLE
+    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
+    {
+        EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]);
+        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_COS_D(v);
+        EIDOS_SLEEF_STORE_D(&output[i], r);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+        output[i] = std::cos(input[i]);
+}
+
+// ---------------------
+// Tangent: tan(x)
+// ---------------------
+void tan_float64(const double *input, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if EIDOS_SLEEF_AVAILABLE
+    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
+    {
+        EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]);
+        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_TAN_D(v);
+        EIDOS_SLEEF_STORE_D(&output[i], r);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+        output[i] = std::tan(input[i]);
+}
+
+// ---------------------
+// Arc Sine: asin(x)
+// ---------------------
+void asin_float64(const double *input, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if EIDOS_SLEEF_AVAILABLE
+    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
+    {
+        EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]);
+        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_ASIN_D(v);
+        EIDOS_SLEEF_STORE_D(&output[i], r);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+        output[i] = std::asin(input[i]);
+}
+
+// ---------------------
+// Arc Cosine: acos(x)
+// ---------------------
+void acos_float64(const double *input, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if EIDOS_SLEEF_AVAILABLE
+    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
+    {
+        EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]);
+        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_ACOS_D(v);
+        EIDOS_SLEEF_STORE_D(&output[i], r);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+        output[i] = std::acos(input[i]);
+}
+
+// ---------------------
+// Arc Tangent: atan(x)
+// ---------------------
+void atan_float64(const double *input, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if EIDOS_SLEEF_AVAILABLE
+    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
+    {
+        EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]);
+        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_ATAN_D(v);
+        EIDOS_SLEEF_STORE_D(&output[i], r);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+        output[i] = std::atan(input[i]);
+}
+
+// ---------------------
+// Arc Tangent 2: atan2(y, x)
+// ---------------------
+void atan2_float64(const double *y, const double *x, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if EIDOS_SLEEF_AVAILABLE
+    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
+    {
+        EIDOS_SLEEF_TYPE_D vy = EIDOS_SLEEF_LOAD_D(&y[i]);
+        EIDOS_SLEEF_TYPE_D vx = EIDOS_SLEEF_LOAD_D(&x[i]);
+        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_ATAN2_D(vy, vx);
+        EIDOS_SLEEF_STORE_D(&output[i], r);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+        output[i] = std::atan2(y[i], x[i]);
+}
+
+// ---------------------
+// Power: pow(x, y) = x^y
+// ---------------------
+void pow_float64(const double *base, const double *exp, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if EIDOS_SLEEF_AVAILABLE
+    for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE)
+    {
+        EIDOS_SLEEF_TYPE_D vb = EIDOS_SLEEF_LOAD_D(&base[i]);
+        EIDOS_SLEEF_TYPE_D ve = EIDOS_SLEEF_LOAD_D(&exp[i]);
+        EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_POW_D(vb, ve);
+        EIDOS_SLEEF_STORE_D(&output[i], r);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+        output[i] = std::pow(base[i], exp[i]);
+}
+
+// Broadcast version: all elements raised to same power (base_array ^ scalar_exp)
+void pow_float64_scalar_exp(const double *base, double exp_scalar, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if defined(EIDOS_HAS_AVX2) && defined(EIDOS_HAS_FMA)
+    __m256d ve_broadcast = _mm256_set1_pd(exp_scalar);
+    for (; i + 4 <= count; i += 4)
+    {
+        __m256d vb = _mm256_loadu_pd(&base[i]);
+        __m256d r = Sleef_powd4_u10avx2(vb, ve_broadcast);
+        _mm256_storeu_pd(&output[i], r);
+    }
+#elif defined(EIDOS_HAS_NEON)
+    float64x2_t ve_broadcast = vdupq_n_f64(exp_scalar);
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t vb = vld1q_f64(&base[i]);
+        float64x2_t r = Sleef_powd2_u10advsimd(vb, ve_broadcast);
+        vst1q_f64(&output[i], r);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+        output[i] = std::pow(base[i], exp_scalar);
+}
+
+// Broadcast version: scalar base raised to array of powers (scalar_base ^ exp_array)
+void pow_float64_scalar_base(double base_scalar, const double *exp, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if defined(EIDOS_HAS_AVX2) && defined(EIDOS_HAS_FMA)
+    __m256d vb_broadcast = _mm256_set1_pd(base_scalar);
+    for (; i + 4 <= count; i += 4)
+    {
+        __m256d ve = _mm256_loadu_pd(&exp[i]);
+        __m256d r = Sleef_powd4_u10avx2(vb_broadcast, ve);
+        _mm256_storeu_pd(&output[i], r);
+    }
+#elif defined(EIDOS_HAS_NEON)
+    float64x2_t vb_broadcast = vdupq_n_f64(base_scalar);
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t ve = vld1q_f64(&exp[i]);
+        float64x2_t r = Sleef_powd2_u10advsimd(vb_broadcast, ve);
+        vst1q_f64(&output[i], r);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+        output[i] = std::pow(base_scalar, exp[i]);
+}
+
+// ================================
+// Reductions
+// ================================
+
+// ---------------------
+// Sum: sum(x)
+// ---------------------
+double sum_float64(const double *input, int64_t count)
+{
+    double sum = 0.0;
+    int64_t i = 0;
+
+#if defined(EIDOS_HAS_AVX2)
+    __m256d vsum = _mm256_setzero_pd();
+    for (; i + 4 <= count; i += 4)
+    {
+        __m256d v = _mm256_loadu_pd(&input[i]);
+        vsum = _mm256_add_pd(vsum, v);
+    }
+    // Horizontal sum of 4 doubles
+    __m128d vlow  = _mm256_castpd256_pd128(vsum);
+    __m128d vhigh = _mm256_extractf128_pd(vsum, 1);
+    vlow  = _mm_add_pd(vlow, vhigh);     // 2 doubles
+    __m128d shuf = _mm_shuffle_pd(vlow, vlow, 1);
+    vlow = _mm_add_sd(vlow, shuf);       // 1 double
+    sum = _mm_cvtsd_f64(vlow);
+#elif defined(EIDOS_HAS_SSE42)
+    __m128d vsum = _mm_setzero_pd();
+    for (; i + 2 <= count; i += 2)
+    {
+        __m128d v = _mm_loadu_pd(&input[i]);
+        vsum = _mm_add_pd(vsum, v);
+    }
+    // Horizontal sum of 2 doubles
+    __m128d shuf = _mm_shuffle_pd(vsum, vsum, 1);
+    vsum = _mm_add_sd(vsum, shuf);
+    sum = _mm_cvtsd_f64(vsum);
+#elif defined(EIDOS_HAS_NEON)
+    float64x2_t vsum = vdupq_n_f64(0.0);
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        vsum = vaddq_f64(vsum, v);
+    }
+    // Horizontal sum of 2 doubles
+    sum = vaddvq_f64(vsum);
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+        sum += input[i];
+
+    return sum;
+}
+
+// ---------------------
+// Product: product(x)
+// ---------------------
+double product_float64(const double *input, int64_t count)
+{
+    double prod = 1.0;
+    int64_t i = 0;
+
+#if defined(EIDOS_HAS_AVX2)
+    __m256d vprod = _mm256_set1_pd(1.0);
+    for (; i + 4 <= count; i += 4)
+    {
+        __m256d v = _mm256_loadu_pd(&input[i]);
+        vprod = _mm256_mul_pd(vprod, v);
+    }
+    // Horizontal product of 4 doubles
+    __m128d vlow  = _mm256_castpd256_pd128(vprod);
+    __m128d vhigh = _mm256_extractf128_pd(vprod, 1);
+    vlow  = _mm_mul_pd(vlow, vhigh);
+    __m128d shuf = _mm_shuffle_pd(vlow, vlow, 1);
+    vlow = _mm_mul_sd(vlow, shuf);
+    prod = _mm_cvtsd_f64(vlow);
+#elif defined(EIDOS_HAS_SSE42)
+    __m128d vprod = _mm_set1_pd(1.0);
+    for (; i + 2 <= count; i += 2)
+    {
+        __m128d v = _mm_loadu_pd(&input[i]);
+        vprod = _mm_mul_pd(vprod, v);
+    }
+    __m128d shuf = _mm_shuffle_pd(vprod, vprod, 1);
+    vprod = _mm_mul_sd(vprod, shuf);
+    prod = _mm_cvtsd_f64(vprod);
+#elif defined(EIDOS_HAS_NEON)
+    float64x2_t vprod = vdupq_n_f64(1.0);
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        vprod = vmulq_f64(vprod, v);
+    }
+    // Horizontal product of 2 doubles
+    prod = vgetq_lane_f64(vprod, 0) * vgetq_lane_f64(vprod, 1);
+#endif
+
+    for (; i < count; i++)
+        prod *= input[i];
+
+    return prod;
+}
+
+// ================================
+// Float (Single-Precision) SIMD Operations
+// ================================
+// These functions operate on arrays of floats, used by spatial interaction kernels.
+
+// ---------------------
+// Exponential: exp(x) for floats
+// ---------------------
+void exp_float32(const float *input, float *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if EIDOS_SLEEF_FLOAT_AVAILABLE
+    for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F)
+    {
+        EIDOS_SLEEF_TYPE_F v = EIDOS_SLEEF_LOAD_F(&input[i]);
+        EIDOS_SLEEF_TYPE_F r = EIDOS_SLEEF_EXP_F(v);
+        EIDOS_SLEEF_STORE_F(&output[i], r);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+        output[i] = std::exp(input[i]);
+}
+
+// ---------------------
+// Exponential Kernel: strength = fmax * exp(-lambda * distance)
+// ---------------------
+// Operates in-place on a distance array, transforming distances to strengths.
+// This is optimized for the spatial interaction kernel calculation.
+void exp_kernel_float32(float *distances, int64_t count, float fmax, float lambda)
+{
+    int64_t i = 0;
+
+#if EIDOS_SLEEF_FLOAT_AVAILABLE
+    // We need to compute: fmax * exp(-lambda * distance)
+    // First, compute -lambda * distance for all values, then exp, then multiply by fmax
+    EIDOS_SLEEF_TYPE_F v_fmax =
+#if defined(EIDOS_HAS_AVX2)
+        _mm256_set1_ps(fmax);
+    EIDOS_SLEEF_TYPE_F v_neg_lambda = _mm256_set1_ps(-lambda);
+#elif defined(EIDOS_HAS_NEON)
+        vdupq_n_f32(fmax);
+    EIDOS_SLEEF_TYPE_F v_neg_lambda = vdupq_n_f32(-lambda);
+#endif
+
+    for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F)
+    {
+        EIDOS_SLEEF_TYPE_F v_dist = EIDOS_SLEEF_LOAD_F(&distances[i]);
+
+        // Compute -lambda * distance
+#if defined(EIDOS_HAS_AVX2)
+        EIDOS_SLEEF_TYPE_F v_arg = _mm256_mul_ps(v_neg_lambda, v_dist);
+#elif defined(EIDOS_HAS_NEON)
+        EIDOS_SLEEF_TYPE_F v_arg = vmulq_f32(v_neg_lambda, v_dist);
+#endif
+
+        // Compute exp(-lambda * distance)
+        EIDOS_SLEEF_TYPE_F v_exp = EIDOS_SLEEF_EXP_F(v_arg);
+
+        // Compute fmax * exp(...)
+#if defined(EIDOS_HAS_AVX2)
+        EIDOS_SLEEF_TYPE_F v_result = _mm256_mul_ps(v_fmax, v_exp);
+#elif defined(EIDOS_HAS_NEON)
+        EIDOS_SLEEF_TYPE_F v_result = vmulq_f32(v_fmax, v_exp);
+#endif
+
+        EIDOS_SLEEF_STORE_F(&distances[i], v_result);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+        distances[i] = fmax * std::exp(-lambda * distances[i]);
+}
+
+// ---------------------
+// Normal (Gaussian) Kernel: strength = fmax * exp(-distance^2 / 2sigma^2)
+// ---------------------
+// Operates in-place on a distance array, transforming distances to strengths.
+// The two_sigma_sq parameter is pre-computed as 2 * sigma^2 for efficiency.
+void normal_kernel_float32(float *distances, int64_t count, float fmax, float two_sigma_sq)
+{
+    int64_t i = 0;
+
+#if EIDOS_SLEEF_FLOAT_AVAILABLE
+    // We need to compute: fmax * exp(-distance^2 / two_sigma_sq)
+    EIDOS_SLEEF_TYPE_F v_fmax =
+#if defined(EIDOS_HAS_AVX2)
+        _mm256_set1_ps(fmax);
+    EIDOS_SLEEF_TYPE_F v_neg_inv_2sigsq = _mm256_set1_ps(-1.0f / two_sigma_sq);
+#elif defined(EIDOS_HAS_NEON)
+        vdupq_n_f32(fmax);
+    EIDOS_SLEEF_TYPE_F v_neg_inv_2sigsq = vdupq_n_f32(-1.0f / two_sigma_sq);
+#endif
+
+    for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F)
+    {
+        EIDOS_SLEEF_TYPE_F v_dist = EIDOS_SLEEF_LOAD_F(&distances[i]);
+
+        // Compute distance^2
+#if defined(EIDOS_HAS_AVX2)
+        EIDOS_SLEEF_TYPE_F v_dist_sq = _mm256_mul_ps(v_dist, v_dist);
+        // Compute -distance^2 / 2sigma^2
+        EIDOS_SLEEF_TYPE_F v_arg = _mm256_mul_ps(v_dist_sq, v_neg_inv_2sigsq);
+#elif defined(EIDOS_HAS_NEON)
+        EIDOS_SLEEF_TYPE_F v_dist_sq = vmulq_f32(v_dist, v_dist);
+        // Compute -distance^2 / 2sigma^2
+        EIDOS_SLEEF_TYPE_F v_arg = vmulq_f32(v_dist_sq, v_neg_inv_2sigsq);
+#endif
+
+        // Compute exp(-distance^2 / 2sigma^2)
+        EIDOS_SLEEF_TYPE_F v_exp = EIDOS_SLEEF_EXP_F(v_arg);
+
+        // Compute fmax * exp(...)
+#if defined(EIDOS_HAS_AVX2)
+        EIDOS_SLEEF_TYPE_F v_result = _mm256_mul_ps(v_fmax, v_exp);
+#elif defined(EIDOS_HAS_NEON)
+        EIDOS_SLEEF_TYPE_F v_result = vmulq_f32(v_fmax, v_exp);
+#endif
+
+        EIDOS_SLEEF_STORE_F(&distances[i], v_result);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+    {
+        float d = distances[i];
+        distances[i] = fmax * std::exp(-(d * d) / two_sigma_sq);
+    }
+}
+
+// ---------------------
+// Student's T Kernel: strength = fmax / pow(1 + (d/tau)^2 / nu, (nu+1)/2)
+// ---------------------
+// Operates in-place on a distance array, transforming distances to strengths.
+// Parameters: fmax = maximum strength, nu = degrees of freedom, tau = scale
+void tdist_kernel_float32(float *distances, int64_t count, float fmax, float nu, float tau)
+{
+    int64_t i = 0;
+
+    // Pre-compute constants
+    float inv_tau = 1.0f / tau;
+    float inv_nu = 1.0f / nu;
+    float exponent = (nu + 1.0f) / 2.0f;
+
+#if EIDOS_SLEEF_FLOAT_AVAILABLE
+    EIDOS_SLEEF_TYPE_F v_fmax, v_inv_tau, v_inv_nu, v_exponent, v_one;
+#if defined(EIDOS_HAS_AVX2)
+    v_fmax = _mm256_set1_ps(fmax);
+    v_inv_tau = _mm256_set1_ps(inv_tau);
+    v_inv_nu = _mm256_set1_ps(inv_nu);
+    v_exponent = _mm256_set1_ps(-exponent);
+    v_one = _mm256_set1_ps(1.0f);
+#elif defined(EIDOS_HAS_NEON)
+    v_fmax = vdupq_n_f32(fmax);
+    v_inv_tau = vdupq_n_f32(inv_tau);
+    v_inv_nu = vdupq_n_f32(inv_nu);
+    v_exponent = vdupq_n_f32(-exponent);
+    v_one = vdupq_n_f32(1.0f);
+#endif
+
+    for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F)
+    {
+        EIDOS_SLEEF_TYPE_F v_dist = EIDOS_SLEEF_LOAD_F(&distances[i]);
+
+        // Compute (d / tau)
+#if defined(EIDOS_HAS_AVX2)
+        EIDOS_SLEEF_TYPE_F v_d_over_tau = _mm256_mul_ps(v_dist, v_inv_tau);
+        // Compute (d/tau)^2
+        EIDOS_SLEEF_TYPE_F v_d_over_tau_sq = _mm256_mul_ps(v_d_over_tau, v_d_over_tau);
+        // Compute (d/tau)^2 / nu
+        EIDOS_SLEEF_TYPE_F v_term = _mm256_mul_ps(v_d_over_tau_sq, v_inv_nu);
+        // Compute 1 + (d/tau)^2 / nu
+        EIDOS_SLEEF_TYPE_F v_base = _mm256_add_ps(v_one, v_term);
+#elif defined(EIDOS_HAS_NEON)
+        EIDOS_SLEEF_TYPE_F v_d_over_tau = vmulq_f32(v_dist, v_inv_tau);
+        EIDOS_SLEEF_TYPE_F v_d_over_tau_sq = vmulq_f32(v_d_over_tau, v_d_over_tau);
+        EIDOS_SLEEF_TYPE_F v_term = vmulq_f32(v_d_over_tau_sq, v_inv_nu);
+        EIDOS_SLEEF_TYPE_F v_base = vaddq_f32(v_one, v_term);
+#endif
+
+        // Compute pow(base, -exponent) = 1 / pow(base, exponent)
+        EIDOS_SLEEF_TYPE_F v_pow = EIDOS_SLEEF_POW_F(v_base, v_exponent);
+
+        // Compute fmax * pow(...)
+#if defined(EIDOS_HAS_AVX2)
+        EIDOS_SLEEF_TYPE_F v_result = _mm256_mul_ps(v_fmax, v_pow);
+#elif defined(EIDOS_HAS_NEON)
+        EIDOS_SLEEF_TYPE_F v_result = vmulq_f32(v_fmax, v_pow);
+#endif
+
+        EIDOS_SLEEF_STORE_F(&distances[i], v_result);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+    {
+        float d = distances[i];
+        float d_over_tau = d * inv_tau;
+        distances[i] = fmax * std::pow(1.0f + d_over_tau * d_over_tau * inv_nu, -exponent);
+    }
+}
+
+// ---------------------
+// Cauchy Kernel: strength = fmax / (1 + (d/lambda)^2)
+// ---------------------
+// Operates in-place on a distance array, transforming distances to strengths.
+// Parameters: fmax = maximum strength, lambda = scale parameter
+void cauchy_kernel_float32(float *distances, int64_t count, float fmax, float lambda)
+{
+    int64_t i = 0;
+    float inv_lambda = 1.0f / lambda;
+
+#if defined(EIDOS_HAS_AVX2)
+    __m256 v_fmax = _mm256_set1_ps(fmax);
+    __m256 v_inv_lambda = _mm256_set1_ps(inv_lambda);
+    __m256 v_one = _mm256_set1_ps(1.0f);
+
+    for (; i + 8 <= count; i += 8)
+    {
+        __m256 v_dist = _mm256_loadu_ps(&distances[i]);
+        __m256 v_temp = _mm256_mul_ps(v_dist, v_inv_lambda);      // d/lambda
+        __m256 v_temp_sq = _mm256_mul_ps(v_temp, v_temp);         // (d/lambda)^2
+        __m256 v_denom = _mm256_add_ps(v_one, v_temp_sq);         // 1 + (d/lambda)^2
+        __m256 v_result = _mm256_div_ps(v_fmax, v_denom);         // fmax / denom
+        _mm256_storeu_ps(&distances[i], v_result);
+    }
+#elif defined(EIDOS_HAS_NEON)
+    float32x4_t v_fmax = vdupq_n_f32(fmax);
+    float32x4_t v_inv_lambda = vdupq_n_f32(inv_lambda);
+    float32x4_t v_one = vdupq_n_f32(1.0f);
+
+    for (; i + 4 <= count; i += 4)
+    {
+        float32x4_t v_dist = vld1q_f32(&distances[i]);
+        float32x4_t v_temp = vmulq_f32(v_dist, v_inv_lambda);
+        float32x4_t v_temp_sq = vmulq_f32(v_temp, v_temp);
+        float32x4_t v_denom = vaddq_f32(v_one, v_temp_sq);
+        float32x4_t v_result = vdivq_f32(v_fmax, v_denom);
+        vst1q_f32(&distances[i], v_result);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+    {
+        float d = distances[i];
+        float temp = d * inv_lambda;
+        distances[i] = fmax / (1.0f + temp * temp);
+    }
+}
+
+// ---------------------
+// Linear Kernel: strength = fmax * (1 - d / max_distance)
+// ---------------------
+// Operates in-place on a distance array, transforming distances to strengths.
+// Parameters: fmax = maximum strength, max_distance = interaction max distance
+void linear_kernel_float32(float *distances, int64_t count, float fmax, float max_distance)
+{
+    int64_t i = 0;
+    float fmax_over_maxdist = fmax / max_distance;
+
+#if defined(EIDOS_HAS_AVX2)
+    __m256 v_fmax = _mm256_set1_ps(fmax);
+    __m256 v_fmax_over_maxdist = _mm256_set1_ps(fmax_over_maxdist);
+
+    for (; i + 8 <= count; i += 8)
+    {
+        __m256 v_dist = _mm256_loadu_ps(&distances[i]);
+        // fmax - d * (fmax / max_distance) = fmax * (1 - d/max_distance)
+        __m256 v_term = _mm256_mul_ps(v_dist, v_fmax_over_maxdist);
+        __m256 v_result = _mm256_sub_ps(v_fmax, v_term);
+        _mm256_storeu_ps(&distances[i], v_result);
+    }
+#elif defined(EIDOS_HAS_NEON)
+    float32x4_t v_fmax = vdupq_n_f32(fmax);
+    float32x4_t v_fmax_over_maxdist = vdupq_n_f32(fmax_over_maxdist);
+
+    for (; i + 4 <= count; i += 4)
+    {
+        float32x4_t v_dist = vld1q_f32(&distances[i]);
+        float32x4_t v_term = vmulq_f32(v_dist, v_fmax_over_maxdist);
+        float32x4_t v_result = vsubq_f32(v_fmax, v_term);
+        vst1q_f32(&distances[i], v_result);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+    {
+        distances[i] = fmax - distances[i] * fmax_over_maxdist;
+    }
+}
+
+// ================================
+// Convolution Helpers for SpatialMap::smooth()
+// ================================
+// These functions compute vectorized dot products for convolution operations.
+// They accumulate both kernel_sum and conv_sum (weighted sum) in a single pass.
+
+// ---------------------
+// Convolution dot product: accumulates kernel_sum and kernel*values sum
+// ---------------------
+// Computes: kernel_sum += sum(kernel), conv_sum += sum(kernel * values)
+// Used by SpatialMap convolution when all values are valid (no edge handling needed)
+void convolve_dot_product_float64(
+    const double *kernel_ptr, const double *pixel_ptr,
+    int64_t count, double &kernel_sum, double &conv_sum)
+{
+    int64_t i = 0;
+    double local_ksum = 0.0;
+    double local_csum = 0.0;
+
+#if defined(EIDOS_HAS_AVX2) && defined(EIDOS_HAS_FMA)
+    __m256d v_ksum = _mm256_setzero_pd();
+    __m256d v_csum = _mm256_setzero_pd();
+
+    for (; i + 4 <= count; i += 4)
+    {
+        __m256d v_kernel = _mm256_loadu_pd(&kernel_ptr[i]);
+        __m256d v_pixel = _mm256_loadu_pd(&pixel_ptr[i]);
+
+        v_ksum = _mm256_add_pd(v_ksum, v_kernel);
+        v_csum = _mm256_fmadd_pd(v_kernel, v_pixel, v_csum);
+    }
+
+    // Horizontal sum
+    __m128d ksum_low = _mm256_castpd256_pd128(v_ksum);
+    __m128d ksum_high = _mm256_extractf128_pd(v_ksum, 1);
+    ksum_low = _mm_add_pd(ksum_low, ksum_high);
+    __m128d ksum_shuf = _mm_shuffle_pd(ksum_low, ksum_low, 1);
+    ksum_low = _mm_add_sd(ksum_low, ksum_shuf);
+    local_ksum = _mm_cvtsd_f64(ksum_low);
+
+    __m128d csum_low = _mm256_castpd256_pd128(v_csum);
+    __m128d csum_high = _mm256_extractf128_pd(v_csum, 1);
+    csum_low = _mm_add_pd(csum_low, csum_high);
+    __m128d csum_shuf = _mm_shuffle_pd(csum_low, csum_low, 1);
+    csum_low = _mm_add_sd(csum_low, csum_shuf);
+    local_csum = _mm_cvtsd_f64(csum_low);
+
+#elif defined(EIDOS_HAS_SSE42)
+    __m128d v_ksum = _mm_setzero_pd();
+    __m128d v_csum = _mm_setzero_pd();
+
+    for (; i + 2 <= count; i += 2)
+    {
+        __m128d v_kernel = _mm_loadu_pd(&kernel_ptr[i]);
+        __m128d v_pixel = _mm_loadu_pd(&pixel_ptr[i]);
+
+        v_ksum = _mm_add_pd(v_ksum, v_kernel);
+        __m128d v_prod = _mm_mul_pd(v_kernel, v_pixel);
+        v_csum = _mm_add_pd(v_csum, v_prod);
+    }
+
+    // Horizontal sum
+    __m128d ksum_shuf = _mm_shuffle_pd(v_ksum, v_ksum, 1);
+    v_ksum = _mm_add_sd(v_ksum, ksum_shuf);
+    local_ksum = _mm_cvtsd_f64(v_ksum);
+
+    __m128d csum_shuf = _mm_shuffle_pd(v_csum, v_csum, 1);
+    v_csum = _mm_add_sd(v_csum, csum_shuf);
+    local_csum = _mm_cvtsd_f64(v_csum);
+
+#elif defined(EIDOS_HAS_NEON)
+    float64x2_t v_ksum = vdupq_n_f64(0.0);
+    float64x2_t v_csum = vdupq_n_f64(0.0);
+
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v_kernel = vld1q_f64(&kernel_ptr[i]);
+        float64x2_t v_pixel = vld1q_f64(&pixel_ptr[i]);
+
+        v_ksum = vaddq_f64(v_ksum, v_kernel);
+        v_csum = vfmaq_f64(v_csum, v_kernel, v_pixel);
+    }
+
+    local_ksum = vaddvq_f64(v_ksum);
+    local_csum = vaddvq_f64(v_csum);
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+    {
+        local_ksum += kernel_ptr[i];
+        local_csum += kernel_ptr[i] * pixel_ptr[i];
+    }
+
+    kernel_sum += local_ksum;
+    conv_sum += local_csum;
+}
+
+// ---------------------
+// Scaled convolution dot product for edge handling
+// ---------------------
+// Like convolve_dot_product_float64 but scales kernel values by coverage factor.
+// Used when handling edges where some kernel positions are out of bounds.
+void convolve_dot_product_scaled_float64(
+    const double *kernel_ptr, const double *pixel_ptr,
+    int64_t count, double coverage,
+    double &kernel_sum, double &conv_sum)
+{
+    int64_t i = 0;
+    double local_ksum = 0.0;
+    double local_csum = 0.0;
+
+#if defined(EIDOS_HAS_AVX2) && defined(EIDOS_HAS_FMA)
+    __m256d v_coverage = _mm256_set1_pd(coverage);
+    __m256d v_ksum = _mm256_setzero_pd();
+    __m256d v_csum = _mm256_setzero_pd();
+
+    for (; i + 4 <= count; i += 4)
+    {
+        __m256d v_kernel = _mm256_loadu_pd(&kernel_ptr[i]);
+        __m256d v_pixel = _mm256_loadu_pd(&pixel_ptr[i]);
+        __m256d v_scaled_kernel = _mm256_mul_pd(v_kernel, v_coverage);
+
+        v_ksum = _mm256_add_pd(v_ksum, v_scaled_kernel);
+        v_csum = _mm256_fmadd_pd(v_scaled_kernel, v_pixel, v_csum);
+    }
+
+    // Horizontal sum
+    __m128d ksum_low = _mm256_castpd256_pd128(v_ksum);
+    __m128d ksum_high = _mm256_extractf128_pd(v_ksum, 1);
+    ksum_low = _mm_add_pd(ksum_low, ksum_high);
+    __m128d ksum_shuf = _mm_shuffle_pd(ksum_low, ksum_low, 1);
+    ksum_low = _mm_add_sd(ksum_low, ksum_shuf);
+    local_ksum = _mm_cvtsd_f64(ksum_low);
+
+    __m128d csum_low = _mm256_castpd256_pd128(v_csum);
+    __m128d csum_high = _mm256_extractf128_pd(v_csum, 1);
+    csum_low = _mm_add_pd(csum_low, csum_high);
+    __m128d csum_shuf = _mm_shuffle_pd(csum_low, csum_low, 1);
+    csum_low = _mm_add_sd(csum_low, csum_shuf);
+    local_csum = _mm_cvtsd_f64(csum_low);
+
+#elif defined(EIDOS_HAS_SSE42)
+    __m128d v_coverage = _mm_set1_pd(coverage);
+    __m128d v_ksum = _mm_setzero_pd();
+    __m128d v_csum = _mm_setzero_pd();
+
+    for (; i + 2 <= count; i += 2)
+    {
+        __m128d v_kernel = _mm_loadu_pd(&kernel_ptr[i]);
+        __m128d v_pixel = _mm_loadu_pd(&pixel_ptr[i]);
+        __m128d v_scaled_kernel = _mm_mul_pd(v_kernel, v_coverage);
+
+        v_ksum = _mm_add_pd(v_ksum, v_scaled_kernel);
+        __m128d v_prod = _mm_mul_pd(v_scaled_kernel, v_pixel);
+        v_csum = _mm_add_pd(v_csum, v_prod);
+    }
+
+    // Horizontal sum
+    __m128d ksum_shuf = _mm_shuffle_pd(v_ksum, v_ksum, 1);
+    v_ksum = _mm_add_sd(v_ksum, ksum_shuf);
+    local_ksum = _mm_cvtsd_f64(v_ksum);
+
+    __m128d csum_shuf = _mm_shuffle_pd(v_csum, v_csum, 1);
+    v_csum = _mm_add_sd(v_csum, csum_shuf);
+    local_csum = _mm_cvtsd_f64(v_csum);
+
+#elif defined(EIDOS_HAS_NEON)
+    float64x2_t v_coverage = vdupq_n_f64(coverage);
+    float64x2_t v_ksum = vdupq_n_f64(0.0);
+    float64x2_t v_csum = vdupq_n_f64(0.0);
+
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v_kernel = vld1q_f64(&kernel_ptr[i]);
+        float64x2_t v_pixel = vld1q_f64(&pixel_ptr[i]);
+        float64x2_t v_scaled_kernel = vmulq_f64(v_kernel, v_coverage);
+
+        v_ksum = vaddq_f64(v_ksum, v_scaled_kernel);
+        v_csum = vfmaq_f64(v_csum, v_scaled_kernel, v_pixel);
+    }
+
+    local_ksum = vaddvq_f64(v_ksum);
+    local_csum = vaddvq_f64(v_csum);
+#endif
+
+    // Scalar remainder with scaling
+    for (; i < count; i++)
+    {
+        double scaled_k = kernel_ptr[i] * coverage;
+        local_ksum += scaled_k;
+        local_csum += scaled_k * pixel_ptr[i];
+    }
+
+    kernel_sum += local_ksum;
+    conv_sum += local_csum;
+}
+
+} // namespace EIDOS_SIMD_IMPL_NAMESPACE
+
+
+// stop suppressing warnings
+#pragma clang diagnostic pop
+#pragma GCC diagnostic pop
diff --git a/eidos/eidos_simd_neon.cpp b/eidos/eidos_simd_neon.cpp
new file mode 100644
index 00000000..297117b4
--- /dev/null
+++ b/eidos/eidos_simd_neon.cpp
@@ -0,0 +1,44 @@
+//
+//  eidos_simd_neon.cpp
+//  Eidos
+//
+//  Created by Andrew Kern on 5/21/2026.
+//  Copyright (c) 2024-2025 Benjamin C. Haller.  All rights reserved.
+//	A product of the Messer Lab, http://messerlab.org/slim/
+//
+
+//	This file is part of Eidos.
+//
+//	Eidos is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
+//
+//	Eidos is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License along with Eidos.  If not, see <http://www.gnu.org/licenses/>.
+
+/*
+
+ The NEON SIMD tier (ARM64 only).  NEON is part of the baseline ARM64
+ architecture, so this translation unit needs no special compiler flags; it is
+ simply compiled normally on ARM64.  On x86 architectures, when USE_SIMD=OFF,
+ and on MSVC, this file compiles to nothing.
+
+ */
+
+#include "eidos_simd.h"
+
+#if EIDOS_SIMD_DISPATCH_ARM
+
+#define EIDOS_HAS_NEON 1
+#define EIDOS_SIMD_IMPL_NAMESPACE Eidos_SIMD_neon
+#include "eidos_simd_impl.h"
+
+void Eidos_SIMD_Fill_neon(void)
+{
+#define X(ret, name, params) Eidos_SIMD::name = &Eidos_SIMD_neon::name;
+EIDOS_SIMD_FUNCTION_TABLE
+#undef X
+}
+
+#endif // EIDOS_SIMD_DISPATCH_ARM
diff --git a/eidos/eidos_simd_scalar.cpp b/eidos/eidos_simd_scalar.cpp
new file mode 100644
index 00000000..260097ec
--- /dev/null
+++ b/eidos/eidos_simd_scalar.cpp
@@ -0,0 +1,40 @@
+//
+//  eidos_simd_scalar.cpp
+//  Eidos
+//
+//  Created by Andrew Kern on 5/21/2026.
+//  Copyright (c) 2024-2025 Benjamin C. Haller.  All rights reserved.
+//	A product of the Messer Lab, http://messerlab.org/slim/
+//
+
+//	This file is part of Eidos.
+//
+//	Eidos is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
+//
+//	Eidos is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License along with Eidos.  If not, see <http://www.gnu.org/licenses/>.
+
+/*
+
+ The scalar SIMD tier: portable C++ with no instruction-set intrinsics, built
+ on every platform and compiled at the plain baseline ABI.  This is the
+ universal fallback and the static default for the Eidos_SIMD pointers.
+
+ */
+
+#include "eidos_simd.h"
+
+// No EIDOS_HAS_* macro is defined, so eidos_simd_impl.h expands to the scalar
+// code paths.
+#define EIDOS_SIMD_IMPL_NAMESPACE Eidos_SIMD_scalar
+#include "eidos_simd_impl.h"
+
+void Eidos_SIMD_Fill_scalar(void)
+{
+#define X(ret, name, params) Eidos_SIMD::name = &Eidos_SIMD_scalar::name;
+EIDOS_SIMD_FUNCTION_TABLE
+#undef X
+}
diff --git a/eidos/eidos_simd_sse42.cpp b/eidos/eidos_simd_sse42.cpp
new file mode 100644
index 00000000..3680cee6
--- /dev/null
+++ b/eidos/eidos_simd_sse42.cpp
@@ -0,0 +1,47 @@
+//
+//  eidos_simd_sse42.cpp
+//  Eidos
+//
+//  Created by Andrew Kern on 5/21/2026.
+//  Copyright (c) 2024-2025 Benjamin C. Haller.  All rights reserved.
+//	A product of the Messer Lab, http://messerlab.org/slim/
+//
+
+//	This file is part of Eidos.
+//
+//	Eidos is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
+//
+//	Eidos is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License along with Eidos.  If not, see <http://www.gnu.org/licenses/>.
+
+/*
+
+ The SSE4.2 SIMD tier (x86_64 only).  This entire translation unit is built
+ with -msse4.2 by the build system (see CMakeLists.txt); no other translation
+ unit gets that flag, so the SSE4.2 code is confined here and reached only
+ through the dispatch pointers after the CPU has been confirmed to support it.
+
+ On non-x86 architectures, when USE_SIMD=OFF, and on MSVC, this file compiles
+ to nothing.
+
+ */
+
+#include "eidos_simd.h"
+
+#if EIDOS_SIMD_DISPATCH_X86
+
+#define EIDOS_HAS_SSE42 1
+#define EIDOS_SIMD_IMPL_NAMESPACE Eidos_SIMD_sse42
+#include "eidos_simd_impl.h"
+
+void Eidos_SIMD_Fill_sse42(void)
+{
+#define X(ret, name, params) Eidos_SIMD::name = &Eidos_SIMD_sse42::name;
+EIDOS_SIMD_FUNCTION_TABLE
+#undef X
+}
+
+#endif // EIDOS_SIMD_DISPATCH_X86
diff --git a/eidos/eidos_test_functions_math.cpp b/eidos/eidos_test_functions_math.cpp
index a062793d..52847fb2 100644
--- a/eidos/eidos_test_functions_math.cpp
+++ b/eidos/eidos_test_functions_math.cpp
@@ -1303,12 +1303,34 @@ static void _TestAtan2SimdFunction(const double *y_values, const double *x_value
 	}
 }
 
-void _RunSIMDMathTests(void)
+// Test a reduction SIMD function (sum, product) against a scalar reference
+template<typename ScalarReduce, typename SimdReduce>
+static void _TestReductionSimdFunction(const char *name, ScalarReduce scalar_reduce, SimdReduce simd_reduce,
+									   const double *test_values, int num_values, double tolerance = 1e-12)
 {
-#if !defined(EIDOS_SLEEF_AVAILABLE) || !EIDOS_SLEEF_AVAILABLE
-	std::cout << "NOTE: SLEEF is not available in this build; SIMD math tests will compare scalar fallback against std:: (trivially identical)." << std::endl;
-#endif
+	double scalar_result = scalar_reduce(test_values, num_values);
+	double simd_result = simd_reduce(test_values, num_values);
+	double diff = std::abs(simd_result - scalar_result);
+	double max_val = std::max(std::abs(scalar_result), std::abs(simd_result));
+	if (max_val > 1.0)
+		diff /= max_val;
+
+	if (diff <= tolerance)
+	{
+		gEidosTestSuccessCount++;
+	}
+	else
+	{
+		gEidosTestFailureCount++;
+		std::cerr << EIDOS_OUTPUT_FAILURE_TAG << " : SIMD " << name << "() test failed (scalar=" << scalar_result << ", simd=" << simd_result << ")" << std::endl;
+	}
+}
 
+// Run the full SIMD math test battery against whichever tier is currently
+// installed in the Eidos_SIMD dispatch table.  _RunSIMDMathTests() calls this
+// once for each instruction-set tier the CPU supports.
+static void _RunSIMDMathTests_CurrentTier(void)
+{
 	// Test values for trigonometric functions (in radians)
 	// Include various edge cases and values that span multiple SIMD vector widths
 	const double trig_test_values[] = {
@@ -1530,6 +1552,179 @@ void _RunSIMDMathTests(void)
 			std::cerr << EIDOS_OUTPUT_FAILURE_TAG << " : SIMD pow_scalar_base() test failed" << std::endl;
 		}
 	}
+
+	// Non-transcendental kernels: sqrt, abs, the rounding family, the reductions,
+	// and the convolution helpers each have distinct SSE4.2 / AVX2 / NEON code,
+	// so cycling the tiers in _RunSIMDMathTests() is what covers all of them.
+	_TestUnarySimdFunction("sqrt", [](double x) { return std::sqrt(x); },
+						   Eidos_SIMD::sqrt_float64, trig_test_values, num_trig_values);
+	_TestUnarySimdFunction("abs", [](double x) { return std::fabs(x); },
+						   Eidos_SIMD::abs_float64, trig_test_values, num_trig_values);
+	_TestUnarySimdFunction("floor", [](double x) { return std::floor(x); },
+						   Eidos_SIMD::floor_float64, trig_test_values, num_trig_values);
+	_TestUnarySimdFunction("ceil", [](double x) { return std::ceil(x); },
+						   Eidos_SIMD::ceil_float64, trig_test_values, num_trig_values);
+	_TestUnarySimdFunction("trunc", [](double x) { return std::trunc(x); },
+						   Eidos_SIMD::trunc_float64, trig_test_values, num_trig_values);
+
+	// round() test values deliberately avoid exact halves: the SIMD path rounds
+	// half-to-even while std::round() rounds half-away-from-zero, so the two
+	// agree only away from the .5 tie point.
+	const double round_test_values[] = {
+		0.0, 7.0, -7.0,
+		0.3, 0.7, 1.2, 1.8, 2.4, 2.9, 3.1, 3.6,
+		-0.3, -0.7, -1.2, -1.8, -2.4, -2.9, -3.1, -3.6,
+		100.4, 100.7, -100.4, -100.7
+	};
+	const int num_round_values = (int)(sizeof(round_test_values) / sizeof(double));
+	_TestUnarySimdFunction("round", [](double x) { return std::round(x); },
+						   Eidos_SIMD::round_float64, round_test_values, num_round_values);
+
+	// Reductions: sum and product
+	const double reduction_test_values[] = {
+		0.5, 1.5, 2.0, 0.25, 1.0, 3.0, 0.8, 1.2, 2.5, 0.4, 1.1, 0.9, 2.0, 0.6
+	};
+	const int num_reduction_values = (int)(sizeof(reduction_test_values) / sizeof(double));
+	_TestReductionSimdFunction("sum",
+							   [](const double *v, int n) { double s = 0.0; for (int i = 0; i < n; i++) s += v[i]; return s; },
+							   Eidos_SIMD::sum_float64, reduction_test_values, num_reduction_values);
+	_TestReductionSimdFunction("product",
+							   [](const double *v, int n) { double p = 1.0; for (int i = 0; i < n; i++) p *= v[i]; return p; },
+							   Eidos_SIMD::product_float64, reduction_test_values, num_reduction_values);
+
+	// Convolution dot products (used by SpatialMap::smooth())
+	{
+		const double conv_kernel[] = { 0.1, 0.2, 0.15, 0.05, 0.25, 0.1, 0.3, 0.2, 0.12, 0.18, 0.22, 0.08, 0.14, 0.16 };
+		const double conv_pixel[]  = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0 };
+		const int num_conv = (int)(sizeof(conv_kernel) / sizeof(double));
+		double ref_ksum = 0.0, ref_csum = 0.0;
+		for (int i = 0; i < num_conv; i++) { ref_ksum += conv_kernel[i]; ref_csum += conv_kernel[i] * conv_pixel[i]; }
+		double simd_ksum = 0.0, simd_csum = 0.0;
+		Eidos_SIMD::convolve_dot_product_float64(conv_kernel, conv_pixel, num_conv, simd_ksum, simd_csum);
+		if ((std::abs(simd_ksum - ref_ksum) <= 1e-9) && (std::abs(simd_csum - ref_csum) <= 1e-9))
+			gEidosTestSuccessCount++;
+		else
+		{
+			gEidosTestFailureCount++;
+			std::cerr << EIDOS_OUTPUT_FAILURE_TAG << " : SIMD convolve_dot_product() test failed" << std::endl;
+		}
+		const double coverage = 0.75;
+		double ref_sksum = 0.0, ref_scsum = 0.0;
+		for (int i = 0; i < num_conv; i++) { double sk = conv_kernel[i] * coverage; ref_sksum += sk; ref_scsum += sk * conv_pixel[i]; }
+		double simd_sksum = 0.0, simd_scsum = 0.0;
+		Eidos_SIMD::convolve_dot_product_scaled_float64(conv_kernel, conv_pixel, num_conv, coverage, simd_sksum, simd_scsum);
+		if ((std::abs(simd_sksum - ref_sksum) <= 1e-9) && (std::abs(simd_scsum - ref_scsum) <= 1e-9))
+			gEidosTestSuccessCount++;
+		else
+		{
+			gEidosTestFailureCount++;
+			std::cerr << EIDOS_OUTPUT_FAILURE_TAG << " : SIMD convolve_dot_product_scaled() test failed" << std::endl;
+		}
+	}
+
+	// Single-precision exponential
+	{
+		const float exp_f_values[] = {
+			0.0f, 0.1f, 0.5f, 1.0f, 2.0f, 5.0f, -0.1f, -0.5f, -1.0f, -2.0f, -5.0f, 0.01f, 3.3f, -3.3f
+		};
+		const int num_exp_f = (int)(sizeof(exp_f_values) / sizeof(float));
+		std::vector<float> exp_f_out(num_exp_f);
+		Eidos_SIMD::exp_float32(exp_f_values, exp_f_out.data(), num_exp_f);
+		bool all_match = true;
+		for (int i = 0; i < num_exp_f; i++)
+		{
+			float ref = std::exp(exp_f_values[i]);
+			float fdiff = std::fabs(exp_f_out[i] - ref);
+			float fmax = std::max(std::fabs(ref), std::fabs(exp_f_out[i]));
+			if (fmax > 1.0f)
+				fdiff /= fmax;
+			if (fdiff > 1e-5f)
+				all_match = false;
+		}
+		if (all_match)
+			gEidosTestSuccessCount++;
+		else
+		{
+			gEidosTestFailureCount++;
+			std::cerr << EIDOS_OUTPUT_FAILURE_TAG << " : SIMD exp_float32() test failed" << std::endl;
+		}
+	}
+
+	// Single-precision spatial-interaction kernels (in-place transforms)
+	{
+		const float kdist[] = {
+			0.0f, 0.1f, 0.25f, 0.5f, 0.8f, 1.0f, 1.5f, 2.0f, 2.5f, 3.0f, 0.33f, 0.66f, 1.1f, 1.7f
+		};
+		const int num_k = (int)(sizeof(kdist) / sizeof(float));
+		const float kfmax = 1.5f;
+		struct KernelCase { const char *name; int which; float p1, p2; };
+		const KernelCase kernels[] = {
+			{ "exp_kernel_float32",    0, 0.7f, 0.0f },
+			{ "normal_kernel_float32", 1, 2.0f, 0.0f },
+			{ "tdist_kernel_float32",  2, 3.0f, 1.2f },
+			{ "cauchy_kernel_float32", 3, 0.9f, 0.0f },
+			{ "linear_kernel_float32", 4, 5.0f, 0.0f }
+		};
+		for (const KernelCase &k : kernels)
+		{
+			std::vector<float> ref(kdist, kdist + num_k);
+			std::vector<float> simd(kdist, kdist + num_k);
+			for (int i = 0; i < num_k; i++)
+			{
+				float d = ref[i];
+				if (k.which == 0) ref[i] = kfmax * std::exp(-k.p1 * d);
+				else if (k.which == 1) ref[i] = kfmax * std::exp(-(d * d) / k.p1);
+				else if (k.which == 2) { float dt = d / k.p2; ref[i] = kfmax * std::pow(1.0f + dt * dt / k.p1, -(k.p1 + 1.0f) / 2.0f); }
+				else if (k.which == 3) { float dl = d / k.p1; ref[i] = kfmax / (1.0f + dl * dl); }
+				else ref[i] = kfmax - d * (kfmax / k.p1);
+			}
+			if (k.which == 0) Eidos_SIMD::exp_kernel_float32(simd.data(), num_k, kfmax, k.p1);
+			else if (k.which == 1) Eidos_SIMD::normal_kernel_float32(simd.data(), num_k, kfmax, k.p1);
+			else if (k.which == 2) Eidos_SIMD::tdist_kernel_float32(simd.data(), num_k, kfmax, k.p1, k.p2);
+			else if (k.which == 3) Eidos_SIMD::cauchy_kernel_float32(simd.data(), num_k, kfmax, k.p1);
+			else Eidos_SIMD::linear_kernel_float32(simd.data(), num_k, kfmax, k.p1);
+			bool all_match = true;
+			for (int i = 0; i < num_k; i++)
+			{
+				float diff = std::fabs(simd[i] - ref[i]);
+				float mx = std::max(std::fabs(ref[i]), std::fabs(simd[i]));
+				if (mx > 1.0f)
+					diff /= mx;
+				if (diff > 1e-4f)
+					all_match = false;
+			}
+			if (all_match)
+				gEidosTestSuccessCount++;
+			else
+			{
+				gEidosTestFailureCount++;
+				std::cerr << EIDOS_OUTPUT_FAILURE_TAG << " : SIMD " << k.name << "() test failed" << std::endl;
+			}
+		}
+	}
+}
+
+void _RunSIMDMathTests(void)
+{
+	// The SIMD kernels are compiled once per instruction-set tier (scalar,
+	// SSE4.2, AVX2+FMA, NEON); normal execution only runs the single tier the
+	// CPU selected at startup.  To test, and cover, every tier's kernels, cycle
+	// through all tiers available on this CPU, then restore the best one.
+	static const char * const tier_names[] = { "scalar", "SSE4.2", "AVX2+FMA", "NEON" };
+
+	for (const char * const tier_name : tier_names)
+	{
+		if (!Eidos_SIMD_SelectTier(tier_name))
+			continue;
+		std::cout << "Testing SIMD kernels: " << Eidos_SIMD_ActiveTierName() << " tier";
+		if (!Eidos_SIMD_SLEEFActive())
+			std::cout << " (transcendentals use the std:: fallback)";
+		std::cout << std::endl;
+		_RunSIMDMathTests_CurrentTier();
+	}
+
+	// Restore the fastest tier for this CPU, undoing the cycling above.
+	Eidos_SIMD_Init();
 }