diff --git a/CMakeLists.txt b/CMakeLists.txt index b18ad492..a3a6fbb7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -294,77 +294,58 @@ if(BUILD_LTO) endif() # -# SIMD SUPPORT (independent of OpenMP) +# SIMD SUPPORT (runtime CPU dispatch) # - -# Option to disable SIMD entirely -option(USE_SIMD "Enable SIMD optimizations (SSE4.2/AVX2 on x86_64, NEON on ARM64)" ON) - -# Check architecture -# CMAKE_SYSTEM_PROCESSOR is "x86_64" on Intel Macs and Linux x86_64, "arm64"/"aarch64" on ARM +# SLiM's SIMD kernels live in eidos_simd_*.cpp, compiled once per instruction- +# set tier: scalar, SSE4.2, and AVX2+FMA on x86_64, and NEON on ARM64. Only +# the per-tier files are given instruction-set-specific compiler flags (those +# are applied at the end of this file); every other translation unit, and the +# dispatcher, is compiled at the plain baseline ABI. At startup +# Eidos_SIMD_Init() probes the CPU and points the +# kernels at the fastest tier the hardware supports, so a single binary is +# correct on any CPU, from pre-AVX2 hardware up. Applying -mavx2 globally (the +# previous approach) let the compiler emit AVX2 throughout the whole binary and +# caused SIGILL crashes on older CPUs (issue #628). + +option(USE_SIMD "Enable SIMD-accelerated kernels with runtime CPU dispatch" ON) + +# Detect x86 so the SSE4.2/AVX2 tier files can be given their ISA flags. NEON +# needs no flag (it is baseline on ARM64); other architectures use the scalar +# tier only. set(IS_X86_64 FALSE) -set(IS_ARM64 FALSE) if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64|i686|i386") set(IS_X86_64 TRUE) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64|ARM64") - set(IS_ARM64 TRUE) endif() -if(USE_SIMD AND NOT WIN32 AND IS_X86_64) - include(CheckCXXCompilerFlag) - - # Check for AVX2 support - check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2) - check_cxx_compiler_flag("-msse4.2" COMPILER_SUPPORTS_SSE42) - check_cxx_compiler_flag("-mfma" COMPILER_SUPPORTS_FMA) - - if(COMPILER_SUPPORTS_AVX2) - message(STATUS "SIMD: AVX2 support detected") - add_compile_definitions(EIDOS_HAS_AVX2=1) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2") - if(COMPILER_SUPPORTS_FMA) - message(STATUS "SIMD: FMA support detected") - add_compile_definitions(EIDOS_HAS_FMA=1) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma") - endif() - elseif(COMPILER_SUPPORTS_SSE42) - message(STATUS "SIMD: SSE4.2 support detected (no AVX2)") - add_compile_definitions(EIDOS_HAS_SSE42=1) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2") - else() - message(STATUS "SIMD: No x86 SIMD support detected, using scalar fallback") - endif() -elseif(USE_SIMD AND NOT WIN32 AND IS_ARM64) - # ARM64 NEON is always available on ARM64, no compiler flag needed - message(STATUS "SIMD: ARM64 NEON support enabled") - add_compile_definitions(EIDOS_HAS_NEON=1) -elseif(USE_SIMD AND NOT WIN32) - message(STATUS "SIMD: Unknown architecture (${CMAKE_SYSTEM_PROCESSOR}), using scalar fallback") -elseif(USE_SIMD AND WIN32) - # Windows SIMD detection - MinGW uses GCC, so we can use the same flags as Linux - if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") +if(USE_SIMD) + if(IS_X86_64 AND NOT MSVC) include(CheckCXXCompilerFlag) - check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2_WIN) - check_cxx_compiler_flag("-mfma" COMPILER_SUPPORTS_FMA_WIN) - - if(COMPILER_SUPPORTS_AVX2_WIN) - message(STATUS "SIMD: AVX2 support detected (Windows/MinGW)") - add_compile_definitions(EIDOS_HAS_AVX2=1) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2") - if(COMPILER_SUPPORTS_FMA_WIN) - message(STATUS "SIMD: FMA support detected (Windows/MinGW)") - add_compile_definitions(EIDOS_HAS_FMA=1) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma") - endif() + check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2) + check_cxx_compiler_flag("-msse4.2" COMPILER_SUPPORTS_SSE42) + + if(COMPILER_SUPPORTS_AVX2 AND COMPILER_SUPPORTS_SSE42) + message(STATUS "SIMD: runtime dispatch enabled (scalar / SSE4.2 / AVX2+FMA selected per-CPU at startup)") + # The avx2/sse42 tier files need -mavx2 -mfma / -msse4.2. The flags + # are applied at the end of this file rather than here, because the + # Windows target blocks below overwrite COMPILE_FLAGS on every + # source file with "-include config.h"; applying our flags last, + # with APPEND_STRING, lets the two coexist (issue #628). + set(SIMD_TIER_FLAGS_X86 TRUE) else() - message(STATUS "SIMD: No AVX2 support on Windows/MinGW, using scalar fallback") + message(STATUS "SIMD: compiler lacks -mavx2/-msse4.2; building scalar kernels only") + add_compile_definitions(EIDOS_SUPPRESS_SIMD_DISPATCH=1) endif() + elseif(MSVC) + message(STATUS "SIMD: runtime dispatch not implemented for MSVC; building scalar kernels only") + add_compile_definitions(EIDOS_SUPPRESS_SIMD_DISPATCH=1) else() - # MSVC path - not currently used but could be added in the future - message(STATUS "SIMD: MSVC SIMD detection not yet implemented, using scalar fallback") + # ARM64 (NEON, baseline) or any other architecture (scalar fallback): + # the tier files need no instruction-set flags. + message(STATUS "SIMD: runtime dispatch enabled (NEON or scalar kernels selected at startup)") endif() else() - message(STATUS "SIMD: Disabled by user") + message(STATUS "SIMD: disabled (USE_SIMD=OFF); all math uses scalar kernels") + add_compile_definitions(EIDOS_SUPPRESS_SIMD_DISPATCH=1) endif() # GSL - adding /usr/local/include so all targets that use GSL_INCLUDES get omp.h @@ -535,6 +516,20 @@ if(BUILD_SLIMGUI) endif(BUILD_SLIMGUI) +# Apply the per-tier SIMD instruction-set flags to the avx2/sse42 tier files. +# This is deliberately done here, after the WIN32 blocks above: those run +# set_source_files_properties(... COMPILE_FLAGS "-include config.h") over every +# source file, and COMPILE_FLAGS is a single string that gets overwritten, not +# appended. Setting our flags last with APPEND_STRING preserves the Windows +# "-include config.h" while still confining AVX2/SSE4.2 to these two files. +if(SIMD_TIER_FLAGS_X86) + set_property(SOURCE "${PROJECT_SOURCE_DIR}/eidos/eidos_simd_avx2.cpp" + APPEND_STRING PROPERTY COMPILE_FLAGS " -mavx2 -mfma") + set_property(SOURCE "${PROJECT_SOURCE_DIR}/eidos/eidos_simd_sse42.cpp" + APPEND_STRING PROPERTY COMPILE_FLAGS " -msse4.2") +endif() + + # Deal with the PROFILE and PARALLEL flags, which interact and are handled in a complex way. # # For SLiMgui, profiling is always on for Release builds, always off for Debug builds; PROFILE does not affect it diff --git a/QtSLiM/QtSLiM.pro b/QtSLiM/QtSLiM.pro index d095e198..85f248fe 100644 --- a/QtSLiM/QtSLiM.pro +++ b/QtSLiM/QtSLiM.pro @@ -83,6 +83,11 @@ QMAKE_CFLAGS += $$(CFLAGS) DEFINES += EIDOS_GUI DEFINES += SLIMGUI=1 +# The Qt Creator (qmake) build does not apply per-file SIMD compiler flags, so +# it builds the scalar SIMD kernels only. The CMake build provides the full +# runtime SIMD dispatch (scalar / SSE4.2 / AVX2+FMA / NEON). See eidos_simd.h. +DEFINES += EIDOS_SUPPRESS_SIMD_DISPATCH + # Uncomment this define to disable the use of OpenGL in SLiMgui completely. This, plus removing the # link dependency on openglwidgets, should allow you to build SLiMgui without linking OpenGL at all. diff --git a/core/core.pro b/core/core.pro index d4394a80..6e9daa9a 100644 --- a/core/core.pro +++ b/core/core.pro @@ -45,6 +45,11 @@ QMAKE_CXXFLAGS += -Xarch_arm64 -DEIDOS_HAS_NEON=1 DEFINES += EIDOS_GUI DEFINES += SLIMGUI=1 +# The Qt Creator (qmake) build does not apply per-file SIMD compiler flags, so +# it builds the scalar SIMD kernels only. The CMake build provides the full +# runtime SIMD dispatch (scalar / SSE4.2 / AVX2+FMA / NEON). See eidos_simd.h. +DEFINES += EIDOS_SUPPRESS_SIMD_DISPATCH + CONFIG -= qt CONFIG += c++11 CONFIG += c11 diff --git a/eidos/eidos.pro b/eidos/eidos.pro index a5b3ff88..09e28a1a 100644 --- a/eidos/eidos.pro +++ b/eidos/eidos.pro @@ -45,6 +45,11 @@ QMAKE_CXXFLAGS += -Xarch_arm64 -DEIDOS_HAS_NEON=1 DEFINES += EIDOS_GUI DEFINES += SLIMGUI=1 +# The Qt Creator (qmake) build does not apply per-file SIMD compiler flags, so +# it builds the scalar SIMD kernels only. The CMake build provides the full +# runtime SIMD dispatch (scalar / SSE4.2 / AVX2+FMA / NEON). See eidos_simd.h. +DEFINES += EIDOS_SUPPRESS_SIMD_DISPATCH + CONFIG -= qt CONFIG += c++11 CONFIG += c11 @@ -115,6 +120,11 @@ SOURCES += \ eidos_property_signature.cpp \ eidos_rng.cpp \ eidos_script.cpp \ + eidos_simd.cpp \ + eidos_simd_avx2.cpp \ + eidos_simd_neon.cpp \ + eidos_simd_scalar.cpp \ + eidos_simd_sse42.cpp \ eidos_sorting.cpp \ eidos_symbol_table.cpp \ eidos_test.cpp \ @@ -149,6 +159,7 @@ HEADERS += \ eidos_rng.h \ eidos_script.h \ eidos_simd.h \ + eidos_simd_impl.h \ eidos_sorting.h \ eidos_symbol_table.h \ eidos_test_builtins.h \ diff --git a/eidos/eidos_globals.cpp b/eidos/eidos_globals.cpp index 172afca9..a57b9369 100644 --- a/eidos/eidos_globals.cpp +++ b/eidos/eidos_globals.cpp @@ -30,6 +30,7 @@ #include "eidos_class_DataFrame.h" #include "eidos_class_Image.h" #include "eidos_class_TestElement.h" +#include "eidos_simd.h" #include #include @@ -1163,6 +1164,9 @@ void Eidos_WarmUp(void) { been_here = true; + // Detect the CPU and select the SIMD kernel tier; must happen before any Eidos_SIMD kernel is called. + Eidos_SIMD_Init(); + // Initialize the random number generator with a random-ish seed. This seed may be overridden by the Context downstream. Eidos_InitializeRNG(); Eidos_SetRNGSeed(Eidos_GenerateRNGSeed()); diff --git a/eidos/eidos_simd.cpp b/eidos/eidos_simd.cpp new file mode 100644 index 00000000..c407969c --- /dev/null +++ b/eidos/eidos_simd.cpp @@ -0,0 +1,131 @@ +// +// eidos_simd.cpp +// Eidos +// +// Created by Andrew Kern on 5/21/2026. +// Copyright (c) 2024-2025 Benjamin C. Haller. All rights reserved. +// A product of the Messer Lab, http://messerlab.org/slim/ +// + +// This file is part of Eidos. +// +// Eidos is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or (at your option) any later version. +// +// Eidos is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along with Eidos. If not, see . + +/* + + SIMD runtime dispatcher. This translation unit is compiled at the plain + baseline ABI (no instruction-set flags); it owns the public Eidos_SIMD + function pointers and selects a tier for them at startup. See eidos_simd.h. + + */ + +#include "eidos_simd.h" + +#include + + +// The public kernel pointers. They are statically initialized to the scalar +// tier so that a call is well-defined even if it somehow happens before +// Eidos_SIMD_Init() runs; the address of a function is a constant expression, +// so there is no static-initialization-order dependency here. +namespace Eidos_SIMD { +#define X(ret, name, params) ret (*name) params = &Eidos_SIMD_scalar::name; +EIDOS_SIMD_FUNCTION_TABLE +#undef X +} + + +enum class Eidos_SIMD_Tier { kScalar, kSSE42, kAVX2_FMA, kNEON }; + +static Eidos_SIMD_Tier sActiveTier = Eidos_SIMD_Tier::kScalar; + + +bool Eidos_SIMD_SelectTier(const char *tier_name) +{ + // The scalar tier is built on every platform and always available. + if (std::strcmp(tier_name, "scalar") == 0) + { + Eidos_SIMD_Fill_scalar(); + sActiveTier = Eidos_SIMD_Tier::kScalar; + return true; + } + +#if EIDOS_SIMD_DISPATCH_X86 + // __builtin_cpu_supports() reads CPUID; it is available on GCC and Clang + // for x86 and works regardless of the flags this file was compiled with. + // AVX2 and FMA shipped together (Haswell), but we require both explicitly + // since the AVX2 tier and SLEEF both use FMA instructions. + if (std::strcmp(tier_name, "AVX2+FMA") == 0) + { + if (!(__builtin_cpu_supports("avx2") && __builtin_cpu_supports("fma"))) + return false; + Eidos_SIMD_Fill_avx2(); + sActiveTier = Eidos_SIMD_Tier::kAVX2_FMA; + return true; + } + if (std::strcmp(tier_name, "SSE4.2") == 0) + { + if (!__builtin_cpu_supports("sse4.2")) + return false; + Eidos_SIMD_Fill_sse42(); + sActiveTier = Eidos_SIMD_Tier::kSSE42; + return true; + } +#endif + +#if EIDOS_SIMD_DISPATCH_ARM + // NEON is baseline on every ARM64 CPU, so it is always available here. + if (std::strcmp(tier_name, "NEON") == 0) + { + Eidos_SIMD_Fill_neon(); + sActiveTier = Eidos_SIMD_Tier::kNEON; + return true; + } +#endif + + return false; +} + +void Eidos_SIMD_Init(void) +{ + // Install the fastest tier the CPU supports. This is idempotent: calling it + // again re-runs detection and re-installs the same tier, which is how the + // SIMD self-tests restore normal dispatch after cycling through every tier. +#if EIDOS_SIMD_DISPATCH_X86 + if (Eidos_SIMD_SelectTier("AVX2+FMA")) + return; + if (Eidos_SIMD_SelectTier("SSE4.2")) + return; +#endif +#if EIDOS_SIMD_DISPATCH_ARM + if (Eidos_SIMD_SelectTier("NEON")) + return; +#endif + // Fallback for pre-AVX2/pre-SSE4.2 x86, unknown architectures, MSVC, and + // USE_SIMD=OFF builds: the scalar tier, which runs on any CPU. + Eidos_SIMD_SelectTier("scalar"); +} + +const char *Eidos_SIMD_ActiveTierName(void) +{ + switch (sActiveTier) + { + case Eidos_SIMD_Tier::kAVX2_FMA: return "AVX2+FMA"; + case Eidos_SIMD_Tier::kSSE42: return "SSE4.2"; + case Eidos_SIMD_Tier::kNEON: return "NEON"; + case Eidos_SIMD_Tier::kScalar: return "scalar"; + } + return "scalar"; +} + +bool Eidos_SIMD_SLEEFActive(void) +{ + // SLEEF transcendentals are wired up only for the AVX2+FMA and NEON tiers. + return (sActiveTier == Eidos_SIMD_Tier::kAVX2_FMA) || (sActiveTier == Eidos_SIMD_Tier::kNEON); +} diff --git a/eidos/eidos_simd.h b/eidos/eidos_simd.h index b379eb78..ca2abb10 100644 --- a/eidos/eidos_simd.h +++ b/eidos/eidos_simd.h @@ -19,13 +19,39 @@ /* - SIMD acceleration for Eidos math operations, independent of OpenMP. - - This header provides vectorized implementations of common math operations - using platform-specific SIMD intrinsics when available: - - x86_64: SSE4.2 or AVX2 via - - ARM64: NEON via - Falls back to scalar code when no SIMD is available. + SIMD acceleration for Eidos math operations, with runtime CPU dispatch. + + This header is the public interface to a set of vectorized array-math kernels + (sqrt, exp, the spatial-interaction kernels, etc.). Each kernel is exposed as + a function pointer in namespace Eidos_SIMD; callers simply write, e.g., + Eidos_SIMD::sqrt_float64(in, out, n) exactly as if it were a function. + + How the dispatch works, and why + ------------------------------- + The kernels are compiled once per instruction-set "tier": + + - scalar portable C++ that runs on any CPU + - SSE4.2 x86_64-v2, present on essentially every x86_64 CPU (~2009+) + - AVX2 + FMA x86_64-v3, Haswell / Excavator and later (~2013+) + - NEON baseline on every ARM64 CPU + + Each tier lives in its own translation unit (eidos_simd_scalar.cpp, + eidos_simd_sse42.cpp, eidos_simd_avx2.cpp, eidos_simd_neon.cpp); only those + translation units are built with instruction-set-specific compiler flags. + Every other translation unit in SLiM, and the dispatcher itself, is compiled + at the plain x86_64 (SSE2) baseline. At startup Eidos_SIMD_Init() probes the + CPU with __builtin_cpu_supports() and points the Eidos_SIMD function pointers + at the fastest tier the hardware actually supports. + + The point of this design is that a single binary is correct everywhere: the + bulk of the executable contains only baseline instructions, so it cannot fault + on an old CPU, while the AVX2/SSE4.2 code is reached only after the CPU has + been confirmed to support it. Previously SLiM was built with -mavx2 applied + globally, which let the compiler emit AVX2 instructions throughout the whole + binary and caused SIGILL crashes on pre-AVX2 hardware (issue #628). + + Building with -D USE_SIMD=OFF defines EIDOS_SUPPRESS_SIMD_DISPATCH, which + forces the scalar tier and omits all instruction-set-specific code. */ @@ -33,1190 +59,105 @@ #define eidos_simd_h #include -#include - -// Determine SIMD capability level -#if defined(EIDOS_HAS_AVX2) - #include - #define EIDOS_SIMD_WIDTH 4 // 4 doubles per AVX register - #define EIDOS_SIMD_FLOAT_WIDTH 8 // 8 floats per AVX register -#elif defined(EIDOS_HAS_SSE42) - #include - #include - #define EIDOS_SIMD_WIDTH 2 // 2 doubles per SSE register - #define EIDOS_SIMD_FLOAT_WIDTH 4 // 4 floats per SSE register -#elif defined(EIDOS_HAS_NEON) - #include - #define EIDOS_SIMD_WIDTH 2 // 2 doubles per NEON register - #define EIDOS_SIMD_FLOAT_WIDTH 4 // 4 floats per NEON register -#else - #define EIDOS_SIMD_WIDTH 1 // Scalar fallback - #define EIDOS_SIMD_FLOAT_WIDTH 1 -#endif - -// Include SLEEF for vectorized transcendental functions (exp, log, log10, log2) -// SLEEF is only used when AVX2+FMA or NEON is available -// BCH 12/31/2025: SLEEF generates tons of shadowed variable warnings for some reason; disable them -#if defined(EIDOS_HAS_AVX2) || defined(EIDOS_HAS_NEON) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wshadow" -#pragma GCC diagnostic ignored "-Wdouble-promotion" -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wunused-function" -#pragma GCC diagnostic ignored "-Wimplicit-float-conversion" -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wshadow" -#pragma clang diagnostic ignored "-Wdouble-promotion" -#pragma clang diagnostic ignored "-Wunused-parameter" -#pragma clang diagnostic ignored "-Wunused-function" -#pragma clang diagnostic ignored "-Wimplicit-float-conversion" -#include "sleef/sleef_config.h" -#pragma clang diagnostic pop -#pragma GCC diagnostic pop -#endif - -// Disable certain warnings for the remainder of this file -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Waggressive-loop-optimizations" -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Waggressive-loop-optimizations" - - -// ================================ -// SIMD Vector Math Operations -// ================================ -// These functions apply an operation to arrays of doubles. -// They handle the loop, SIMD processing, and scalar remainder. +// EIDOS_SIMD_DISPATCH_X86 / _ARM are defined when this build includes a +// hardware-specific tier beyond scalar. Runtime dispatch needs both a +// supported compiler (for __builtin_cpu_supports and target attributes / +// per-file ISA flags) and a known architecture. When USE_SIMD=OFF, neither +// is defined and only the scalar tier exists. +#if !defined(EIDOS_SUPPRESS_SIMD_DISPATCH) && (defined(__GNUC__) || defined(__clang__)) + #if defined(__x86_64__) || defined(__i386__) + #define EIDOS_SIMD_DISPATCH_X86 1 + #elif defined(__aarch64__) || defined(__arm64__) + #define EIDOS_SIMD_DISPATCH_ARM 1 + #endif +#endif + +// The complete set of dispatched SIMD kernels, as X-macro entries of the form +// X(return_type, name, (parameter_types)). This single list drives the +// function-pointer declarations, their definitions, and the per-tier dispatch +// tables, so a kernel is added or removed in exactly one place. +#define EIDOS_SIMD_FUNCTION_TABLE \ + X(void, sqrt_float64, (const double *, double *, int64_t)) \ + X(void, abs_float64, (const double *, double *, int64_t)) \ + X(void, floor_float64, (const double *, double *, int64_t)) \ + X(void, ceil_float64, (const double *, double *, int64_t)) \ + X(void, trunc_float64, (const double *, double *, int64_t)) \ + X(void, round_float64, (const double *, double *, int64_t)) \ + X(void, exp_float64, (const double *, double *, int64_t)) \ + X(void, log_float64, (const double *, double *, int64_t)) \ + X(void, log10_float64, (const double *, double *, int64_t)) \ + X(void, log2_float64, (const double *, double *, int64_t)) \ + X(void, sin_float64, (const double *, double *, int64_t)) \ + X(void, cos_float64, (const double *, double *, int64_t)) \ + X(void, tan_float64, (const double *, double *, int64_t)) \ + X(void, asin_float64, (const double *, double *, int64_t)) \ + X(void, acos_float64, (const double *, double *, int64_t)) \ + X(void, atan_float64, (const double *, double *, int64_t)) \ + X(void, atan2_float64, (const double *, const double *, double *, int64_t)) \ + X(void, pow_float64, (const double *, const double *, double *, int64_t)) \ + X(void, pow_float64_scalar_exp, (const double *, double, double *, int64_t)) \ + X(void, pow_float64_scalar_base, (double, const double *, double *, int64_t)) \ + X(double, sum_float64, (const double *, int64_t)) \ + X(double, product_float64, (const double *, int64_t)) \ + X(void, exp_float32, (const float *, float *, int64_t)) \ + X(void, exp_kernel_float32, (float *, int64_t, float, float)) \ + X(void, normal_kernel_float32, (float *, int64_t, float, float)) \ + X(void, tdist_kernel_float32, (float *, int64_t, float, float, float)) \ + X(void, cauchy_kernel_float32, (float *, int64_t, float, float)) \ + X(void, linear_kernel_float32, (float *, int64_t, float, float)) \ + X(void, convolve_dot_product_float64, (const double *, const double *, int64_t, double &, double &)) \ + X(void, convolve_dot_product_scaled_float64, (const double *, const double *, int64_t, double, double &, double &)) + +// The public kernels: function pointers set by Eidos_SIMD_Init() to point at +// the tier selected for the running CPU. Call them as Eidos_SIMD::name(...). namespace Eidos_SIMD { - -// --------------------- -// Square Root: sqrt(x) -// --------------------- -inline void sqrt_float64(const double *input, double *output, int64_t count) -{ - int64_t i = 0; - -#if defined(EIDOS_HAS_AVX2) - // Process 4 doubles at a time - for (; i + 4 <= count; i += 4) - { - __m256d v = _mm256_loadu_pd(&input[i]); - __m256d r = _mm256_sqrt_pd(v); - _mm256_storeu_pd(&output[i], r); - } -#elif defined(EIDOS_HAS_SSE42) - // Process 2 doubles at a time - for (; i + 2 <= count; i += 2) - { - __m128d v = _mm_loadu_pd(&input[i]); - __m128d r = _mm_sqrt_pd(v); - _mm_storeu_pd(&output[i], r); - } -#elif defined(EIDOS_HAS_NEON) - // Process 2 doubles at a time - for (; i + 2 <= count; i += 2) - { - float64x2_t v = vld1q_f64(&input[i]); - float64x2_t r = vsqrtq_f64(v); - vst1q_f64(&output[i], r); - } -#endif - - // Scalar remainder - for (; i < count; i++) - output[i] = std::sqrt(input[i]); -} - -// --------------------- -// Absolute Value: abs(x) -// --------------------- -inline void abs_float64(const double *input, double *output, int64_t count) -{ - int64_t i = 0; - -#if defined(EIDOS_HAS_AVX2) - // Create sign mask (all bits except sign bit) - __m256d sign_mask = _mm256_set1_pd(-0.0); - for (; i + 4 <= count; i += 4) - { - __m256d v = _mm256_loadu_pd(&input[i]); - __m256d r = _mm256_andnot_pd(sign_mask, v); // Clear sign bit - _mm256_storeu_pd(&output[i], r); - } -#elif defined(EIDOS_HAS_SSE42) - __m128d sign_mask = _mm_set1_pd(-0.0); - for (; i + 2 <= count; i += 2) - { - __m128d v = _mm_loadu_pd(&input[i]); - __m128d r = _mm_andnot_pd(sign_mask, v); - _mm_storeu_pd(&output[i], r); - } -#elif defined(EIDOS_HAS_NEON) - for (; i + 2 <= count; i += 2) - { - float64x2_t v = vld1q_f64(&input[i]); - float64x2_t r = vabsq_f64(v); - vst1q_f64(&output[i], r); - } -#endif - - for (; i < count; i++) - output[i] = std::fabs(input[i]); -} - -// --------------------- -// Floor: floor(x) -// --------------------- -inline void floor_float64(const double *input, double *output, int64_t count) -{ - int64_t i = 0; - -#if defined(EIDOS_HAS_AVX2) - for (; i + 4 <= count; i += 4) - { - __m256d v = _mm256_loadu_pd(&input[i]); - __m256d r = _mm256_floor_pd(v); - _mm256_storeu_pd(&output[i], r); - } -#elif defined(EIDOS_HAS_SSE42) - for (; i + 2 <= count; i += 2) - { - __m128d v = _mm_loadu_pd(&input[i]); - __m128d r = _mm_floor_pd(v); - _mm_storeu_pd(&output[i], r); - } -#elif defined(EIDOS_HAS_NEON) - for (; i + 2 <= count; i += 2) - { - float64x2_t v = vld1q_f64(&input[i]); - float64x2_t r = vrndmq_f64(v); // Round toward minus infinity (floor) - vst1q_f64(&output[i], r); - } -#endif - - for (; i < count; i++) - output[i] = std::floor(input[i]); -} - -// --------------------- -// Ceil: ceil(x) -// --------------------- -inline void ceil_float64(const double *input, double *output, int64_t count) -{ - int64_t i = 0; - -#if defined(EIDOS_HAS_AVX2) - for (; i + 4 <= count; i += 4) - { - __m256d v = _mm256_loadu_pd(&input[i]); - __m256d r = _mm256_ceil_pd(v); - _mm256_storeu_pd(&output[i], r); - } -#elif defined(EIDOS_HAS_SSE42) - for (; i + 2 <= count; i += 2) - { - __m128d v = _mm_loadu_pd(&input[i]); - __m128d r = _mm_ceil_pd(v); - _mm_storeu_pd(&output[i], r); - } -#elif defined(EIDOS_HAS_NEON) - for (; i + 2 <= count; i += 2) - { - float64x2_t v = vld1q_f64(&input[i]); - float64x2_t r = vrndpq_f64(v); // Round toward plus infinity (ceil) - vst1q_f64(&output[i], r); - } -#endif - - for (; i < count; i++) - output[i] = std::ceil(input[i]); -} - -// --------------------- -// Truncate: trunc(x) -// --------------------- -inline void trunc_float64(const double *input, double *output, int64_t count) -{ - int64_t i = 0; - -#if defined(EIDOS_HAS_AVX2) - for (; i + 4 <= count; i += 4) - { - __m256d v = _mm256_loadu_pd(&input[i]); - __m256d r = _mm256_round_pd(v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); - _mm256_storeu_pd(&output[i], r); - } -#elif defined(EIDOS_HAS_SSE42) - for (; i + 2 <= count; i += 2) - { - __m128d v = _mm_loadu_pd(&input[i]); - __m128d r = _mm_round_pd(v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); - _mm_storeu_pd(&output[i], r); - } -#elif defined(EIDOS_HAS_NEON) - for (; i + 2 <= count; i += 2) - { - float64x2_t v = vld1q_f64(&input[i]); - float64x2_t r = vrndq_f64(v); // Round toward zero (truncate) - vst1q_f64(&output[i], r); - } -#endif - - for (; i < count; i++) - output[i] = std::trunc(input[i]); -} - -// --------------------- -// Round: round(x) -// --------------------- -inline void round_float64(const double *input, double *output, int64_t count) -{ - int64_t i = 0; - -#if defined(EIDOS_HAS_AVX2) - for (; i + 4 <= count; i += 4) - { - __m256d v = _mm256_loadu_pd(&input[i]); - __m256d r = _mm256_round_pd(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - _mm256_storeu_pd(&output[i], r); - } -#elif defined(EIDOS_HAS_SSE42) - for (; i + 2 <= count; i += 2) - { - __m128d v = _mm_loadu_pd(&input[i]); - __m128d r = _mm_round_pd(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - _mm_storeu_pd(&output[i], r); - } -#elif defined(EIDOS_HAS_NEON) - for (; i + 2 <= count; i += 2) - { - float64x2_t v = vld1q_f64(&input[i]); - float64x2_t r = vrndaq_f64(v); // Round to nearest, ties away from zero - vst1q_f64(&output[i], r); - } -#endif - - for (; i < count; i++) - output[i] = std::round(input[i]); -} - -// --------------------- -// Exponential: exp(x) -// --------------------- -inline void exp_float64(const double *input, double *output, int64_t count) -{ - int64_t i = 0; - -#if EIDOS_SLEEF_AVAILABLE - for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) - { - EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]); - EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_EXP_D(v); - EIDOS_SLEEF_STORE_D(&output[i], r); - } -#endif - - // Scalar remainder - for (; i < count; i++) - output[i] = std::exp(input[i]); -} - -// --------------------- -// Natural Log: log(x) -// --------------------- -inline void log_float64(const double *input, double *output, int64_t count) -{ - int64_t i = 0; - -#if EIDOS_SLEEF_AVAILABLE - for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) - { - EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]); - EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_LOG_D(v); - EIDOS_SLEEF_STORE_D(&output[i], r); - } -#endif - - // Scalar remainder - for (; i < count; i++) - output[i] = std::log(input[i]); -} - -// --------------------- -// Log base 10: log10(x) -// --------------------- -inline void log10_float64(const double *input, double *output, int64_t count) -{ - int64_t i = 0; - -#if EIDOS_SLEEF_AVAILABLE - for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) - { - EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]); - EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_LOG10_D(v); - EIDOS_SLEEF_STORE_D(&output[i], r); - } -#endif - - // Scalar remainder - for (; i < count; i++) - output[i] = std::log10(input[i]); -} - -// --------------------- -// Log base 2: log2(x) -// --------------------- -inline void log2_float64(const double *input, double *output, int64_t count) -{ - int64_t i = 0; - -#if EIDOS_SLEEF_AVAILABLE - for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) - { - EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]); - EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_LOG2_D(v); - EIDOS_SLEEF_STORE_D(&output[i], r); - } -#endif - - // Scalar remainder - for (; i < count; i++) - output[i] = std::log2(input[i]); -} - -// --------------------- -// Sine: sin(x) -// --------------------- -inline void sin_float64(const double *input, double *output, int64_t count) -{ - int64_t i = 0; - -#if EIDOS_SLEEF_AVAILABLE - for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) - { - EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]); - EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_SIN_D(v); - EIDOS_SLEEF_STORE_D(&output[i], r); - } -#endif - - // Scalar remainder - for (; i < count; i++) - output[i] = std::sin(input[i]); -} - -// --------------------- -// Cosine: cos(x) -// --------------------- -inline void cos_float64(const double *input, double *output, int64_t count) -{ - int64_t i = 0; - -#if EIDOS_SLEEF_AVAILABLE - for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) - { - EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]); - EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_COS_D(v); - EIDOS_SLEEF_STORE_D(&output[i], r); - } -#endif - - // Scalar remainder - for (; i < count; i++) - output[i] = std::cos(input[i]); -} - -// --------------------- -// Tangent: tan(x) -// --------------------- -inline void tan_float64(const double *input, double *output, int64_t count) -{ - int64_t i = 0; - -#if EIDOS_SLEEF_AVAILABLE - for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) - { - EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]); - EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_TAN_D(v); - EIDOS_SLEEF_STORE_D(&output[i], r); - } -#endif - - // Scalar remainder - for (; i < count; i++) - output[i] = std::tan(input[i]); -} - -// --------------------- -// Arc Sine: asin(x) -// --------------------- -inline void asin_float64(const double *input, double *output, int64_t count) -{ - int64_t i = 0; - -#if EIDOS_SLEEF_AVAILABLE - for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) - { - EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]); - EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_ASIN_D(v); - EIDOS_SLEEF_STORE_D(&output[i], r); - } -#endif - - // Scalar remainder - for (; i < count; i++) - output[i] = std::asin(input[i]); -} - -// --------------------- -// Arc Cosine: acos(x) -// --------------------- -inline void acos_float64(const double *input, double *output, int64_t count) -{ - int64_t i = 0; - -#if EIDOS_SLEEF_AVAILABLE - for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) - { - EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]); - EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_ACOS_D(v); - EIDOS_SLEEF_STORE_D(&output[i], r); - } -#endif - - // Scalar remainder - for (; i < count; i++) - output[i] = std::acos(input[i]); -} - -// --------------------- -// Arc Tangent: atan(x) -// --------------------- -inline void atan_float64(const double *input, double *output, int64_t count) -{ - int64_t i = 0; - -#if EIDOS_SLEEF_AVAILABLE - for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) - { - EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]); - EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_ATAN_D(v); - EIDOS_SLEEF_STORE_D(&output[i], r); - } -#endif - - // Scalar remainder - for (; i < count; i++) - output[i] = std::atan(input[i]); -} - -// --------------------- -// Arc Tangent 2: atan2(y, x) -// --------------------- -inline void atan2_float64(const double *y, const double *x, double *output, int64_t count) -{ - int64_t i = 0; - -#if EIDOS_SLEEF_AVAILABLE - for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) - { - EIDOS_SLEEF_TYPE_D vy = EIDOS_SLEEF_LOAD_D(&y[i]); - EIDOS_SLEEF_TYPE_D vx = EIDOS_SLEEF_LOAD_D(&x[i]); - EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_ATAN2_D(vy, vx); - EIDOS_SLEEF_STORE_D(&output[i], r); - } -#endif - - // Scalar remainder - for (; i < count; i++) - output[i] = std::atan2(y[i], x[i]); -} - -// --------------------- -// Power: pow(x, y) = x^y -// --------------------- -inline void pow_float64(const double *base, const double *exp, double *output, int64_t count) -{ - int64_t i = 0; - -#if EIDOS_SLEEF_AVAILABLE - for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) - { - EIDOS_SLEEF_TYPE_D vb = EIDOS_SLEEF_LOAD_D(&base[i]); - EIDOS_SLEEF_TYPE_D ve = EIDOS_SLEEF_LOAD_D(&exp[i]); - EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_POW_D(vb, ve); - EIDOS_SLEEF_STORE_D(&output[i], r); - } -#endif - - // Scalar remainder - for (; i < count; i++) - output[i] = std::pow(base[i], exp[i]); -} - -// Broadcast version: all elements raised to same power (base_array ^ scalar_exp) -inline void pow_float64_scalar_exp(const double *base, double exp_scalar, double *output, int64_t count) -{ - int64_t i = 0; - -#if defined(EIDOS_HAS_AVX2) && defined(EIDOS_HAS_FMA) - __m256d ve_broadcast = _mm256_set1_pd(exp_scalar); - for (; i + 4 <= count; i += 4) - { - __m256d vb = _mm256_loadu_pd(&base[i]); - __m256d r = Sleef_powd4_u10avx2(vb, ve_broadcast); - _mm256_storeu_pd(&output[i], r); - } -#elif defined(EIDOS_HAS_NEON) - float64x2_t ve_broadcast = vdupq_n_f64(exp_scalar); - for (; i + 2 <= count; i += 2) - { - float64x2_t vb = vld1q_f64(&base[i]); - float64x2_t r = Sleef_powd2_u10advsimd(vb, ve_broadcast); - vst1q_f64(&output[i], r); - } -#endif - - // Scalar remainder - for (; i < count; i++) - output[i] = std::pow(base[i], exp_scalar); -} - -// Broadcast version: scalar base raised to array of powers (scalar_base ^ exp_array) -inline void pow_float64_scalar_base(double base_scalar, const double *exp, double *output, int64_t count) -{ - int64_t i = 0; - -#if defined(EIDOS_HAS_AVX2) && defined(EIDOS_HAS_FMA) - __m256d vb_broadcast = _mm256_set1_pd(base_scalar); - for (; i + 4 <= count; i += 4) - { - __m256d ve = _mm256_loadu_pd(&exp[i]); - __m256d r = Sleef_powd4_u10avx2(vb_broadcast, ve); - _mm256_storeu_pd(&output[i], r); - } -#elif defined(EIDOS_HAS_NEON) - float64x2_t vb_broadcast = vdupq_n_f64(base_scalar); - for (; i + 2 <= count; i += 2) - { - float64x2_t ve = vld1q_f64(&exp[i]); - float64x2_t r = Sleef_powd2_u10advsimd(vb_broadcast, ve); - vst1q_f64(&output[i], r); - } -#endif - - // Scalar remainder - for (; i < count; i++) - output[i] = std::pow(base_scalar, exp[i]); -} - -// ================================ -// Reductions -// ================================ - -// --------------------- -// Sum: sum(x) -// --------------------- -inline double sum_float64(const double *input, int64_t count) -{ - double sum = 0.0; - int64_t i = 0; - -#if defined(EIDOS_HAS_AVX2) - __m256d vsum = _mm256_setzero_pd(); - for (; i + 4 <= count; i += 4) - { - __m256d v = _mm256_loadu_pd(&input[i]); - vsum = _mm256_add_pd(vsum, v); - } - // Horizontal sum of 4 doubles - __m128d vlow = _mm256_castpd256_pd128(vsum); - __m128d vhigh = _mm256_extractf128_pd(vsum, 1); - vlow = _mm_add_pd(vlow, vhigh); // 2 doubles - __m128d shuf = _mm_shuffle_pd(vlow, vlow, 1); - vlow = _mm_add_sd(vlow, shuf); // 1 double - sum = _mm_cvtsd_f64(vlow); -#elif defined(EIDOS_HAS_SSE42) - __m128d vsum = _mm_setzero_pd(); - for (; i + 2 <= count; i += 2) - { - __m128d v = _mm_loadu_pd(&input[i]); - vsum = _mm_add_pd(vsum, v); - } - // Horizontal sum of 2 doubles - __m128d shuf = _mm_shuffle_pd(vsum, vsum, 1); - vsum = _mm_add_sd(vsum, shuf); - sum = _mm_cvtsd_f64(vsum); -#elif defined(EIDOS_HAS_NEON) - float64x2_t vsum = vdupq_n_f64(0.0); - for (; i + 2 <= count; i += 2) - { - float64x2_t v = vld1q_f64(&input[i]); - vsum = vaddq_f64(vsum, v); - } - // Horizontal sum of 2 doubles - sum = vaddvq_f64(vsum); -#endif - - // Scalar remainder - for (; i < count; i++) - sum += input[i]; - - return sum; -} - -// --------------------- -// Product: product(x) -// --------------------- -inline double product_float64(const double *input, int64_t count) -{ - double prod = 1.0; - int64_t i = 0; - -#if defined(EIDOS_HAS_AVX2) - __m256d vprod = _mm256_set1_pd(1.0); - for (; i + 4 <= count; i += 4) - { - __m256d v = _mm256_loadu_pd(&input[i]); - vprod = _mm256_mul_pd(vprod, v); - } - // Horizontal product of 4 doubles - __m128d vlow = _mm256_castpd256_pd128(vprod); - __m128d vhigh = _mm256_extractf128_pd(vprod, 1); - vlow = _mm_mul_pd(vlow, vhigh); - __m128d shuf = _mm_shuffle_pd(vlow, vlow, 1); - vlow = _mm_mul_sd(vlow, shuf); - prod = _mm_cvtsd_f64(vlow); -#elif defined(EIDOS_HAS_SSE42) - __m128d vprod = _mm_set1_pd(1.0); - for (; i + 2 <= count; i += 2) - { - __m128d v = _mm_loadu_pd(&input[i]); - vprod = _mm_mul_pd(vprod, v); - } - __m128d shuf = _mm_shuffle_pd(vprod, vprod, 1); - vprod = _mm_mul_sd(vprod, shuf); - prod = _mm_cvtsd_f64(vprod); -#elif defined(EIDOS_HAS_NEON) - float64x2_t vprod = vdupq_n_f64(1.0); - for (; i + 2 <= count; i += 2) - { - float64x2_t v = vld1q_f64(&input[i]); - vprod = vmulq_f64(vprod, v); - } - // Horizontal product of 2 doubles - prod = vgetq_lane_f64(vprod, 0) * vgetq_lane_f64(vprod, 1); -#endif - - for (; i < count; i++) - prod *= input[i]; - - return prod; -} - -// ================================ -// Float (Single-Precision) SIMD Operations -// ================================ -// These functions operate on arrays of floats, used by spatial interaction kernels. - -// --------------------- -// Exponential: exp(x) for floats -// --------------------- -inline void exp_float32(const float *input, float *output, int64_t count) -{ - int64_t i = 0; - -#if EIDOS_SLEEF_FLOAT_AVAILABLE - for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F) - { - EIDOS_SLEEF_TYPE_F v = EIDOS_SLEEF_LOAD_F(&input[i]); - EIDOS_SLEEF_TYPE_F r = EIDOS_SLEEF_EXP_F(v); - EIDOS_SLEEF_STORE_F(&output[i], r); - } -#endif - - // Scalar remainder - for (; i < count; i++) - output[i] = std::exp(input[i]); -} - -// --------------------- -// Exponential Kernel: strength = fmax * exp(-lambda * distance) -// --------------------- -// Operates in-place on a distance array, transforming distances to strengths. -// This is optimized for the spatial interaction kernel calculation. -inline void exp_kernel_float32(float *distances, int64_t count, float fmax, float lambda) -{ - int64_t i = 0; - -#if EIDOS_SLEEF_FLOAT_AVAILABLE - // We need to compute: fmax * exp(-lambda * distance) - // First, compute -lambda * distance for all values, then exp, then multiply by fmax - EIDOS_SLEEF_TYPE_F v_fmax = -#if defined(EIDOS_HAS_AVX2) - _mm256_set1_ps(fmax); - EIDOS_SLEEF_TYPE_F v_neg_lambda = _mm256_set1_ps(-lambda); -#elif defined(EIDOS_HAS_NEON) - vdupq_n_f32(fmax); - EIDOS_SLEEF_TYPE_F v_neg_lambda = vdupq_n_f32(-lambda); -#endif - - for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F) - { - EIDOS_SLEEF_TYPE_F v_dist = EIDOS_SLEEF_LOAD_F(&distances[i]); - - // Compute -lambda * distance -#if defined(EIDOS_HAS_AVX2) - EIDOS_SLEEF_TYPE_F v_arg = _mm256_mul_ps(v_neg_lambda, v_dist); -#elif defined(EIDOS_HAS_NEON) - EIDOS_SLEEF_TYPE_F v_arg = vmulq_f32(v_neg_lambda, v_dist); -#endif - - // Compute exp(-lambda * distance) - EIDOS_SLEEF_TYPE_F v_exp = EIDOS_SLEEF_EXP_F(v_arg); - - // Compute fmax * exp(...) -#if defined(EIDOS_HAS_AVX2) - EIDOS_SLEEF_TYPE_F v_result = _mm256_mul_ps(v_fmax, v_exp); -#elif defined(EIDOS_HAS_NEON) - EIDOS_SLEEF_TYPE_F v_result = vmulq_f32(v_fmax, v_exp); -#endif - - EIDOS_SLEEF_STORE_F(&distances[i], v_result); - } -#endif - - // Scalar remainder - for (; i < count; i++) - distances[i] = fmax * std::exp(-lambda * distances[i]); -} - -// --------------------- -// Normal (Gaussian) Kernel: strength = fmax * exp(-distance^2 / 2sigma^2) -// --------------------- -// Operates in-place on a distance array, transforming distances to strengths. -// The two_sigma_sq parameter is pre-computed as 2 * sigma^2 for efficiency. -inline void normal_kernel_float32(float *distances, int64_t count, float fmax, float two_sigma_sq) -{ - int64_t i = 0; - -#if EIDOS_SLEEF_FLOAT_AVAILABLE - // We need to compute: fmax * exp(-distance^2 / two_sigma_sq) - EIDOS_SLEEF_TYPE_F v_fmax = -#if defined(EIDOS_HAS_AVX2) - _mm256_set1_ps(fmax); - EIDOS_SLEEF_TYPE_F v_neg_inv_2sigsq = _mm256_set1_ps(-1.0f / two_sigma_sq); -#elif defined(EIDOS_HAS_NEON) - vdupq_n_f32(fmax); - EIDOS_SLEEF_TYPE_F v_neg_inv_2sigsq = vdupq_n_f32(-1.0f / two_sigma_sq); -#endif - - for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F) - { - EIDOS_SLEEF_TYPE_F v_dist = EIDOS_SLEEF_LOAD_F(&distances[i]); - - // Compute distance^2 -#if defined(EIDOS_HAS_AVX2) - EIDOS_SLEEF_TYPE_F v_dist_sq = _mm256_mul_ps(v_dist, v_dist); - // Compute -distance^2 / 2sigma^2 - EIDOS_SLEEF_TYPE_F v_arg = _mm256_mul_ps(v_dist_sq, v_neg_inv_2sigsq); -#elif defined(EIDOS_HAS_NEON) - EIDOS_SLEEF_TYPE_F v_dist_sq = vmulq_f32(v_dist, v_dist); - // Compute -distance^2 / 2sigma^2 - EIDOS_SLEEF_TYPE_F v_arg = vmulq_f32(v_dist_sq, v_neg_inv_2sigsq); -#endif - - // Compute exp(-distance^2 / 2sigma^2) - EIDOS_SLEEF_TYPE_F v_exp = EIDOS_SLEEF_EXP_F(v_arg); - - // Compute fmax * exp(...) -#if defined(EIDOS_HAS_AVX2) - EIDOS_SLEEF_TYPE_F v_result = _mm256_mul_ps(v_fmax, v_exp); -#elif defined(EIDOS_HAS_NEON) - EIDOS_SLEEF_TYPE_F v_result = vmulq_f32(v_fmax, v_exp); -#endif - - EIDOS_SLEEF_STORE_F(&distances[i], v_result); - } -#endif - - // Scalar remainder - for (; i < count; i++) - { - float d = distances[i]; - distances[i] = fmax * std::exp(-(d * d) / two_sigma_sq); - } -} - -// --------------------- -// Student's T Kernel: strength = fmax / pow(1 + (d/tau)^2 / nu, (nu+1)/2) -// --------------------- -// Operates in-place on a distance array, transforming distances to strengths. -// Parameters: fmax = maximum strength, nu = degrees of freedom, tau = scale -inline void tdist_kernel_float32(float *distances, int64_t count, float fmax, float nu, float tau) -{ - int64_t i = 0; - - // Pre-compute constants - float inv_tau = 1.0f / tau; - float inv_nu = 1.0f / nu; - float exponent = (nu + 1.0f) / 2.0f; - -#if EIDOS_SLEEF_FLOAT_AVAILABLE - EIDOS_SLEEF_TYPE_F v_fmax, v_inv_tau, v_inv_nu, v_exponent, v_one; -#if defined(EIDOS_HAS_AVX2) - v_fmax = _mm256_set1_ps(fmax); - v_inv_tau = _mm256_set1_ps(inv_tau); - v_inv_nu = _mm256_set1_ps(inv_nu); - v_exponent = _mm256_set1_ps(-exponent); - v_one = _mm256_set1_ps(1.0f); -#elif defined(EIDOS_HAS_NEON) - v_fmax = vdupq_n_f32(fmax); - v_inv_tau = vdupq_n_f32(inv_tau); - v_inv_nu = vdupq_n_f32(inv_nu); - v_exponent = vdupq_n_f32(-exponent); - v_one = vdupq_n_f32(1.0f); -#endif - - for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F) - { - EIDOS_SLEEF_TYPE_F v_dist = EIDOS_SLEEF_LOAD_F(&distances[i]); - - // Compute (d / tau) -#if defined(EIDOS_HAS_AVX2) - EIDOS_SLEEF_TYPE_F v_d_over_tau = _mm256_mul_ps(v_dist, v_inv_tau); - // Compute (d/tau)^2 - EIDOS_SLEEF_TYPE_F v_d_over_tau_sq = _mm256_mul_ps(v_d_over_tau, v_d_over_tau); - // Compute (d/tau)^2 / nu - EIDOS_SLEEF_TYPE_F v_term = _mm256_mul_ps(v_d_over_tau_sq, v_inv_nu); - // Compute 1 + (d/tau)^2 / nu - EIDOS_SLEEF_TYPE_F v_base = _mm256_add_ps(v_one, v_term); -#elif defined(EIDOS_HAS_NEON) - EIDOS_SLEEF_TYPE_F v_d_over_tau = vmulq_f32(v_dist, v_inv_tau); - EIDOS_SLEEF_TYPE_F v_d_over_tau_sq = vmulq_f32(v_d_over_tau, v_d_over_tau); - EIDOS_SLEEF_TYPE_F v_term = vmulq_f32(v_d_over_tau_sq, v_inv_nu); - EIDOS_SLEEF_TYPE_F v_base = vaddq_f32(v_one, v_term); -#endif - - // Compute pow(base, -exponent) = 1 / pow(base, exponent) - EIDOS_SLEEF_TYPE_F v_pow = EIDOS_SLEEF_POW_F(v_base, v_exponent); - - // Compute fmax * pow(...) -#if defined(EIDOS_HAS_AVX2) - EIDOS_SLEEF_TYPE_F v_result = _mm256_mul_ps(v_fmax, v_pow); -#elif defined(EIDOS_HAS_NEON) - EIDOS_SLEEF_TYPE_F v_result = vmulq_f32(v_fmax, v_pow); -#endif - - EIDOS_SLEEF_STORE_F(&distances[i], v_result); - } -#endif - - // Scalar remainder - for (; i < count; i++) - { - float d = distances[i]; - float d_over_tau = d * inv_tau; - distances[i] = fmax * std::pow(1.0f + d_over_tau * d_over_tau * inv_nu, -exponent); - } -} - -// --------------------- -// Cauchy Kernel: strength = fmax / (1 + (d/lambda)^2) -// --------------------- -// Operates in-place on a distance array, transforming distances to strengths. -// Parameters: fmax = maximum strength, lambda = scale parameter -inline void cauchy_kernel_float32(float *distances, int64_t count, float fmax, float lambda) -{ - int64_t i = 0; - float inv_lambda = 1.0f / lambda; - -#if defined(EIDOS_HAS_AVX2) - __m256 v_fmax = _mm256_set1_ps(fmax); - __m256 v_inv_lambda = _mm256_set1_ps(inv_lambda); - __m256 v_one = _mm256_set1_ps(1.0f); - - for (; i + 8 <= count; i += 8) - { - __m256 v_dist = _mm256_loadu_ps(&distances[i]); - __m256 v_temp = _mm256_mul_ps(v_dist, v_inv_lambda); // d/lambda - __m256 v_temp_sq = _mm256_mul_ps(v_temp, v_temp); // (d/lambda)^2 - __m256 v_denom = _mm256_add_ps(v_one, v_temp_sq); // 1 + (d/lambda)^2 - __m256 v_result = _mm256_div_ps(v_fmax, v_denom); // fmax / denom - _mm256_storeu_ps(&distances[i], v_result); - } -#elif defined(EIDOS_HAS_NEON) - float32x4_t v_fmax = vdupq_n_f32(fmax); - float32x4_t v_inv_lambda = vdupq_n_f32(inv_lambda); - float32x4_t v_one = vdupq_n_f32(1.0f); - - for (; i + 4 <= count; i += 4) - { - float32x4_t v_dist = vld1q_f32(&distances[i]); - float32x4_t v_temp = vmulq_f32(v_dist, v_inv_lambda); - float32x4_t v_temp_sq = vmulq_f32(v_temp, v_temp); - float32x4_t v_denom = vaddq_f32(v_one, v_temp_sq); - float32x4_t v_result = vdivq_f32(v_fmax, v_denom); - vst1q_f32(&distances[i], v_result); - } -#endif - - // Scalar remainder - for (; i < count; i++) - { - float d = distances[i]; - float temp = d * inv_lambda; - distances[i] = fmax / (1.0f + temp * temp); - } -} - -// --------------------- -// Linear Kernel: strength = fmax * (1 - d / max_distance) -// --------------------- -// Operates in-place on a distance array, transforming distances to strengths. -// Parameters: fmax = maximum strength, max_distance = interaction max distance -inline void linear_kernel_float32(float *distances, int64_t count, float fmax, float max_distance) -{ - int64_t i = 0; - float fmax_over_maxdist = fmax / max_distance; - -#if defined(EIDOS_HAS_AVX2) - __m256 v_fmax = _mm256_set1_ps(fmax); - __m256 v_fmax_over_maxdist = _mm256_set1_ps(fmax_over_maxdist); - - for (; i + 8 <= count; i += 8) - { - __m256 v_dist = _mm256_loadu_ps(&distances[i]); - // fmax - d * (fmax / max_distance) = fmax * (1 - d/max_distance) - __m256 v_term = _mm256_mul_ps(v_dist, v_fmax_over_maxdist); - __m256 v_result = _mm256_sub_ps(v_fmax, v_term); - _mm256_storeu_ps(&distances[i], v_result); - } -#elif defined(EIDOS_HAS_NEON) - float32x4_t v_fmax = vdupq_n_f32(fmax); - float32x4_t v_fmax_over_maxdist = vdupq_n_f32(fmax_over_maxdist); - - for (; i + 4 <= count; i += 4) - { - float32x4_t v_dist = vld1q_f32(&distances[i]); - float32x4_t v_term = vmulq_f32(v_dist, v_fmax_over_maxdist); - float32x4_t v_result = vsubq_f32(v_fmax, v_term); - vst1q_f32(&distances[i], v_result); - } -#endif - - // Scalar remainder - for (; i < count; i++) - { - distances[i] = fmax - distances[i] * fmax_over_maxdist; - } -} - -// ================================ -// Convolution Helpers for SpatialMap::smooth() -// ================================ -// These functions compute vectorized dot products for convolution operations. -// They accumulate both kernel_sum and conv_sum (weighted sum) in a single pass. - -// --------------------- -// Convolution dot product: accumulates kernel_sum and kernel*values sum -// --------------------- -// Computes: kernel_sum += sum(kernel), conv_sum += sum(kernel * values) -// Used by SpatialMap convolution when all values are valid (no edge handling needed) -inline void convolve_dot_product_float64( - const double *kernel_ptr, const double *pixel_ptr, - int64_t count, double &kernel_sum, double &conv_sum) -{ - int64_t i = 0; - double local_ksum = 0.0; - double local_csum = 0.0; - -#if defined(EIDOS_HAS_AVX2) && defined(EIDOS_HAS_FMA) - __m256d v_ksum = _mm256_setzero_pd(); - __m256d v_csum = _mm256_setzero_pd(); - - for (; i + 4 <= count; i += 4) - { - __m256d v_kernel = _mm256_loadu_pd(&kernel_ptr[i]); - __m256d v_pixel = _mm256_loadu_pd(&pixel_ptr[i]); - - v_ksum = _mm256_add_pd(v_ksum, v_kernel); - v_csum = _mm256_fmadd_pd(v_kernel, v_pixel, v_csum); - } - - // Horizontal sum - __m128d ksum_low = _mm256_castpd256_pd128(v_ksum); - __m128d ksum_high = _mm256_extractf128_pd(v_ksum, 1); - ksum_low = _mm_add_pd(ksum_low, ksum_high); - __m128d ksum_shuf = _mm_shuffle_pd(ksum_low, ksum_low, 1); - ksum_low = _mm_add_sd(ksum_low, ksum_shuf); - local_ksum = _mm_cvtsd_f64(ksum_low); - - __m128d csum_low = _mm256_castpd256_pd128(v_csum); - __m128d csum_high = _mm256_extractf128_pd(v_csum, 1); - csum_low = _mm_add_pd(csum_low, csum_high); - __m128d csum_shuf = _mm_shuffle_pd(csum_low, csum_low, 1); - csum_low = _mm_add_sd(csum_low, csum_shuf); - local_csum = _mm_cvtsd_f64(csum_low); - -#elif defined(EIDOS_HAS_SSE42) - __m128d v_ksum = _mm_setzero_pd(); - __m128d v_csum = _mm_setzero_pd(); - - for (; i + 2 <= count; i += 2) - { - __m128d v_kernel = _mm_loadu_pd(&kernel_ptr[i]); - __m128d v_pixel = _mm_loadu_pd(&pixel_ptr[i]); - - v_ksum = _mm_add_pd(v_ksum, v_kernel); - __m128d v_prod = _mm_mul_pd(v_kernel, v_pixel); - v_csum = _mm_add_pd(v_csum, v_prod); - } - - // Horizontal sum - __m128d ksum_shuf = _mm_shuffle_pd(v_ksum, v_ksum, 1); - v_ksum = _mm_add_sd(v_ksum, ksum_shuf); - local_ksum = _mm_cvtsd_f64(v_ksum); - - __m128d csum_shuf = _mm_shuffle_pd(v_csum, v_csum, 1); - v_csum = _mm_add_sd(v_csum, csum_shuf); - local_csum = _mm_cvtsd_f64(v_csum); - -#elif defined(EIDOS_HAS_NEON) - float64x2_t v_ksum = vdupq_n_f64(0.0); - float64x2_t v_csum = vdupq_n_f64(0.0); - - for (; i + 2 <= count; i += 2) - { - float64x2_t v_kernel = vld1q_f64(&kernel_ptr[i]); - float64x2_t v_pixel = vld1q_f64(&pixel_ptr[i]); - - v_ksum = vaddq_f64(v_ksum, v_kernel); - v_csum = vfmaq_f64(v_csum, v_kernel, v_pixel); - } - - local_ksum = vaddvq_f64(v_ksum); - local_csum = vaddvq_f64(v_csum); -#endif - - // Scalar remainder - for (; i < count; i++) - { - local_ksum += kernel_ptr[i]; - local_csum += kernel_ptr[i] * pixel_ptr[i]; - } - - kernel_sum += local_ksum; - conv_sum += local_csum; -} - -// --------------------- -// Scaled convolution dot product for edge handling -// --------------------- -// Like convolve_dot_product_float64 but scales kernel values by coverage factor. -// Used when handling edges where some kernel positions are out of bounds. -inline void convolve_dot_product_scaled_float64( - const double *kernel_ptr, const double *pixel_ptr, - int64_t count, double coverage, - double &kernel_sum, double &conv_sum) -{ - int64_t i = 0; - double local_ksum = 0.0; - double local_csum = 0.0; - -#if defined(EIDOS_HAS_AVX2) && defined(EIDOS_HAS_FMA) - __m256d v_coverage = _mm256_set1_pd(coverage); - __m256d v_ksum = _mm256_setzero_pd(); - __m256d v_csum = _mm256_setzero_pd(); - - for (; i + 4 <= count; i += 4) - { - __m256d v_kernel = _mm256_loadu_pd(&kernel_ptr[i]); - __m256d v_pixel = _mm256_loadu_pd(&pixel_ptr[i]); - __m256d v_scaled_kernel = _mm256_mul_pd(v_kernel, v_coverage); - - v_ksum = _mm256_add_pd(v_ksum, v_scaled_kernel); - v_csum = _mm256_fmadd_pd(v_scaled_kernel, v_pixel, v_csum); - } - - // Horizontal sum - __m128d ksum_low = _mm256_castpd256_pd128(v_ksum); - __m128d ksum_high = _mm256_extractf128_pd(v_ksum, 1); - ksum_low = _mm_add_pd(ksum_low, ksum_high); - __m128d ksum_shuf = _mm_shuffle_pd(ksum_low, ksum_low, 1); - ksum_low = _mm_add_sd(ksum_low, ksum_shuf); - local_ksum = _mm_cvtsd_f64(ksum_low); - - __m128d csum_low = _mm256_castpd256_pd128(v_csum); - __m128d csum_high = _mm256_extractf128_pd(v_csum, 1); - csum_low = _mm_add_pd(csum_low, csum_high); - __m128d csum_shuf = _mm_shuffle_pd(csum_low, csum_low, 1); - csum_low = _mm_add_sd(csum_low, csum_shuf); - local_csum = _mm_cvtsd_f64(csum_low); - -#elif defined(EIDOS_HAS_SSE42) - __m128d v_coverage = _mm_set1_pd(coverage); - __m128d v_ksum = _mm_setzero_pd(); - __m128d v_csum = _mm_setzero_pd(); - - for (; i + 2 <= count; i += 2) - { - __m128d v_kernel = _mm_loadu_pd(&kernel_ptr[i]); - __m128d v_pixel = _mm_loadu_pd(&pixel_ptr[i]); - __m128d v_scaled_kernel = _mm_mul_pd(v_kernel, v_coverage); - - v_ksum = _mm_add_pd(v_ksum, v_scaled_kernel); - __m128d v_prod = _mm_mul_pd(v_scaled_kernel, v_pixel); - v_csum = _mm_add_pd(v_csum, v_prod); - } - - // Horizontal sum - __m128d ksum_shuf = _mm_shuffle_pd(v_ksum, v_ksum, 1); - v_ksum = _mm_add_sd(v_ksum, ksum_shuf); - local_ksum = _mm_cvtsd_f64(v_ksum); - - __m128d csum_shuf = _mm_shuffle_pd(v_csum, v_csum, 1); - v_csum = _mm_add_sd(v_csum, csum_shuf); - local_csum = _mm_cvtsd_f64(v_csum); - -#elif defined(EIDOS_HAS_NEON) - float64x2_t v_coverage = vdupq_n_f64(coverage); - float64x2_t v_ksum = vdupq_n_f64(0.0); - float64x2_t v_csum = vdupq_n_f64(0.0); - - for (; i + 2 <= count; i += 2) - { - float64x2_t v_kernel = vld1q_f64(&kernel_ptr[i]); - float64x2_t v_pixel = vld1q_f64(&pixel_ptr[i]); - float64x2_t v_scaled_kernel = vmulq_f64(v_kernel, v_coverage); - - v_ksum = vaddq_f64(v_ksum, v_scaled_kernel); - v_csum = vfmaq_f64(v_csum, v_scaled_kernel, v_pixel); - } - - local_ksum = vaddvq_f64(v_ksum); - local_csum = vaddvq_f64(v_csum); -#endif - - // Scalar remainder with scaling - for (; i < count; i++) - { - double scaled_k = kernel_ptr[i] * coverage; - local_ksum += scaled_k; - local_csum += scaled_k * pixel_ptr[i]; - } - - kernel_sum += local_ksum; - conv_sum += local_csum; -} - -} // namespace Eidos_SIMD - - -// stop suppressing warnings -#pragma clang diagnostic pop -#pragma GCC diagnostic pop - +#define X(ret, name, params) extern ret (*name) params; +EIDOS_SIMD_FUNCTION_TABLE +#undef X +} + +// Declarations of the scalar-tier kernels. These are the universal fallback; +// the function pointers above are statically initialized to point here, so a +// call is always valid even before Eidos_SIMD_Init() has run. The scalar tier +// is defined in eidos_simd_scalar.cpp and built on every platform. +namespace Eidos_SIMD_scalar { +#define X(ret, name, params) ret name params; +EIDOS_SIMD_FUNCTION_TABLE +#undef X +} + +// Each tier's translation unit defines a fill function that points the +// Eidos_SIMD pointers at that tier's kernels. Eidos_SIMD_Init() calls the +// appropriate one based on runtime CPU detection. +void Eidos_SIMD_Fill_scalar(void); +#if EIDOS_SIMD_DISPATCH_X86 +void Eidos_SIMD_Fill_sse42(void); +void Eidos_SIMD_Fill_avx2(void); +#endif +#if EIDOS_SIMD_DISPATCH_ARM +void Eidos_SIMD_Fill_neon(void); +#endif + +// Detect the CPU and select the fastest available SIMD tier. This must be +// called once, early in startup, before any Eidos_SIMD kernel is used; +// Eidos_WarmUp() does this. It is idempotent. +void Eidos_SIMD_Init(void); + +// Force a specific SIMD tier by name ("scalar", "SSE4.2", "AVX2+FMA", "NEON") +// instead of the auto-selected best tier. Returns false, leaving the active +// tier unchanged, if the named tier is unavailable on this CPU or in this +// build. Used by the SIMD self-tests to exercise every tier; calling +// Eidos_SIMD_Init() afterward restores the best tier for the CPU. +bool Eidos_SIMD_SelectTier(const char *tier_name); + +// The name of the SIMD tier selected at startup: "scalar", "SSE4.2", +// "AVX2+FMA", or "NEON". Valid only after Eidos_SIMD_Init() has run. +const char *Eidos_SIMD_ActiveTierName(void); + +// True if the active tier uses SLEEF for vectorized transcendental functions +// (exp, log, sin, ...). SLEEF is wired up for the AVX2+FMA and NEON tiers; +// the scalar and SSE4.2 tiers use std:: transcendentals. +bool Eidos_SIMD_SLEEFActive(void); #endif /* eidos_simd_h */ diff --git a/eidos/eidos_simd_avx2.cpp b/eidos/eidos_simd_avx2.cpp new file mode 100644 index 00000000..4084db5f --- /dev/null +++ b/eidos/eidos_simd_avx2.cpp @@ -0,0 +1,50 @@ +// +// eidos_simd_avx2.cpp +// Eidos +// +// Created by Andrew Kern on 5/21/2026. +// Copyright (c) 2024-2025 Benjamin C. Haller. All rights reserved. +// A product of the Messer Lab, http://messerlab.org/slim/ +// + +// This file is part of Eidos. +// +// Eidos is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or (at your option) any later version. +// +// Eidos is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along with Eidos. If not, see . + +/* + + The AVX2+FMA SIMD tier (x86_64 only). This entire translation unit is built + with -mavx2 -mfma by the build system (see CMakeLists.txt); no other + translation unit gets those flags, so AVX2/FMA instructions (including those + inside the SLEEF transcendental headers pulled in here) are confined to this + file and reached only through the dispatch pointers after the CPU has been + confirmed to support AVX2 and FMA. + + On non-x86 architectures, when USE_SIMD=OFF, and on MSVC, this file compiles + to nothing. + + */ + +#include "eidos_simd.h" + +#if EIDOS_SIMD_DISPATCH_X86 + +#define EIDOS_HAS_AVX2 1 +#define EIDOS_HAS_FMA 1 +#define EIDOS_SIMD_IMPL_NAMESPACE Eidos_SIMD_avx2 +#include "eidos_simd_impl.h" + +void Eidos_SIMD_Fill_avx2(void) +{ +#define X(ret, name, params) Eidos_SIMD::name = &Eidos_SIMD_avx2::name; +EIDOS_SIMD_FUNCTION_TABLE +#undef X +} + +#endif // EIDOS_SIMD_DISPATCH_X86 diff --git a/eidos/eidos_simd_impl.h b/eidos/eidos_simd_impl.h new file mode 100644 index 00000000..11734a35 --- /dev/null +++ b/eidos/eidos_simd_impl.h @@ -0,0 +1,1227 @@ +// +// eidos_simd_impl.h +// Eidos +// +// Created by Andrew Kern on 5/21/2026. +// Copyright (c) 2024-2025 Benjamin C. Haller. All rights reserved. +// A product of the Messer Lab, http://messerlab.org/slim/ +// + +// This file is part of Eidos. +// +// Eidos is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or (at your option) any later version. +// +// Eidos is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along with Eidos. If not, see . + +/* + + Tier-parameterized SIMD kernel implementations for Eidos math operations. + + This file is the shared body of the SIMD runtime-dispatch system. It is + included once per instruction-set tier by eidos_simd_scalar.cpp, + eidos_simd_sse42.cpp, eidos_simd_avx2.cpp, and eidos_simd_neon.cpp. Each + includer first defines EIDOS_SIMD_IMPL_NAMESPACE and the EIDOS_HAS_* macro + for its tier, then includes this file; the preprocessor blocks below select + the matching intrinsics (or the scalar fallback). See eidos_simd.h for the + dispatch design and the rationale (issue #628). + + There is deliberately no include guard: this file is a template body meant + to be expanded into a different namespace in each translation unit that + includes it. Each translation unit must include it exactly once. + + */ + +#ifndef EIDOS_SIMD_IMPL_NAMESPACE +#error "Define EIDOS_SIMD_IMPL_NAMESPACE (and any EIDOS_HAS_* tier macro) before including eidos_simd_impl.h" +#endif + +#include +#include +#include // SLEEF's inline headers use memcpy() without including this themselves + +// Determine SIMD capability level +#if defined(EIDOS_HAS_AVX2) + #include + #define EIDOS_SIMD_WIDTH 4 // 4 doubles per AVX register + #define EIDOS_SIMD_FLOAT_WIDTH 8 // 8 floats per AVX register +#elif defined(EIDOS_HAS_SSE42) + #include + #include + #define EIDOS_SIMD_WIDTH 2 // 2 doubles per SSE register + #define EIDOS_SIMD_FLOAT_WIDTH 4 // 4 floats per SSE register +#elif defined(EIDOS_HAS_NEON) + #include + #define EIDOS_SIMD_WIDTH 2 // 2 doubles per NEON register + #define EIDOS_SIMD_FLOAT_WIDTH 4 // 4 floats per NEON register +#else + #define EIDOS_SIMD_WIDTH 1 // Scalar fallback + #define EIDOS_SIMD_FLOAT_WIDTH 1 +#endif + +// Include SLEEF for vectorized transcendental functions (exp, log, log10, log2) +// SLEEF is only used when AVX2+FMA or NEON is available +// BCH 12/31/2025: SLEEF generates tons of shadowed variable warnings for some reason; disable them +#if defined(EIDOS_HAS_AVX2) || defined(EIDOS_HAS_NEON) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wshadow" +#pragma GCC diagnostic ignored "-Wdouble-promotion" +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wunused-function" +#pragma GCC diagnostic ignored "-Wimplicit-float-conversion" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wshadow" +#pragma clang diagnostic ignored "-Wdouble-promotion" +#pragma clang diagnostic ignored "-Wunused-parameter" +#pragma clang diagnostic ignored "-Wunused-function" +#pragma clang diagnostic ignored "-Wimplicit-float-conversion" +#include "sleef/sleef_config.h" +#pragma clang diagnostic pop +#pragma GCC diagnostic pop +#endif + +// Disable certain warnings for the remainder of this file +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Waggressive-loop-optimizations" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Waggressive-loop-optimizations" + + +// ================================ +// SIMD Vector Math Operations +// ================================ +// These functions apply an operation to arrays of doubles. +// They handle the loop, SIMD processing, and scalar remainder. + +namespace EIDOS_SIMD_IMPL_NAMESPACE { + +// --------------------- +// Square Root: sqrt(x) +// --------------------- +void sqrt_float64(const double *input, double *output, int64_t count) +{ + int64_t i = 0; + +#if defined(EIDOS_HAS_AVX2) + // Process 4 doubles at a time + for (; i + 4 <= count; i += 4) + { + __m256d v = _mm256_loadu_pd(&input[i]); + __m256d r = _mm256_sqrt_pd(v); + _mm256_storeu_pd(&output[i], r); + } +#elif defined(EIDOS_HAS_SSE42) + // Process 2 doubles at a time + for (; i + 2 <= count; i += 2) + { + __m128d v = _mm_loadu_pd(&input[i]); + __m128d r = _mm_sqrt_pd(v); + _mm_storeu_pd(&output[i], r); + } +#elif defined(EIDOS_HAS_NEON) + // Process 2 doubles at a time + for (; i + 2 <= count; i += 2) + { + float64x2_t v = vld1q_f64(&input[i]); + float64x2_t r = vsqrtq_f64(v); + vst1q_f64(&output[i], r); + } +#endif + + // Scalar remainder + for (; i < count; i++) + output[i] = std::sqrt(input[i]); +} + +// --------------------- +// Absolute Value: abs(x) +// --------------------- +void abs_float64(const double *input, double *output, int64_t count) +{ + int64_t i = 0; + +#if defined(EIDOS_HAS_AVX2) + // Create sign mask (all bits except sign bit) + __m256d sign_mask = _mm256_set1_pd(-0.0); + for (; i + 4 <= count; i += 4) + { + __m256d v = _mm256_loadu_pd(&input[i]); + __m256d r = _mm256_andnot_pd(sign_mask, v); // Clear sign bit + _mm256_storeu_pd(&output[i], r); + } +#elif defined(EIDOS_HAS_SSE42) + __m128d sign_mask = _mm_set1_pd(-0.0); + for (; i + 2 <= count; i += 2) + { + __m128d v = _mm_loadu_pd(&input[i]); + __m128d r = _mm_andnot_pd(sign_mask, v); + _mm_storeu_pd(&output[i], r); + } +#elif defined(EIDOS_HAS_NEON) + for (; i + 2 <= count; i += 2) + { + float64x2_t v = vld1q_f64(&input[i]); + float64x2_t r = vabsq_f64(v); + vst1q_f64(&output[i], r); + } +#endif + + for (; i < count; i++) + output[i] = std::fabs(input[i]); +} + +// --------------------- +// Floor: floor(x) +// --------------------- +void floor_float64(const double *input, double *output, int64_t count) +{ + int64_t i = 0; + +#if defined(EIDOS_HAS_AVX2) + for (; i + 4 <= count; i += 4) + { + __m256d v = _mm256_loadu_pd(&input[i]); + __m256d r = _mm256_floor_pd(v); + _mm256_storeu_pd(&output[i], r); + } +#elif defined(EIDOS_HAS_SSE42) + for (; i + 2 <= count; i += 2) + { + __m128d v = _mm_loadu_pd(&input[i]); + __m128d r = _mm_floor_pd(v); + _mm_storeu_pd(&output[i], r); + } +#elif defined(EIDOS_HAS_NEON) + for (; i + 2 <= count; i += 2) + { + float64x2_t v = vld1q_f64(&input[i]); + float64x2_t r = vrndmq_f64(v); // Round toward minus infinity (floor) + vst1q_f64(&output[i], r); + } +#endif + + for (; i < count; i++) + output[i] = std::floor(input[i]); +} + +// --------------------- +// Ceil: ceil(x) +// --------------------- +void ceil_float64(const double *input, double *output, int64_t count) +{ + int64_t i = 0; + +#if defined(EIDOS_HAS_AVX2) + for (; i + 4 <= count; i += 4) + { + __m256d v = _mm256_loadu_pd(&input[i]); + __m256d r = _mm256_ceil_pd(v); + _mm256_storeu_pd(&output[i], r); + } +#elif defined(EIDOS_HAS_SSE42) + for (; i + 2 <= count; i += 2) + { + __m128d v = _mm_loadu_pd(&input[i]); + __m128d r = _mm_ceil_pd(v); + _mm_storeu_pd(&output[i], r); + } +#elif defined(EIDOS_HAS_NEON) + for (; i + 2 <= count; i += 2) + { + float64x2_t v = vld1q_f64(&input[i]); + float64x2_t r = vrndpq_f64(v); // Round toward plus infinity (ceil) + vst1q_f64(&output[i], r); + } +#endif + + for (; i < count; i++) + output[i] = std::ceil(input[i]); +} + +// --------------------- +// Truncate: trunc(x) +// --------------------- +void trunc_float64(const double *input, double *output, int64_t count) +{ + int64_t i = 0; + +#if defined(EIDOS_HAS_AVX2) + for (; i + 4 <= count; i += 4) + { + __m256d v = _mm256_loadu_pd(&input[i]); + __m256d r = _mm256_round_pd(v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + _mm256_storeu_pd(&output[i], r); + } +#elif defined(EIDOS_HAS_SSE42) + for (; i + 2 <= count; i += 2) + { + __m128d v = _mm_loadu_pd(&input[i]); + __m128d r = _mm_round_pd(v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + _mm_storeu_pd(&output[i], r); + } +#elif defined(EIDOS_HAS_NEON) + for (; i + 2 <= count; i += 2) + { + float64x2_t v = vld1q_f64(&input[i]); + float64x2_t r = vrndq_f64(v); // Round toward zero (truncate) + vst1q_f64(&output[i], r); + } +#endif + + for (; i < count; i++) + output[i] = std::trunc(input[i]); +} + +// --------------------- +// Round: round(x) +// --------------------- +void round_float64(const double *input, double *output, int64_t count) +{ + int64_t i = 0; + +#if defined(EIDOS_HAS_AVX2) + for (; i + 4 <= count; i += 4) + { + __m256d v = _mm256_loadu_pd(&input[i]); + __m256d r = _mm256_round_pd(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + _mm256_storeu_pd(&output[i], r); + } +#elif defined(EIDOS_HAS_SSE42) + for (; i + 2 <= count; i += 2) + { + __m128d v = _mm_loadu_pd(&input[i]); + __m128d r = _mm_round_pd(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + _mm_storeu_pd(&output[i], r); + } +#elif defined(EIDOS_HAS_NEON) + for (; i + 2 <= count; i += 2) + { + float64x2_t v = vld1q_f64(&input[i]); + float64x2_t r = vrndaq_f64(v); // Round to nearest, ties away from zero + vst1q_f64(&output[i], r); + } +#endif + + for (; i < count; i++) + output[i] = std::round(input[i]); +} + +// --------------------- +// Exponential: exp(x) +// --------------------- +void exp_float64(const double *input, double *output, int64_t count) +{ + int64_t i = 0; + +#if EIDOS_SLEEF_AVAILABLE + for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) + { + EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]); + EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_EXP_D(v); + EIDOS_SLEEF_STORE_D(&output[i], r); + } +#endif + + // Scalar remainder + for (; i < count; i++) + output[i] = std::exp(input[i]); +} + +// --------------------- +// Natural Log: log(x) +// --------------------- +void log_float64(const double *input, double *output, int64_t count) +{ + int64_t i = 0; + +#if EIDOS_SLEEF_AVAILABLE + for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) + { + EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]); + EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_LOG_D(v); + EIDOS_SLEEF_STORE_D(&output[i], r); + } +#endif + + // Scalar remainder + for (; i < count; i++) + output[i] = std::log(input[i]); +} + +// --------------------- +// Log base 10: log10(x) +// --------------------- +void log10_float64(const double *input, double *output, int64_t count) +{ + int64_t i = 0; + +#if EIDOS_SLEEF_AVAILABLE + for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) + { + EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]); + EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_LOG10_D(v); + EIDOS_SLEEF_STORE_D(&output[i], r); + } +#endif + + // Scalar remainder + for (; i < count; i++) + output[i] = std::log10(input[i]); +} + +// --------------------- +// Log base 2: log2(x) +// --------------------- +void log2_float64(const double *input, double *output, int64_t count) +{ + int64_t i = 0; + +#if EIDOS_SLEEF_AVAILABLE + for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) + { + EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]); + EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_LOG2_D(v); + EIDOS_SLEEF_STORE_D(&output[i], r); + } +#endif + + // Scalar remainder + for (; i < count; i++) + output[i] = std::log2(input[i]); +} + +// --------------------- +// Sine: sin(x) +// --------------------- +void sin_float64(const double *input, double *output, int64_t count) +{ + int64_t i = 0; + +#if EIDOS_SLEEF_AVAILABLE + for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) + { + EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]); + EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_SIN_D(v); + EIDOS_SLEEF_STORE_D(&output[i], r); + } +#endif + + // Scalar remainder + for (; i < count; i++) + output[i] = std::sin(input[i]); +} + +// --------------------- +// Cosine: cos(x) +// --------------------- +void cos_float64(const double *input, double *output, int64_t count) +{ + int64_t i = 0; + +#if EIDOS_SLEEF_AVAILABLE + for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) + { + EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]); + EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_COS_D(v); + EIDOS_SLEEF_STORE_D(&output[i], r); + } +#endif + + // Scalar remainder + for (; i < count; i++) + output[i] = std::cos(input[i]); +} + +// --------------------- +// Tangent: tan(x) +// --------------------- +void tan_float64(const double *input, double *output, int64_t count) +{ + int64_t i = 0; + +#if EIDOS_SLEEF_AVAILABLE + for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) + { + EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]); + EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_TAN_D(v); + EIDOS_SLEEF_STORE_D(&output[i], r); + } +#endif + + // Scalar remainder + for (; i < count; i++) + output[i] = std::tan(input[i]); +} + +// --------------------- +// Arc Sine: asin(x) +// --------------------- +void asin_float64(const double *input, double *output, int64_t count) +{ + int64_t i = 0; + +#if EIDOS_SLEEF_AVAILABLE + for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) + { + EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]); + EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_ASIN_D(v); + EIDOS_SLEEF_STORE_D(&output[i], r); + } +#endif + + // Scalar remainder + for (; i < count; i++) + output[i] = std::asin(input[i]); +} + +// --------------------- +// Arc Cosine: acos(x) +// --------------------- +void acos_float64(const double *input, double *output, int64_t count) +{ + int64_t i = 0; + +#if EIDOS_SLEEF_AVAILABLE + for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) + { + EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]); + EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_ACOS_D(v); + EIDOS_SLEEF_STORE_D(&output[i], r); + } +#endif + + // Scalar remainder + for (; i < count; i++) + output[i] = std::acos(input[i]); +} + +// --------------------- +// Arc Tangent: atan(x) +// --------------------- +void atan_float64(const double *input, double *output, int64_t count) +{ + int64_t i = 0; + +#if EIDOS_SLEEF_AVAILABLE + for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) + { + EIDOS_SLEEF_TYPE_D v = EIDOS_SLEEF_LOAD_D(&input[i]); + EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_ATAN_D(v); + EIDOS_SLEEF_STORE_D(&output[i], r); + } +#endif + + // Scalar remainder + for (; i < count; i++) + output[i] = std::atan(input[i]); +} + +// --------------------- +// Arc Tangent 2: atan2(y, x) +// --------------------- +void atan2_float64(const double *y, const double *x, double *output, int64_t count) +{ + int64_t i = 0; + +#if EIDOS_SLEEF_AVAILABLE + for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) + { + EIDOS_SLEEF_TYPE_D vy = EIDOS_SLEEF_LOAD_D(&y[i]); + EIDOS_SLEEF_TYPE_D vx = EIDOS_SLEEF_LOAD_D(&x[i]); + EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_ATAN2_D(vy, vx); + EIDOS_SLEEF_STORE_D(&output[i], r); + } +#endif + + // Scalar remainder + for (; i < count; i++) + output[i] = std::atan2(y[i], x[i]); +} + +// --------------------- +// Power: pow(x, y) = x^y +// --------------------- +void pow_float64(const double *base, const double *exp, double *output, int64_t count) +{ + int64_t i = 0; + +#if EIDOS_SLEEF_AVAILABLE + for (; i + EIDOS_SLEEF_VEC_SIZE <= count; i += EIDOS_SLEEF_VEC_SIZE) + { + EIDOS_SLEEF_TYPE_D vb = EIDOS_SLEEF_LOAD_D(&base[i]); + EIDOS_SLEEF_TYPE_D ve = EIDOS_SLEEF_LOAD_D(&exp[i]); + EIDOS_SLEEF_TYPE_D r = EIDOS_SLEEF_POW_D(vb, ve); + EIDOS_SLEEF_STORE_D(&output[i], r); + } +#endif + + // Scalar remainder + for (; i < count; i++) + output[i] = std::pow(base[i], exp[i]); +} + +// Broadcast version: all elements raised to same power (base_array ^ scalar_exp) +void pow_float64_scalar_exp(const double *base, double exp_scalar, double *output, int64_t count) +{ + int64_t i = 0; + +#if defined(EIDOS_HAS_AVX2) && defined(EIDOS_HAS_FMA) + __m256d ve_broadcast = _mm256_set1_pd(exp_scalar); + for (; i + 4 <= count; i += 4) + { + __m256d vb = _mm256_loadu_pd(&base[i]); + __m256d r = Sleef_powd4_u10avx2(vb, ve_broadcast); + _mm256_storeu_pd(&output[i], r); + } +#elif defined(EIDOS_HAS_NEON) + float64x2_t ve_broadcast = vdupq_n_f64(exp_scalar); + for (; i + 2 <= count; i += 2) + { + float64x2_t vb = vld1q_f64(&base[i]); + float64x2_t r = Sleef_powd2_u10advsimd(vb, ve_broadcast); + vst1q_f64(&output[i], r); + } +#endif + + // Scalar remainder + for (; i < count; i++) + output[i] = std::pow(base[i], exp_scalar); +} + +// Broadcast version: scalar base raised to array of powers (scalar_base ^ exp_array) +void pow_float64_scalar_base(double base_scalar, const double *exp, double *output, int64_t count) +{ + int64_t i = 0; + +#if defined(EIDOS_HAS_AVX2) && defined(EIDOS_HAS_FMA) + __m256d vb_broadcast = _mm256_set1_pd(base_scalar); + for (; i + 4 <= count; i += 4) + { + __m256d ve = _mm256_loadu_pd(&exp[i]); + __m256d r = Sleef_powd4_u10avx2(vb_broadcast, ve); + _mm256_storeu_pd(&output[i], r); + } +#elif defined(EIDOS_HAS_NEON) + float64x2_t vb_broadcast = vdupq_n_f64(base_scalar); + for (; i + 2 <= count; i += 2) + { + float64x2_t ve = vld1q_f64(&exp[i]); + float64x2_t r = Sleef_powd2_u10advsimd(vb_broadcast, ve); + vst1q_f64(&output[i], r); + } +#endif + + // Scalar remainder + for (; i < count; i++) + output[i] = std::pow(base_scalar, exp[i]); +} + +// ================================ +// Reductions +// ================================ + +// --------------------- +// Sum: sum(x) +// --------------------- +double sum_float64(const double *input, int64_t count) +{ + double sum = 0.0; + int64_t i = 0; + +#if defined(EIDOS_HAS_AVX2) + __m256d vsum = _mm256_setzero_pd(); + for (; i + 4 <= count; i += 4) + { + __m256d v = _mm256_loadu_pd(&input[i]); + vsum = _mm256_add_pd(vsum, v); + } + // Horizontal sum of 4 doubles + __m128d vlow = _mm256_castpd256_pd128(vsum); + __m128d vhigh = _mm256_extractf128_pd(vsum, 1); + vlow = _mm_add_pd(vlow, vhigh); // 2 doubles + __m128d shuf = _mm_shuffle_pd(vlow, vlow, 1); + vlow = _mm_add_sd(vlow, shuf); // 1 double + sum = _mm_cvtsd_f64(vlow); +#elif defined(EIDOS_HAS_SSE42) + __m128d vsum = _mm_setzero_pd(); + for (; i + 2 <= count; i += 2) + { + __m128d v = _mm_loadu_pd(&input[i]); + vsum = _mm_add_pd(vsum, v); + } + // Horizontal sum of 2 doubles + __m128d shuf = _mm_shuffle_pd(vsum, vsum, 1); + vsum = _mm_add_sd(vsum, shuf); + sum = _mm_cvtsd_f64(vsum); +#elif defined(EIDOS_HAS_NEON) + float64x2_t vsum = vdupq_n_f64(0.0); + for (; i + 2 <= count; i += 2) + { + float64x2_t v = vld1q_f64(&input[i]); + vsum = vaddq_f64(vsum, v); + } + // Horizontal sum of 2 doubles + sum = vaddvq_f64(vsum); +#endif + + // Scalar remainder + for (; i < count; i++) + sum += input[i]; + + return sum; +} + +// --------------------- +// Product: product(x) +// --------------------- +double product_float64(const double *input, int64_t count) +{ + double prod = 1.0; + int64_t i = 0; + +#if defined(EIDOS_HAS_AVX2) + __m256d vprod = _mm256_set1_pd(1.0); + for (; i + 4 <= count; i += 4) + { + __m256d v = _mm256_loadu_pd(&input[i]); + vprod = _mm256_mul_pd(vprod, v); + } + // Horizontal product of 4 doubles + __m128d vlow = _mm256_castpd256_pd128(vprod); + __m128d vhigh = _mm256_extractf128_pd(vprod, 1); + vlow = _mm_mul_pd(vlow, vhigh); + __m128d shuf = _mm_shuffle_pd(vlow, vlow, 1); + vlow = _mm_mul_sd(vlow, shuf); + prod = _mm_cvtsd_f64(vlow); +#elif defined(EIDOS_HAS_SSE42) + __m128d vprod = _mm_set1_pd(1.0); + for (; i + 2 <= count; i += 2) + { + __m128d v = _mm_loadu_pd(&input[i]); + vprod = _mm_mul_pd(vprod, v); + } + __m128d shuf = _mm_shuffle_pd(vprod, vprod, 1); + vprod = _mm_mul_sd(vprod, shuf); + prod = _mm_cvtsd_f64(vprod); +#elif defined(EIDOS_HAS_NEON) + float64x2_t vprod = vdupq_n_f64(1.0); + for (; i + 2 <= count; i += 2) + { + float64x2_t v = vld1q_f64(&input[i]); + vprod = vmulq_f64(vprod, v); + } + // Horizontal product of 2 doubles + prod = vgetq_lane_f64(vprod, 0) * vgetq_lane_f64(vprod, 1); +#endif + + for (; i < count; i++) + prod *= input[i]; + + return prod; +} + +// ================================ +// Float (Single-Precision) SIMD Operations +// ================================ +// These functions operate on arrays of floats, used by spatial interaction kernels. + +// --------------------- +// Exponential: exp(x) for floats +// --------------------- +void exp_float32(const float *input, float *output, int64_t count) +{ + int64_t i = 0; + +#if EIDOS_SLEEF_FLOAT_AVAILABLE + for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F) + { + EIDOS_SLEEF_TYPE_F v = EIDOS_SLEEF_LOAD_F(&input[i]); + EIDOS_SLEEF_TYPE_F r = EIDOS_SLEEF_EXP_F(v); + EIDOS_SLEEF_STORE_F(&output[i], r); + } +#endif + + // Scalar remainder + for (; i < count; i++) + output[i] = std::exp(input[i]); +} + +// --------------------- +// Exponential Kernel: strength = fmax * exp(-lambda * distance) +// --------------------- +// Operates in-place on a distance array, transforming distances to strengths. +// This is optimized for the spatial interaction kernel calculation. +void exp_kernel_float32(float *distances, int64_t count, float fmax, float lambda) +{ + int64_t i = 0; + +#if EIDOS_SLEEF_FLOAT_AVAILABLE + // We need to compute: fmax * exp(-lambda * distance) + // First, compute -lambda * distance for all values, then exp, then multiply by fmax + EIDOS_SLEEF_TYPE_F v_fmax = +#if defined(EIDOS_HAS_AVX2) + _mm256_set1_ps(fmax); + EIDOS_SLEEF_TYPE_F v_neg_lambda = _mm256_set1_ps(-lambda); +#elif defined(EIDOS_HAS_NEON) + vdupq_n_f32(fmax); + EIDOS_SLEEF_TYPE_F v_neg_lambda = vdupq_n_f32(-lambda); +#endif + + for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F) + { + EIDOS_SLEEF_TYPE_F v_dist = EIDOS_SLEEF_LOAD_F(&distances[i]); + + // Compute -lambda * distance +#if defined(EIDOS_HAS_AVX2) + EIDOS_SLEEF_TYPE_F v_arg = _mm256_mul_ps(v_neg_lambda, v_dist); +#elif defined(EIDOS_HAS_NEON) + EIDOS_SLEEF_TYPE_F v_arg = vmulq_f32(v_neg_lambda, v_dist); +#endif + + // Compute exp(-lambda * distance) + EIDOS_SLEEF_TYPE_F v_exp = EIDOS_SLEEF_EXP_F(v_arg); + + // Compute fmax * exp(...) +#if defined(EIDOS_HAS_AVX2) + EIDOS_SLEEF_TYPE_F v_result = _mm256_mul_ps(v_fmax, v_exp); +#elif defined(EIDOS_HAS_NEON) + EIDOS_SLEEF_TYPE_F v_result = vmulq_f32(v_fmax, v_exp); +#endif + + EIDOS_SLEEF_STORE_F(&distances[i], v_result); + } +#endif + + // Scalar remainder + for (; i < count; i++) + distances[i] = fmax * std::exp(-lambda * distances[i]); +} + +// --------------------- +// Normal (Gaussian) Kernel: strength = fmax * exp(-distance^2 / 2sigma^2) +// --------------------- +// Operates in-place on a distance array, transforming distances to strengths. +// The two_sigma_sq parameter is pre-computed as 2 * sigma^2 for efficiency. +void normal_kernel_float32(float *distances, int64_t count, float fmax, float two_sigma_sq) +{ + int64_t i = 0; + +#if EIDOS_SLEEF_FLOAT_AVAILABLE + // We need to compute: fmax * exp(-distance^2 / two_sigma_sq) + EIDOS_SLEEF_TYPE_F v_fmax = +#if defined(EIDOS_HAS_AVX2) + _mm256_set1_ps(fmax); + EIDOS_SLEEF_TYPE_F v_neg_inv_2sigsq = _mm256_set1_ps(-1.0f / two_sigma_sq); +#elif defined(EIDOS_HAS_NEON) + vdupq_n_f32(fmax); + EIDOS_SLEEF_TYPE_F v_neg_inv_2sigsq = vdupq_n_f32(-1.0f / two_sigma_sq); +#endif + + for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F) + { + EIDOS_SLEEF_TYPE_F v_dist = EIDOS_SLEEF_LOAD_F(&distances[i]); + + // Compute distance^2 +#if defined(EIDOS_HAS_AVX2) + EIDOS_SLEEF_TYPE_F v_dist_sq = _mm256_mul_ps(v_dist, v_dist); + // Compute -distance^2 / 2sigma^2 + EIDOS_SLEEF_TYPE_F v_arg = _mm256_mul_ps(v_dist_sq, v_neg_inv_2sigsq); +#elif defined(EIDOS_HAS_NEON) + EIDOS_SLEEF_TYPE_F v_dist_sq = vmulq_f32(v_dist, v_dist); + // Compute -distance^2 / 2sigma^2 + EIDOS_SLEEF_TYPE_F v_arg = vmulq_f32(v_dist_sq, v_neg_inv_2sigsq); +#endif + + // Compute exp(-distance^2 / 2sigma^2) + EIDOS_SLEEF_TYPE_F v_exp = EIDOS_SLEEF_EXP_F(v_arg); + + // Compute fmax * exp(...) +#if defined(EIDOS_HAS_AVX2) + EIDOS_SLEEF_TYPE_F v_result = _mm256_mul_ps(v_fmax, v_exp); +#elif defined(EIDOS_HAS_NEON) + EIDOS_SLEEF_TYPE_F v_result = vmulq_f32(v_fmax, v_exp); +#endif + + EIDOS_SLEEF_STORE_F(&distances[i], v_result); + } +#endif + + // Scalar remainder + for (; i < count; i++) + { + float d = distances[i]; + distances[i] = fmax * std::exp(-(d * d) / two_sigma_sq); + } +} + +// --------------------- +// Student's T Kernel: strength = fmax / pow(1 + (d/tau)^2 / nu, (nu+1)/2) +// --------------------- +// Operates in-place on a distance array, transforming distances to strengths. +// Parameters: fmax = maximum strength, nu = degrees of freedom, tau = scale +void tdist_kernel_float32(float *distances, int64_t count, float fmax, float nu, float tau) +{ + int64_t i = 0; + + // Pre-compute constants + float inv_tau = 1.0f / tau; + float inv_nu = 1.0f / nu; + float exponent = (nu + 1.0f) / 2.0f; + +#if EIDOS_SLEEF_FLOAT_AVAILABLE + EIDOS_SLEEF_TYPE_F v_fmax, v_inv_tau, v_inv_nu, v_exponent, v_one; +#if defined(EIDOS_HAS_AVX2) + v_fmax = _mm256_set1_ps(fmax); + v_inv_tau = _mm256_set1_ps(inv_tau); + v_inv_nu = _mm256_set1_ps(inv_nu); + v_exponent = _mm256_set1_ps(-exponent); + v_one = _mm256_set1_ps(1.0f); +#elif defined(EIDOS_HAS_NEON) + v_fmax = vdupq_n_f32(fmax); + v_inv_tau = vdupq_n_f32(inv_tau); + v_inv_nu = vdupq_n_f32(inv_nu); + v_exponent = vdupq_n_f32(-exponent); + v_one = vdupq_n_f32(1.0f); +#endif + + for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F) + { + EIDOS_SLEEF_TYPE_F v_dist = EIDOS_SLEEF_LOAD_F(&distances[i]); + + // Compute (d / tau) +#if defined(EIDOS_HAS_AVX2) + EIDOS_SLEEF_TYPE_F v_d_over_tau = _mm256_mul_ps(v_dist, v_inv_tau); + // Compute (d/tau)^2 + EIDOS_SLEEF_TYPE_F v_d_over_tau_sq = _mm256_mul_ps(v_d_over_tau, v_d_over_tau); + // Compute (d/tau)^2 / nu + EIDOS_SLEEF_TYPE_F v_term = _mm256_mul_ps(v_d_over_tau_sq, v_inv_nu); + // Compute 1 + (d/tau)^2 / nu + EIDOS_SLEEF_TYPE_F v_base = _mm256_add_ps(v_one, v_term); +#elif defined(EIDOS_HAS_NEON) + EIDOS_SLEEF_TYPE_F v_d_over_tau = vmulq_f32(v_dist, v_inv_tau); + EIDOS_SLEEF_TYPE_F v_d_over_tau_sq = vmulq_f32(v_d_over_tau, v_d_over_tau); + EIDOS_SLEEF_TYPE_F v_term = vmulq_f32(v_d_over_tau_sq, v_inv_nu); + EIDOS_SLEEF_TYPE_F v_base = vaddq_f32(v_one, v_term); +#endif + + // Compute pow(base, -exponent) = 1 / pow(base, exponent) + EIDOS_SLEEF_TYPE_F v_pow = EIDOS_SLEEF_POW_F(v_base, v_exponent); + + // Compute fmax * pow(...) +#if defined(EIDOS_HAS_AVX2) + EIDOS_SLEEF_TYPE_F v_result = _mm256_mul_ps(v_fmax, v_pow); +#elif defined(EIDOS_HAS_NEON) + EIDOS_SLEEF_TYPE_F v_result = vmulq_f32(v_fmax, v_pow); +#endif + + EIDOS_SLEEF_STORE_F(&distances[i], v_result); + } +#endif + + // Scalar remainder + for (; i < count; i++) + { + float d = distances[i]; + float d_over_tau = d * inv_tau; + distances[i] = fmax * std::pow(1.0f + d_over_tau * d_over_tau * inv_nu, -exponent); + } +} + +// --------------------- +// Cauchy Kernel: strength = fmax / (1 + (d/lambda)^2) +// --------------------- +// Operates in-place on a distance array, transforming distances to strengths. +// Parameters: fmax = maximum strength, lambda = scale parameter +void cauchy_kernel_float32(float *distances, int64_t count, float fmax, float lambda) +{ + int64_t i = 0; + float inv_lambda = 1.0f / lambda; + +#if defined(EIDOS_HAS_AVX2) + __m256 v_fmax = _mm256_set1_ps(fmax); + __m256 v_inv_lambda = _mm256_set1_ps(inv_lambda); + __m256 v_one = _mm256_set1_ps(1.0f); + + for (; i + 8 <= count; i += 8) + { + __m256 v_dist = _mm256_loadu_ps(&distances[i]); + __m256 v_temp = _mm256_mul_ps(v_dist, v_inv_lambda); // d/lambda + __m256 v_temp_sq = _mm256_mul_ps(v_temp, v_temp); // (d/lambda)^2 + __m256 v_denom = _mm256_add_ps(v_one, v_temp_sq); // 1 + (d/lambda)^2 + __m256 v_result = _mm256_div_ps(v_fmax, v_denom); // fmax / denom + _mm256_storeu_ps(&distances[i], v_result); + } +#elif defined(EIDOS_HAS_NEON) + float32x4_t v_fmax = vdupq_n_f32(fmax); + float32x4_t v_inv_lambda = vdupq_n_f32(inv_lambda); + float32x4_t v_one = vdupq_n_f32(1.0f); + + for (; i + 4 <= count; i += 4) + { + float32x4_t v_dist = vld1q_f32(&distances[i]); + float32x4_t v_temp = vmulq_f32(v_dist, v_inv_lambda); + float32x4_t v_temp_sq = vmulq_f32(v_temp, v_temp); + float32x4_t v_denom = vaddq_f32(v_one, v_temp_sq); + float32x4_t v_result = vdivq_f32(v_fmax, v_denom); + vst1q_f32(&distances[i], v_result); + } +#endif + + // Scalar remainder + for (; i < count; i++) + { + float d = distances[i]; + float temp = d * inv_lambda; + distances[i] = fmax / (1.0f + temp * temp); + } +} + +// --------------------- +// Linear Kernel: strength = fmax * (1 - d / max_distance) +// --------------------- +// Operates in-place on a distance array, transforming distances to strengths. +// Parameters: fmax = maximum strength, max_distance = interaction max distance +void linear_kernel_float32(float *distances, int64_t count, float fmax, float max_distance) +{ + int64_t i = 0; + float fmax_over_maxdist = fmax / max_distance; + +#if defined(EIDOS_HAS_AVX2) + __m256 v_fmax = _mm256_set1_ps(fmax); + __m256 v_fmax_over_maxdist = _mm256_set1_ps(fmax_over_maxdist); + + for (; i + 8 <= count; i += 8) + { + __m256 v_dist = _mm256_loadu_ps(&distances[i]); + // fmax - d * (fmax / max_distance) = fmax * (1 - d/max_distance) + __m256 v_term = _mm256_mul_ps(v_dist, v_fmax_over_maxdist); + __m256 v_result = _mm256_sub_ps(v_fmax, v_term); + _mm256_storeu_ps(&distances[i], v_result); + } +#elif defined(EIDOS_HAS_NEON) + float32x4_t v_fmax = vdupq_n_f32(fmax); + float32x4_t v_fmax_over_maxdist = vdupq_n_f32(fmax_over_maxdist); + + for (; i + 4 <= count; i += 4) + { + float32x4_t v_dist = vld1q_f32(&distances[i]); + float32x4_t v_term = vmulq_f32(v_dist, v_fmax_over_maxdist); + float32x4_t v_result = vsubq_f32(v_fmax, v_term); + vst1q_f32(&distances[i], v_result); + } +#endif + + // Scalar remainder + for (; i < count; i++) + { + distances[i] = fmax - distances[i] * fmax_over_maxdist; + } +} + +// ================================ +// Convolution Helpers for SpatialMap::smooth() +// ================================ +// These functions compute vectorized dot products for convolution operations. +// They accumulate both kernel_sum and conv_sum (weighted sum) in a single pass. + +// --------------------- +// Convolution dot product: accumulates kernel_sum and kernel*values sum +// --------------------- +// Computes: kernel_sum += sum(kernel), conv_sum += sum(kernel * values) +// Used by SpatialMap convolution when all values are valid (no edge handling needed) +void convolve_dot_product_float64( + const double *kernel_ptr, const double *pixel_ptr, + int64_t count, double &kernel_sum, double &conv_sum) +{ + int64_t i = 0; + double local_ksum = 0.0; + double local_csum = 0.0; + +#if defined(EIDOS_HAS_AVX2) && defined(EIDOS_HAS_FMA) + __m256d v_ksum = _mm256_setzero_pd(); + __m256d v_csum = _mm256_setzero_pd(); + + for (; i + 4 <= count; i += 4) + { + __m256d v_kernel = _mm256_loadu_pd(&kernel_ptr[i]); + __m256d v_pixel = _mm256_loadu_pd(&pixel_ptr[i]); + + v_ksum = _mm256_add_pd(v_ksum, v_kernel); + v_csum = _mm256_fmadd_pd(v_kernel, v_pixel, v_csum); + } + + // Horizontal sum + __m128d ksum_low = _mm256_castpd256_pd128(v_ksum); + __m128d ksum_high = _mm256_extractf128_pd(v_ksum, 1); + ksum_low = _mm_add_pd(ksum_low, ksum_high); + __m128d ksum_shuf = _mm_shuffle_pd(ksum_low, ksum_low, 1); + ksum_low = _mm_add_sd(ksum_low, ksum_shuf); + local_ksum = _mm_cvtsd_f64(ksum_low); + + __m128d csum_low = _mm256_castpd256_pd128(v_csum); + __m128d csum_high = _mm256_extractf128_pd(v_csum, 1); + csum_low = _mm_add_pd(csum_low, csum_high); + __m128d csum_shuf = _mm_shuffle_pd(csum_low, csum_low, 1); + csum_low = _mm_add_sd(csum_low, csum_shuf); + local_csum = _mm_cvtsd_f64(csum_low); + +#elif defined(EIDOS_HAS_SSE42) + __m128d v_ksum = _mm_setzero_pd(); + __m128d v_csum = _mm_setzero_pd(); + + for (; i + 2 <= count; i += 2) + { + __m128d v_kernel = _mm_loadu_pd(&kernel_ptr[i]); + __m128d v_pixel = _mm_loadu_pd(&pixel_ptr[i]); + + v_ksum = _mm_add_pd(v_ksum, v_kernel); + __m128d v_prod = _mm_mul_pd(v_kernel, v_pixel); + v_csum = _mm_add_pd(v_csum, v_prod); + } + + // Horizontal sum + __m128d ksum_shuf = _mm_shuffle_pd(v_ksum, v_ksum, 1); + v_ksum = _mm_add_sd(v_ksum, ksum_shuf); + local_ksum = _mm_cvtsd_f64(v_ksum); + + __m128d csum_shuf = _mm_shuffle_pd(v_csum, v_csum, 1); + v_csum = _mm_add_sd(v_csum, csum_shuf); + local_csum = _mm_cvtsd_f64(v_csum); + +#elif defined(EIDOS_HAS_NEON) + float64x2_t v_ksum = vdupq_n_f64(0.0); + float64x2_t v_csum = vdupq_n_f64(0.0); + + for (; i + 2 <= count; i += 2) + { + float64x2_t v_kernel = vld1q_f64(&kernel_ptr[i]); + float64x2_t v_pixel = vld1q_f64(&pixel_ptr[i]); + + v_ksum = vaddq_f64(v_ksum, v_kernel); + v_csum = vfmaq_f64(v_csum, v_kernel, v_pixel); + } + + local_ksum = vaddvq_f64(v_ksum); + local_csum = vaddvq_f64(v_csum); +#endif + + // Scalar remainder + for (; i < count; i++) + { + local_ksum += kernel_ptr[i]; + local_csum += kernel_ptr[i] * pixel_ptr[i]; + } + + kernel_sum += local_ksum; + conv_sum += local_csum; +} + +// --------------------- +// Scaled convolution dot product for edge handling +// --------------------- +// Like convolve_dot_product_float64 but scales kernel values by coverage factor. +// Used when handling edges where some kernel positions are out of bounds. +void convolve_dot_product_scaled_float64( + const double *kernel_ptr, const double *pixel_ptr, + int64_t count, double coverage, + double &kernel_sum, double &conv_sum) +{ + int64_t i = 0; + double local_ksum = 0.0; + double local_csum = 0.0; + +#if defined(EIDOS_HAS_AVX2) && defined(EIDOS_HAS_FMA) + __m256d v_coverage = _mm256_set1_pd(coverage); + __m256d v_ksum = _mm256_setzero_pd(); + __m256d v_csum = _mm256_setzero_pd(); + + for (; i + 4 <= count; i += 4) + { + __m256d v_kernel = _mm256_loadu_pd(&kernel_ptr[i]); + __m256d v_pixel = _mm256_loadu_pd(&pixel_ptr[i]); + __m256d v_scaled_kernel = _mm256_mul_pd(v_kernel, v_coverage); + + v_ksum = _mm256_add_pd(v_ksum, v_scaled_kernel); + v_csum = _mm256_fmadd_pd(v_scaled_kernel, v_pixel, v_csum); + } + + // Horizontal sum + __m128d ksum_low = _mm256_castpd256_pd128(v_ksum); + __m128d ksum_high = _mm256_extractf128_pd(v_ksum, 1); + ksum_low = _mm_add_pd(ksum_low, ksum_high); + __m128d ksum_shuf = _mm_shuffle_pd(ksum_low, ksum_low, 1); + ksum_low = _mm_add_sd(ksum_low, ksum_shuf); + local_ksum = _mm_cvtsd_f64(ksum_low); + + __m128d csum_low = _mm256_castpd256_pd128(v_csum); + __m128d csum_high = _mm256_extractf128_pd(v_csum, 1); + csum_low = _mm_add_pd(csum_low, csum_high); + __m128d csum_shuf = _mm_shuffle_pd(csum_low, csum_low, 1); + csum_low = _mm_add_sd(csum_low, csum_shuf); + local_csum = _mm_cvtsd_f64(csum_low); + +#elif defined(EIDOS_HAS_SSE42) + __m128d v_coverage = _mm_set1_pd(coverage); + __m128d v_ksum = _mm_setzero_pd(); + __m128d v_csum = _mm_setzero_pd(); + + for (; i + 2 <= count; i += 2) + { + __m128d v_kernel = _mm_loadu_pd(&kernel_ptr[i]); + __m128d v_pixel = _mm_loadu_pd(&pixel_ptr[i]); + __m128d v_scaled_kernel = _mm_mul_pd(v_kernel, v_coverage); + + v_ksum = _mm_add_pd(v_ksum, v_scaled_kernel); + __m128d v_prod = _mm_mul_pd(v_scaled_kernel, v_pixel); + v_csum = _mm_add_pd(v_csum, v_prod); + } + + // Horizontal sum + __m128d ksum_shuf = _mm_shuffle_pd(v_ksum, v_ksum, 1); + v_ksum = _mm_add_sd(v_ksum, ksum_shuf); + local_ksum = _mm_cvtsd_f64(v_ksum); + + __m128d csum_shuf = _mm_shuffle_pd(v_csum, v_csum, 1); + v_csum = _mm_add_sd(v_csum, csum_shuf); + local_csum = _mm_cvtsd_f64(v_csum); + +#elif defined(EIDOS_HAS_NEON) + float64x2_t v_coverage = vdupq_n_f64(coverage); + float64x2_t v_ksum = vdupq_n_f64(0.0); + float64x2_t v_csum = vdupq_n_f64(0.0); + + for (; i + 2 <= count; i += 2) + { + float64x2_t v_kernel = vld1q_f64(&kernel_ptr[i]); + float64x2_t v_pixel = vld1q_f64(&pixel_ptr[i]); + float64x2_t v_scaled_kernel = vmulq_f64(v_kernel, v_coverage); + + v_ksum = vaddq_f64(v_ksum, v_scaled_kernel); + v_csum = vfmaq_f64(v_csum, v_scaled_kernel, v_pixel); + } + + local_ksum = vaddvq_f64(v_ksum); + local_csum = vaddvq_f64(v_csum); +#endif + + // Scalar remainder with scaling + for (; i < count; i++) + { + double scaled_k = kernel_ptr[i] * coverage; + local_ksum += scaled_k; + local_csum += scaled_k * pixel_ptr[i]; + } + + kernel_sum += local_ksum; + conv_sum += local_csum; +} + +} // namespace EIDOS_SIMD_IMPL_NAMESPACE + + +// stop suppressing warnings +#pragma clang diagnostic pop +#pragma GCC diagnostic pop diff --git a/eidos/eidos_simd_neon.cpp b/eidos/eidos_simd_neon.cpp new file mode 100644 index 00000000..297117b4 --- /dev/null +++ b/eidos/eidos_simd_neon.cpp @@ -0,0 +1,44 @@ +// +// eidos_simd_neon.cpp +// Eidos +// +// Created by Andrew Kern on 5/21/2026. +// Copyright (c) 2024-2025 Benjamin C. Haller. All rights reserved. +// A product of the Messer Lab, http://messerlab.org/slim/ +// + +// This file is part of Eidos. +// +// Eidos is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or (at your option) any later version. +// +// Eidos is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along with Eidos. If not, see . + +/* + + The NEON SIMD tier (ARM64 only). NEON is part of the baseline ARM64 + architecture, so this translation unit needs no special compiler flags; it is + simply compiled normally on ARM64. On x86 architectures, when USE_SIMD=OFF, + and on MSVC, this file compiles to nothing. + + */ + +#include "eidos_simd.h" + +#if EIDOS_SIMD_DISPATCH_ARM + +#define EIDOS_HAS_NEON 1 +#define EIDOS_SIMD_IMPL_NAMESPACE Eidos_SIMD_neon +#include "eidos_simd_impl.h" + +void Eidos_SIMD_Fill_neon(void) +{ +#define X(ret, name, params) Eidos_SIMD::name = &Eidos_SIMD_neon::name; +EIDOS_SIMD_FUNCTION_TABLE +#undef X +} + +#endif // EIDOS_SIMD_DISPATCH_ARM diff --git a/eidos/eidos_simd_scalar.cpp b/eidos/eidos_simd_scalar.cpp new file mode 100644 index 00000000..260097ec --- /dev/null +++ b/eidos/eidos_simd_scalar.cpp @@ -0,0 +1,40 @@ +// +// eidos_simd_scalar.cpp +// Eidos +// +// Created by Andrew Kern on 5/21/2026. +// Copyright (c) 2024-2025 Benjamin C. Haller. All rights reserved. +// A product of the Messer Lab, http://messerlab.org/slim/ +// + +// This file is part of Eidos. +// +// Eidos is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or (at your option) any later version. +// +// Eidos is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along with Eidos. If not, see . + +/* + + The scalar SIMD tier: portable C++ with no instruction-set intrinsics, built + on every platform and compiled at the plain baseline ABI. This is the + universal fallback and the static default for the Eidos_SIMD pointers. + + */ + +#include "eidos_simd.h" + +// No EIDOS_HAS_* macro is defined, so eidos_simd_impl.h expands to the scalar +// code paths. +#define EIDOS_SIMD_IMPL_NAMESPACE Eidos_SIMD_scalar +#include "eidos_simd_impl.h" + +void Eidos_SIMD_Fill_scalar(void) +{ +#define X(ret, name, params) Eidos_SIMD::name = &Eidos_SIMD_scalar::name; +EIDOS_SIMD_FUNCTION_TABLE +#undef X +} diff --git a/eidos/eidos_simd_sse42.cpp b/eidos/eidos_simd_sse42.cpp new file mode 100644 index 00000000..3680cee6 --- /dev/null +++ b/eidos/eidos_simd_sse42.cpp @@ -0,0 +1,47 @@ +// +// eidos_simd_sse42.cpp +// Eidos +// +// Created by Andrew Kern on 5/21/2026. +// Copyright (c) 2024-2025 Benjamin C. Haller. All rights reserved. +// A product of the Messer Lab, http://messerlab.org/slim/ +// + +// This file is part of Eidos. +// +// Eidos is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or (at your option) any later version. +// +// Eidos is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along with Eidos. If not, see . + +/* + + The SSE4.2 SIMD tier (x86_64 only). This entire translation unit is built + with -msse4.2 by the build system (see CMakeLists.txt); no other translation + unit gets that flag, so the SSE4.2 code is confined here and reached only + through the dispatch pointers after the CPU has been confirmed to support it. + + On non-x86 architectures, when USE_SIMD=OFF, and on MSVC, this file compiles + to nothing. + + */ + +#include "eidos_simd.h" + +#if EIDOS_SIMD_DISPATCH_X86 + +#define EIDOS_HAS_SSE42 1 +#define EIDOS_SIMD_IMPL_NAMESPACE Eidos_SIMD_sse42 +#include "eidos_simd_impl.h" + +void Eidos_SIMD_Fill_sse42(void) +{ +#define X(ret, name, params) Eidos_SIMD::name = &Eidos_SIMD_sse42::name; +EIDOS_SIMD_FUNCTION_TABLE +#undef X +} + +#endif // EIDOS_SIMD_DISPATCH_X86 diff --git a/eidos/eidos_test_functions_math.cpp b/eidos/eidos_test_functions_math.cpp index a062793d..52847fb2 100644 --- a/eidos/eidos_test_functions_math.cpp +++ b/eidos/eidos_test_functions_math.cpp @@ -1303,12 +1303,34 @@ static void _TestAtan2SimdFunction(const double *y_values, const double *x_value } } -void _RunSIMDMathTests(void) +// Test a reduction SIMD function (sum, product) against a scalar reference +template +static void _TestReductionSimdFunction(const char *name, ScalarReduce scalar_reduce, SimdReduce simd_reduce, + const double *test_values, int num_values, double tolerance = 1e-12) { -#if !defined(EIDOS_SLEEF_AVAILABLE) || !EIDOS_SLEEF_AVAILABLE - std::cout << "NOTE: SLEEF is not available in this build; SIMD math tests will compare scalar fallback against std:: (trivially identical)." << std::endl; -#endif + double scalar_result = scalar_reduce(test_values, num_values); + double simd_result = simd_reduce(test_values, num_values); + double diff = std::abs(simd_result - scalar_result); + double max_val = std::max(std::abs(scalar_result), std::abs(simd_result)); + if (max_val > 1.0) + diff /= max_val; + + if (diff <= tolerance) + { + gEidosTestSuccessCount++; + } + else + { + gEidosTestFailureCount++; + std::cerr << EIDOS_OUTPUT_FAILURE_TAG << " : SIMD " << name << "() test failed (scalar=" << scalar_result << ", simd=" << simd_result << ")" << std::endl; + } +} +// Run the full SIMD math test battery against whichever tier is currently +// installed in the Eidos_SIMD dispatch table. _RunSIMDMathTests() calls this +// once for each instruction-set tier the CPU supports. +static void _RunSIMDMathTests_CurrentTier(void) +{ // Test values for trigonometric functions (in radians) // Include various edge cases and values that span multiple SIMD vector widths const double trig_test_values[] = { @@ -1530,6 +1552,179 @@ void _RunSIMDMathTests(void) std::cerr << EIDOS_OUTPUT_FAILURE_TAG << " : SIMD pow_scalar_base() test failed" << std::endl; } } + + // Non-transcendental kernels: sqrt, abs, the rounding family, the reductions, + // and the convolution helpers each have distinct SSE4.2 / AVX2 / NEON code, + // so cycling the tiers in _RunSIMDMathTests() is what covers all of them. + _TestUnarySimdFunction("sqrt", [](double x) { return std::sqrt(x); }, + Eidos_SIMD::sqrt_float64, trig_test_values, num_trig_values); + _TestUnarySimdFunction("abs", [](double x) { return std::fabs(x); }, + Eidos_SIMD::abs_float64, trig_test_values, num_trig_values); + _TestUnarySimdFunction("floor", [](double x) { return std::floor(x); }, + Eidos_SIMD::floor_float64, trig_test_values, num_trig_values); + _TestUnarySimdFunction("ceil", [](double x) { return std::ceil(x); }, + Eidos_SIMD::ceil_float64, trig_test_values, num_trig_values); + _TestUnarySimdFunction("trunc", [](double x) { return std::trunc(x); }, + Eidos_SIMD::trunc_float64, trig_test_values, num_trig_values); + + // round() test values deliberately avoid exact halves: the SIMD path rounds + // half-to-even while std::round() rounds half-away-from-zero, so the two + // agree only away from the .5 tie point. + const double round_test_values[] = { + 0.0, 7.0, -7.0, + 0.3, 0.7, 1.2, 1.8, 2.4, 2.9, 3.1, 3.6, + -0.3, -0.7, -1.2, -1.8, -2.4, -2.9, -3.1, -3.6, + 100.4, 100.7, -100.4, -100.7 + }; + const int num_round_values = (int)(sizeof(round_test_values) / sizeof(double)); + _TestUnarySimdFunction("round", [](double x) { return std::round(x); }, + Eidos_SIMD::round_float64, round_test_values, num_round_values); + + // Reductions: sum and product + const double reduction_test_values[] = { + 0.5, 1.5, 2.0, 0.25, 1.0, 3.0, 0.8, 1.2, 2.5, 0.4, 1.1, 0.9, 2.0, 0.6 + }; + const int num_reduction_values = (int)(sizeof(reduction_test_values) / sizeof(double)); + _TestReductionSimdFunction("sum", + [](const double *v, int n) { double s = 0.0; for (int i = 0; i < n; i++) s += v[i]; return s; }, + Eidos_SIMD::sum_float64, reduction_test_values, num_reduction_values); + _TestReductionSimdFunction("product", + [](const double *v, int n) { double p = 1.0; for (int i = 0; i < n; i++) p *= v[i]; return p; }, + Eidos_SIMD::product_float64, reduction_test_values, num_reduction_values); + + // Convolution dot products (used by SpatialMap::smooth()) + { + const double conv_kernel[] = { 0.1, 0.2, 0.15, 0.05, 0.25, 0.1, 0.3, 0.2, 0.12, 0.18, 0.22, 0.08, 0.14, 0.16 }; + const double conv_pixel[] = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0 }; + const int num_conv = (int)(sizeof(conv_kernel) / sizeof(double)); + double ref_ksum = 0.0, ref_csum = 0.0; + for (int i = 0; i < num_conv; i++) { ref_ksum += conv_kernel[i]; ref_csum += conv_kernel[i] * conv_pixel[i]; } + double simd_ksum = 0.0, simd_csum = 0.0; + Eidos_SIMD::convolve_dot_product_float64(conv_kernel, conv_pixel, num_conv, simd_ksum, simd_csum); + if ((std::abs(simd_ksum - ref_ksum) <= 1e-9) && (std::abs(simd_csum - ref_csum) <= 1e-9)) + gEidosTestSuccessCount++; + else + { + gEidosTestFailureCount++; + std::cerr << EIDOS_OUTPUT_FAILURE_TAG << " : SIMD convolve_dot_product() test failed" << std::endl; + } + const double coverage = 0.75; + double ref_sksum = 0.0, ref_scsum = 0.0; + for (int i = 0; i < num_conv; i++) { double sk = conv_kernel[i] * coverage; ref_sksum += sk; ref_scsum += sk * conv_pixel[i]; } + double simd_sksum = 0.0, simd_scsum = 0.0; + Eidos_SIMD::convolve_dot_product_scaled_float64(conv_kernel, conv_pixel, num_conv, coverage, simd_sksum, simd_scsum); + if ((std::abs(simd_sksum - ref_sksum) <= 1e-9) && (std::abs(simd_scsum - ref_scsum) <= 1e-9)) + gEidosTestSuccessCount++; + else + { + gEidosTestFailureCount++; + std::cerr << EIDOS_OUTPUT_FAILURE_TAG << " : SIMD convolve_dot_product_scaled() test failed" << std::endl; + } + } + + // Single-precision exponential + { + const float exp_f_values[] = { + 0.0f, 0.1f, 0.5f, 1.0f, 2.0f, 5.0f, -0.1f, -0.5f, -1.0f, -2.0f, -5.0f, 0.01f, 3.3f, -3.3f + }; + const int num_exp_f = (int)(sizeof(exp_f_values) / sizeof(float)); + std::vector exp_f_out(num_exp_f); + Eidos_SIMD::exp_float32(exp_f_values, exp_f_out.data(), num_exp_f); + bool all_match = true; + for (int i = 0; i < num_exp_f; i++) + { + float ref = std::exp(exp_f_values[i]); + float fdiff = std::fabs(exp_f_out[i] - ref); + float fmax = std::max(std::fabs(ref), std::fabs(exp_f_out[i])); + if (fmax > 1.0f) + fdiff /= fmax; + if (fdiff > 1e-5f) + all_match = false; + } + if (all_match) + gEidosTestSuccessCount++; + else + { + gEidosTestFailureCount++; + std::cerr << EIDOS_OUTPUT_FAILURE_TAG << " : SIMD exp_float32() test failed" << std::endl; + } + } + + // Single-precision spatial-interaction kernels (in-place transforms) + { + const float kdist[] = { + 0.0f, 0.1f, 0.25f, 0.5f, 0.8f, 1.0f, 1.5f, 2.0f, 2.5f, 3.0f, 0.33f, 0.66f, 1.1f, 1.7f + }; + const int num_k = (int)(sizeof(kdist) / sizeof(float)); + const float kfmax = 1.5f; + struct KernelCase { const char *name; int which; float p1, p2; }; + const KernelCase kernels[] = { + { "exp_kernel_float32", 0, 0.7f, 0.0f }, + { "normal_kernel_float32", 1, 2.0f, 0.0f }, + { "tdist_kernel_float32", 2, 3.0f, 1.2f }, + { "cauchy_kernel_float32", 3, 0.9f, 0.0f }, + { "linear_kernel_float32", 4, 5.0f, 0.0f } + }; + for (const KernelCase &k : kernels) + { + std::vector ref(kdist, kdist + num_k); + std::vector simd(kdist, kdist + num_k); + for (int i = 0; i < num_k; i++) + { + float d = ref[i]; + if (k.which == 0) ref[i] = kfmax * std::exp(-k.p1 * d); + else if (k.which == 1) ref[i] = kfmax * std::exp(-(d * d) / k.p1); + else if (k.which == 2) { float dt = d / k.p2; ref[i] = kfmax * std::pow(1.0f + dt * dt / k.p1, -(k.p1 + 1.0f) / 2.0f); } + else if (k.which == 3) { float dl = d / k.p1; ref[i] = kfmax / (1.0f + dl * dl); } + else ref[i] = kfmax - d * (kfmax / k.p1); + } + if (k.which == 0) Eidos_SIMD::exp_kernel_float32(simd.data(), num_k, kfmax, k.p1); + else if (k.which == 1) Eidos_SIMD::normal_kernel_float32(simd.data(), num_k, kfmax, k.p1); + else if (k.which == 2) Eidos_SIMD::tdist_kernel_float32(simd.data(), num_k, kfmax, k.p1, k.p2); + else if (k.which == 3) Eidos_SIMD::cauchy_kernel_float32(simd.data(), num_k, kfmax, k.p1); + else Eidos_SIMD::linear_kernel_float32(simd.data(), num_k, kfmax, k.p1); + bool all_match = true; + for (int i = 0; i < num_k; i++) + { + float diff = std::fabs(simd[i] - ref[i]); + float mx = std::max(std::fabs(ref[i]), std::fabs(simd[i])); + if (mx > 1.0f) + diff /= mx; + if (diff > 1e-4f) + all_match = false; + } + if (all_match) + gEidosTestSuccessCount++; + else + { + gEidosTestFailureCount++; + std::cerr << EIDOS_OUTPUT_FAILURE_TAG << " : SIMD " << k.name << "() test failed" << std::endl; + } + } + } +} + +void _RunSIMDMathTests(void) +{ + // The SIMD kernels are compiled once per instruction-set tier (scalar, + // SSE4.2, AVX2+FMA, NEON); normal execution only runs the single tier the + // CPU selected at startup. To test, and cover, every tier's kernels, cycle + // through all tiers available on this CPU, then restore the best one. + static const char * const tier_names[] = { "scalar", "SSE4.2", "AVX2+FMA", "NEON" }; + + for (const char * const tier_name : tier_names) + { + if (!Eidos_SIMD_SelectTier(tier_name)) + continue; + std::cout << "Testing SIMD kernels: " << Eidos_SIMD_ActiveTierName() << " tier"; + if (!Eidos_SIMD_SLEEFActive()) + std::cout << " (transcendentals use the std:: fallback)"; + std::cout << std::endl; + _RunSIMDMathTests_CurrentTier(); + } + + // Restore the fastest tier for this CPU, undoing the cycling above. + Eidos_SIMD_Init(); }