From a04bbb7330844510be2fbb23dc9d5065cc60a60f Mon Sep 17 00:00:00 2001 From: hadeer Date: Fri, 20 Feb 2026 04:46:05 +0200 Subject: [PATCH 1/7] mhnsw: compiler-independent choice of CPU-specific optimizations Replace GCC function multiversioning (__attribute__((target(...)))) with runtime CPU detection using function pointers. At startup, cpuid selects the optimal SIMD implementation (AVX512, AVX2) via a Vector_ops struct. Move x86 CPU detection into a separate vector_mhnsw_x86.cc, compiled with appropriate -mavx2/-mavx512 flags. This makes the dispatching work with musl libc and MSVC, which do not support GCC function multiversioning. Benchmark (1M iterations, 1024 dimensions, AVX512): 0.039602 s total, 0.039602 us per call --- sql/CMakeLists.txt | 7 ++ sql/bloom_filters.h | 34 +++--- sql/vector_mhnsw.cc | 219 +++++++++++++++++++++++++++----------- sql/vector_mhnsw_bench.cc | 10 ++ sql/vector_mhnsw_x86.cc | 89 ++++++++++++++++ 5 files changed, 281 insertions(+), 78 deletions(-) create mode 100644 sql/vector_mhnsw_bench.cc create mode 100644 sql/vector_mhnsw_x86.cc diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt index 32af809849be1..51f1528506f8f 100644 --- a/sql/CMakeLists.txt +++ b/sql/CMakeLists.txt @@ -207,6 +207,10 @@ SET (SQL_SOURCE ${MYSYS_LIBWRAP_SOURCE} ) +IF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i[3-6]86") + LIST(APPEND SQL_SOURCE vector_mhnsw_x86.cc) +ENDIF() + MY_CHECK_CXX_COMPILER_FLAG(-Wno-unused-but-set-variable) IF(have_CXX__Wno_unused_but_set_variable) IF(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" @@ -553,6 +557,9 @@ IF(WIN32) TARGET_LINK_LIBRARIES(mariadb-upgrade-service mysys winservice) ENDIF(WIN32) +ADD_EXECUTABLE(vector_bench vector_mhnsw_bench.cc) +TARGET_LINK_LIBRARIES(vector_bench sql sql_builtins) + IF(NOT WITH_WSREP) SET(EXCL_WSREP "wsrep_[a-np-z]*.h") ENDIF() diff --git a/sql/bloom_filters.h b/sql/bloom_filters.h index 55ae72d09231f..0bed6c56e51c5 100644 --- a/sql/bloom_filters.h +++ b/sql/bloom_filters.h @@ -29,35 +29,35 @@ SOFTWARE. #include #include -/* - Use gcc function multiversioning to optimize for a specific CPU with run-time - detection. Works only for x86, for other architectures we provide only one - implementation for now. -*/ -#define DEFAULT_IMPLEMENTATION -#if __GNUC__ > 7 && defined(__GLIBC__) -#ifdef __x86_64__ -#ifdef HAVE_IMMINTRIN_H -#include -#undef DEFAULT_IMPLEMENTATION -#define DEFAULT_IMPLEMENTATION __attribute__ ((target ("default"))) +#undef AVX2_IMPLEMENTATION +#undef AVX512_IMPLEMENTATION + +#if defined __GNUC__ && (defined __i386__||defined __x86_64__) #define AVX2_IMPLEMENTATION __attribute__ ((target ("avx2,avx,fma"))) -#if __GNUC__ > 9 +#if __GNUC__ >= 5 #define AVX512_IMPLEMENTATION __attribute__ ((target ("avx512f,avx512bw"))) #endif -#endif -#endif +#elif defined _MSC_VER && (defined _M_X64 || defined _M_IX86) +#include +#define AVX2_IMPLEMENTATION +#define AVX512_IMPLEMENTATION +#endif + #ifdef __aarch64__ #include -#undef DEFAULT_IMPLEMENTATION #define NEON_IMPLEMENTATION #endif -#endif + #if defined __powerpc64__ && defined __VSX__ #include #define POWER_IMPLEMENTATION #endif +#if !defined(AVX2_IMPLEMENTATION) && !defined(NEON_IMPLEMENTATION) && !defined(POWER_IMPLEMENTATION) +#define DEFAULT_IMPLEMENTATION +#endif + + template struct PatternedSimdBloomFilter { diff --git a/sql/vector_mhnsw.cc b/sql/vector_mhnsw.cc index d640363b6e76a..69723c7fd2610 100644 --- a/sql/vector_mhnsw.cc +++ b/sql/vector_mhnsw.cc @@ -148,11 +148,15 @@ struct FVector { return (data_size - data_header)*2; } static const FVector *create(const MHNSW_Share *ctx, void *mem, const void *src); + static float dot_product(const int16_t *v1, const int16_t *v2, size_t len); + static void fix_tail(int16_t *dims, size_t vec_len); + static size_t alloc_size(size_t n); + static FVector *align_ptr(void *ptr); void postprocess(bool use_subdist, size_t vec_len) { int16_t *d= dims; - fix_tail(vec_len); + fix_tail(d,vec_len); if (use_subdist) { subabs2= scale * scale * dot_product(d, d, subdist_part) / 2; @@ -164,14 +168,49 @@ struct FVector abs2= subabs2 + scale * scale * dot_product(d, d, vec_len) / 2; } + + + float distance_to(const FVector *other, size_t vec_len) const + { + return abs2 + other->abs2 - scale * other->scale * + dot_product(dims, other->dims, vec_len); + } + + float distance_greater_than(const FVector *other, size_t vec_len, float than, + Stats *stats) const + { + float k = scale * other->scale; + float dp= dot_product(dims, other->dims, subdist_part); + float subdist= (subabs2 + other->subabs2 - k * dp)/subdist_part*vec_len; + if (subdist > than) + return subdist; + dp+= dot_product(dims+subdist_part, other->dims+subdist_part, + vec_len - subdist_part); + float dist= abs2 + other->abs2 - k * dp; + stats->subdist.add(subdist/dist); + return dist; + } +}; +#pragma pack(pop) + +struct Vector_ops +{ + float (*dot_product)(const int16_t *v1, const int16_t *v2, size_t len); + size_t (*alloc_size)(size_t n); + FVector * (*align_ptr)(void *ptr); + void (*fix_tail)(int16_t *dims, size_t vec_len); +}; + + #ifdef AVX2_IMPLEMENTATION /************* AVX2 *****************************************************/ static constexpr size_t AVX2_bytes= 256/8; static constexpr size_t AVX2_dims= AVX2_bytes/sizeof(int16_t); static_assert(subdist_part % AVX2_dims == 0); + extern "C" AVX2_IMPLEMENTATION - static float dot_product(const int16_t *v1, const int16_t *v2, size_t len) + float dot_product_avx2(const int16_t *v1, const int16_t *v2, size_t len) { typedef float v8f __attribute__((vector_size(AVX2_bytes))); union { v8f v; __m256 i; } tmp; @@ -186,17 +225,20 @@ struct FVector return d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7]; } + extern "C" AVX2_IMPLEMENTATION - static size_t alloc_size(size_t n) - { return alloc_header + MY_ALIGN(n*2, AVX2_bytes) + AVX2_bytes - 1; } + size_t alloc_size_avx2(size_t n) + { return FVector::alloc_header + MY_ALIGN(n*2, AVX2_bytes) + AVX2_bytes - 1; } + extern "C" AVX2_IMPLEMENTATION - static FVector *align_ptr(void *ptr) - { return (FVector*)(MY_ALIGN(((intptr)ptr) + alloc_header, AVX2_bytes) - - alloc_header); } + FVector *align_ptr_avx2(void *ptr) + { return (FVector*)(MY_ALIGN(((intptr)ptr) + FVector::alloc_header, AVX2_bytes) + - FVector::alloc_header); } + extern "C" AVX2_IMPLEMENTATION - void fix_tail(size_t vec_len) + void fix_tail_avx2(int16_t *dims, size_t vec_len) { bzero(dims + vec_len, (MY_ALIGN(vec_len, AVX2_dims) - vec_len)*2); } @@ -208,8 +250,9 @@ struct FVector static constexpr size_t AVX512_dims= AVX512_bytes/sizeof(int16_t); static_assert(subdist_part % AVX512_dims == 0); + extern "C" AVX512_IMPLEMENTATION - static float dot_product(const int16_t *v1, const int16_t *v2, size_t len) + float dot_product_avx512(const int16_t *v1, const int16_t *v2, size_t len) { __m512i *p1= (__m512i*)v1; __m512i *p2= (__m512i*)v2; @@ -219,17 +262,20 @@ struct FVector return _mm512_reduce_add_ps(d); } + extern "C" AVX512_IMPLEMENTATION - static size_t alloc_size(size_t n) - { return alloc_header + MY_ALIGN(n*2, AVX512_bytes) + AVX512_bytes - 1; } + size_t alloc_size_avx512(size_t n) + { return FVector::alloc_header + MY_ALIGN(n*2, AVX512_bytes) + AVX512_bytes - 1; } + extern "C" AVX512_IMPLEMENTATION - static FVector *align_ptr(void *ptr) - { return (FVector*)(MY_ALIGN(((intptr)ptr) + alloc_header, AVX512_bytes) - - alloc_header); } + FVector *align_ptr_avx512(void *ptr) + { return (FVector*)(MY_ALIGN(((intptr)ptr) + FVector::alloc_header, AVX512_bytes) + - FVector::alloc_header); } + extern "C" AVX512_IMPLEMENTATION - void fix_tail(size_t vec_len) + void fix_tail_avx512(int16_t *dims, size_t vec_len) { bzero(dims + vec_len, (MY_ALIGN(vec_len, AVX512_dims) - vec_len)*2); } @@ -250,7 +296,7 @@ struct FVector static constexpr size_t NEON_dims= NEON_bytes / sizeof(int16_t); static_assert(subdist_part % NEON_dims == 0); - static float dot_product(const int16_t *v1, const int16_t *v2, size_t len) + float dot_product_neon(const int16_t *v1, const int16_t *v2, size_t len) { int64_t d= 0; for (size_t i= 0; i < (len + NEON_dims - 1) / NEON_dims; i++) @@ -265,14 +311,14 @@ struct FVector return static_cast(d); } - static size_t alloc_size(size_t n) - { return alloc_header + MY_ALIGN(n * 2, NEON_bytes) + NEON_bytes - 1; } + size_t alloc_size_neon(size_t n) + { return FVector::alloc_header + MY_ALIGN(n * 2, NEON_bytes) + NEON_bytes - 1; } - static FVector *align_ptr(void *ptr) - { return (FVector*) (MY_ALIGN(((intptr) ptr) + alloc_header, NEON_bytes) - - alloc_header); } + FVector *align_ptr_neon(void *ptr) + { return (FVector*) (MY_ALIGN(((intptr) ptr) + FVector::alloc_header, NEON_bytes) + - FVector::alloc_header); } - void fix_tail(size_t vec_len) + void fix_tail_neon(int16_t *dims, size_t vec_len) { bzero(dims + vec_len, (MY_ALIGN(vec_len, NEON_dims) - vec_len) * 2); } @@ -284,7 +330,7 @@ struct FVector static constexpr size_t POWER_dims= POWER_bytes / sizeof(int16_t); static_assert(subdist_part % POWER_dims == 0); - static float dot_product(const int16_t *v1, const int16_t *v2, size_t len) + float dot_product_power(const int16_t *v1, const int16_t *v2, size_t len) { // Using vector long long for int64_t accumulation vector long long ll_sum= {0, 0}; @@ -314,28 +360,25 @@ struct FVector static_cast(ll_sum[1])); } - static size_t alloc_size(size_t n) + size_t alloc_size_power(size_t n) { - return alloc_header + MY_ALIGN(n * 2, POWER_bytes) + POWER_bytes - 1; + return FVector::alloc_header + MY_ALIGN(n * 2, POWER_bytes) + POWER_bytes - 1; } - static FVector *align_ptr(void *ptr) + FVector *align_ptr_power(void *ptr) { - return (FVector*)(MY_ALIGN(((intptr)ptr) + alloc_header, POWER_bytes) - - alloc_header); + return (FVector*)(MY_ALIGN(((intptr)ptr) + FVector::alloc_header, POWER_bytes) + - FVector::alloc_header); } - void fix_tail(size_t vec_len) + void fix_tail_power(int16_t *dims, size_t vec_len) { bzero(dims + vec_len, (MY_ALIGN(vec_len, POWER_dims) - vec_len) * 2); } -#undef DEFAULT_IMPLEMENTATION #endif - /************* no-SIMD default ******************************************/ -#ifdef DEFAULT_IMPLEMENTATION - DEFAULT_IMPLEMENTATION - static float dot_product(const int16_t *v1, const int16_t *v2, size_t len) + /************* no-SIMD ******************************************/ + static float dot_product_default(const int16_t *v1, const int16_t *v2, size_t len) { int64_t d= 0; for (size_t i= 0; i < len; i++) @@ -343,38 +386,92 @@ struct FVector return static_cast(d); } - DEFAULT_IMPLEMENTATION - static size_t alloc_size(size_t n) { return alloc_header + n*2; } + size_t alloc_size_default(size_t n) { return FVector::alloc_header + n*2; } - DEFAULT_IMPLEMENTATION - static FVector *align_ptr(void *ptr) { return (FVector*)ptr; } + FVector *align_ptr_default(void *ptr) { return (FVector*)ptr; } + void fix_tail_default(int16_t *dims, size_t) { } - DEFAULT_IMPLEMENTATION - void fix_tail(size_t) { } +#if defined __x86_64__ || defined _M_X64 +extern "C" Vector_ops vector_ops_x86_available(void); #endif - float distance_to(const FVector *other, size_t vec_len) const - { - return abs2 + other->abs2 - scale * other->scale * - dot_product(dims, other->dims, vec_len); - } +static Vector_ops choose_vector_ops_impl() +{ +#if defined __x86_64__ || defined _M_X64 + auto ops = vector_ops_x86_available(); + if (ops.dot_product) + return ops; +#elif defined __aarch64__ + return {dot_product_neon, alloc_size_neon, align_ptr_neon, fix_tail_neon}; +#elif defined __powerpc64__ + return {dot_product_power, alloc_size_power, align_ptr_power, fix_tail_power}; +#endif + return {dot_product_default, alloc_size_default, align_ptr_default, fix_tail_default}; +} - float distance_greater_than(const FVector *other, size_t vec_len, float than, - Stats *stats) const - { - float k = scale * other->scale; - float dp= dot_product(dims, other->dims, subdist_part); - float subdist= (subabs2 + other->subabs2 - k * dp)/subdist_part*vec_len; - if (subdist > than) - return subdist; - dp+= dot_product(dims+subdist_part, other->dims+subdist_part, - vec_len - subdist_part); - float dist= abs2 + other->abs2 - k * dp; - stats->subdist.add(subdist/dist); - return dist; - } -}; -#pragma pack(pop) +static const Vector_ops chosen_vector_ops= choose_vector_ops_impl(); + +float FVector::dot_product(const int16_t *v1, const int16_t *v2, size_t len) +{ + return chosen_vector_ops.dot_product(v1, v2, len); +} +size_t FVector::alloc_size(size_t n) +{ + return chosen_vector_ops.alloc_size(n); +} +FVector *FVector::align_ptr(void *ptr) +{ + return chosen_vector_ops.align_ptr(ptr); +} +void FVector::fix_tail(int16_t *dims, size_t vec_len) +{ + chosen_vector_ops.fix_tail(dims, vec_len); +} + +/* + A simple benchmark to test the performance of the dot product function. +*/ +#include +#include + +void mhnsw_run_benchmark() +{ + const size_t vec_len= 1024; + const size_t iterations= 1000000; + + int16_t *vec1= (int16_t*)aligned_alloc(64, vec_len * sizeof(int16_t)); + int16_t *vec2= (int16_t*)aligned_alloc(64, vec_len * sizeof(int16_t)); + std::mt19937 rng(42); + std::uniform_int_distribution dist(-100, 100); + + for (size_t i= 0; i < vec_len; i++) vec1[i]= dist(rng); + for (size_t i= 0; i < vec_len; i++) vec2[i]= dist(rng); + + std::cout << "Warm up....\n"; + for (int i= 0; i < 1000; i++) + FVector::dot_product(vec1, vec2, vec_len); + + std::cout << "Running benchmark...\n"; + ulonglong start= my_timer_microseconds(); + + volatile float result= 0; + for (size_t i= 0; i < iterations; i++) + result+= FVector::dot_product(vec1, vec2, vec_len); + + ulonglong end= my_timer_microseconds(); + ulonglong duration= end - start; + + double per_call= duration / (double)iterations; + double total_time_sec= duration / 1e6; + + std::cout << "Total time in seconds: " << total_time_sec << "\n"; + std::cout << "Total time in microseconds: " << duration << "\n"; + std::cout << "Average time per call: " << per_call << "\n"; + std::cout << "Result: " << result << "\n"; + + free(vec1); + free(vec2); +} /* An array of pointers to graph nodes diff --git a/sql/vector_mhnsw_bench.cc b/sql/vector_mhnsw_bench.cc new file mode 100644 index 0000000000000..dc3f800a72a21 --- /dev/null +++ b/sql/vector_mhnsw_bench.cc @@ -0,0 +1,10 @@ +void mhnsw_run_benchmark(); +struct st_mysql_plugin; +st_mysql_plugin *mysql_mandatory_plugins[] = {nullptr}; +st_mysql_plugin *mysql_optional_plugins[] = {nullptr}; + +int main() +{ + mhnsw_run_benchmark(); + return 0; +} \ No newline at end of file diff --git a/sql/vector_mhnsw_x86.cc b/sql/vector_mhnsw_x86.cc new file mode 100644 index 0000000000000..e45ff50952b04 --- /dev/null +++ b/sql/vector_mhnsw_x86.cc @@ -0,0 +1,89 @@ +/* Copyright (c) 2024, MariaDB plc + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include +#include +#include + +#ifdef _MSC_VER +# include +#else +# include +#endif + +constexpr uint32_t cpuid_ecx_AVX_AND_XSAVE= 1U << 28 | 1U << 27; +constexpr uint32_t cpuid_ebx_AVX2= 1U << 5; +constexpr uint32_t cpuid_ebx_AVX512= 1U << 16 | 1U << 30; // AVX512F and AVX512BW + + +static uint32_t cpuid_ecx() +{ +#ifdef __GNUC__ + uint32_t eax, ebx, ecx, edx; + __cpuid(1, eax, ebx, ecx, edx); + return ecx; +#elif defined _MSC_VER + int regs[4]; + __cpuid(regs, 1); + return regs[2]; +#endif +} + +static uint32_t cpuid_ebx_7() +{ +#ifdef __GNUC__ + uint32_t eax, ebx, ecx, edx; + __cpuid_count(7, 0, eax, ebx, ecx, edx); + return ebx; +#elif defined _MSC_VER + int regs[4]; + __cpuidex(regs, 7, 0); + return regs[1]; +#endif +} + +struct FVector; +typedef struct Vector_ops +{ + float (*dot_product)(const int16_t *v1, const int16_t *v2, size_t len); + size_t (*alloc_size)(size_t n); + FVector * (*align_ptr)(void *ptr); + void (*fix_tail)(int16_t *dims, size_t vec_len); +} Vector_ops; + +extern "C" float dot_product_avx2(const int16_t*, const int16_t*, size_t); +extern "C" float dot_product_avx512(const int16_t*, const int16_t*, size_t); +extern "C" size_t alloc_size_avx2(size_t); +extern "C" FVector *align_ptr_avx2(void*); +extern "C" void fix_tail_avx2(int16_t*, size_t); +extern "C" size_t alloc_size_avx512(size_t); +extern "C" FVector *align_ptr_avx512(void*); +extern "C" void fix_tail_avx512(int16_t*, size_t); + +extern "C" Vector_ops vector_ops_x86_available(void) +{ + const uint32_t ecx = cpuid_ecx(); + if (~ecx & cpuid_ecx_AVX_AND_XSAVE) + return {nullptr, nullptr, nullptr, nullptr}; + + const uint32_t ebx = cpuid_ebx_7(); + if ((ebx & cpuid_ebx_AVX512) == cpuid_ebx_AVX512) + return {dot_product_avx512, alloc_size_avx512, align_ptr_avx512, fix_tail_avx512}; + + if (ebx & cpuid_ebx_AVX2) + return {dot_product_avx2, alloc_size_avx2, align_ptr_avx2, fix_tail_avx2}; + + return {nullptr, nullptr, nullptr, nullptr}; +} From 4162e01f68dd1a70191b0ff8e2e7b6b9d8984cf2 Mon Sep 17 00:00:00 2001 From: hadeer Date: Thu, 5 Mar 2026 07:52:03 +0200 Subject: [PATCH 2/7] Vector Search: Add OS support verification Use xgetbv to check before enabling the right paths. --- sql/vector_mhnsw_x86.cc | 49 +++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/sql/vector_mhnsw_x86.cc b/sql/vector_mhnsw_x86.cc index e45ff50952b04..9d7c2e35c378e 100644 --- a/sql/vector_mhnsw_x86.cc +++ b/sql/vector_mhnsw_x86.cc @@ -25,35 +25,71 @@ constexpr uint32_t cpuid_ecx_AVX_AND_XSAVE= 1U << 28 | 1U << 27; constexpr uint32_t cpuid_ebx_AVX2= 1U << 5; -constexpr uint32_t cpuid_ebx_AVX512= 1U << 16 | 1U << 30; // AVX512F and AVX512BW - +constexpr uint32_t cpuid_ebx_AVX512= 1U << 16 | 1U << 30; static uint32_t cpuid_ecx() { #ifdef __GNUC__ - uint32_t eax, ebx, ecx, edx; + uint32_t eax=0, ebx=0, ecx=0, edx=0; __cpuid(1, eax, ebx, ecx, edx); return ecx; #elif defined _MSC_VER int regs[4]; __cpuid(regs, 1); return regs[2]; +#else +# error "unknown compiler" #endif } static uint32_t cpuid_ebx_7() { #ifdef __GNUC__ - uint32_t eax, ebx, ecx, edx; + uint32_t eax=0, ebx=0, ecx=0, edx=0; __cpuid_count(7, 0, eax, ebx, ecx, edx); return ebx; #elif defined _MSC_VER int regs[4]; __cpuidex(regs, 7, 0); return regs[1]; +#else +# error "unknown compiler" +#endif +} + + +static uint64_t xgetbv() { +#ifdef _MSC_VER + return _xgetbv(0); +#else +/* builtin xgetbv is only supported in clang 9+, so use inline assembly directly*/ + uint32_t eax, edx; + __asm__ volatile ("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0)); + return ((uint64_t)edx << 32) | eax; #endif } +static bool os_have_avx2() +{ + return (xgetbv() & 0x06) == 0x06; +} +static bool os_have_avx512() +{ + return (xgetbv() & 0xe6) == 0xe6; +} +static bool cpu_has_avx2() +{ + uint32_t ebx7 = cpuid_ebx_7(); + return (ebx7 & cpuid_ebx_AVX2) == cpuid_ebx_AVX2 && os_have_avx2(); +} + +static bool cpu_has_avx512() +{ + uint32_t ebx7 = cpuid_ebx_7(); + return (ebx7 & cpuid_ebx_AVX512) == cpuid_ebx_AVX512 && os_have_avx512(); +} + + struct FVector; typedef struct Vector_ops { @@ -78,11 +114,10 @@ extern "C" Vector_ops vector_ops_x86_available(void) if (~ecx & cpuid_ecx_AVX_AND_XSAVE) return {nullptr, nullptr, nullptr, nullptr}; - const uint32_t ebx = cpuid_ebx_7(); - if ((ebx & cpuid_ebx_AVX512) == cpuid_ebx_AVX512) + if (cpu_has_avx512()) return {dot_product_avx512, alloc_size_avx512, align_ptr_avx512, fix_tail_avx512}; - if (ebx & cpuid_ebx_AVX2) + if (cpu_has_avx2()) return {dot_product_avx2, alloc_size_avx2, align_ptr_avx2, fix_tail_avx2}; return {nullptr, nullptr, nullptr, nullptr}; From e9ae69f1c3db074b1c4a470774912c9d5f6292f2 Mon Sep 17 00:00:00 2001 From: hadeer Date: Thu, 5 Mar 2026 11:10:53 +0200 Subject: [PATCH 3/7] mhnsw: compiler-independent choice using if/else to branch every call, but detection cpu level done once --- libmysqld/CMakeLists.txt | 3 + sql/vector_mhnsw.cc | 134 +++++++++++++++++++++++++++++++++------ sql/vector_mhnsw_x86.cc | 69 ++++++++++---------- 3 files changed, 153 insertions(+), 53 deletions(-) diff --git a/libmysqld/CMakeLists.txt b/libmysqld/CMakeLists.txt index eb5e38427f7f6..4c56a40ca0fcd 100644 --- a/libmysqld/CMakeLists.txt +++ b/libmysqld/CMakeLists.txt @@ -170,6 +170,9 @@ SET(SQL_EMBEDDED_SOURCES emb_qcache.cc libmysqld.c lib_sql.cc ${MYSYS_LIBWRAP_SOURCE} ) +IF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i[3-6]86") + LIST(APPEND SQL_SOURCE vector_mhnsw_x86.cc) +ENDIF() ADD_CONVENIENCE_LIBRARY(sql_embedded ${SQL_EMBEDDED_SOURCES}) DTRACE_INSTRUMENT(sql_embedded) diff --git a/sql/vector_mhnsw.cc b/sql/vector_mhnsw.cc index 69723c7fd2610..d6dfca8316cc5 100644 --- a/sql/vector_mhnsw.cc +++ b/sql/vector_mhnsw.cc @@ -193,13 +193,13 @@ struct FVector }; #pragma pack(pop) -struct Vector_ops -{ - float (*dot_product)(const int16_t *v1, const int16_t *v2, size_t len); - size_t (*alloc_size)(size_t n); - FVector * (*align_ptr)(void *ptr); - void (*fix_tail)(int16_t *dims, size_t vec_len); -}; +// struct Vector_ops +// { +// float (*dot_product)(const int16_t *v1, const int16_t *v2, size_t len); +// size_t (*alloc_size)(size_t n); +// FVector * (*align_ptr)(void *ptr); +// void (*fix_tail)(int16_t *dims, size_t vec_len); +// }; #ifdef AVX2_IMPLEMENTATION @@ -391,45 +391,137 @@ struct Vector_ops FVector *align_ptr_default(void *ptr) { return (FVector*)ptr; } void fix_tail_default(int16_t *dims, size_t) { } + +/*******************/ #if defined __x86_64__ || defined _M_X64 -extern "C" Vector_ops vector_ops_x86_available(void); +// extern "C" Vector_ops vector_ops_x86_available(void); +extern "C" bool cpu_has_avx2(); +extern "C" bool cpu_has_avx512(); +extern "C" bool cpu_has_avx_and_xsave(); #endif -static Vector_ops choose_vector_ops_impl() +enum class CpuLevel { + AVX2, + AVX512, + NEON, + POWERPC, + DEFAULT +}; + +static CpuLevel detect_cpu_level() { #if defined __x86_64__ || defined _M_X64 - auto ops = vector_ops_x86_available(); - if (ops.dot_product) - return ops; + if (!cpu_has_avx_and_xsave()) + return CpuLevel::DEFAULT; + if (cpu_has_avx512()) + return CpuLevel::AVX512; + else if (cpu_has_avx2()) + return CpuLevel::AVX2; + else + return CpuLevel::DEFAULT; #elif defined __aarch64__ - return {dot_product_neon, alloc_size_neon, align_ptr_neon, fix_tail_neon}; + return CpuLevel::NEON; #elif defined __powerpc64__ - return {dot_product_power, alloc_size_power, align_ptr_power, fix_tail_power}; + return CpuLevel::POWERPC; +#else + return CpuLevel::DEFAULT; #endif - return {dot_product_default, alloc_size_default, align_ptr_default, fix_tail_default}; } -static const Vector_ops chosen_vector_ops= choose_vector_ops_impl(); +static const CpuLevel cpu_level = detect_cpu_level(); + +// static Vector_ops choose_vector_ops_impl() +// { +// #if defined __x86_64__ || defined _M_X64 +// auto ops = vector_ops_x86_available(); +// if (ops.dot_product) +// return ops; +// #elif defined __aarch64__ +// return {dot_product_neon, alloc_size_neon, align_ptr_neon, fix_tail_neon}; +// #elif defined __powerpc64__ +// return {dot_product_power, alloc_size_power, align_ptr_power, fix_tail_power}; +// #endif +// return {dot_product_default, alloc_size_default, align_ptr_default, fix_tail_default}; +// } + +// static const Vector_ops chosen_vector_ops= choose_vector_ops_impl(); float FVector::dot_product(const int16_t *v1, const int16_t *v2, size_t len) { - return chosen_vector_ops.dot_product(v1, v2, len); +#ifdef AVX512_IMPLEMENTATION + if (cpu_level == CpuLevel::AVX512) + return dot_product_avx512(v1, v2, len); +#elif AVX2_IMPLEMENTATION + if (cpu_level == CpuLevel::AVX2) + return dot_product_avx2(v1, v2, len); +#elif NEON_IMPLEMENTATION + if (cpu_level == CpuLevel::NEON) + return dot_product_neon(v1, v2, len); +#elif POWER_IMPLEMENTATION + if (cpu_level == CpuLevel::POWERPC) + return dot_product_power(v1, v2, len); +#else + return dot_product_default(v1, v2, len); +#endif } size_t FVector::alloc_size(size_t n) { - return chosen_vector_ops.alloc_size(n); +#ifdef AVX512_IMPLEMENTATION + if (cpu_level == CpuLevel::AVX512) + return alloc_size_avx512(n); +#elif AVX2_IMPLEMENTATION + if (cpu_level == CpuLevel::AVX2) + return alloc_size_avx2(n); +#elif NEON_IMPLEMENTATION + if (cpu_level == CpuLevel::NEON) + return alloc_size_neon(n); +#elif POWER_IMPLEMENTATION + if (cpu_level == CpuLevel::POWERPC) + return alloc_size_power(n); +#else + return alloc_size_default(n); +#endif + } FVector *FVector::align_ptr(void *ptr) { - return chosen_vector_ops.align_ptr(ptr); +#ifdef AVX512_IMPLEMENTATION + if (cpu_level == CpuLevel::AVX512) + return align_ptr_avx512(ptr); +#elif AVX2_IMPLEMENTATION + if (cpu_level == CpuLevel::AVX2) + return align_ptr_avx2(ptr); +#elif NEON_IMPLEMENTATION + if (cpu_level == CpuLevel::NEON) + return align_ptr_neon(ptr); +#elif POWER_IMPLEMENTATION + if (cpu_level == CpuLevel::POWERPC) + return align_ptr_power(ptr); +#else + return align_ptr_default(ptr); +#endif } void FVector::fix_tail(int16_t *dims, size_t vec_len) { - chosen_vector_ops.fix_tail(dims, vec_len); +#ifdef AVX512_IMPLEMENTATION + if (cpu_level == CpuLevel::AVX512) + return fix_tail_avx512(dims, vec_len); +#elif AVX2_IMPLEMENTATION + if (cpu_level == CpuLevel::AVX2) + return fix_tail_avx2(dims, vec_len); +#elif NEON_IMPLEMENTATION + if (cpu_level == CpuLevel::NEON) + return fix_tail_neon(dims, vec_len); +#elif POWER_IMPLEMENTATION + if (cpu_level == CpuLevel::POWERPC) + return fix_tail_power(dims, vec_len); +#else + return fix_tail_default(dims, vec_len); +#endif } /* - A simple benchmark to test the performance of the dot product function. + A temporary benchmark to test the performance of the dot product function. */ #include #include diff --git a/sql/vector_mhnsw_x86.cc b/sql/vector_mhnsw_x86.cc index 9d7c2e35c378e..7c9c1f4863bc8 100644 --- a/sql/vector_mhnsw_x86.cc +++ b/sql/vector_mhnsw_x86.cc @@ -77,48 +77,53 @@ static bool os_have_avx512() { return (xgetbv() & 0xe6) == 0xe6; } -static bool cpu_has_avx2() + +extern "C" bool cpu_has_avx2() { uint32_t ebx7 = cpuid_ebx_7(); return (ebx7 & cpuid_ebx_AVX2) == cpuid_ebx_AVX2 && os_have_avx2(); } - -static bool cpu_has_avx512() +extern "C" bool cpu_has_avx512() { uint32_t ebx7 = cpuid_ebx_7(); return (ebx7 & cpuid_ebx_AVX512) == cpuid_ebx_AVX512 && os_have_avx512(); } +extern "C" bool cpu_has_avx_and_xsave() { + uint32_t ecx = cpuid_ecx(); + return (ecx & cpuid_ecx_AVX_AND_XSAVE) == cpuid_ecx_AVX_AND_XSAVE; +} -struct FVector; -typedef struct Vector_ops -{ - float (*dot_product)(const int16_t *v1, const int16_t *v2, size_t len); - size_t (*alloc_size)(size_t n); - FVector * (*align_ptr)(void *ptr); - void (*fix_tail)(int16_t *dims, size_t vec_len); -} Vector_ops; - -extern "C" float dot_product_avx2(const int16_t*, const int16_t*, size_t); -extern "C" float dot_product_avx512(const int16_t*, const int16_t*, size_t); -extern "C" size_t alloc_size_avx2(size_t); -extern "C" FVector *align_ptr_avx2(void*); -extern "C" void fix_tail_avx2(int16_t*, size_t); -extern "C" size_t alloc_size_avx512(size_t); -extern "C" FVector *align_ptr_avx512(void*); -extern "C" void fix_tail_avx512(int16_t*, size_t); - -extern "C" Vector_ops vector_ops_x86_available(void) -{ - const uint32_t ecx = cpuid_ecx(); - if (~ecx & cpuid_ecx_AVX_AND_XSAVE) - return {nullptr, nullptr, nullptr, nullptr}; - if (cpu_has_avx512()) - return {dot_product_avx512, alloc_size_avx512, align_ptr_avx512, fix_tail_avx512}; +// struct FVector; +// typedef struct Vector_ops +// { +// float (*dot_product)(const int16_t *v1, const int16_t *v2, size_t len); +// size_t (*alloc_size)(size_t n); +// FVector * (*align_ptr)(void *ptr); +// void (*fix_tail)(int16_t *dims, size_t vec_len); +// } Vector_ops; - if (cpu_has_avx2()) - return {dot_product_avx2, alloc_size_avx2, align_ptr_avx2, fix_tail_avx2}; +// extern "C" float dot_product_avx2(const int16_t*, const int16_t*, size_t); +// extern "C" float dot_product_avx512(const int16_t*, const int16_t*, size_t); +// extern "C" size_t alloc_size_avx2(size_t); +// extern "C" FVector *align_ptr_avx2(void*); +// extern "C" void fix_tail_avx2(int16_t*, size_t); +// extern "C" size_t alloc_size_avx512(size_t); +// extern "C" FVector *align_ptr_avx512(void*); +// extern "C" void fix_tail_avx512(int16_t*, size_t); - return {nullptr, nullptr, nullptr, nullptr}; -} +// extern "C" Vector_ops vector_ops_x86_available(void) +// { +// const uint32_t ecx = cpuid_ecx(); +// if (~ecx & cpuid_ecx_AVX_AND_XSAVE) +// return {nullptr, nullptr, nullptr, nullptr}; + +// if (cpu_has_avx512()) +// return {dot_product_avx512, alloc_size_avx512, align_ptr_avx512, fix_tail_avx512}; + +// if (cpu_has_avx2()) +// return {dot_product_avx2, alloc_size_avx2, align_ptr_avx2, fix_tail_avx2}; + +// return {nullptr, nullptr, nullptr, nullptr}; +// } \ No newline at end of file From 5c39550a2c6a8b073c774ed7984d0ddc4fa40a89 Mon Sep 17 00:00:00 2001 From: hadeer Date: Thu, 5 Mar 2026 13:13:31 +0200 Subject: [PATCH 4/7] Fix build errors: add missing returns and include --- sql/bloom_filters.h | 3 +++ sql/vector_mhnsw.cc | 52 +++++++++++++++++++++++++++------------------ 2 files changed, 34 insertions(+), 21 deletions(-) diff --git a/sql/bloom_filters.h b/sql/bloom_filters.h index 0bed6c56e51c5..815140d516ca6 100644 --- a/sql/bloom_filters.h +++ b/sql/bloom_filters.h @@ -37,6 +37,9 @@ SOFTWARE. #if __GNUC__ >= 5 #define AVX512_IMPLEMENTATION __attribute__ ((target ("avx512f,avx512bw"))) #endif +#ifdef HAVE_IMMINTRIN_H +#include +#endif #elif defined _MSC_VER && (defined _M_X64 || defined _M_IX86) #include #define AVX2_IMPLEMENTATION diff --git a/sql/vector_mhnsw.cc b/sql/vector_mhnsw.cc index d6dfca8316cc5..fe3b165e3bf07 100644 --- a/sql/vector_mhnsw.cc +++ b/sql/vector_mhnsw.cc @@ -451,78 +451,87 @@ float FVector::dot_product(const int16_t *v1, const int16_t *v2, size_t len) #ifdef AVX512_IMPLEMENTATION if (cpu_level == CpuLevel::AVX512) return dot_product_avx512(v1, v2, len); -#elif AVX2_IMPLEMENTATION +#endif +#ifdef AVX2_IMPLEMENTATION if (cpu_level == CpuLevel::AVX2) return dot_product_avx2(v1, v2, len); -#elif NEON_IMPLEMENTATION +#endif +#ifdef NEON_IMPLEMENTATION if (cpu_level == CpuLevel::NEON) return dot_product_neon(v1, v2, len); -#elif POWER_IMPLEMENTATION +#endif +#ifdef POWER_IMPLEMENTATION if (cpu_level == CpuLevel::POWERPC) return dot_product_power(v1, v2, len); -#else - return dot_product_default(v1, v2, len); #endif + return dot_product_default(v1, v2, len); } size_t FVector::alloc_size(size_t n) { #ifdef AVX512_IMPLEMENTATION if (cpu_level == CpuLevel::AVX512) return alloc_size_avx512(n); -#elif AVX2_IMPLEMENTATION +#endif +#ifdef AVX2_IMPLEMENTATION if (cpu_level == CpuLevel::AVX2) return alloc_size_avx2(n); -#elif NEON_IMPLEMENTATION +#endif +#ifdef NEON_IMPLEMENTATION if (cpu_level == CpuLevel::NEON) return alloc_size_neon(n); -#elif POWER_IMPLEMENTATION +#endif +#ifdef POWER_IMPLEMENTATION if (cpu_level == CpuLevel::POWERPC) return alloc_size_power(n); -#else - return alloc_size_default(n); #endif - + return alloc_size_default(n); } FVector *FVector::align_ptr(void *ptr) { #ifdef AVX512_IMPLEMENTATION if (cpu_level == CpuLevel::AVX512) return align_ptr_avx512(ptr); -#elif AVX2_IMPLEMENTATION +#endif +#ifdef AVX2_IMPLEMENTATION if (cpu_level == CpuLevel::AVX2) return align_ptr_avx2(ptr); -#elif NEON_IMPLEMENTATION +#endif +#ifdef NEON_IMPLEMENTATION if (cpu_level == CpuLevel::NEON) return align_ptr_neon(ptr); -#elif POWER_IMPLEMENTATION +#endif +#ifdef POWER_IMPLEMENTATION if (cpu_level == CpuLevel::POWERPC) return align_ptr_power(ptr); -#else - return align_ptr_default(ptr); #endif + return align_ptr_default(ptr); } + void FVector::fix_tail(int16_t *dims, size_t vec_len) { #ifdef AVX512_IMPLEMENTATION if (cpu_level == CpuLevel::AVX512) return fix_tail_avx512(dims, vec_len); -#elif AVX2_IMPLEMENTATION +#endif +#ifdef AVX2_IMPLEMENTATION if (cpu_level == CpuLevel::AVX2) return fix_tail_avx2(dims, vec_len); -#elif NEON_IMPLEMENTATION +#endif +#ifdef NEON_IMPLEMENTATION if (cpu_level == CpuLevel::NEON) return fix_tail_neon(dims, vec_len); -#elif POWER_IMPLEMENTATION +#endif +#ifdef POWER_IMPLEMENTATION if (cpu_level == CpuLevel::POWERPC) return fix_tail_power(dims, vec_len); -#else - return fix_tail_default(dims, vec_len); #endif + return fix_tail_default(dims, vec_len); } /* A temporary benchmark to test the performance of the dot product function. */ +#ifdef __x86_64__ #include #include @@ -564,6 +573,7 @@ void mhnsw_run_benchmark() free(vec1); free(vec2); } +#endif /* An array of pointers to graph nodes From 6d6becd48def1188d1ae5890a82adb8dd95ea2ac Mon Sep 17 00:00:00 2001 From: hadeer Date: Mon, 9 Mar 2026 13:00:39 +0200 Subject: [PATCH 5/7] MDEV-34804 mhnsw: compiler-independent choice of CPU-specific optimizations Replace GCC-specific __attribute__((target(...))) function multi-versioning with a function pointer dispatch mechanism for dot_product, alloc_size, align_ptr, and fix_tail. A Vector_ops struct holds function pointers for all four operations. At startup, choose_vector_ops_impl() probes CPU capabilities and selects the best available implementation (AVX-512, AVX2, NEON, POWER, or fallback). This approach works on GCC, Clang, MSVC, and musl libc, matching the pattern used in mysys/crc32/crc32c.cc. Other changes: - Replace GCC vector_size extension in dot_product_avx2 with portable AVX2 intrinsics (_mm256_add_ps, _mm256_extractf128_ps, etc.) - Decouple bloom_filters.h macros from vector dispatch macros. --- libmysqld/CMakeLists.txt | 2 +- sql/CMakeLists.txt | 3 - sql/bloom_filters.h | 35 +++--- sql/vector_mhnsw.cc | 251 ++++++++++---------------------------- sql/vector_mhnsw_bench.cc | 10 -- sql/vector_mhnsw_x86.cc | 61 ++++----- sql/vector_mhnsw_x86.h | 37 ++++++ 7 files changed, 143 insertions(+), 256 deletions(-) delete mode 100644 sql/vector_mhnsw_bench.cc create mode 100644 sql/vector_mhnsw_x86.h diff --git a/libmysqld/CMakeLists.txt b/libmysqld/CMakeLists.txt index 4c56a40ca0fcd..cc9990b2dee64 100644 --- a/libmysqld/CMakeLists.txt +++ b/libmysqld/CMakeLists.txt @@ -171,7 +171,7 @@ SET(SQL_EMBEDDED_SOURCES emb_qcache.cc libmysqld.c lib_sql.cc ) IF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i[3-6]86") - LIST(APPEND SQL_SOURCE vector_mhnsw_x86.cc) + LIST(APPEND SQL_EMBEDDED_SOURCES ../sql/vector_mhnsw_x86.cc) ENDIF() ADD_CONVENIENCE_LIBRARY(sql_embedded ${SQL_EMBEDDED_SOURCES}) diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt index 51f1528506f8f..b2d61993cd9a0 100644 --- a/sql/CMakeLists.txt +++ b/sql/CMakeLists.txt @@ -557,9 +557,6 @@ IF(WIN32) TARGET_LINK_LIBRARIES(mariadb-upgrade-service mysys winservice) ENDIF(WIN32) -ADD_EXECUTABLE(vector_bench vector_mhnsw_bench.cc) -TARGET_LINK_LIBRARIES(vector_bench sql sql_builtins) - IF(NOT WITH_WSREP) SET(EXCL_WSREP "wsrep_[a-np-z]*.h") ENDIF() diff --git a/sql/bloom_filters.h b/sql/bloom_filters.h index 815140d516ca6..55ae72d09231f 100644 --- a/sql/bloom_filters.h +++ b/sql/bloom_filters.h @@ -29,38 +29,35 @@ SOFTWARE. #include #include -#undef AVX2_IMPLEMENTATION -#undef AVX512_IMPLEMENTATION - -#if defined __GNUC__ && (defined __i386__||defined __x86_64__) +/* + Use gcc function multiversioning to optimize for a specific CPU with run-time + detection. Works only for x86, for other architectures we provide only one + implementation for now. +*/ +#define DEFAULT_IMPLEMENTATION +#if __GNUC__ > 7 && defined(__GLIBC__) +#ifdef __x86_64__ +#ifdef HAVE_IMMINTRIN_H +#include +#undef DEFAULT_IMPLEMENTATION +#define DEFAULT_IMPLEMENTATION __attribute__ ((target ("default"))) #define AVX2_IMPLEMENTATION __attribute__ ((target ("avx2,avx,fma"))) -#if __GNUC__ >= 5 +#if __GNUC__ > 9 #define AVX512_IMPLEMENTATION __attribute__ ((target ("avx512f,avx512bw"))) #endif -#ifdef HAVE_IMMINTRIN_H -#include #endif -#elif defined _MSC_VER && (defined _M_X64 || defined _M_IX86) -#include -#define AVX2_IMPLEMENTATION -#define AVX512_IMPLEMENTATION -#endif - +#endif #ifdef __aarch64__ #include +#undef DEFAULT_IMPLEMENTATION #define NEON_IMPLEMENTATION #endif - +#endif #if defined __powerpc64__ && defined __VSX__ #include #define POWER_IMPLEMENTATION #endif -#if !defined(AVX2_IMPLEMENTATION) && !defined(NEON_IMPLEMENTATION) && !defined(POWER_IMPLEMENTATION) -#define DEFAULT_IMPLEMENTATION -#endif - - template struct PatternedSimdBloomFilter { diff --git a/sql/vector_mhnsw.cc b/sql/vector_mhnsw.cc index fe3b165e3bf07..86ae5a68c5a96 100644 --- a/sql/vector_mhnsw.cc +++ b/sql/vector_mhnsw.cc @@ -23,6 +23,32 @@ #include #include #include "bloom_filters.h" +#include "vector_mhnsw_x86.h" + +/* + Independent SIMD macros for mhnsw vector operations. + These are separate from bloom_filters.h macros, which + the bloom filter continues to use for its own purposes. +*/ +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +# ifdef HAVE_IMMINTRIN_H +# include +# define MHNSW_AVX2 __attribute__((target("avx2,avx,fma"))) +# if __GNUC__ >= 5 +# define MHNSW_AVX512 __attribute__((target("avx512f,avx512bw"))) +# endif +# endif +#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) +# include +# define MHNSW_AVX2 +# define MHNSW_AVX512 +#elif defined(__aarch64__) || defined(_M_ARM64) +# include +# define MHNSW_HAVE_NEON +#elif defined(__powerpc64__) && defined(__VSX__) +# include +# define MHNSW_HAVE_POWER +#endif // distance can be a little bit < 0 because of fast math static constexpr float NEAREST = -1.0f; @@ -193,65 +219,57 @@ struct FVector }; #pragma pack(pop) -// struct Vector_ops -// { -// float (*dot_product)(const int16_t *v1, const int16_t *v2, size_t len); -// size_t (*alloc_size)(size_t n); -// FVector * (*align_ptr)(void *ptr); -// void (*fix_tail)(int16_t *dims, size_t vec_len); -// }; - -#ifdef AVX2_IMPLEMENTATION +#ifdef MHNSW_AVX2 /************* AVX2 *****************************************************/ static constexpr size_t AVX2_bytes= 256/8; static constexpr size_t AVX2_dims= AVX2_bytes/sizeof(int16_t); static_assert(subdist_part % AVX2_dims == 0); extern "C" - AVX2_IMPLEMENTATION + MHNSW_AVX2 float dot_product_avx2(const int16_t *v1, const int16_t *v2, size_t len) { - typedef float v8f __attribute__((vector_size(AVX2_bytes))); - union { v8f v; __m256 i; } tmp; __m256i *p1= (__m256i*)v1; __m256i *p2= (__m256i*)v2; - v8f d= {0}; + __m256 d= _mm256_setzero_ps(); for (size_t i= 0; i < (len + AVX2_dims-1)/AVX2_dims; p1++, p2++, i++) - { - tmp.i= _mm256_cvtepi32_ps(_mm256_madd_epi16(*p1, *p2)); - d+= tmp.v; - } - return d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7]; + d= _mm256_add_ps(d, _mm256_cvtepi32_ps(_mm256_madd_epi16(*p1, *p2))); + __m128 hi= _mm256_extractf128_ps(d, 1); + __m128 lo= _mm256_castps256_ps128(d); + __m128 sum= _mm_add_ps(lo, hi); + sum= _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); + sum= _mm_add_ss(sum, _mm_movehdup_ps(sum)); + return _mm_cvtss_f32(sum); } extern "C" - AVX2_IMPLEMENTATION + MHNSW_AVX2 size_t alloc_size_avx2(size_t n) { return FVector::alloc_header + MY_ALIGN(n*2, AVX2_bytes) + AVX2_bytes - 1; } extern "C" - AVX2_IMPLEMENTATION + MHNSW_AVX2 FVector *align_ptr_avx2(void *ptr) { return (FVector*)(MY_ALIGN(((intptr)ptr) + FVector::alloc_header, AVX2_bytes) - FVector::alloc_header); } extern "C" - AVX2_IMPLEMENTATION + MHNSW_AVX2 void fix_tail_avx2(int16_t *dims, size_t vec_len) { bzero(dims + vec_len, (MY_ALIGN(vec_len, AVX2_dims) - vec_len)*2); } #endif -#ifdef AVX512_IMPLEMENTATION +#ifdef MHNSW_AVX512 /************* AVX512 ****************************************************/ static constexpr size_t AVX512_bytes= 512/8; static constexpr size_t AVX512_dims= AVX512_bytes/sizeof(int16_t); static_assert(subdist_part % AVX512_dims == 0); extern "C" - AVX512_IMPLEMENTATION + MHNSW_AVX512 float dot_product_avx512(const int16_t *v1, const int16_t *v2, size_t len) { __m512i *p1= (__m512i*)v1; @@ -263,18 +281,18 @@ struct FVector } extern "C" - AVX512_IMPLEMENTATION + MHNSW_AVX512 size_t alloc_size_avx512(size_t n) { return FVector::alloc_header + MY_ALIGN(n*2, AVX512_bytes) + AVX512_bytes - 1; } extern "C" - AVX512_IMPLEMENTATION + MHNSW_AVX512 FVector *align_ptr_avx512(void *ptr) { return (FVector*)(MY_ALIGN(((intptr)ptr) + FVector::alloc_header, AVX512_bytes) - FVector::alloc_header); } extern "C" - AVX512_IMPLEMENTATION + MHNSW_AVX512 void fix_tail_avx512(int16_t *dims, size_t vec_len) { bzero(dims + vec_len, (MY_ALIGN(vec_len, AVX512_dims) - vec_len)*2); @@ -291,7 +309,7 @@ struct FVector vmull+vmlal2_high implementations. */ -#ifdef NEON_IMPLEMENTATION +#ifdef MHNSW_HAVE_NEON static constexpr size_t NEON_bytes= 128 / 8; static constexpr size_t NEON_dims= NEON_bytes / sizeof(int16_t); static_assert(subdist_part % NEON_dims == 0); @@ -324,7 +342,7 @@ struct FVector } #endif -#ifdef POWER_IMPLEMENTATION +#ifdef MHNSW_HAVE_POWER /************* POWERPC *****************************************************/ static constexpr size_t POWER_bytes= 128 / 8; // Assume 128-bit vector width static constexpr size_t POWER_dims= POWER_bytes / sizeof(int16_t); @@ -392,188 +410,45 @@ struct FVector void fix_tail_default(int16_t *dims, size_t) { } -/*******************/ -#if defined __x86_64__ || defined _M_X64 -// extern "C" Vector_ops vector_ops_x86_available(void); -extern "C" bool cpu_has_avx2(); -extern "C" bool cpu_has_avx512(); -extern "C" bool cpu_has_avx_and_xsave(); -#endif +/*******************************CPU Dispatching*******************************/ -enum class CpuLevel -{ - AVX2, - AVX512, - NEON, - POWERPC, - DEFAULT -}; -static CpuLevel detect_cpu_level() { -#if defined __x86_64__ || defined _M_X64 - if (!cpu_has_avx_and_xsave()) - return CpuLevel::DEFAULT; - if (cpu_has_avx512()) - return CpuLevel::AVX512; - else if (cpu_has_avx2()) - return CpuLevel::AVX2; - else - return CpuLevel::DEFAULT; + +static Vector_ops choose_vector_ops_impl() +{ +#if defined __x86_64__ || defined _M_X64 + auto ops = vector_ops_x86_available(); + if (ops.dot_product) + return ops; #elif defined __aarch64__ - return CpuLevel::NEON; + return {dot_product_neon, alloc_size_neon, align_ptr_neon, fix_tail_neon}; #elif defined __powerpc64__ - return CpuLevel::POWERPC; -#else - return CpuLevel::DEFAULT; + return {dot_product_power, alloc_size_power, align_ptr_power, fix_tail_power}; #endif + return {dot_product_default, alloc_size_default, align_ptr_default, fix_tail_default}; } -static const CpuLevel cpu_level = detect_cpu_level(); - -// static Vector_ops choose_vector_ops_impl() -// { -// #if defined __x86_64__ || defined _M_X64 -// auto ops = vector_ops_x86_available(); -// if (ops.dot_product) -// return ops; -// #elif defined __aarch64__ -// return {dot_product_neon, alloc_size_neon, align_ptr_neon, fix_tail_neon}; -// #elif defined __powerpc64__ -// return {dot_product_power, alloc_size_power, align_ptr_power, fix_tail_power}; -// #endif -// return {dot_product_default, alloc_size_default, align_ptr_default, fix_tail_default}; -// } - -// static const Vector_ops chosen_vector_ops= choose_vector_ops_impl(); +static const Vector_ops chosen_vector_ops= choose_vector_ops_impl(); float FVector::dot_product(const int16_t *v1, const int16_t *v2, size_t len) { -#ifdef AVX512_IMPLEMENTATION - if (cpu_level == CpuLevel::AVX512) - return dot_product_avx512(v1, v2, len); -#endif -#ifdef AVX2_IMPLEMENTATION - if (cpu_level == CpuLevel::AVX2) - return dot_product_avx2(v1, v2, len); -#endif -#ifdef NEON_IMPLEMENTATION - if (cpu_level == CpuLevel::NEON) - return dot_product_neon(v1, v2, len); -#endif -#ifdef POWER_IMPLEMENTATION - if (cpu_level == CpuLevel::POWERPC) - return dot_product_power(v1, v2, len); -#endif - return dot_product_default(v1, v2, len); + return chosen_vector_ops.dot_product(v1, v2, len); } size_t FVector::alloc_size(size_t n) { -#ifdef AVX512_IMPLEMENTATION - if (cpu_level == CpuLevel::AVX512) - return alloc_size_avx512(n); -#endif -#ifdef AVX2_IMPLEMENTATION - if (cpu_level == CpuLevel::AVX2) - return alloc_size_avx2(n); -#endif -#ifdef NEON_IMPLEMENTATION - if (cpu_level == CpuLevel::NEON) - return alloc_size_neon(n); -#endif -#ifdef POWER_IMPLEMENTATION - if (cpu_level == CpuLevel::POWERPC) - return alloc_size_power(n); -#endif - return alloc_size_default(n); + return chosen_vector_ops.alloc_size(n); } FVector *FVector::align_ptr(void *ptr) { -#ifdef AVX512_IMPLEMENTATION - if (cpu_level == CpuLevel::AVX512) - return align_ptr_avx512(ptr); -#endif -#ifdef AVX2_IMPLEMENTATION - if (cpu_level == CpuLevel::AVX2) - return align_ptr_avx2(ptr); -#endif -#ifdef NEON_IMPLEMENTATION - if (cpu_level == CpuLevel::NEON) - return align_ptr_neon(ptr); -#endif -#ifdef POWER_IMPLEMENTATION - if (cpu_level == CpuLevel::POWERPC) - return align_ptr_power(ptr); -#endif - return align_ptr_default(ptr); + return chosen_vector_ops.align_ptr(ptr); } void FVector::fix_tail(int16_t *dims, size_t vec_len) { -#ifdef AVX512_IMPLEMENTATION - if (cpu_level == CpuLevel::AVX512) - return fix_tail_avx512(dims, vec_len); -#endif -#ifdef AVX2_IMPLEMENTATION - if (cpu_level == CpuLevel::AVX2) - return fix_tail_avx2(dims, vec_len); -#endif -#ifdef NEON_IMPLEMENTATION - if (cpu_level == CpuLevel::NEON) - return fix_tail_neon(dims, vec_len); -#endif -#ifdef POWER_IMPLEMENTATION - if (cpu_level == CpuLevel::POWERPC) - return fix_tail_power(dims, vec_len); -#endif - return fix_tail_default(dims, vec_len); + chosen_vector_ops.fix_tail(dims, vec_len); } -/* - A temporary benchmark to test the performance of the dot product function. -*/ -#ifdef __x86_64__ -#include -#include - -void mhnsw_run_benchmark() -{ - const size_t vec_len= 1024; - const size_t iterations= 1000000; - - int16_t *vec1= (int16_t*)aligned_alloc(64, vec_len * sizeof(int16_t)); - int16_t *vec2= (int16_t*)aligned_alloc(64, vec_len * sizeof(int16_t)); - std::mt19937 rng(42); - std::uniform_int_distribution dist(-100, 100); - - for (size_t i= 0; i < vec_len; i++) vec1[i]= dist(rng); - for (size_t i= 0; i < vec_len; i++) vec2[i]= dist(rng); - - std::cout << "Warm up....\n"; - for (int i= 0; i < 1000; i++) - FVector::dot_product(vec1, vec2, vec_len); - - std::cout << "Running benchmark...\n"; - ulonglong start= my_timer_microseconds(); - - volatile float result= 0; - for (size_t i= 0; i < iterations; i++) - result+= FVector::dot_product(vec1, vec2, vec_len); - - ulonglong end= my_timer_microseconds(); - ulonglong duration= end - start; - - double per_call= duration / (double)iterations; - double total_time_sec= duration / 1e6; - - std::cout << "Total time in seconds: " << total_time_sec << "\n"; - std::cout << "Total time in microseconds: " << duration << "\n"; - std::cout << "Average time per call: " << per_call << "\n"; - std::cout << "Result: " << result << "\n"; - - free(vec1); - free(vec2); -} -#endif +/**************************************************************/ /* An array of pointers to graph nodes diff --git a/sql/vector_mhnsw_bench.cc b/sql/vector_mhnsw_bench.cc deleted file mode 100644 index dc3f800a72a21..0000000000000 --- a/sql/vector_mhnsw_bench.cc +++ /dev/null @@ -1,10 +0,0 @@ -void mhnsw_run_benchmark(); -struct st_mysql_plugin; -st_mysql_plugin *mysql_mandatory_plugins[] = {nullptr}; -st_mysql_plugin *mysql_optional_plugins[] = {nullptr}; - -int main() -{ - mhnsw_run_benchmark(); - return 0; -} \ No newline at end of file diff --git a/sql/vector_mhnsw_x86.cc b/sql/vector_mhnsw_x86.cc index 7c9c1f4863bc8..97080cc1b6995 100644 --- a/sql/vector_mhnsw_x86.cc +++ b/sql/vector_mhnsw_x86.cc @@ -14,8 +14,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ #include -#include -#include +#include "vector_mhnsw_x86.h" #ifdef _MSC_VER # include @@ -69,61 +68,53 @@ static uint64_t xgetbv() { #endif } -static bool os_have_avx2() +bool os_have_avx2() { return (xgetbv() & 0x06) == 0x06; } -static bool os_have_avx512() +bool os_have_avx512() { return (xgetbv() & 0xe6) == 0xe6; } -extern "C" bool cpu_has_avx2() +bool cpu_has_avx2() { uint32_t ebx7 = cpuid_ebx_7(); return (ebx7 & cpuid_ebx_AVX2) == cpuid_ebx_AVX2 && os_have_avx2(); } -extern "C" bool cpu_has_avx512() +bool cpu_has_avx512() { uint32_t ebx7 = cpuid_ebx_7(); return (ebx7 & cpuid_ebx_AVX512) == cpuid_ebx_AVX512 && os_have_avx512(); } -extern "C" bool cpu_has_avx_and_xsave() { +bool cpu_has_avx_and_xsave() { uint32_t ecx = cpuid_ecx(); return (ecx & cpuid_ecx_AVX_AND_XSAVE) == cpuid_ecx_AVX_AND_XSAVE; } -// struct FVector; -// typedef struct Vector_ops -// { -// float (*dot_product)(const int16_t *v1, const int16_t *v2, size_t len); -// size_t (*alloc_size)(size_t n); -// FVector * (*align_ptr)(void *ptr); -// void (*fix_tail)(int16_t *dims, size_t vec_len); -// } Vector_ops; +struct FVector; +extern "C" float dot_product_avx2(const int16_t*, const int16_t*, size_t); +extern "C" float dot_product_avx512(const int16_t*, const int16_t*, size_t); +extern "C" size_t alloc_size_avx2(size_t); +extern "C" FVector *align_ptr_avx2(void*); +extern "C" void fix_tail_avx2(int16_t*, size_t); +extern "C" size_t alloc_size_avx512(size_t); +extern "C" FVector *align_ptr_avx512(void*); +extern "C" void fix_tail_avx512(int16_t*, size_t); -// extern "C" float dot_product_avx2(const int16_t*, const int16_t*, size_t); -// extern "C" float dot_product_avx512(const int16_t*, const int16_t*, size_t); -// extern "C" size_t alloc_size_avx2(size_t); -// extern "C" FVector *align_ptr_avx2(void*); -// extern "C" void fix_tail_avx2(int16_t*, size_t); -// extern "C" size_t alloc_size_avx512(size_t); -// extern "C" FVector *align_ptr_avx512(void*); -// extern "C" void fix_tail_avx512(int16_t*, size_t); - -// extern "C" Vector_ops vector_ops_x86_available(void) -// { -// const uint32_t ecx = cpuid_ecx(); -// if (~ecx & cpuid_ecx_AVX_AND_XSAVE) -// return {nullptr, nullptr, nullptr, nullptr}; +Vector_ops vector_ops_x86_available(void) +{ + const uint32_t ecx = cpuid_ecx(); + if (~ecx & cpuid_ecx_AVX_AND_XSAVE) + return {nullptr, nullptr, nullptr, nullptr}; -// if (cpu_has_avx512()) -// return {dot_product_avx512, alloc_size_avx512, align_ptr_avx512, fix_tail_avx512}; + if (cpu_has_avx512()) + return {dot_product_avx512, alloc_size_avx512, align_ptr_avx512, fix_tail_avx512}; -// if (cpu_has_avx2()) -// return {dot_product_avx2, alloc_size_avx2, align_ptr_avx2, fix_tail_avx2}; + if (cpu_has_avx2()) + return {dot_product_avx2, alloc_size_avx2, align_ptr_avx2, fix_tail_avx2}; -// return {nullptr, nullptr, nullptr, nullptr}; -// } \ No newline at end of file + return {nullptr, nullptr, nullptr, nullptr}; +} \ No newline at end of file diff --git a/sql/vector_mhnsw_x86.h b/sql/vector_mhnsw_x86.h new file mode 100644 index 0000000000000..841fa8e3c07aa --- /dev/null +++ b/sql/vector_mhnsw_x86.h @@ -0,0 +1,37 @@ +/* + Copyright (c) 2024, MariaDB plc + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA +*/ + + +#pragma once +#include +#include + +/* + Function pointer table for SIMD-accelerated vector operations. + Shared between vector_mhnsw.cc and vector_mhnsw_x86.cc. +*/ +struct FVector; +struct Vector_ops +{ + float (*dot_product)(const int16_t *v1, const int16_t *v2, size_t len); + size_t (*alloc_size)(size_t n); + FVector * (*align_ptr)(void *ptr); + void (*fix_tail)(int16_t *dims, size_t vec_len); +}; +#if defined(__x86_64__) || defined(_M_X64) +extern "C" Vector_ops vector_ops_x86_available(void); +#endif \ No newline at end of file From 2b36fe037bf756c146a0706eb4c434a28f0e4330 Mon Sep 17 00:00:00 2001 From: hadeer Date: Mon, 9 Mar 2026 14:19:28 +0200 Subject: [PATCH 6/7] MDEV-34804 fix Windows build: add AMD64 to CMake processor check --- sql/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt index b2d61993cd9a0..4cfdcf40660de 100644 --- a/sql/CMakeLists.txt +++ b/sql/CMakeLists.txt @@ -207,7 +207,7 @@ SET (SQL_SOURCE ${MYSYS_LIBWRAP_SOURCE} ) -IF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i[3-6]86") +IF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64|i[3-6]86") LIST(APPEND SQL_SOURCE vector_mhnsw_x86.cc) ENDIF() From 6bc5027e00391639da93337ef83b079ae21e314c Mon Sep 17 00:00:00 2001 From: hadeer Date: Mon, 9 Mar 2026 14:45:04 +0200 Subject: [PATCH 7/7] MDEV-34804 fix clang build: include __clang__ in AVX512 check --- sql/vector_mhnsw.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/vector_mhnsw.cc b/sql/vector_mhnsw.cc index 86ae5a68c5a96..cb4ebf4572301 100644 --- a/sql/vector_mhnsw.cc +++ b/sql/vector_mhnsw.cc @@ -34,7 +34,7 @@ # ifdef HAVE_IMMINTRIN_H # include # define MHNSW_AVX2 __attribute__((target("avx2,avx,fma"))) -# if __GNUC__ >= 5 +# if __GNUC__ >= 5 || defined(__clang__) # define MHNSW_AVX512 __attribute__((target("avx512f,avx512bw"))) # endif # endif