From a04bbb7330844510be2fbb23dc9d5065cc60a60f Mon Sep 17 00:00:00 2001
From: hadeer <hadderramadan7@gmail.com>
Date: Fri, 20 Feb 2026 04:46:05 +0200
Subject: [PATCH 1/7] mhnsw: compiler-independent choice of CPU-specific
 optimizations

Replace GCC function multiversioning (__attribute__((target(...)))) with
runtime CPU detection using function pointers. At startup, cpuid selects
the optimal SIMD implementation (AVX512, AVX2) via a Vector_ops struct.

Move x86 CPU detection into a separate vector_mhnsw_x86.cc, compiled
with appropriate -mavx2/-mavx512 flags.

This makes the dispatching work with musl libc and MSVC, which do not
support GCC function multiversioning.

Benchmark (1M iterations, 1024 dimensions, AVX512):
   0.039602 s total, 0.039602 us per call
---
 sql/CMakeLists.txt        |   7 ++
 sql/bloom_filters.h       |  34 +++---
 sql/vector_mhnsw.cc       | 219 +++++++++++++++++++++++++++-----------
 sql/vector_mhnsw_bench.cc |  10 ++
 sql/vector_mhnsw_x86.cc   |  89 ++++++++++++++++
 5 files changed, 281 insertions(+), 78 deletions(-)
 create mode 100644 sql/vector_mhnsw_bench.cc
 create mode 100644 sql/vector_mhnsw_x86.cc
diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt
index 32af809849be1..51f1528506f8f 100644
--- a/sql/CMakeLists.txt
+++ b/sql/CMakeLists.txt
@@ -207,6 +207,10 @@ SET (SQL_SOURCE
                ${MYSYS_LIBWRAP_SOURCE}
 )
 
+IF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i[3-6]86")
+  LIST(APPEND SQL_SOURCE vector_mhnsw_x86.cc)
+ENDIF()
+
 MY_CHECK_CXX_COMPILER_FLAG(-Wno-unused-but-set-variable)
 IF(have_CXX__Wno_unused_but_set_variable)
   IF(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64"
@@ -553,6 +557,9 @@ IF(WIN32)
   TARGET_LINK_LIBRARIES(mariadb-upgrade-service mysys winservice)
 ENDIF(WIN32)
 
+ADD_EXECUTABLE(vector_bench vector_mhnsw_bench.cc)
+TARGET_LINK_LIBRARIES(vector_bench sql sql_builtins)
+
 IF(NOT WITH_WSREP)
   SET(EXCL_WSREP "wsrep_[a-np-z]*.h")
 ENDIF()
diff --git a/sql/bloom_filters.h b/sql/bloom_filters.h
index 55ae72d09231f..0bed6c56e51c5 100644
--- a/sql/bloom_filters.h
+++ b/sql/bloom_filters.h
@@ -29,35 +29,35 @@ SOFTWARE.
 #include <vector>
 #include <algorithm>
 
-/*
-  Use gcc function multiversioning to optimize for a specific CPU with run-time
-  detection. Works only for x86, for other architectures we provide only one
-  implementation for now.
-*/
-#define DEFAULT_IMPLEMENTATION
-#if __GNUC__ > 7 && defined(__GLIBC__)
-#ifdef __x86_64__
-#ifdef HAVE_IMMINTRIN_H
-#include <immintrin.h>
-#undef DEFAULT_IMPLEMENTATION
-#define DEFAULT_IMPLEMENTATION    __attribute__ ((target ("default")))
+#undef AVX2_IMPLEMENTATION
+#undef AVX512_IMPLEMENTATION
+
+#if defined __GNUC__ && (defined __i386__||defined __x86_64__)
 #define AVX2_IMPLEMENTATION __attribute__ ((target ("avx2,avx,fma")))
-#if __GNUC__ > 9
+#if __GNUC__ >= 5 
 #define AVX512_IMPLEMENTATION __attribute__ ((target ("avx512f,avx512bw")))
 #endif
-#endif
-#endif
+#elif defined _MSC_VER && (defined _M_X64 || defined _M_IX86)
+#include <immintrin.h>
+#define AVX2_IMPLEMENTATION
+#define AVX512_IMPLEMENTATION
+#endif 
+
 #ifdef __aarch64__
 #include <arm_neon.h>
-#undef DEFAULT_IMPLEMENTATION
 #define NEON_IMPLEMENTATION
 #endif
-#endif
+
 #if defined __powerpc64__ && defined __VSX__
 #include <altivec.h>
 #define POWER_IMPLEMENTATION
 #endif
 
+#if !defined(AVX2_IMPLEMENTATION) && !defined(NEON_IMPLEMENTATION) && !defined(POWER_IMPLEMENTATION)
+#define DEFAULT_IMPLEMENTATION
+#endif
+
+
 template <typename T>
 struct PatternedSimdBloomFilter
 {
diff --git a/sql/vector_mhnsw.cc b/sql/vector_mhnsw.cc
index d640363b6e76a..69723c7fd2610 100644
--- a/sql/vector_mhnsw.cc
+++ b/sql/vector_mhnsw.cc
@@ -148,11 +148,15 @@ struct FVector
   { return (data_size - data_header)*2; }
 
   static const FVector *create(const MHNSW_Share *ctx, void *mem, const void *src);
+  static float dot_product(const int16_t *v1, const int16_t *v2, size_t len);
+  static void fix_tail(int16_t *dims, size_t vec_len);
+  static size_t alloc_size(size_t n);
+  static FVector *align_ptr(void *ptr);
 
   void postprocess(bool use_subdist, size_t vec_len)
   {
     int16_t *d= dims;
-    fix_tail(vec_len);
+    fix_tail(d,vec_len);
     if (use_subdist)
     {
       subabs2= scale * scale * dot_product(d, d, subdist_part) / 2;
@@ -164,14 +168,49 @@ struct FVector
     abs2= subabs2 + scale * scale * dot_product(d, d, vec_len) / 2;
   }
 
+
+
+  float distance_to(const FVector *other, size_t vec_len) const
+  {
+    return abs2 + other->abs2 - scale * other->scale *
+           dot_product(dims, other->dims, vec_len);
+  }
+
+  float distance_greater_than(const FVector *other, size_t vec_len, float than,
+                              Stats *stats) const
+  {
+    float k = scale * other->scale;
+    float dp= dot_product(dims, other->dims, subdist_part);
+    float subdist= (subabs2 + other->subabs2 - k * dp)/subdist_part*vec_len;
+    if (subdist > than)
+      return subdist;
+    dp+= dot_product(dims+subdist_part, other->dims+subdist_part,
+                     vec_len - subdist_part);
+    float dist= abs2 + other->abs2 - k * dp;
+    stats->subdist.add(subdist/dist);
+    return dist;
+  }
+};
+#pragma pack(pop)
+
+struct Vector_ops 
+{
+  float (*dot_product)(const int16_t *v1, const int16_t *v2, size_t len);
+  size_t (*alloc_size)(size_t n);
+  FVector * (*align_ptr)(void *ptr);
+  void (*fix_tail)(int16_t *dims, size_t vec_len);
+};
+
+
 #ifdef AVX2_IMPLEMENTATION
   /************* AVX2 *****************************************************/
   static constexpr size_t AVX2_bytes= 256/8;
   static constexpr size_t AVX2_dims= AVX2_bytes/sizeof(int16_t);
   static_assert(subdist_part % AVX2_dims == 0);
 
+  extern "C"
   AVX2_IMPLEMENTATION
-  static float dot_product(const int16_t *v1, const int16_t *v2, size_t len)
+  float dot_product_avx2(const int16_t *v1, const int16_t *v2, size_t len)
   {
     typedef float v8f __attribute__((vector_size(AVX2_bytes)));
     union { v8f v; __m256 i; } tmp;
@@ -186,17 +225,20 @@ struct FVector
     return d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7];
   }
 
+  extern "C"
   AVX2_IMPLEMENTATION
-  static size_t alloc_size(size_t n)
-  { return alloc_header + MY_ALIGN(n*2, AVX2_bytes) + AVX2_bytes - 1; }
+  size_t alloc_size_avx2(size_t n)
+  { return FVector::alloc_header + MY_ALIGN(n*2, AVX2_bytes) + AVX2_bytes - 1; }
 
+  extern "C"
   AVX2_IMPLEMENTATION
-  static FVector *align_ptr(void *ptr)
-  { return (FVector*)(MY_ALIGN(((intptr)ptr) + alloc_header, AVX2_bytes)
-                      - alloc_header); }
+  FVector *align_ptr_avx2(void *ptr)
+  { return (FVector*)(MY_ALIGN(((intptr)ptr) + FVector::alloc_header, AVX2_bytes)
+                      - FVector::alloc_header); }
 
+  extern "C"
   AVX2_IMPLEMENTATION
-  void fix_tail(size_t vec_len)
+  void fix_tail_avx2(int16_t *dims, size_t vec_len)
   {
     bzero(dims + vec_len, (MY_ALIGN(vec_len, AVX2_dims) - vec_len)*2);
   }
@@ -208,8 +250,9 @@ struct FVector
   static constexpr size_t AVX512_dims= AVX512_bytes/sizeof(int16_t);
   static_assert(subdist_part % AVX512_dims == 0);
 
+  extern "C"
   AVX512_IMPLEMENTATION
-  static float dot_product(const int16_t *v1, const int16_t *v2, size_t len)
+  float dot_product_avx512(const int16_t *v1, const int16_t *v2, size_t len)
   {
     __m512i *p1= (__m512i*)v1;
     __m512i *p2= (__m512i*)v2;
@@ -219,17 +262,20 @@ struct FVector
     return _mm512_reduce_add_ps(d);
   }
 
+  extern "C"
   AVX512_IMPLEMENTATION
-  static size_t alloc_size(size_t n)
-  { return alloc_header + MY_ALIGN(n*2, AVX512_bytes) + AVX512_bytes - 1; }
+  size_t alloc_size_avx512(size_t n)
+  { return FVector::alloc_header + MY_ALIGN(n*2, AVX512_bytes) + AVX512_bytes - 1; }
 
+  extern "C"
   AVX512_IMPLEMENTATION
-  static FVector *align_ptr(void *ptr)
-  { return (FVector*)(MY_ALIGN(((intptr)ptr) + alloc_header, AVX512_bytes)
-                      - alloc_header); }
+  FVector *align_ptr_avx512(void *ptr)
+  { return (FVector*)(MY_ALIGN(((intptr)ptr) + FVector::alloc_header, AVX512_bytes)
+                      - FVector::alloc_header); }
 
+  extern "C"
   AVX512_IMPLEMENTATION
-  void fix_tail(size_t vec_len)
+  void fix_tail_avx512(int16_t *dims, size_t vec_len)
   {
     bzero(dims + vec_len, (MY_ALIGN(vec_len, AVX512_dims) - vec_len)*2);
   }
@@ -250,7 +296,7 @@ struct FVector
   static constexpr size_t NEON_dims= NEON_bytes / sizeof(int16_t);
   static_assert(subdist_part % NEON_dims == 0);
 
-  static float dot_product(const int16_t *v1, const int16_t *v2, size_t len)
+  float dot_product_neon(const int16_t *v1, const int16_t *v2, size_t len)
   {
     int64_t d= 0;
     for (size_t i= 0; i < (len + NEON_dims - 1) / NEON_dims; i++)
@@ -265,14 +311,14 @@ struct FVector
     return static_cast<float>(d);
   }
 
-  static size_t alloc_size(size_t n)
-  { return alloc_header + MY_ALIGN(n * 2, NEON_bytes) + NEON_bytes - 1; }
+  size_t alloc_size_neon(size_t n)
+  { return FVector::alloc_header + MY_ALIGN(n * 2, NEON_bytes) + NEON_bytes - 1; }
 
-  static FVector *align_ptr(void *ptr)
-  { return (FVector*) (MY_ALIGN(((intptr) ptr) + alloc_header, NEON_bytes)
-                       - alloc_header); }
+  FVector *align_ptr_neon(void *ptr)
+  { return (FVector*) (MY_ALIGN(((intptr) ptr) + FVector::alloc_header, NEON_bytes)
+                       - FVector::alloc_header); }
 
-  void fix_tail(size_t vec_len)
+  void fix_tail_neon(int16_t *dims, size_t vec_len)
   {
     bzero(dims + vec_len, (MY_ALIGN(vec_len, NEON_dims) - vec_len) * 2);
   }
@@ -284,7 +330,7 @@ struct FVector
   static constexpr size_t POWER_dims= POWER_bytes / sizeof(int16_t);
   static_assert(subdist_part % POWER_dims == 0);
 
-  static float dot_product(const int16_t *v1, const int16_t *v2, size_t len)
+  float dot_product_power(const int16_t *v1, const int16_t *v2, size_t len)
   {
     // Using vector long long for int64_t accumulation
     vector long long ll_sum= {0, 0};
@@ -314,28 +360,25 @@ struct FVector
                               static_cast<int64_t>(ll_sum[1]));
   }
 
-  static size_t alloc_size(size_t n)
+  size_t alloc_size_power(size_t n)
   {
-    return alloc_header + MY_ALIGN(n * 2, POWER_bytes) + POWER_bytes - 1;
+    return FVector::alloc_header + MY_ALIGN(n * 2, POWER_bytes) + POWER_bytes - 1;
   }
 
-  static FVector *align_ptr(void *ptr)
+  FVector *align_ptr_power(void *ptr)
   {
-    return (FVector*)(MY_ALIGN(((intptr)ptr) + alloc_header, POWER_bytes)
-                    - alloc_header);
+    return (FVector*)(MY_ALIGN(((intptr)ptr) + FVector::alloc_header, POWER_bytes)
+                    - FVector::alloc_header);
   }
 
-  void fix_tail(size_t vec_len)
+  void fix_tail_power(int16_t *dims, size_t vec_len)
   {
     bzero(dims + vec_len, (MY_ALIGN(vec_len, POWER_dims) - vec_len) * 2);
   }
-#undef DEFAULT_IMPLEMENTATION
 #endif
 
-  /************* no-SIMD default ******************************************/
-#ifdef DEFAULT_IMPLEMENTATION
-  DEFAULT_IMPLEMENTATION
-  static float dot_product(const int16_t *v1, const int16_t *v2, size_t len)
+  /************* no-SIMD ******************************************/
+  static float dot_product_default(const int16_t *v1, const int16_t *v2, size_t len)
   {
     int64_t d= 0;
     for (size_t i= 0; i < len; i++)
@@ -343,38 +386,92 @@ struct FVector
     return static_cast<float>(d);
   }
 
-  DEFAULT_IMPLEMENTATION
-  static size_t alloc_size(size_t n) { return alloc_header + n*2; }
+  size_t alloc_size_default(size_t n) { return FVector::alloc_header + n*2; }
 
-  DEFAULT_IMPLEMENTATION
-  static FVector *align_ptr(void *ptr) { return (FVector*)ptr; }
+  FVector *align_ptr_default(void *ptr) { return (FVector*)ptr; }
+  void fix_tail_default(int16_t *dims, size_t) { }
 
-  DEFAULT_IMPLEMENTATION
-  void fix_tail(size_t) { }
+#if defined __x86_64__ || defined _M_X64
+extern "C" Vector_ops vector_ops_x86_available(void);
 #endif
 
-  float distance_to(const FVector *other, size_t vec_len) const
-  {
-    return abs2 + other->abs2 - scale * other->scale *
-           dot_product(dims, other->dims, vec_len);
-  }
+static Vector_ops choose_vector_ops_impl()
+{
+#if defined __x86_64__ || defined _M_X64
+  auto ops = vector_ops_x86_available();
+  if (ops.dot_product)
+    return ops;
+#elif defined __aarch64__
+  return {dot_product_neon, alloc_size_neon, align_ptr_neon, fix_tail_neon};
+#elif defined __powerpc64__
+  return {dot_product_power, alloc_size_power, align_ptr_power, fix_tail_power};
+#endif
+  return {dot_product_default, alloc_size_default, align_ptr_default, fix_tail_default};
+}
 
-  float distance_greater_than(const FVector *other, size_t vec_len, float than,
-                              Stats *stats) const
-  {
-    float k = scale * other->scale;
-    float dp= dot_product(dims, other->dims, subdist_part);
-    float subdist= (subabs2 + other->subabs2 - k * dp)/subdist_part*vec_len;
-    if (subdist > than)
-      return subdist;
-    dp+= dot_product(dims+subdist_part, other->dims+subdist_part,
-                     vec_len - subdist_part);
-    float dist= abs2 + other->abs2 - k * dp;
-    stats->subdist.add(subdist/dist);
-    return dist;
-  }
-};
-#pragma pack(pop)
+static const Vector_ops chosen_vector_ops= choose_vector_ops_impl();
+
+float FVector::dot_product(const int16_t *v1, const int16_t *v2, size_t len)
+{
+  return chosen_vector_ops.dot_product(v1, v2, len);
+}
+size_t FVector::alloc_size(size_t n)
+{
+  return chosen_vector_ops.alloc_size(n);
+}
+FVector *FVector::align_ptr(void *ptr)
+{
+  return chosen_vector_ops.align_ptr(ptr);
+}
+void FVector::fix_tail(int16_t *dims, size_t vec_len)
+{
+  chosen_vector_ops.fix_tail(dims, vec_len);
+}
+
+/*
+  A simple benchmark to test the performance of the dot product function.
+*/
+#include <iostream>
+#include <random>
+
+void mhnsw_run_benchmark()
+{
+  const size_t vec_len= 1024;
+  const size_t iterations= 1000000;
+
+  int16_t *vec1= (int16_t*)aligned_alloc(64, vec_len * sizeof(int16_t));
+  int16_t *vec2= (int16_t*)aligned_alloc(64, vec_len * sizeof(int16_t));
+  std::mt19937 rng(42);
+  std::uniform_int_distribution<int16_t> dist(-100, 100);
+
+  for (size_t i= 0; i < vec_len; i++) vec1[i]= dist(rng);
+  for (size_t i= 0; i < vec_len; i++) vec2[i]= dist(rng);
+
+  std::cout << "Warm up....\n";
+  for (int i= 0; i < 1000; i++)
+    FVector::dot_product(vec1, vec2, vec_len);
+
+  std::cout << "Running benchmark...\n";
+  ulonglong start= my_timer_microseconds();
+
+  volatile float result= 0;
+  for (size_t i= 0; i < iterations; i++)
+    result+= FVector::dot_product(vec1, vec2, vec_len);
+
+  ulonglong end= my_timer_microseconds();
+  ulonglong duration= end - start;
+
+  double per_call= duration / (double)iterations;
+  double total_time_sec= duration / 1e6;
+
+  std::cout << "Total time in seconds: " << total_time_sec << "\n";
+  std::cout << "Total time in microseconds: " << duration << "\n";
+  std::cout << "Average time per call: " << per_call << "\n";
+  std::cout << "Result: " << result << "\n";
+
+  free(vec1);
+  free(vec2);
+}
 
 /*
   An array of pointers to graph nodes
diff --git a/sql/vector_mhnsw_bench.cc b/sql/vector_mhnsw_bench.cc
new file mode 100644
index 0000000000000..dc3f800a72a21
--- /dev/null
+++ b/sql/vector_mhnsw_bench.cc
@@ -0,0 +1,10 @@
+void mhnsw_run_benchmark();
+struct st_mysql_plugin;
+st_mysql_plugin *mysql_mandatory_plugins[] = {nullptr};
+st_mysql_plugin *mysql_optional_plugins[] = {nullptr};
+
+int main()
+{
+  mhnsw_run_benchmark();
+  return 0;
+}
\ No newline at end of file
diff --git a/sql/vector_mhnsw_x86.cc b/sql/vector_mhnsw_x86.cc
new file mode 100644
index 0000000000000..e45ff50952b04
--- /dev/null
+++ b/sql/vector_mhnsw_x86.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2024, MariaDB plc
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA */
+
+#include <my_global.h>
+#include <cstddef>
+#include <cstdint>
+
+#ifdef _MSC_VER
+# include <intrin.h>
+#else
+# include <cpuid.h>
+#endif
+
+constexpr uint32_t cpuid_ecx_AVX_AND_XSAVE= 1U << 28 | 1U << 27;
+constexpr uint32_t cpuid_ebx_AVX2= 1U << 5;
+constexpr uint32_t cpuid_ebx_AVX512= 1U << 16 | 1U << 30; // AVX512F and AVX512BW
+
+
+static uint32_t cpuid_ecx()
+{
+#ifdef __GNUC__
+  uint32_t eax, ebx, ecx, edx;
+  __cpuid(1, eax, ebx, ecx, edx);
+  return ecx;
+#elif defined _MSC_VER
+  int regs[4];
+  __cpuid(regs, 1);
+  return regs[2];
+#endif
+}
+
+static uint32_t cpuid_ebx_7()
+{
+#ifdef __GNUC__
+  uint32_t eax, ebx, ecx, edx;
+  __cpuid_count(7, 0, eax, ebx, ecx, edx);
+  return ebx;
+#elif defined _MSC_VER
+  int regs[4];
+  __cpuidex(regs, 7, 0);
+  return regs[1];
+#endif
+}
+
+struct FVector;
+typedef struct Vector_ops
+{
+  float (*dot_product)(const int16_t *v1, const int16_t *v2, size_t len);
+  size_t (*alloc_size)(size_t n);
+  FVector * (*align_ptr)(void *ptr);
+  void (*fix_tail)(int16_t *dims, size_t vec_len);
+} Vector_ops;
+
+extern "C" float dot_product_avx2(const int16_t*, const int16_t*, size_t);
+extern "C" float dot_product_avx512(const int16_t*, const int16_t*, size_t);
+extern "C" size_t alloc_size_avx2(size_t);
+extern "C" FVector *align_ptr_avx2(void*);
+extern "C" void fix_tail_avx2(int16_t*, size_t);
+extern "C" size_t alloc_size_avx512(size_t);
+extern "C" FVector *align_ptr_avx512(void*);
+extern "C" void fix_tail_avx512(int16_t*, size_t);
+
+extern "C" Vector_ops vector_ops_x86_available(void)
+{
+  const uint32_t ecx = cpuid_ecx();
+  if (~ecx & cpuid_ecx_AVX_AND_XSAVE)
+    return {nullptr, nullptr, nullptr, nullptr};
+
+  const uint32_t ebx = cpuid_ebx_7();
+  if ((ebx & cpuid_ebx_AVX512) == cpuid_ebx_AVX512)
+    return {dot_product_avx512, alloc_size_avx512, align_ptr_avx512, fix_tail_avx512};
+
+  if (ebx & cpuid_ebx_AVX2)
+    return {dot_product_avx2, alloc_size_avx2, align_ptr_avx2, fix_tail_avx2};
+
+  return {nullptr, nullptr, nullptr, nullptr};
+}

From 4162e01f68dd1a70191b0ff8e2e7b6b9d8984cf2 Mon Sep 17 00:00:00 2001
From: hadeer <hadderramadan7@gmail.com>
Date: Thu, 5 Mar 2026 07:52:03 +0200
Subject: [PATCH 2/7] Vector Search: Add OS support verification

Use xgetbv to check before enabling the right paths.
---
 sql/vector_mhnsw_x86.cc | 49 +++++++++++++++++++++++++++++++++++------
 1 file changed, 42 insertions(+), 7 deletions(-)

diff --git a/sql/vector_mhnsw_x86.cc b/sql/vector_mhnsw_x86.cc
index e45ff50952b04..9d7c2e35c378e 100644
--- a/sql/vector_mhnsw_x86.cc
+++ b/sql/vector_mhnsw_x86.cc
@@ -25,35 +25,71 @@
 
 constexpr uint32_t cpuid_ecx_AVX_AND_XSAVE= 1U << 28 | 1U << 27;
 constexpr uint32_t cpuid_ebx_AVX2= 1U << 5;
-constexpr uint32_t cpuid_ebx_AVX512= 1U << 16 | 1U << 30; // AVX512F and AVX512BW
-
+constexpr uint32_t cpuid_ebx_AVX512= 1U << 16 | 1U << 30;
 
 static uint32_t cpuid_ecx()
 {
 #ifdef __GNUC__
-  uint32_t eax, ebx, ecx, edx;
+  uint32_t eax=0, ebx=0, ecx=0, edx=0;
   __cpuid(1, eax, ebx, ecx, edx);
   return ecx;
 #elif defined _MSC_VER
   int regs[4];
   __cpuid(regs, 1);
   return regs[2];
+#else
+# error "unknown compiler"
 #endif
 }
 
 static uint32_t cpuid_ebx_7()
 {
 #ifdef __GNUC__
-  uint32_t eax, ebx, ecx, edx;
+  uint32_t eax=0, ebx=0, ecx=0, edx=0;
   __cpuid_count(7, 0, eax, ebx, ecx, edx);
   return ebx;
 #elif defined _MSC_VER
   int regs[4];
   __cpuidex(regs, 7, 0);
   return regs[1];
+#else
+# error "unknown compiler"
+#endif
+}
+
+
+static uint64_t xgetbv() {
+#ifdef _MSC_VER
+    return _xgetbv(0);
+#else
+/* builtin xgetbv is only supported in clang 9+, so use inline assembly directly*/
+    uint32_t eax, edx;
+    __asm__ volatile ("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
+    return ((uint64_t)edx << 32) | eax;
 #endif
 }
 
+static bool os_have_avx2()
+{
+  return (xgetbv() & 0x06) == 0x06;
+}
+static bool os_have_avx512()
+{
+  return (xgetbv() & 0xe6) == 0xe6;
+}
+static bool cpu_has_avx2()
+{
+  uint32_t ebx7 = cpuid_ebx_7();
+  return (ebx7 & cpuid_ebx_AVX2) == cpuid_ebx_AVX2 && os_have_avx2();
+}
+
+static bool cpu_has_avx512()
+{
+  uint32_t ebx7 = cpuid_ebx_7();
+  return (ebx7 & cpuid_ebx_AVX512) == cpuid_ebx_AVX512 && os_have_avx512();
+}
+
+
 struct FVector;
 typedef struct Vector_ops
 {
@@ -78,11 +114,10 @@ extern "C" Vector_ops vector_ops_x86_available(void)
   if (~ecx & cpuid_ecx_AVX_AND_XSAVE)
     return {nullptr, nullptr, nullptr, nullptr};
 
-  const uint32_t ebx = cpuid_ebx_7();
-  if ((ebx & cpuid_ebx_AVX512) == cpuid_ebx_AVX512)
+  if (cpu_has_avx512())
     return {dot_product_avx512, alloc_size_avx512, align_ptr_avx512, fix_tail_avx512};
 
-  if (ebx & cpuid_ebx_AVX2)
+  if (cpu_has_avx2())
     return {dot_product_avx2, alloc_size_avx2, align_ptr_avx2, fix_tail_avx2};
 
   return {nullptr, nullptr, nullptr, nullptr};

From e9ae69f1c3db074b1c4a470774912c9d5f6292f2 Mon Sep 17 00:00:00 2001
From: hadeer <hadderramadan7@gmail.com>
Date: Thu, 5 Mar 2026 11:10:53 +0200
Subject: [PATCH 3/7] mhnsw: compiler-independent choice

using if/else to branch every call, but
detection cpu level done once
---
 libmysqld/CMakeLists.txt |   3 +
 sql/vector_mhnsw.cc      | 134 +++++++++++++++++++++++++++++++++------
 sql/vector_mhnsw_x86.cc  |  69 ++++++++++----------
 3 files changed, 153 insertions(+), 53 deletions(-)

diff --git a/libmysqld/CMakeLists.txt b/libmysqld/CMakeLists.txt
index eb5e38427f7f6..4c56a40ca0fcd 100644
--- a/libmysqld/CMakeLists.txt
+++ b/libmysqld/CMakeLists.txt
@@ -170,6 +170,9 @@ SET(SQL_EMBEDDED_SOURCES emb_qcache.cc libmysqld.c lib_sql.cc
            ${MYSYS_LIBWRAP_SOURCE}
 )
 
+IF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i[3-6]86")
+  LIST(APPEND SQL_SOURCE vector_mhnsw_x86.cc)
+ENDIF()
 
 ADD_CONVENIENCE_LIBRARY(sql_embedded ${SQL_EMBEDDED_SOURCES})
 DTRACE_INSTRUMENT(sql_embedded)
diff --git a/sql/vector_mhnsw.cc b/sql/vector_mhnsw.cc
index 69723c7fd2610..d6dfca8316cc5 100644
--- a/sql/vector_mhnsw.cc
+++ b/sql/vector_mhnsw.cc
@@ -193,13 +193,13 @@ struct FVector
 };
 #pragma pack(pop)
 
-struct Vector_ops 
-{
-  float (*dot_product)(const int16_t *v1, const int16_t *v2, size_t len);
-  size_t (*alloc_size)(size_t n);
-  FVector * (*align_ptr)(void *ptr);
-  void (*fix_tail)(int16_t *dims, size_t vec_len);
-};
+// struct Vector_ops 
+// {
+//   float (*dot_product)(const int16_t *v1, const int16_t *v2, size_t len);
+//   size_t (*alloc_size)(size_t n);
+//   FVector * (*align_ptr)(void *ptr);
+//   void (*fix_tail)(int16_t *dims, size_t vec_len);
+// };
 
 
 #ifdef AVX2_IMPLEMENTATION
@@ -391,45 +391,137 @@ struct Vector_ops
   FVector *align_ptr_default(void *ptr) { return (FVector*)ptr; }
   void fix_tail_default(int16_t *dims, size_t) { }
 
+
+/*******************/
 #if defined __x86_64__ || defined _M_X64
-extern "C" Vector_ops vector_ops_x86_available(void);
+// extern "C" Vector_ops vector_ops_x86_available(void);
+extern "C" bool cpu_has_avx2();
+extern "C" bool cpu_has_avx512();
+extern "C" bool cpu_has_avx_and_xsave();
 #endif
 
-static Vector_ops choose_vector_ops_impl()
+enum class CpuLevel 
 {
+  AVX2, 
+  AVX512,
+  NEON,
+  POWERPC,
+  DEFAULT
+};
+
+static CpuLevel detect_cpu_level() {
 #if defined __x86_64__ || defined _M_X64
-  auto ops = vector_ops_x86_available();
-  if (ops.dot_product)
-    return ops;
+    if (!cpu_has_avx_and_xsave())
+        return CpuLevel::DEFAULT;
+    if (cpu_has_avx512())
+        return CpuLevel::AVX512;
+    else if (cpu_has_avx2())
+        return CpuLevel::AVX2;
+    else
+        return CpuLevel::DEFAULT;
 #elif defined __aarch64__
-  return {dot_product_neon, alloc_size_neon, align_ptr_neon, fix_tail_neon};
+    return CpuLevel::NEON;
 #elif defined __powerpc64__
-  return {dot_product_power, alloc_size_power, align_ptr_power, fix_tail_power};
+    return CpuLevel::POWERPC;
+#else
+    return CpuLevel::DEFAULT;
 #endif
-  return {dot_product_default, alloc_size_default, align_ptr_default, fix_tail_default};
 }
 
-static const Vector_ops chosen_vector_ops= choose_vector_ops_impl();
+static const CpuLevel cpu_level = detect_cpu_level();
+
+// static Vector_ops choose_vector_ops_impl()
+// {
+// #if defined __x86_64__ || defined _M_X64 
+//   auto ops = vector_ops_x86_available();
+//   if (ops.dot_product)
+//     return ops;
+// #elif defined __aarch64__
+//   return {dot_product_neon, alloc_size_neon, align_ptr_neon, fix_tail_neon};
+// #elif defined __powerpc64__
+//   return {dot_product_power, alloc_size_power, align_ptr_power, fix_tail_power};
+// #endif
+//   return {dot_product_default, alloc_size_default, align_ptr_default, fix_tail_default};
+// }
+
+// static const Vector_ops chosen_vector_ops= choose_vector_ops_impl();
 
 float FVector::dot_product(const int16_t *v1, const int16_t *v2, size_t len)
 {
-  return chosen_vector_ops.dot_product(v1, v2, len);
+#ifdef AVX512_IMPLEMENTATION
+  if (cpu_level == CpuLevel::AVX512)
+    return dot_product_avx512(v1, v2, len);
+#elif AVX2_IMPLEMENTATION
+  if (cpu_level == CpuLevel::AVX2)
+    return dot_product_avx2(v1, v2, len);
+#elif NEON_IMPLEMENTATION
+  if (cpu_level == CpuLevel::NEON)
+    return dot_product_neon(v1, v2, len);
+#elif POWER_IMPLEMENTATION
+  if (cpu_level == CpuLevel::POWERPC)
+    return dot_product_power(v1, v2, len);
+#else
+    return dot_product_default(v1, v2, len);
+#endif
 }
 size_t FVector::alloc_size(size_t n)
 {
-  return chosen_vector_ops.alloc_size(n);
+#ifdef AVX512_IMPLEMENTATION
+  if (cpu_level == CpuLevel::AVX512)
+    return alloc_size_avx512(n);
+#elif AVX2_IMPLEMENTATION
+  if (cpu_level == CpuLevel::AVX2)
+    return alloc_size_avx2(n);
+#elif NEON_IMPLEMENTATION
+  if (cpu_level == CpuLevel::NEON)
+    return alloc_size_neon(n);
+#elif POWER_IMPLEMENTATION
+  if (cpu_level == CpuLevel::POWERPC)
+    return alloc_size_power(n);
+#else
+    return alloc_size_default(n);
+#endif
+
 }
 FVector *FVector::align_ptr(void *ptr)
 {
-  return chosen_vector_ops.align_ptr(ptr);
+#ifdef AVX512_IMPLEMENTATION
+  if (cpu_level == CpuLevel::AVX512)
+    return align_ptr_avx512(ptr);
+#elif AVX2_IMPLEMENTATION
+  if (cpu_level == CpuLevel::AVX2)
+    return align_ptr_avx2(ptr);
+#elif NEON_IMPLEMENTATION
+  if (cpu_level == CpuLevel::NEON)
+    return align_ptr_neon(ptr);
+#elif POWER_IMPLEMENTATION
+  if (cpu_level == CpuLevel::POWERPC)
+    return align_ptr_power(ptr);
+#else
+    return align_ptr_default(ptr);
+#endif
 }
 void FVector::fix_tail(int16_t *dims, size_t vec_len)
 {
-  chosen_vector_ops.fix_tail(dims, vec_len);
+#ifdef AVX512_IMPLEMENTATION
+  if (cpu_level == CpuLevel::AVX512)
+    return fix_tail_avx512(dims, vec_len);
+#elif AVX2_IMPLEMENTATION
+  if (cpu_level == CpuLevel::AVX2)
+    return fix_tail_avx2(dims, vec_len);
+#elif NEON_IMPLEMENTATION
+  if (cpu_level == CpuLevel::NEON)
+    return fix_tail_neon(dims, vec_len);
+#elif POWER_IMPLEMENTATION
+  if (cpu_level == CpuLevel::POWERPC)
+    return fix_tail_power(dims, vec_len);
+#else
+    return fix_tail_default(dims, vec_len);
+#endif
 }
 
 /*
-  A simple benchmark to test the performance of the dot product function.
+  A temporary benchmark to test the performance of the dot product function.
 */
 #include <iostream>
 #include <random>
diff --git a/sql/vector_mhnsw_x86.cc b/sql/vector_mhnsw_x86.cc
index 9d7c2e35c378e..7c9c1f4863bc8 100644
--- a/sql/vector_mhnsw_x86.cc
+++ b/sql/vector_mhnsw_x86.cc
@@ -77,48 +77,53 @@ static bool os_have_avx512()
 {
   return (xgetbv() & 0xe6) == 0xe6;
 }
-static bool cpu_has_avx2()
+
+extern "C" bool cpu_has_avx2()
 {
   uint32_t ebx7 = cpuid_ebx_7();
   return (ebx7 & cpuid_ebx_AVX2) == cpuid_ebx_AVX2 && os_have_avx2();
 }
-
-static bool cpu_has_avx512()
+extern "C" bool cpu_has_avx512()
 {
   uint32_t ebx7 = cpuid_ebx_7();
   return (ebx7 & cpuid_ebx_AVX512) == cpuid_ebx_AVX512 && os_have_avx512();
 }
+extern "C" bool cpu_has_avx_and_xsave() {
+    uint32_t ecx = cpuid_ecx();
+    return (ecx & cpuid_ecx_AVX_AND_XSAVE) == cpuid_ecx_AVX_AND_XSAVE;
+}
 
 
-struct FVector;
-typedef struct Vector_ops
-{
-  float (*dot_product)(const int16_t *v1, const int16_t *v2, size_t len);
-  size_t (*alloc_size)(size_t n);
-  FVector * (*align_ptr)(void *ptr);
-  void (*fix_tail)(int16_t *dims, size_t vec_len);
-} Vector_ops;
-
-extern "C" float dot_product_avx2(const int16_t*, const int16_t*, size_t);
-extern "C" float dot_product_avx512(const int16_t*, const int16_t*, size_t);
-extern "C" size_t alloc_size_avx2(size_t);
-extern "C" FVector *align_ptr_avx2(void*);
-extern "C" void fix_tail_avx2(int16_t*, size_t);
-extern "C" size_t alloc_size_avx512(size_t);
-extern "C" FVector *align_ptr_avx512(void*);
-extern "C" void fix_tail_avx512(int16_t*, size_t);
-
-extern "C" Vector_ops vector_ops_x86_available(void)
-{
-  const uint32_t ecx = cpuid_ecx();
-  if (~ecx & cpuid_ecx_AVX_AND_XSAVE)
-    return {nullptr, nullptr, nullptr, nullptr};
 
-  if (cpu_has_avx512())
-    return {dot_product_avx512, alloc_size_avx512, align_ptr_avx512, fix_tail_avx512};
+// struct FVector;
+// typedef struct Vector_ops
+// {
+//   float (*dot_product)(const int16_t *v1, const int16_t *v2, size_t len);
+//   size_t (*alloc_size)(size_t n);
+//   FVector * (*align_ptr)(void *ptr);
+//   void (*fix_tail)(int16_t *dims, size_t vec_len);
+// } Vector_ops;
 
-  if (cpu_has_avx2())
-    return {dot_product_avx2, alloc_size_avx2, align_ptr_avx2, fix_tail_avx2};
+// extern "C" float dot_product_avx2(const int16_t*, const int16_t*, size_t);
+// extern "C" float dot_product_avx512(const int16_t*, const int16_t*, size_t);
+// extern "C" size_t alloc_size_avx2(size_t);
+// extern "C" FVector *align_ptr_avx2(void*);
+// extern "C" void fix_tail_avx2(int16_t*, size_t);
+// extern "C" size_t alloc_size_avx512(size_t);
+// extern "C" FVector *align_ptr_avx512(void*);
+// extern "C" void fix_tail_avx512(int16_t*, size_t);
 
-  return {nullptr, nullptr, nullptr, nullptr};
-}
+// extern "C" Vector_ops vector_ops_x86_available(void)
+// {
+//   const uint32_t ecx = cpuid_ecx();
+//   if (~ecx & cpuid_ecx_AVX_AND_XSAVE)
+//     return {nullptr, nullptr, nullptr, nullptr};
+
+//   if (cpu_has_avx512())
+//     return {dot_product_avx512, alloc_size_avx512, align_ptr_avx512, fix_tail_avx512};
+
+//   if (cpu_has_avx2())
+//     return {dot_product_avx2, alloc_size_avx2, align_ptr_avx2, fix_tail_avx2};
+
+//   return {nullptr, nullptr, nullptr, nullptr};
+// }              
\ No newline at end of file

From 5c39550a2c6a8b073c774ed7984d0ddc4fa40a89 Mon Sep 17 00:00:00 2001
From: hadeer <hadderramadan7@gmail.com>
Date: Thu, 5 Mar 2026 13:13:31 +0200
Subject: [PATCH 4/7] Fix build errors: add missing returns and include
 <immintrin.h>

---
 sql/bloom_filters.h |  3 +++
 sql/vector_mhnsw.cc | 52 +++++++++++++++++++++++++++------------------
 2 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/sql/bloom_filters.h b/sql/bloom_filters.h
index 0bed6c56e51c5..815140d516ca6 100644
--- a/sql/bloom_filters.h
+++ b/sql/bloom_filters.h
@@ -37,6 +37,9 @@ SOFTWARE.
 #if __GNUC__ >= 5 
 #define AVX512_IMPLEMENTATION __attribute__ ((target ("avx512f,avx512bw")))
 #endif
+#ifdef HAVE_IMMINTRIN_H
+#include <immintrin.h>
+#endif
 #elif defined _MSC_VER && (defined _M_X64 || defined _M_IX86)
 #include <immintrin.h>
 #define AVX2_IMPLEMENTATION
diff --git a/sql/vector_mhnsw.cc b/sql/vector_mhnsw.cc
index d6dfca8316cc5..fe3b165e3bf07 100644
--- a/sql/vector_mhnsw.cc
+++ b/sql/vector_mhnsw.cc
@@ -451,78 +451,87 @@ float FVector::dot_product(const int16_t *v1, const int16_t *v2, size_t len)
 #ifdef AVX512_IMPLEMENTATION
   if (cpu_level == CpuLevel::AVX512)
     return dot_product_avx512(v1, v2, len);
-#elif AVX2_IMPLEMENTATION
+#endif
+#ifdef AVX2_IMPLEMENTATION
   if (cpu_level == CpuLevel::AVX2)
     return dot_product_avx2(v1, v2, len);
-#elif NEON_IMPLEMENTATION
+#endif
+#ifdef NEON_IMPLEMENTATION
   if (cpu_level == CpuLevel::NEON)
     return dot_product_neon(v1, v2, len);
-#elif POWER_IMPLEMENTATION
+#endif
+#ifdef POWER_IMPLEMENTATION
   if (cpu_level == CpuLevel::POWERPC)
     return dot_product_power(v1, v2, len);
-#else
-    return dot_product_default(v1, v2, len);
 #endif
+    return dot_product_default(v1, v2, len);
 }
 size_t FVector::alloc_size(size_t n)
 {
 #ifdef AVX512_IMPLEMENTATION
   if (cpu_level == CpuLevel::AVX512)
     return alloc_size_avx512(n);
-#elif AVX2_IMPLEMENTATION
+#endif
+#ifdef AVX2_IMPLEMENTATION
   if (cpu_level == CpuLevel::AVX2)
     return alloc_size_avx2(n);
-#elif NEON_IMPLEMENTATION
+#endif
+#ifdef NEON_IMPLEMENTATION
   if (cpu_level == CpuLevel::NEON)
     return alloc_size_neon(n);
-#elif POWER_IMPLEMENTATION
+#endif
+#ifdef POWER_IMPLEMENTATION
   if (cpu_level == CpuLevel::POWERPC)
     return alloc_size_power(n);
-#else
-    return alloc_size_default(n);
 #endif
-
+    return alloc_size_default(n);
 }
 FVector *FVector::align_ptr(void *ptr)
 {
 #ifdef AVX512_IMPLEMENTATION
   if (cpu_level == CpuLevel::AVX512)
     return align_ptr_avx512(ptr);
-#elif AVX2_IMPLEMENTATION
+#endif
+#ifdef AVX2_IMPLEMENTATION
   if (cpu_level == CpuLevel::AVX2)
     return align_ptr_avx2(ptr);
-#elif NEON_IMPLEMENTATION
+#endif
+#ifdef NEON_IMPLEMENTATION
   if (cpu_level == CpuLevel::NEON)
     return align_ptr_neon(ptr);
-#elif POWER_IMPLEMENTATION
+#endif
+#ifdef POWER_IMPLEMENTATION
   if (cpu_level == CpuLevel::POWERPC)
     return align_ptr_power(ptr);
-#else
-    return align_ptr_default(ptr);
 #endif
+    return align_ptr_default(ptr);
 }
+
 void FVector::fix_tail(int16_t *dims, size_t vec_len)
 {
 #ifdef AVX512_IMPLEMENTATION
   if (cpu_level == CpuLevel::AVX512)
     return fix_tail_avx512(dims, vec_len);
-#elif AVX2_IMPLEMENTATION
+#endif
+#ifdef AVX2_IMPLEMENTATION
   if (cpu_level == CpuLevel::AVX2)
     return fix_tail_avx2(dims, vec_len);
-#elif NEON_IMPLEMENTATION
+#endif
+#ifdef NEON_IMPLEMENTATION
   if (cpu_level == CpuLevel::NEON)
     return fix_tail_neon(dims, vec_len);
-#elif POWER_IMPLEMENTATION
+#endif
+#ifdef POWER_IMPLEMENTATION
   if (cpu_level == CpuLevel::POWERPC)
     return fix_tail_power(dims, vec_len);
-#else
-    return fix_tail_default(dims, vec_len);
 #endif
+    return fix_tail_default(dims, vec_len);
 }
 
 /*
   A temporary benchmark to test the performance of the dot product function.
 */
+#ifdef __x86_64__ 
 #include <iostream>
 #include <random>
 
@@ -564,6 +573,7 @@ void mhnsw_run_benchmark()
   free(vec1);
   free(vec2);
 }
+#endif
 
 /*
   An array of pointers to graph nodes

From 6d6becd48def1188d1ae5890a82adb8dd95ea2ac Mon Sep 17 00:00:00 2001
From: hadeer <hadderramadan7@gmail.com>
Date: Mon, 9 Mar 2026 13:00:39 +0200
Subject: [PATCH 5/7] MDEV-34804 mhnsw: compiler-independent choice of
 CPU-specific optimizations

Replace GCC-specific __attribute__((target(...))) function multi-versioning
with a function pointer dispatch mechanism for dot_product, alloc_size,
align_ptr, and fix_tail.

A Vector_ops struct holds function pointers for all four operations.
At startup, choose_vector_ops_impl() probes CPU capabilities and selects
the best available implementation (AVX-512, AVX2, NEON, POWER, or fallback).

This approach works on GCC, Clang, MSVC, and musl libc, matching
the pattern used in mysys/crc32/crc32c.cc.

Other changes:
- Replace GCC vector_size extension in dot_product_avx2 with portable
  AVX2 intrinsics (_mm256_add_ps, _mm256_extractf128_ps, etc.)
- Decouple bloom_filters.h macros from vector dispatch macros.
---
 libmysqld/CMakeLists.txt  |   2 +-
 sql/CMakeLists.txt        |   3 -
 sql/bloom_filters.h       |  35 +++---
 sql/vector_mhnsw.cc       | 251 ++++++++++----------------------------
 sql/vector_mhnsw_bench.cc |  10 --
 sql/vector_mhnsw_x86.cc   |  61 ++++-----
 sql/vector_mhnsw_x86.h    |  37 ++++++
 7 files changed, 143 insertions(+), 256 deletions(-)
 delete mode 100644 sql/vector_mhnsw_bench.cc
 create mode 100644 sql/vector_mhnsw_x86.h

diff --git a/libmysqld/CMakeLists.txt b/libmysqld/CMakeLists.txt
index 4c56a40ca0fcd..cc9990b2dee64 100644
--- a/libmysqld/CMakeLists.txt
+++ b/libmysqld/CMakeLists.txt
@@ -171,7 +171,7 @@ SET(SQL_EMBEDDED_SOURCES emb_qcache.cc libmysqld.c lib_sql.cc
 )
 
 IF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i[3-6]86")
-  LIST(APPEND SQL_SOURCE vector_mhnsw_x86.cc)
+  LIST(APPEND SQL_EMBEDDED_SOURCES ../sql/vector_mhnsw_x86.cc)
 ENDIF()
 
 ADD_CONVENIENCE_LIBRARY(sql_embedded ${SQL_EMBEDDED_SOURCES})
diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt
index 51f1528506f8f..b2d61993cd9a0 100644
--- a/sql/CMakeLists.txt
+++ b/sql/CMakeLists.txt
@@ -557,9 +557,6 @@ IF(WIN32)
   TARGET_LINK_LIBRARIES(mariadb-upgrade-service mysys winservice)
 ENDIF(WIN32)
 
-ADD_EXECUTABLE(vector_bench vector_mhnsw_bench.cc)
-TARGET_LINK_LIBRARIES(vector_bench sql sql_builtins)
-
 IF(NOT WITH_WSREP)
   SET(EXCL_WSREP "wsrep_[a-np-z]*.h")
 ENDIF()
diff --git a/sql/bloom_filters.h b/sql/bloom_filters.h
index 815140d516ca6..55ae72d09231f 100644
--- a/sql/bloom_filters.h
+++ b/sql/bloom_filters.h
@@ -29,38 +29,35 @@ SOFTWARE.
 #include <vector>
 #include <algorithm>
 
-#undef AVX2_IMPLEMENTATION
-#undef AVX512_IMPLEMENTATION
-
-#if defined __GNUC__ && (defined __i386__||defined __x86_64__)
+/*
+  Use gcc function multiversioning to optimize for a specific CPU with run-time
+  detection. Works only for x86, for other architectures we provide only one
+  implementation for now.
+*/
+#define DEFAULT_IMPLEMENTATION
+#if __GNUC__ > 7 && defined(__GLIBC__)
+#ifdef __x86_64__
+#ifdef HAVE_IMMINTRIN_H
+#include <immintrin.h>
+#undef DEFAULT_IMPLEMENTATION
+#define DEFAULT_IMPLEMENTATION    __attribute__ ((target ("default")))
 #define AVX2_IMPLEMENTATION __attribute__ ((target ("avx2,avx,fma")))
-#if __GNUC__ >= 5 
+#if __GNUC__ > 9
 #define AVX512_IMPLEMENTATION __attribute__ ((target ("avx512f,avx512bw")))
 #endif
-#ifdef HAVE_IMMINTRIN_H
-#include <immintrin.h>
 #endif
-#elif defined _MSC_VER && (defined _M_X64 || defined _M_IX86)
-#include <immintrin.h>
-#define AVX2_IMPLEMENTATION
-#define AVX512_IMPLEMENTATION
-#endif 
-
+#endif
 #ifdef __aarch64__
 #include <arm_neon.h>
+#undef DEFAULT_IMPLEMENTATION
 #define NEON_IMPLEMENTATION
 #endif
-
+#endif
 #if defined __powerpc64__ && defined __VSX__
 #include <altivec.h>
 #define POWER_IMPLEMENTATION
 #endif
 
-#if !defined(AVX2_IMPLEMENTATION) && !defined(NEON_IMPLEMENTATION) && !defined(POWER_IMPLEMENTATION)
-#define DEFAULT_IMPLEMENTATION
-#endif
-
-
 template <typename T>
 struct PatternedSimdBloomFilter
 {
diff --git a/sql/vector_mhnsw.cc b/sql/vector_mhnsw.cc
index fe3b165e3bf07..86ae5a68c5a96 100644
--- a/sql/vector_mhnsw.cc
+++ b/sql/vector_mhnsw.cc
@@ -23,6 +23,32 @@
 #include <scope.h>
 #include <my_atomic_wrapper.h>
 #include "bloom_filters.h"
+#include "vector_mhnsw_x86.h"
+
+/*
+  Independent SIMD macros for mhnsw vector operations.
+  These are separate from bloom_filters.h macros, which
+  the bloom filter continues to use for its own purposes.
+*/
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+# ifdef HAVE_IMMINTRIN_H
+#   include <immintrin.h>
+#   define MHNSW_AVX2   __attribute__((target("avx2,avx,fma")))
+#   if __GNUC__ >= 5
+#     define MHNSW_AVX512 __attribute__((target("avx512f,avx512bw")))
+#   endif
+# endif
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+# include <immintrin.h>
+# define MHNSW_AVX2
+# define MHNSW_AVX512
+#elif defined(__aarch64__) || defined(_M_ARM64)
+# include <arm_neon.h>
+# define MHNSW_HAVE_NEON
+#elif defined(__powerpc64__) && defined(__VSX__)
+# include <altivec.h>
+# define MHNSW_HAVE_POWER
+#endif
 
 // distance can be a little bit < 0 because of fast math
 static constexpr float NEAREST = -1.0f;
@@ -193,65 +219,57 @@ struct FVector
 };
 #pragma pack(pop)
 
-// struct Vector_ops 
-// {
-//   float (*dot_product)(const int16_t *v1, const int16_t *v2, size_t len);
-//   size_t (*alloc_size)(size_t n);
-//   FVector * (*align_ptr)(void *ptr);
-//   void (*fix_tail)(int16_t *dims, size_t vec_len);
-// };
-
 
-#ifdef AVX2_IMPLEMENTATION
+#ifdef MHNSW_AVX2
   /************* AVX2 *****************************************************/
   static constexpr size_t AVX2_bytes= 256/8;
   static constexpr size_t AVX2_dims= AVX2_bytes/sizeof(int16_t);
   static_assert(subdist_part % AVX2_dims == 0);
 
   extern "C"
-  AVX2_IMPLEMENTATION
+  MHNSW_AVX2
   float dot_product_avx2(const int16_t *v1, const int16_t *v2, size_t len)
   {
-    typedef float v8f __attribute__((vector_size(AVX2_bytes)));
-    union { v8f v; __m256 i; } tmp;
     __m256i *p1= (__m256i*)v1;
     __m256i *p2= (__m256i*)v2;
-    v8f d= {0};
+    __m256 d= _mm256_setzero_ps();
     for (size_t i= 0; i < (len + AVX2_dims-1)/AVX2_dims; p1++, p2++, i++)
-    {
-      tmp.i= _mm256_cvtepi32_ps(_mm256_madd_epi16(*p1, *p2));
-      d+= tmp.v;
-    }
-    return d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7];
+      d= _mm256_add_ps(d, _mm256_cvtepi32_ps(_mm256_madd_epi16(*p1, *p2)));
+    __m128 hi= _mm256_extractf128_ps(d, 1);
+    __m128 lo= _mm256_castps256_ps128(d);
+    __m128 sum= _mm_add_ps(lo, hi);
+    sum= _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
+    sum= _mm_add_ss(sum, _mm_movehdup_ps(sum));
+    return _mm_cvtss_f32(sum);
   }
 
   extern "C"
-  AVX2_IMPLEMENTATION
+  MHNSW_AVX2
   size_t alloc_size_avx2(size_t n)
   { return FVector::alloc_header + MY_ALIGN(n*2, AVX2_bytes) + AVX2_bytes - 1; }
 
   extern "C"
-  AVX2_IMPLEMENTATION
+  MHNSW_AVX2
   FVector *align_ptr_avx2(void *ptr)
   { return (FVector*)(MY_ALIGN(((intptr)ptr) + FVector::alloc_header, AVX2_bytes)
                       - FVector::alloc_header); }
 
   extern "C"
-  AVX2_IMPLEMENTATION
+  MHNSW_AVX2
   void fix_tail_avx2(int16_t *dims, size_t vec_len)
   {
     bzero(dims + vec_len, (MY_ALIGN(vec_len, AVX2_dims) - vec_len)*2);
   }
 #endif
 
-#ifdef AVX512_IMPLEMENTATION
+#ifdef MHNSW_AVX512
   /************* AVX512 ****************************************************/
   static constexpr size_t AVX512_bytes= 512/8;
   static constexpr size_t AVX512_dims= AVX512_bytes/sizeof(int16_t);
   static_assert(subdist_part % AVX512_dims == 0);
 
   extern "C"
-  AVX512_IMPLEMENTATION
+  MHNSW_AVX512
   float dot_product_avx512(const int16_t *v1, const int16_t *v2, size_t len)
   {
     __m512i *p1= (__m512i*)v1;
@@ -263,18 +281,18 @@ struct FVector
   }
 
   extern "C"
-  AVX512_IMPLEMENTATION
+  MHNSW_AVX512
   size_t alloc_size_avx512(size_t n)
   { return FVector::alloc_header + MY_ALIGN(n*2, AVX512_bytes) + AVX512_bytes - 1; }
 
   extern "C"
-  AVX512_IMPLEMENTATION
+  MHNSW_AVX512
   FVector *align_ptr_avx512(void *ptr)
   { return (FVector*)(MY_ALIGN(((intptr)ptr) + FVector::alloc_header, AVX512_bytes)
                       - FVector::alloc_header); }
 
   extern "C"
-  AVX512_IMPLEMENTATION
+  MHNSW_AVX512
   void fix_tail_avx512(int16_t *dims, size_t vec_len)
   {
     bzero(dims + vec_len, (MY_ALIGN(vec_len, AVX512_dims) - vec_len)*2);
@@ -291,7 +309,7 @@ struct FVector
   vmull+vmlal2_high implementations.
 */
 
-#ifdef NEON_IMPLEMENTATION
+#ifdef MHNSW_HAVE_NEON
   static constexpr size_t NEON_bytes= 128 / 8;
   static constexpr size_t NEON_dims= NEON_bytes / sizeof(int16_t);
   static_assert(subdist_part % NEON_dims == 0);
@@ -324,7 +342,7 @@ struct FVector
   }
 #endif
 
-#ifdef POWER_IMPLEMENTATION
+#ifdef MHNSW_HAVE_POWER
   /************* POWERPC *****************************************************/
   static constexpr size_t POWER_bytes= 128 / 8; // Assume 128-bit vector width
   static constexpr size_t POWER_dims= POWER_bytes / sizeof(int16_t);
@@ -392,188 +410,45 @@ struct FVector
   void fix_tail_default(int16_t *dims, size_t) { }
 
 
-/*******************/
-#if defined __x86_64__ || defined _M_X64
-// extern "C" Vector_ops vector_ops_x86_available(void);
-extern "C" bool cpu_has_avx2();
-extern "C" bool cpu_has_avx512();
-extern "C" bool cpu_has_avx_and_xsave();
-#endif
+/*******************************CPU Dispatching*******************************/
 
-enum class CpuLevel 
-{
-  AVX2, 
-  AVX512,
-  NEON,
-  POWERPC,
-  DEFAULT
-};
 
-static CpuLevel detect_cpu_level() {
-#if defined __x86_64__ || defined _M_X64
-    if (!cpu_has_avx_and_xsave())
-        return CpuLevel::DEFAULT;
-    if (cpu_has_avx512())
-        return CpuLevel::AVX512;
-    else if (cpu_has_avx2())
-        return CpuLevel::AVX2;
-    else
-        return CpuLevel::DEFAULT;
+
+static Vector_ops choose_vector_ops_impl()
+{
+#if defined __x86_64__ || defined _M_X64 
+  auto ops = vector_ops_x86_available();
+  if (ops.dot_product)
+    return ops;
 #elif defined __aarch64__
-    return CpuLevel::NEON;
+  return {dot_product_neon, alloc_size_neon, align_ptr_neon, fix_tail_neon};
 #elif defined __powerpc64__
-    return CpuLevel::POWERPC;
-#else
-    return CpuLevel::DEFAULT;
+  return {dot_product_power, alloc_size_power, align_ptr_power, fix_tail_power};
 #endif
+  return {dot_product_default, alloc_size_default, align_ptr_default, fix_tail_default};
 }
 
-static const CpuLevel cpu_level = detect_cpu_level();
-
-// static Vector_ops choose_vector_ops_impl()
-// {
-// #if defined __x86_64__ || defined _M_X64 
-//   auto ops = vector_ops_x86_available();
-//   if (ops.dot_product)
-//     return ops;
-// #elif defined __aarch64__
-//   return {dot_product_neon, alloc_size_neon, align_ptr_neon, fix_tail_neon};
-// #elif defined __powerpc64__
-//   return {dot_product_power, alloc_size_power, align_ptr_power, fix_tail_power};
-// #endif
-//   return {dot_product_default, alloc_size_default, align_ptr_default, fix_tail_default};
-// }
-
-// static const Vector_ops chosen_vector_ops= choose_vector_ops_impl();
+static const Vector_ops chosen_vector_ops= choose_vector_ops_impl();
 
 float FVector::dot_product(const int16_t *v1, const int16_t *v2, size_t len)
 {
-#ifdef AVX512_IMPLEMENTATION
-  if (cpu_level == CpuLevel::AVX512)
-    return dot_product_avx512(v1, v2, len);
-#endif
-#ifdef AVX2_IMPLEMENTATION
-  if (cpu_level == CpuLevel::AVX2)
-    return dot_product_avx2(v1, v2, len);
-#endif
-#ifdef NEON_IMPLEMENTATION
-  if (cpu_level == CpuLevel::NEON)
-    return dot_product_neon(v1, v2, len);
-#endif
-#ifdef POWER_IMPLEMENTATION
-  if (cpu_level == CpuLevel::POWERPC)
-    return dot_product_power(v1, v2, len);
-#endif
-    return dot_product_default(v1, v2, len);
+  return chosen_vector_ops.dot_product(v1, v2, len);
 }
 size_t FVector::alloc_size(size_t n)
 {
-#ifdef AVX512_IMPLEMENTATION
-  if (cpu_level == CpuLevel::AVX512)
-    return alloc_size_avx512(n);
-#endif
-#ifdef AVX2_IMPLEMENTATION
-  if (cpu_level == CpuLevel::AVX2)
-    return alloc_size_avx2(n);
-#endif
-#ifdef NEON_IMPLEMENTATION
-  if (cpu_level == CpuLevel::NEON)
-    return alloc_size_neon(n);
-#endif
-#ifdef POWER_IMPLEMENTATION
-  if (cpu_level == CpuLevel::POWERPC)
-    return alloc_size_power(n);
-#endif
-    return alloc_size_default(n);
+  return chosen_vector_ops.alloc_size(n);
 }
 FVector *FVector::align_ptr(void *ptr)
 {
-#ifdef AVX512_IMPLEMENTATION
-  if (cpu_level == CpuLevel::AVX512)
-    return align_ptr_avx512(ptr);
-#endif
-#ifdef AVX2_IMPLEMENTATION
-  if (cpu_level == CpuLevel::AVX2)
-    return align_ptr_avx2(ptr);
-#endif
-#ifdef NEON_IMPLEMENTATION
-  if (cpu_level == CpuLevel::NEON)
-    return align_ptr_neon(ptr);
-#endif
-#ifdef POWER_IMPLEMENTATION
-  if (cpu_level == CpuLevel::POWERPC)
-    return align_ptr_power(ptr);
-#endif
-    return align_ptr_default(ptr);
+  return chosen_vector_ops.align_ptr(ptr);
 }
 
 void FVector::fix_tail(int16_t *dims, size_t vec_len)
 {
-#ifdef AVX512_IMPLEMENTATION
-  if (cpu_level == CpuLevel::AVX512)
-    return fix_tail_avx512(dims, vec_len);
-#endif
-#ifdef AVX2_IMPLEMENTATION
-  if (cpu_level == CpuLevel::AVX2)
-    return fix_tail_avx2(dims, vec_len);
-#endif
-#ifdef NEON_IMPLEMENTATION
-  if (cpu_level == CpuLevel::NEON)
-    return fix_tail_neon(dims, vec_len);
-#endif
-#ifdef POWER_IMPLEMENTATION
-  if (cpu_level == CpuLevel::POWERPC)
-    return fix_tail_power(dims, vec_len);
-#endif
-    return fix_tail_default(dims, vec_len);
+  chosen_vector_ops.fix_tail(dims, vec_len);
 }
 
-/*
-  A temporary benchmark to test the performance of the dot product function.
-*/
-#ifdef __x86_64__ 
-#include <iostream>
-#include <random>
-
-void mhnsw_run_benchmark()
-{
-  const size_t vec_len= 1024;
-  const size_t iterations= 1000000;
-
-  int16_t *vec1= (int16_t*)aligned_alloc(64, vec_len * sizeof(int16_t));
-  int16_t *vec2= (int16_t*)aligned_alloc(64, vec_len * sizeof(int16_t));
-  std::mt19937 rng(42);
-  std::uniform_int_distribution<int16_t> dist(-100, 100);
-
-  for (size_t i= 0; i < vec_len; i++) vec1[i]= dist(rng);
-  for (size_t i= 0; i < vec_len; i++) vec2[i]= dist(rng);
-
-  std::cout << "Warm up....\n";
-  for (int i= 0; i < 1000; i++)
-    FVector::dot_product(vec1, vec2, vec_len);
-
-  std::cout << "Running benchmark...\n";
-  ulonglong start= my_timer_microseconds();
-
-  volatile float result= 0;
-  for (size_t i= 0; i < iterations; i++)
-    result+= FVector::dot_product(vec1, vec2, vec_len);
-
-  ulonglong end= my_timer_microseconds();
-  ulonglong duration= end - start;
-
-  double per_call= duration / (double)iterations;
-  double total_time_sec= duration / 1e6;
-
-  std::cout << "Total time in seconds: " << total_time_sec << "\n";
-  std::cout << "Total time in microseconds: " << duration << "\n";
-  std::cout << "Average time per call: " << per_call << "\n";
-  std::cout << "Result: " << result << "\n";
-
-  free(vec1);
-  free(vec2);
-}
-#endif
+/**************************************************************/
 
 /*
   An array of pointers to graph nodes
diff --git a/sql/vector_mhnsw_bench.cc b/sql/vector_mhnsw_bench.cc
deleted file mode 100644
index dc3f800a72a21..0000000000000
--- a/sql/vector_mhnsw_bench.cc
+++ /dev/null
@@ -1,10 +0,0 @@
-void mhnsw_run_benchmark();
-struct st_mysql_plugin;
-st_mysql_plugin *mysql_mandatory_plugins[] = {nullptr};
-st_mysql_plugin *mysql_optional_plugins[] = {nullptr};
-
-int main()
-{
-  mhnsw_run_benchmark();
-  return 0;
-}
\ No newline at end of file
diff --git a/sql/vector_mhnsw_x86.cc b/sql/vector_mhnsw_x86.cc
index 7c9c1f4863bc8..97080cc1b6995 100644
--- a/sql/vector_mhnsw_x86.cc
+++ b/sql/vector_mhnsw_x86.cc
@@ -14,8 +14,7 @@
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA */
 
 #include <my_global.h>
-#include <cstddef>
-#include <cstdint>
+#include "vector_mhnsw_x86.h"
 
 #ifdef _MSC_VER
 # include <intrin.h>
@@ -69,61 +68,53 @@ static uint64_t xgetbv() {
 #endif
 }
 
-static bool os_have_avx2()
+bool os_have_avx2()
 {
   return (xgetbv() & 0x06) == 0x06;
 }
-static bool os_have_avx512()
+bool os_have_avx512()
 {
   return (xgetbv() & 0xe6) == 0xe6;
 }
 
-extern "C" bool cpu_has_avx2()
+bool cpu_has_avx2()
 {
   uint32_t ebx7 = cpuid_ebx_7();
   return (ebx7 & cpuid_ebx_AVX2) == cpuid_ebx_AVX2 && os_have_avx2();
 }
-extern "C" bool cpu_has_avx512()
+bool cpu_has_avx512()
 {
   uint32_t ebx7 = cpuid_ebx_7();
   return (ebx7 & cpuid_ebx_AVX512) == cpuid_ebx_AVX512 && os_have_avx512();
 }
-extern "C" bool cpu_has_avx_and_xsave() {
+bool cpu_has_avx_and_xsave() {
     uint32_t ecx = cpuid_ecx();
     return (ecx & cpuid_ecx_AVX_AND_XSAVE) == cpuid_ecx_AVX_AND_XSAVE;
 }
 
 
 
-// struct FVector;
-// typedef struct Vector_ops
-// {
-//   float (*dot_product)(const int16_t *v1, const int16_t *v2, size_t len);
-//   size_t (*alloc_size)(size_t n);
-//   FVector * (*align_ptr)(void *ptr);
-//   void (*fix_tail)(int16_t *dims, size_t vec_len);
-// } Vector_ops;
+struct FVector;
+extern "C" float dot_product_avx2(const int16_t*, const int16_t*, size_t);
+extern "C" float dot_product_avx512(const int16_t*, const int16_t*, size_t);
+extern "C" size_t alloc_size_avx2(size_t);
+extern "C" FVector *align_ptr_avx2(void*);
+extern "C" void fix_tail_avx2(int16_t*, size_t);
+extern "C" size_t alloc_size_avx512(size_t);
+extern "C" FVector *align_ptr_avx512(void*);
+extern "C" void fix_tail_avx512(int16_t*, size_t);
 
-// extern "C" float dot_product_avx2(const int16_t*, const int16_t*, size_t);
-// extern "C" float dot_product_avx512(const int16_t*, const int16_t*, size_t);
-// extern "C" size_t alloc_size_avx2(size_t);
-// extern "C" FVector *align_ptr_avx2(void*);
-// extern "C" void fix_tail_avx2(int16_t*, size_t);
-// extern "C" size_t alloc_size_avx512(size_t);
-// extern "C" FVector *align_ptr_avx512(void*);
-// extern "C" void fix_tail_avx512(int16_t*, size_t);
-
-// extern "C" Vector_ops vector_ops_x86_available(void)
-// {
-//   const uint32_t ecx = cpuid_ecx();
-//   if (~ecx & cpuid_ecx_AVX_AND_XSAVE)
-//     return {nullptr, nullptr, nullptr, nullptr};
+Vector_ops vector_ops_x86_available(void)
+{
+  const uint32_t ecx = cpuid_ecx();
+  if (~ecx & cpuid_ecx_AVX_AND_XSAVE)
+    return {nullptr, nullptr, nullptr, nullptr};
 
-//   if (cpu_has_avx512())
-//     return {dot_product_avx512, alloc_size_avx512, align_ptr_avx512, fix_tail_avx512};
+  if (cpu_has_avx512())
+    return {dot_product_avx512, alloc_size_avx512, align_ptr_avx512, fix_tail_avx512};
 
-//   if (cpu_has_avx2())
-//     return {dot_product_avx2, alloc_size_avx2, align_ptr_avx2, fix_tail_avx2};
+  if (cpu_has_avx2())
+    return {dot_product_avx2, alloc_size_avx2, align_ptr_avx2, fix_tail_avx2};
 
-//   return {nullptr, nullptr, nullptr, nullptr};
-// }              
\ No newline at end of file
+  return {nullptr, nullptr, nullptr, nullptr};
+}                
\ No newline at end of file
diff --git a/sql/vector_mhnsw_x86.h b/sql/vector_mhnsw_x86.h
new file mode 100644
index 0000000000000..841fa8e3c07aa
--- /dev/null
+++ b/sql/vector_mhnsw_x86.h
@@ -0,0 +1,37 @@
+/*
+   Copyright (c) 2024, MariaDB plc
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA
+*/
+
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+
+/*
+  Function pointer table for SIMD-accelerated vector operations.
+  Shared between vector_mhnsw.cc and vector_mhnsw_x86.cc.
+*/
+struct FVector;
+struct Vector_ops 
+{
+  float (*dot_product)(const int16_t *v1, const int16_t *v2, size_t len);
+  size_t (*alloc_size)(size_t n);
+  FVector * (*align_ptr)(void *ptr);
+  void (*fix_tail)(int16_t *dims, size_t vec_len);
+};
+#if defined(__x86_64__) || defined(_M_X64)
+extern "C" Vector_ops vector_ops_x86_available(void);
+#endif
\ No newline at end of file

From 2b36fe037bf756c146a0706eb4c434a28f0e4330 Mon Sep 17 00:00:00 2001
From: hadeer <hadderramadan7@gmail.com>
Date: Mon, 9 Mar 2026 14:19:28 +0200
Subject: [PATCH 6/7] MDEV-34804 fix Windows build: add AMD64 to CMake
 processor check

---
 sql/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt
index b2d61993cd9a0..4cfdcf40660de 100644
--- a/sql/CMakeLists.txt
+++ b/sql/CMakeLists.txt
@@ -207,7 +207,7 @@ SET (SQL_SOURCE
                ${MYSYS_LIBWRAP_SOURCE}
 )
 
-IF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i[3-6]86")
+IF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64|i[3-6]86")
   LIST(APPEND SQL_SOURCE vector_mhnsw_x86.cc)
 ENDIF()
 

From 6bc5027e00391639da93337ef83b079ae21e314c Mon Sep 17 00:00:00 2001
From: hadeer <hadderramadan7@gmail.com>
Date: Mon, 9 Mar 2026 14:45:04 +0200
Subject: [PATCH 7/7] MDEV-34804 fix clang build: include __clang__ in AVX512
 check

---
 sql/vector_mhnsw.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/vector_mhnsw.cc b/sql/vector_mhnsw.cc
index 86ae5a68c5a96..cb4ebf4572301 100644
--- a/sql/vector_mhnsw.cc
+++ b/sql/vector_mhnsw.cc
@@ -34,7 +34,7 @@
 # ifdef HAVE_IMMINTRIN_H
 #   include <immintrin.h>
 #   define MHNSW_AVX2   __attribute__((target("avx2,avx,fma")))
-#   if __GNUC__ >= 5
+#   if __GNUC__ >= 5 || defined(__clang__)
 #     define MHNSW_AVX512 __attribute__((target("avx512f,avx512bw")))
 #   endif
 # endif