diff --git a/libmysqld/CMakeLists.txt b/libmysqld/CMakeLists.txt
index f59354625b345..6bde9fd06852d 100644
--- a/libmysqld/CMakeLists.txt
+++ b/libmysqld/CMakeLists.txt
@@ -170,6 +170,9 @@ SET(SQL_EMBEDDED_SOURCES emb_qcache.cc libmysqld.c lib_sql.cc
            ${MYSYS_LIBWRAP_SOURCE}
 )
 
+IF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i[3-6]86")
+  LIST(APPEND SQL_EMBEDDED_SOURCES ../sql/vector_mhnsw_x86.cc)
+ENDIF()
 
 ADD_CONVENIENCE_LIBRARY(sql_embedded ${SQL_EMBEDDED_SOURCES})
 DTRACE_INSTRUMENT(sql_embedded)
diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt
index e54e894e1d0fc..40a8fe2ddbaf6 100644
--- a/sql/CMakeLists.txt
+++ b/sql/CMakeLists.txt
@@ -207,6 +207,10 @@ SET (SQL_SOURCE
                ${MYSYS_LIBWRAP_SOURCE}
 )
 
+IF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64|i[3-6]86")
+  LIST(APPEND SQL_SOURCE vector_mhnsw_x86.cc)
+ENDIF()
+
 MY_CHECK_CXX_COMPILER_FLAG(-Wno-unused-but-set-variable)
 IF(have_CXX__Wno_unused_but_set_variable)
   IF(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64"
diff --git a/sql/vector_mhnsw.cc b/sql/vector_mhnsw.cc
index d640363b6e76a..cb4ebf4572301 100644
--- a/sql/vector_mhnsw.cc
+++ b/sql/vector_mhnsw.cc
@@ -23,6 +23,32 @@
 #include <scope.h>
 #include <my_atomic_wrapper.h>
 #include "bloom_filters.h"
+#include "vector_mhnsw_x86.h"
+
+/*
+  Independent SIMD macros for mhnsw vector operations.
+  These are separate from bloom_filters.h macros, which
+  the bloom filter continues to use for its own purposes.
+*/
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+# ifdef HAVE_IMMINTRIN_H
+#   include <immintrin.h>
+#   define MHNSW_AVX2   __attribute__((target("avx2,avx,fma")))
+#   if __GNUC__ >= 5 || defined(__clang__)
+#     define MHNSW_AVX512 __attribute__((target("avx512f,avx512bw")))
+#   endif
+# endif
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+# include <immintrin.h>
+# define MHNSW_AVX2
+# define MHNSW_AVX512
+#elif defined(__aarch64__) || defined(_M_ARM64)
+# include <arm_neon.h>
+# define MHNSW_HAVE_NEON
+#elif defined(__powerpc64__) && defined(__VSX__)
+# include <altivec.h>
+# define MHNSW_HAVE_POWER
+#endif
 
 // distance can be a little bit < 0 because of fast math
 static constexpr float NEAREST = -1.0f;
@@ -148,11 +174,15 @@ struct FVector
   { return (data_size - data_header)*2; }
 
   static const FVector *create(const MHNSW_Share *ctx, void *mem, const void *src);
+  static float dot_product(const int16_t *v1, const int16_t *v2, size_t len);
+  static void fix_tail(int16_t *dims, size_t vec_len);
+  static size_t alloc_size(size_t n);
+  static FVector *align_ptr(void *ptr);
 
   void postprocess(bool use_subdist, size_t vec_len)
   {
     int16_t *d= dims;
-    fix_tail(vec_len);
+    fix_tail(d,vec_len);
     if (use_subdist)
     {
       subabs2= scale * scale * dot_product(d, d, subdist_part) / 2;
@@ -164,52 +194,83 @@ struct FVector
     abs2= subabs2 + scale * scale * dot_product(d, d, vec_len) / 2;
   }
 
-#ifdef AVX2_IMPLEMENTATION
+
+
+  float distance_to(const FVector *other, size_t vec_len) const
+  {
+    return abs2 + other->abs2 - scale * other->scale *
+           dot_product(dims, other->dims, vec_len);
+  }
+
+  float distance_greater_than(const FVector *other, size_t vec_len, float than,
+                              Stats *stats) const
+  {
+    float k = scale * other->scale;
+    float dp= dot_product(dims, other->dims, subdist_part);
+    float subdist= (subabs2 + other->subabs2 - k * dp)/subdist_part*vec_len;
+    if (subdist > than)
+      return subdist;
+    dp+= dot_product(dims+subdist_part, other->dims+subdist_part,
+                     vec_len - subdist_part);
+    float dist= abs2 + other->abs2 - k * dp;
+    stats->subdist.add(subdist/dist);
+    return dist;
+  }
+};
+#pragma pack(pop)
+
+
+#ifdef MHNSW_AVX2
   /************* AVX2 *****************************************************/
   static constexpr size_t AVX2_bytes= 256/8;
   static constexpr size_t AVX2_dims= AVX2_bytes/sizeof(int16_t);
   static_assert(subdist_part % AVX2_dims == 0);
 
-  AVX2_IMPLEMENTATION
-  static float dot_product(const int16_t *v1, const int16_t *v2, size_t len)
+  extern "C"
+  MHNSW_AVX2
+  float dot_product_avx2(const int16_t *v1, const int16_t *v2, size_t len)
   {
-    typedef float v8f __attribute__((vector_size(AVX2_bytes)));
-    union { v8f v; __m256 i; } tmp;
     __m256i *p1= (__m256i*)v1;
     __m256i *p2= (__m256i*)v2;
-    v8f d= {0};
+    __m256 d= _mm256_setzero_ps();
     for (size_t i= 0; i < (len + AVX2_dims-1)/AVX2_dims; p1++, p2++, i++)
-    {
-      tmp.i= _mm256_cvtepi32_ps(_mm256_madd_epi16(*p1, *p2));
-      d+= tmp.v;
-    }
-    return d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7];
-  }
-
-  AVX2_IMPLEMENTATION
-  static size_t alloc_size(size_t n)
-  { return alloc_header + MY_ALIGN(n*2, AVX2_bytes) + AVX2_bytes - 1; }
-
-  AVX2_IMPLEMENTATION
-  static FVector *align_ptr(void *ptr)
-  { return (FVector*)(MY_ALIGN(((intptr)ptr) + alloc_header, AVX2_bytes)
-                      - alloc_header); }
-
-  AVX2_IMPLEMENTATION
-  void fix_tail(size_t vec_len)
+      d= _mm256_add_ps(d, _mm256_cvtepi32_ps(_mm256_madd_epi16(*p1, *p2)));
+    __m128 hi= _mm256_extractf128_ps(d, 1);
+    __m128 lo= _mm256_castps256_ps128(d);
+    __m128 sum= _mm_add_ps(lo, hi);
+    sum= _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
+    sum= _mm_add_ss(sum, _mm_movehdup_ps(sum));
+    return _mm_cvtss_f32(sum);
+  }
+
+  extern "C"
+  MHNSW_AVX2
+  size_t alloc_size_avx2(size_t n)
+  { return FVector::alloc_header + MY_ALIGN(n*2, AVX2_bytes) + AVX2_bytes - 1; }
+
+  extern "C"
+  MHNSW_AVX2
+  FVector *align_ptr_avx2(void *ptr)
+  { return (FVector*)(MY_ALIGN(((intptr)ptr) + FVector::alloc_header, AVX2_bytes)
+                      - FVector::alloc_header); }
+
+  extern "C"
+  MHNSW_AVX2
+  void fix_tail_avx2(int16_t *dims, size_t vec_len)
   {
     bzero(dims + vec_len, (MY_ALIGN(vec_len, AVX2_dims) - vec_len)*2);
   }
 #endif
 
-#ifdef AVX512_IMPLEMENTATION
+#ifdef MHNSW_AVX512
   /************* AVX512 ****************************************************/
   static constexpr size_t AVX512_bytes= 512/8;
   static constexpr size_t AVX512_dims= AVX512_bytes/sizeof(int16_t);
   static_assert(subdist_part % AVX512_dims == 0);
 
-  AVX512_IMPLEMENTATION
-  static float dot_product(const int16_t *v1, const int16_t *v2, size_t len)
+  extern "C"
+  MHNSW_AVX512
+  float dot_product_avx512(const int16_t *v1, const int16_t *v2, size_t len)
   {
     __m512i *p1= (__m512i*)v1;
     __m512i *p2= (__m512i*)v2;
@@ -219,17 +280,20 @@ struct FVector
     return _mm512_reduce_add_ps(d);
   }
 
-  AVX512_IMPLEMENTATION
-  static size_t alloc_size(size_t n)
-  { return alloc_header + MY_ALIGN(n*2, AVX512_bytes) + AVX512_bytes - 1; }
+  extern "C"
+  MHNSW_AVX512
+  size_t alloc_size_avx512(size_t n)
+  { return FVector::alloc_header + MY_ALIGN(n*2, AVX512_bytes) + AVX512_bytes - 1; }
 
-  AVX512_IMPLEMENTATION
-  static FVector *align_ptr(void *ptr)
-  { return (FVector*)(MY_ALIGN(((intptr)ptr) + alloc_header, AVX512_bytes)
-                      - alloc_header); }
+  extern "C"
+  MHNSW_AVX512
+  FVector *align_ptr_avx512(void *ptr)
+  { return (FVector*)(MY_ALIGN(((intptr)ptr) + FVector::alloc_header, AVX512_bytes)
+                      - FVector::alloc_header); }
 
-  AVX512_IMPLEMENTATION
-  void fix_tail(size_t vec_len)
+  extern "C"
+  MHNSW_AVX512
+  void fix_tail_avx512(int16_t *dims, size_t vec_len)
   {
     bzero(dims + vec_len, (MY_ALIGN(vec_len, AVX512_dims) - vec_len)*2);
   }
@@ -245,12 +309,12 @@ struct FVector
   vmull+vmlal2_high implementations.
 */
 
-#ifdef NEON_IMPLEMENTATION
+#ifdef MHNSW_HAVE_NEON
   static constexpr size_t NEON_bytes= 128 / 8;
   static constexpr size_t NEON_dims= NEON_bytes / sizeof(int16_t);
   static_assert(subdist_part % NEON_dims == 0);
 
-  static float dot_product(const int16_t *v1, const int16_t *v2, size_t len)
+  float dot_product_neon(const int16_t *v1, const int16_t *v2, size_t len)
   {
     int64_t d= 0;
     for (size_t i= 0; i < (len + NEON_dims - 1) / NEON_dims; i++)
@@ -265,26 +329,26 @@ struct FVector
     return static_cast<float>(d);
   }
 
-  static size_t alloc_size(size_t n)
-  { return alloc_header + MY_ALIGN(n * 2, NEON_bytes) + NEON_bytes - 1; }
+  size_t alloc_size_neon(size_t n)
+  { return FVector::alloc_header + MY_ALIGN(n * 2, NEON_bytes) + NEON_bytes - 1; }
 
-  static FVector *align_ptr(void *ptr)
-  { return (FVector*) (MY_ALIGN(((intptr) ptr) + alloc_header, NEON_bytes)
-                       - alloc_header); }
+  FVector *align_ptr_neon(void *ptr)
+  { return (FVector*) (MY_ALIGN(((intptr) ptr) + FVector::alloc_header, NEON_bytes)
+                       - FVector::alloc_header); }
 
-  void fix_tail(size_t vec_len)
+  void fix_tail_neon(int16_t *dims, size_t vec_len)
   {
     bzero(dims + vec_len, (MY_ALIGN(vec_len, NEON_dims) - vec_len) * 2);
   }
 #endif
 
-#ifdef POWER_IMPLEMENTATION
+#ifdef MHNSW_HAVE_POWER
   /************* POWERPC *****************************************************/
   static constexpr size_t POWER_bytes= 128 / 8; // Assume 128-bit vector width
   static constexpr size_t POWER_dims= POWER_bytes / sizeof(int16_t);
   static_assert(subdist_part % POWER_dims == 0);
 
-  static float dot_product(const int16_t *v1, const int16_t *v2, size_t len)
+  float dot_product_power(const int16_t *v1, const int16_t *v2, size_t len)
   {
     // Using vector long long for int64_t accumulation
     vector long long ll_sum= {0, 0};
@@ -314,28 +378,25 @@ struct FVector
                               static_cast<int64_t>(ll_sum[1]));
   }
 
-  static size_t alloc_size(size_t n)
+  size_t alloc_size_power(size_t n)
   {
-    return alloc_header + MY_ALIGN(n * 2, POWER_bytes) + POWER_bytes - 1;
+    return FVector::alloc_header + MY_ALIGN(n * 2, POWER_bytes) + POWER_bytes - 1;
   }
 
-  static FVector *align_ptr(void *ptr)
+  FVector *align_ptr_power(void *ptr)
   {
-    return (FVector*)(MY_ALIGN(((intptr)ptr) + alloc_header, POWER_bytes)
-                    - alloc_header);
+    return (FVector*)(MY_ALIGN(((intptr)ptr) + FVector::alloc_header, POWER_bytes)
+                    - FVector::alloc_header);
   }
 
-  void fix_tail(size_t vec_len)
+  void fix_tail_power(int16_t *dims, size_t vec_len)
   {
     bzero(dims + vec_len, (MY_ALIGN(vec_len, POWER_dims) - vec_len) * 2);
   }
-#undef DEFAULT_IMPLEMENTATION
 #endif
 
-  /************* no-SIMD default ******************************************/
-#ifdef DEFAULT_IMPLEMENTATION
-  DEFAULT_IMPLEMENTATION
-  static float dot_product(const int16_t *v1, const int16_t *v2, size_t len)
+  /************* no-SIMD ******************************************/
+  static float dot_product_default(const int16_t *v1, const int16_t *v2, size_t len)
   {
     int64_t d= 0;
     for (size_t i= 0; i < len; i++)
@@ -343,38 +404,51 @@ struct FVector
     return static_cast<float>(d);
   }
 
-  DEFAULT_IMPLEMENTATION
-  static size_t alloc_size(size_t n) { return alloc_header + n*2; }
+  size_t alloc_size_default(size_t n) { return FVector::alloc_header + n*2; }
+
+  FVector *align_ptr_default(void *ptr) { return (FVector*)ptr; }
+  void fix_tail_default(int16_t *dims, size_t) { }
 
-  DEFAULT_IMPLEMENTATION
-  static FVector *align_ptr(void *ptr) { return (FVector*)ptr; }
 
-  DEFAULT_IMPLEMENTATION
-  void fix_tail(size_t) { }
+/*******************************CPU Dispatching*******************************/
+
+
+
+static Vector_ops choose_vector_ops_impl()
+{
+#if defined __x86_64__ || defined _M_X64 
+  auto ops = vector_ops_x86_available();
+  if (ops.dot_product)
+    return ops;
+#elif defined __aarch64__
+  return {dot_product_neon, alloc_size_neon, align_ptr_neon, fix_tail_neon};
+#elif defined __powerpc64__
+  return {dot_product_power, alloc_size_power, align_ptr_power, fix_tail_power};
 #endif
+  return {dot_product_default, alloc_size_default, align_ptr_default, fix_tail_default};
+}
 
-  float distance_to(const FVector *other, size_t vec_len) const
-  {
-    return abs2 + other->abs2 - scale * other->scale *
-           dot_product(dims, other->dims, vec_len);
-  }
+static const Vector_ops chosen_vector_ops= choose_vector_ops_impl();
 
-  float distance_greater_than(const FVector *other, size_t vec_len, float than,
-                              Stats *stats) const
-  {
-    float k = scale * other->scale;
-    float dp= dot_product(dims, other->dims, subdist_part);
-    float subdist= (subabs2 + other->subabs2 - k * dp)/subdist_part*vec_len;
-    if (subdist > than)
-      return subdist;
-    dp+= dot_product(dims+subdist_part, other->dims+subdist_part,
-                     vec_len - subdist_part);
-    float dist= abs2 + other->abs2 - k * dp;
-    stats->subdist.add(subdist/dist);
-    return dist;
-  }
-};
-#pragma pack(pop)
+float FVector::dot_product(const int16_t *v1, const int16_t *v2, size_t len)
+{
+  return chosen_vector_ops.dot_product(v1, v2, len);
+}
+size_t FVector::alloc_size(size_t n)
+{
+  return chosen_vector_ops.alloc_size(n);
+}
+FVector *FVector::align_ptr(void *ptr)
+{
+  return chosen_vector_ops.align_ptr(ptr);
+}
+
+void FVector::fix_tail(int16_t *dims, size_t vec_len)
+{
+  chosen_vector_ops.fix_tail(dims, vec_len);
+}
+
+/**************************************************************/
 
 /*
   An array of pointers to graph nodes
diff --git a/sql/vector_mhnsw_x86.cc b/sql/vector_mhnsw_x86.cc
new file mode 100644
index 0000000000000..97080cc1b6995
--- /dev/null
+++ b/sql/vector_mhnsw_x86.cc
@@ -0,0 +1,120 @@
+/* Copyright (c) 2024, MariaDB plc
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA */
+
+#include <my_global.h>
+#include "vector_mhnsw_x86.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+#else
+# include <cpuid.h>
+#endif
+
+constexpr uint32_t cpuid_ecx_AVX_AND_XSAVE= 1U << 28 | 1U << 27;
+constexpr uint32_t cpuid_ebx_AVX2= 1U << 5;
+constexpr uint32_t cpuid_ebx_AVX512= 1U << 16 | 1U << 30;
+
+static uint32_t cpuid_ecx()
+{
+#ifdef __GNUC__
+  uint32_t eax=0, ebx=0, ecx=0, edx=0;
+  __cpuid(1, eax, ebx, ecx, edx);
+  return ecx;
+#elif defined _MSC_VER
+  int regs[4];
+  __cpuid(regs, 1);
+  return regs[2];
+#else
+# error "unknown compiler"
+#endif
+}
+
+static uint32_t cpuid_ebx_7()
+{
+#ifdef __GNUC__
+  uint32_t eax=0, ebx=0, ecx=0, edx=0;
+  __cpuid_count(7, 0, eax, ebx, ecx, edx);
+  return ebx;
+#elif defined _MSC_VER
+  int regs[4];
+  __cpuidex(regs, 7, 0);
+  return regs[1];
+#else
+# error "unknown compiler"
+#endif
+}
+
+
+static uint64_t xgetbv() {
+#ifdef _MSC_VER
+    return _xgetbv(0);
+#else
+/* builtin xgetbv is only supported in clang 9+, so use inline assembly directly*/
+    uint32_t eax, edx;
+    __asm__ volatile ("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
+    return ((uint64_t)edx << 32) | eax;
+#endif
+}
+
+bool os_have_avx2()
+{
+  return (xgetbv() & 0x06) == 0x06;
+}
+bool os_have_avx512()
+{
+  return (xgetbv() & 0xe6) == 0xe6;
+}
+
+bool cpu_has_avx2()
+{
+  uint32_t ebx7 = cpuid_ebx_7();
+  return (ebx7 & cpuid_ebx_AVX2) == cpuid_ebx_AVX2 && os_have_avx2();
+}
+bool cpu_has_avx512()
+{
+  uint32_t ebx7 = cpuid_ebx_7();
+  return (ebx7 & cpuid_ebx_AVX512) == cpuid_ebx_AVX512 && os_have_avx512();
+}
+bool cpu_has_avx_and_xsave() {
+    uint32_t ecx = cpuid_ecx();
+    return (ecx & cpuid_ecx_AVX_AND_XSAVE) == cpuid_ecx_AVX_AND_XSAVE;
+}
+
+
+
+struct FVector;
+extern "C" float dot_product_avx2(const int16_t*, const int16_t*, size_t);
+extern "C" float dot_product_avx512(const int16_t*, const int16_t*, size_t);
+extern "C" size_t alloc_size_avx2(size_t);
+extern "C" FVector *align_ptr_avx2(void*);
+extern "C" void fix_tail_avx2(int16_t*, size_t);
+extern "C" size_t alloc_size_avx512(size_t);
+extern "C" FVector *align_ptr_avx512(void*);
+extern "C" void fix_tail_avx512(int16_t*, size_t);
+
+Vector_ops vector_ops_x86_available(void)
+{
+  const uint32_t ecx = cpuid_ecx();
+  if (~ecx & cpuid_ecx_AVX_AND_XSAVE)
+    return {nullptr, nullptr, nullptr, nullptr};
+
+  if (cpu_has_avx512())
+    return {dot_product_avx512, alloc_size_avx512, align_ptr_avx512, fix_tail_avx512};
+
+  if (cpu_has_avx2())
+    return {dot_product_avx2, alloc_size_avx2, align_ptr_avx2, fix_tail_avx2};
+
+  return {nullptr, nullptr, nullptr, nullptr};
+}                
\ No newline at end of file
diff --git a/sql/vector_mhnsw_x86.h b/sql/vector_mhnsw_x86.h
new file mode 100644
index 0000000000000..841fa8e3c07aa
--- /dev/null
+++ b/sql/vector_mhnsw_x86.h
@@ -0,0 +1,37 @@
+/*
+   Copyright (c) 2024, MariaDB plc
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA
+*/
+
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+
+/*
+  Function pointer table for SIMD-accelerated vector operations.
+  Shared between vector_mhnsw.cc and vector_mhnsw_x86.cc.
+*/
+struct FVector;
+struct Vector_ops 
+{
+  float (*dot_product)(const int16_t *v1, const int16_t *v2, size_t len);
+  size_t (*alloc_size)(size_t n);
+  FVector * (*align_ptr)(void *ptr);
+  void (*fix_tail)(int16_t *dims, size_t vec_len);
+};
+#if defined(__x86_64__) || defined(_M_X64)
+extern "C" Vector_ops vector_ops_x86_available(void);
+#endif
\ No newline at end of file