diff --git a/libmysqld/CMakeLists.txt b/libmysqld/CMakeLists.txt index f59354625b345..6bde9fd06852d 100644 --- a/libmysqld/CMakeLists.txt +++ b/libmysqld/CMakeLists.txt @@ -170,6 +170,9 @@ SET(SQL_EMBEDDED_SOURCES emb_qcache.cc libmysqld.c lib_sql.cc ${MYSYS_LIBWRAP_SOURCE} ) +IF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i[3-6]86") + LIST(APPEND SQL_EMBEDDED_SOURCES ../sql/vector_mhnsw_x86.cc) +ENDIF() ADD_CONVENIENCE_LIBRARY(sql_embedded ${SQL_EMBEDDED_SOURCES}) DTRACE_INSTRUMENT(sql_embedded) diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt index e54e894e1d0fc..40a8fe2ddbaf6 100644 --- a/sql/CMakeLists.txt +++ b/sql/CMakeLists.txt @@ -207,6 +207,10 @@ SET (SQL_SOURCE ${MYSYS_LIBWRAP_SOURCE} ) +IF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64|i[3-6]86") + LIST(APPEND SQL_SOURCE vector_mhnsw_x86.cc) +ENDIF() + MY_CHECK_CXX_COMPILER_FLAG(-Wno-unused-but-set-variable) IF(have_CXX__Wno_unused_but_set_variable) IF(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" diff --git a/sql/vector_mhnsw.cc b/sql/vector_mhnsw.cc index d640363b6e76a..cb4ebf4572301 100644 --- a/sql/vector_mhnsw.cc +++ b/sql/vector_mhnsw.cc @@ -23,6 +23,32 @@ #include #include #include "bloom_filters.h" +#include "vector_mhnsw_x86.h" + +/* + Independent SIMD macros for mhnsw vector operations. + These are separate from bloom_filters.h macros, which + the bloom filter continues to use for its own purposes. +*/ +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +# ifdef HAVE_IMMINTRIN_H +# include +# define MHNSW_AVX2 __attribute__((target("avx2,avx,fma"))) +# if __GNUC__ >= 5 || defined(__clang__) +# define MHNSW_AVX512 __attribute__((target("avx512f,avx512bw"))) +# endif +# endif +#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) +# include +# define MHNSW_AVX2 +# define MHNSW_AVX512 +#elif defined(__aarch64__) || defined(_M_ARM64) +# include +# define MHNSW_HAVE_NEON +#elif defined(__powerpc64__) && defined(__VSX__) +# include +# define MHNSW_HAVE_POWER +#endif // distance can be a little bit < 0 because of fast math static constexpr float NEAREST = -1.0f; @@ -148,11 +174,15 @@ struct FVector { return (data_size - data_header)*2; } static const FVector *create(const MHNSW_Share *ctx, void *mem, const void *src); + static float dot_product(const int16_t *v1, const int16_t *v2, size_t len); + static void fix_tail(int16_t *dims, size_t vec_len); + static size_t alloc_size(size_t n); + static FVector *align_ptr(void *ptr); void postprocess(bool use_subdist, size_t vec_len) { int16_t *d= dims; - fix_tail(vec_len); + fix_tail(d,vec_len); if (use_subdist) { subabs2= scale * scale * dot_product(d, d, subdist_part) / 2; @@ -164,52 +194,83 @@ struct FVector abs2= subabs2 + scale * scale * dot_product(d, d, vec_len) / 2; } -#ifdef AVX2_IMPLEMENTATION + + + float distance_to(const FVector *other, size_t vec_len) const + { + return abs2 + other->abs2 - scale * other->scale * + dot_product(dims, other->dims, vec_len); + } + + float distance_greater_than(const FVector *other, size_t vec_len, float than, + Stats *stats) const + { + float k = scale * other->scale; + float dp= dot_product(dims, other->dims, subdist_part); + float subdist= (subabs2 + other->subabs2 - k * dp)/subdist_part*vec_len; + if (subdist > than) + return subdist; + dp+= dot_product(dims+subdist_part, other->dims+subdist_part, + vec_len - subdist_part); + float dist= abs2 + other->abs2 - k * dp; + stats->subdist.add(subdist/dist); + return dist; + } +}; +#pragma pack(pop) + + +#ifdef MHNSW_AVX2 /************* AVX2 *****************************************************/ static constexpr size_t AVX2_bytes= 256/8; static constexpr size_t AVX2_dims= AVX2_bytes/sizeof(int16_t); static_assert(subdist_part % AVX2_dims == 0); - AVX2_IMPLEMENTATION - static float dot_product(const int16_t *v1, const int16_t *v2, size_t len) + extern "C" + MHNSW_AVX2 + float dot_product_avx2(const int16_t *v1, const int16_t *v2, size_t len) { - typedef float v8f __attribute__((vector_size(AVX2_bytes))); - union { v8f v; __m256 i; } tmp; __m256i *p1= (__m256i*)v1; __m256i *p2= (__m256i*)v2; - v8f d= {0}; + __m256 d= _mm256_setzero_ps(); for (size_t i= 0; i < (len + AVX2_dims-1)/AVX2_dims; p1++, p2++, i++) - { - tmp.i= _mm256_cvtepi32_ps(_mm256_madd_epi16(*p1, *p2)); - d+= tmp.v; - } - return d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7]; - } - - AVX2_IMPLEMENTATION - static size_t alloc_size(size_t n) - { return alloc_header + MY_ALIGN(n*2, AVX2_bytes) + AVX2_bytes - 1; } - - AVX2_IMPLEMENTATION - static FVector *align_ptr(void *ptr) - { return (FVector*)(MY_ALIGN(((intptr)ptr) + alloc_header, AVX2_bytes) - - alloc_header); } - - AVX2_IMPLEMENTATION - void fix_tail(size_t vec_len) + d= _mm256_add_ps(d, _mm256_cvtepi32_ps(_mm256_madd_epi16(*p1, *p2))); + __m128 hi= _mm256_extractf128_ps(d, 1); + __m128 lo= _mm256_castps256_ps128(d); + __m128 sum= _mm_add_ps(lo, hi); + sum= _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); + sum= _mm_add_ss(sum, _mm_movehdup_ps(sum)); + return _mm_cvtss_f32(sum); + } + + extern "C" + MHNSW_AVX2 + size_t alloc_size_avx2(size_t n) + { return FVector::alloc_header + MY_ALIGN(n*2, AVX2_bytes) + AVX2_bytes - 1; } + + extern "C" + MHNSW_AVX2 + FVector *align_ptr_avx2(void *ptr) + { return (FVector*)(MY_ALIGN(((intptr)ptr) + FVector::alloc_header, AVX2_bytes) + - FVector::alloc_header); } + + extern "C" + MHNSW_AVX2 + void fix_tail_avx2(int16_t *dims, size_t vec_len) { bzero(dims + vec_len, (MY_ALIGN(vec_len, AVX2_dims) - vec_len)*2); } #endif -#ifdef AVX512_IMPLEMENTATION +#ifdef MHNSW_AVX512 /************* AVX512 ****************************************************/ static constexpr size_t AVX512_bytes= 512/8; static constexpr size_t AVX512_dims= AVX512_bytes/sizeof(int16_t); static_assert(subdist_part % AVX512_dims == 0); - AVX512_IMPLEMENTATION - static float dot_product(const int16_t *v1, const int16_t *v2, size_t len) + extern "C" + MHNSW_AVX512 + float dot_product_avx512(const int16_t *v1, const int16_t *v2, size_t len) { __m512i *p1= (__m512i*)v1; __m512i *p2= (__m512i*)v2; @@ -219,17 +280,20 @@ struct FVector return _mm512_reduce_add_ps(d); } - AVX512_IMPLEMENTATION - static size_t alloc_size(size_t n) - { return alloc_header + MY_ALIGN(n*2, AVX512_bytes) + AVX512_bytes - 1; } + extern "C" + MHNSW_AVX512 + size_t alloc_size_avx512(size_t n) + { return FVector::alloc_header + MY_ALIGN(n*2, AVX512_bytes) + AVX512_bytes - 1; } - AVX512_IMPLEMENTATION - static FVector *align_ptr(void *ptr) - { return (FVector*)(MY_ALIGN(((intptr)ptr) + alloc_header, AVX512_bytes) - - alloc_header); } + extern "C" + MHNSW_AVX512 + FVector *align_ptr_avx512(void *ptr) + { return (FVector*)(MY_ALIGN(((intptr)ptr) + FVector::alloc_header, AVX512_bytes) + - FVector::alloc_header); } - AVX512_IMPLEMENTATION - void fix_tail(size_t vec_len) + extern "C" + MHNSW_AVX512 + void fix_tail_avx512(int16_t *dims, size_t vec_len) { bzero(dims + vec_len, (MY_ALIGN(vec_len, AVX512_dims) - vec_len)*2); } @@ -245,12 +309,12 @@ struct FVector vmull+vmlal2_high implementations. */ -#ifdef NEON_IMPLEMENTATION +#ifdef MHNSW_HAVE_NEON static constexpr size_t NEON_bytes= 128 / 8; static constexpr size_t NEON_dims= NEON_bytes / sizeof(int16_t); static_assert(subdist_part % NEON_dims == 0); - static float dot_product(const int16_t *v1, const int16_t *v2, size_t len) + float dot_product_neon(const int16_t *v1, const int16_t *v2, size_t len) { int64_t d= 0; for (size_t i= 0; i < (len + NEON_dims - 1) / NEON_dims; i++) @@ -265,26 +329,26 @@ struct FVector return static_cast(d); } - static size_t alloc_size(size_t n) - { return alloc_header + MY_ALIGN(n * 2, NEON_bytes) + NEON_bytes - 1; } + size_t alloc_size_neon(size_t n) + { return FVector::alloc_header + MY_ALIGN(n * 2, NEON_bytes) + NEON_bytes - 1; } - static FVector *align_ptr(void *ptr) - { return (FVector*) (MY_ALIGN(((intptr) ptr) + alloc_header, NEON_bytes) - - alloc_header); } + FVector *align_ptr_neon(void *ptr) + { return (FVector*) (MY_ALIGN(((intptr) ptr) + FVector::alloc_header, NEON_bytes) + - FVector::alloc_header); } - void fix_tail(size_t vec_len) + void fix_tail_neon(int16_t *dims, size_t vec_len) { bzero(dims + vec_len, (MY_ALIGN(vec_len, NEON_dims) - vec_len) * 2); } #endif -#ifdef POWER_IMPLEMENTATION +#ifdef MHNSW_HAVE_POWER /************* POWERPC *****************************************************/ static constexpr size_t POWER_bytes= 128 / 8; // Assume 128-bit vector width static constexpr size_t POWER_dims= POWER_bytes / sizeof(int16_t); static_assert(subdist_part % POWER_dims == 0); - static float dot_product(const int16_t *v1, const int16_t *v2, size_t len) + float dot_product_power(const int16_t *v1, const int16_t *v2, size_t len) { // Using vector long long for int64_t accumulation vector long long ll_sum= {0, 0}; @@ -314,28 +378,25 @@ struct FVector static_cast(ll_sum[1])); } - static size_t alloc_size(size_t n) + size_t alloc_size_power(size_t n) { - return alloc_header + MY_ALIGN(n * 2, POWER_bytes) + POWER_bytes - 1; + return FVector::alloc_header + MY_ALIGN(n * 2, POWER_bytes) + POWER_bytes - 1; } - static FVector *align_ptr(void *ptr) + FVector *align_ptr_power(void *ptr) { - return (FVector*)(MY_ALIGN(((intptr)ptr) + alloc_header, POWER_bytes) - - alloc_header); + return (FVector*)(MY_ALIGN(((intptr)ptr) + FVector::alloc_header, POWER_bytes) + - FVector::alloc_header); } - void fix_tail(size_t vec_len) + void fix_tail_power(int16_t *dims, size_t vec_len) { bzero(dims + vec_len, (MY_ALIGN(vec_len, POWER_dims) - vec_len) * 2); } -#undef DEFAULT_IMPLEMENTATION #endif - /************* no-SIMD default ******************************************/ -#ifdef DEFAULT_IMPLEMENTATION - DEFAULT_IMPLEMENTATION - static float dot_product(const int16_t *v1, const int16_t *v2, size_t len) + /************* no-SIMD ******************************************/ + static float dot_product_default(const int16_t *v1, const int16_t *v2, size_t len) { int64_t d= 0; for (size_t i= 0; i < len; i++) @@ -343,38 +404,51 @@ struct FVector return static_cast(d); } - DEFAULT_IMPLEMENTATION - static size_t alloc_size(size_t n) { return alloc_header + n*2; } + size_t alloc_size_default(size_t n) { return FVector::alloc_header + n*2; } + + FVector *align_ptr_default(void *ptr) { return (FVector*)ptr; } + void fix_tail_default(int16_t *dims, size_t) { } - DEFAULT_IMPLEMENTATION - static FVector *align_ptr(void *ptr) { return (FVector*)ptr; } - DEFAULT_IMPLEMENTATION - void fix_tail(size_t) { } +/*******************************CPU Dispatching*******************************/ + + + +static Vector_ops choose_vector_ops_impl() +{ +#if defined __x86_64__ || defined _M_X64 + auto ops = vector_ops_x86_available(); + if (ops.dot_product) + return ops; +#elif defined __aarch64__ + return {dot_product_neon, alloc_size_neon, align_ptr_neon, fix_tail_neon}; +#elif defined __powerpc64__ + return {dot_product_power, alloc_size_power, align_ptr_power, fix_tail_power}; #endif + return {dot_product_default, alloc_size_default, align_ptr_default, fix_tail_default}; +} - float distance_to(const FVector *other, size_t vec_len) const - { - return abs2 + other->abs2 - scale * other->scale * - dot_product(dims, other->dims, vec_len); - } +static const Vector_ops chosen_vector_ops= choose_vector_ops_impl(); - float distance_greater_than(const FVector *other, size_t vec_len, float than, - Stats *stats) const - { - float k = scale * other->scale; - float dp= dot_product(dims, other->dims, subdist_part); - float subdist= (subabs2 + other->subabs2 - k * dp)/subdist_part*vec_len; - if (subdist > than) - return subdist; - dp+= dot_product(dims+subdist_part, other->dims+subdist_part, - vec_len - subdist_part); - float dist= abs2 + other->abs2 - k * dp; - stats->subdist.add(subdist/dist); - return dist; - } -}; -#pragma pack(pop) +float FVector::dot_product(const int16_t *v1, const int16_t *v2, size_t len) +{ + return chosen_vector_ops.dot_product(v1, v2, len); +} +size_t FVector::alloc_size(size_t n) +{ + return chosen_vector_ops.alloc_size(n); +} +FVector *FVector::align_ptr(void *ptr) +{ + return chosen_vector_ops.align_ptr(ptr); +} + +void FVector::fix_tail(int16_t *dims, size_t vec_len) +{ + chosen_vector_ops.fix_tail(dims, vec_len); +} + +/**************************************************************/ /* An array of pointers to graph nodes diff --git a/sql/vector_mhnsw_x86.cc b/sql/vector_mhnsw_x86.cc new file mode 100644 index 0000000000000..97080cc1b6995 --- /dev/null +++ b/sql/vector_mhnsw_x86.cc @@ -0,0 +1,120 @@ +/* Copyright (c) 2024, MariaDB plc + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include +#include "vector_mhnsw_x86.h" + +#ifdef _MSC_VER +# include +#else +# include +#endif + +constexpr uint32_t cpuid_ecx_AVX_AND_XSAVE= 1U << 28 | 1U << 27; +constexpr uint32_t cpuid_ebx_AVX2= 1U << 5; +constexpr uint32_t cpuid_ebx_AVX512= 1U << 16 | 1U << 30; + +static uint32_t cpuid_ecx() +{ +#ifdef __GNUC__ + uint32_t eax=0, ebx=0, ecx=0, edx=0; + __cpuid(1, eax, ebx, ecx, edx); + return ecx; +#elif defined _MSC_VER + int regs[4]; + __cpuid(regs, 1); + return regs[2]; +#else +# error "unknown compiler" +#endif +} + +static uint32_t cpuid_ebx_7() +{ +#ifdef __GNUC__ + uint32_t eax=0, ebx=0, ecx=0, edx=0; + __cpuid_count(7, 0, eax, ebx, ecx, edx); + return ebx; +#elif defined _MSC_VER + int regs[4]; + __cpuidex(regs, 7, 0); + return regs[1]; +#else +# error "unknown compiler" +#endif +} + + +static uint64_t xgetbv() { +#ifdef _MSC_VER + return _xgetbv(0); +#else +/* builtin xgetbv is only supported in clang 9+, so use inline assembly directly*/ + uint32_t eax, edx; + __asm__ volatile ("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0)); + return ((uint64_t)edx << 32) | eax; +#endif +} + +bool os_have_avx2() +{ + return (xgetbv() & 0x06) == 0x06; +} +bool os_have_avx512() +{ + return (xgetbv() & 0xe6) == 0xe6; +} + +bool cpu_has_avx2() +{ + uint32_t ebx7 = cpuid_ebx_7(); + return (ebx7 & cpuid_ebx_AVX2) == cpuid_ebx_AVX2 && os_have_avx2(); +} +bool cpu_has_avx512() +{ + uint32_t ebx7 = cpuid_ebx_7(); + return (ebx7 & cpuid_ebx_AVX512) == cpuid_ebx_AVX512 && os_have_avx512(); +} +bool cpu_has_avx_and_xsave() { + uint32_t ecx = cpuid_ecx(); + return (ecx & cpuid_ecx_AVX_AND_XSAVE) == cpuid_ecx_AVX_AND_XSAVE; +} + + + +struct FVector; +extern "C" float dot_product_avx2(const int16_t*, const int16_t*, size_t); +extern "C" float dot_product_avx512(const int16_t*, const int16_t*, size_t); +extern "C" size_t alloc_size_avx2(size_t); +extern "C" FVector *align_ptr_avx2(void*); +extern "C" void fix_tail_avx2(int16_t*, size_t); +extern "C" size_t alloc_size_avx512(size_t); +extern "C" FVector *align_ptr_avx512(void*); +extern "C" void fix_tail_avx512(int16_t*, size_t); + +Vector_ops vector_ops_x86_available(void) +{ + const uint32_t ecx = cpuid_ecx(); + if (~ecx & cpuid_ecx_AVX_AND_XSAVE) + return {nullptr, nullptr, nullptr, nullptr}; + + if (cpu_has_avx512()) + return {dot_product_avx512, alloc_size_avx512, align_ptr_avx512, fix_tail_avx512}; + + if (cpu_has_avx2()) + return {dot_product_avx2, alloc_size_avx2, align_ptr_avx2, fix_tail_avx2}; + + return {nullptr, nullptr, nullptr, nullptr}; +} \ No newline at end of file diff --git a/sql/vector_mhnsw_x86.h b/sql/vector_mhnsw_x86.h new file mode 100644 index 0000000000000..841fa8e3c07aa --- /dev/null +++ b/sql/vector_mhnsw_x86.h @@ -0,0 +1,37 @@ +/* + Copyright (c) 2024, MariaDB plc + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA +*/ + + +#pragma once +#include +#include + +/* + Function pointer table for SIMD-accelerated vector operations. + Shared between vector_mhnsw.cc and vector_mhnsw_x86.cc. +*/ +struct FVector; +struct Vector_ops +{ + float (*dot_product)(const int16_t *v1, const int16_t *v2, size_t len); + size_t (*alloc_size)(size_t n); + FVector * (*align_ptr)(void *ptr); + void (*fix_tail)(int16_t *dims, size_t vec_len); +}; +#if defined(__x86_64__) || defined(_M_X64) +extern "C" Vector_ops vector_ops_x86_available(void); +#endif \ No newline at end of file