Skip to content
/ server Public
Open
3 changes: 3 additions & 0 deletions libmysqld/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,9 @@ SET(SQL_EMBEDDED_SOURCES emb_qcache.cc libmysqld.c lib_sql.cc
${MYSYS_LIBWRAP_SOURCE}
)

IF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i[3-6]86")
LIST(APPEND SQL_EMBEDDED_SOURCES ../sql/vector_mhnsw_x86.cc)
ENDIF()

ADD_CONVENIENCE_LIBRARY(sql_embedded ${SQL_EMBEDDED_SOURCES})
DTRACE_INSTRUMENT(sql_embedded)
Expand Down
4 changes: 4 additions & 0 deletions sql/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,10 @@ SET (SQL_SOURCE
${MYSYS_LIBWRAP_SOURCE}
)

IF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64|i[3-6]86")
LIST(APPEND SQL_SOURCE vector_mhnsw_x86.cc)
ENDIF()

MY_CHECK_CXX_COMPILER_FLAG(-Wno-unused-but-set-variable)
IF(have_CXX__Wno_unused_but_set_variable)
IF(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64"
Expand Down
244 changes: 159 additions & 85 deletions sql/vector_mhnsw.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,32 @@
#include <scope.h>
#include <my_atomic_wrapper.h>
#include "bloom_filters.h"
#include "vector_mhnsw_x86.h"

/*
Independent SIMD macros for mhnsw vector operations.
These are separate from bloom_filters.h macros, which
the bloom filter continues to use for its own purposes.
*/
#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
# ifdef HAVE_IMMINTRIN_H
# include <immintrin.h>
# define MHNSW_AVX2 __attribute__((target("avx2,avx,fma")))
# if __GNUC__ >= 5 || defined(__clang__)
# define MHNSW_AVX512 __attribute__((target("avx512f,avx512bw")))
# endif
# endif
#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
# include <immintrin.h>
# define MHNSW_AVX2
# define MHNSW_AVX512
#elif defined(__aarch64__) || defined(_M_ARM64)
# include <arm_neon.h>
# define MHNSW_HAVE_NEON
#elif defined(__powerpc64__) && defined(__VSX__)
# include <altivec.h>
# define MHNSW_HAVE_POWER
#endif

// distance can be a little bit < 0 because of fast math
static constexpr float NEAREST = -1.0f;
Expand Down Expand Up @@ -148,11 +174,15 @@ struct FVector
{ return (data_size - data_header)*2; }

static const FVector *create(const MHNSW_Share *ctx, void *mem, const void *src);
static float dot_product(const int16_t *v1, const int16_t *v2, size_t len);
static void fix_tail(int16_t *dims, size_t vec_len);
static size_t alloc_size(size_t n);
static FVector *align_ptr(void *ptr);

void postprocess(bool use_subdist, size_t vec_len)
{
int16_t *d= dims;
fix_tail(vec_len);
fix_tail(d,vec_len);
if (use_subdist)
{
subabs2= scale * scale * dot_product(d, d, subdist_part) / 2;
Expand All @@ -164,52 +194,83 @@ struct FVector
abs2= subabs2 + scale * scale * dot_product(d, d, vec_len) / 2;
}

#ifdef AVX2_IMPLEMENTATION


float distance_to(const FVector *other, size_t vec_len) const
{
return abs2 + other->abs2 - scale * other->scale *
dot_product(dims, other->dims, vec_len);
}

float distance_greater_than(const FVector *other, size_t vec_len, float than,
Stats *stats) const
{
float k = scale * other->scale;
float dp= dot_product(dims, other->dims, subdist_part);
float subdist= (subabs2 + other->subabs2 - k * dp)/subdist_part*vec_len;
if (subdist > than)
return subdist;
dp+= dot_product(dims+subdist_part, other->dims+subdist_part,
vec_len - subdist_part);
float dist= abs2 + other->abs2 - k * dp;
stats->subdist.add(subdist/dist);
return dist;
}
};
#pragma pack(pop)


#ifdef MHNSW_AVX2
/************* AVX2 *****************************************************/
static constexpr size_t AVX2_bytes= 256/8;
static constexpr size_t AVX2_dims= AVX2_bytes/sizeof(int16_t);
static_assert(subdist_part % AVX2_dims == 0);

AVX2_IMPLEMENTATION
static float dot_product(const int16_t *v1, const int16_t *v2, size_t len)
extern "C"
MHNSW_AVX2
float dot_product_avx2(const int16_t *v1, const int16_t *v2, size_t len)
{
typedef float v8f __attribute__((vector_size(AVX2_bytes)));
union { v8f v; __m256 i; } tmp;
__m256i *p1= (__m256i*)v1;
__m256i *p2= (__m256i*)v2;
v8f d= {0};
__m256 d= _mm256_setzero_ps();
for (size_t i= 0; i < (len + AVX2_dims-1)/AVX2_dims; p1++, p2++, i++)
{
tmp.i= _mm256_cvtepi32_ps(_mm256_madd_epi16(*p1, *p2));
d+= tmp.v;
}
return d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7];
}

AVX2_IMPLEMENTATION
static size_t alloc_size(size_t n)
{ return alloc_header + MY_ALIGN(n*2, AVX2_bytes) + AVX2_bytes - 1; }

AVX2_IMPLEMENTATION
static FVector *align_ptr(void *ptr)
{ return (FVector*)(MY_ALIGN(((intptr)ptr) + alloc_header, AVX2_bytes)
- alloc_header); }

AVX2_IMPLEMENTATION
void fix_tail(size_t vec_len)
d= _mm256_add_ps(d, _mm256_cvtepi32_ps(_mm256_madd_epi16(*p1, *p2)));
__m128 hi= _mm256_extractf128_ps(d, 1);
__m128 lo= _mm256_castps256_ps128(d);
__m128 sum= _mm_add_ps(lo, hi);
sum= _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
sum= _mm_add_ss(sum, _mm_movehdup_ps(sum));
return _mm_cvtss_f32(sum);
}

extern "C"
MHNSW_AVX2
size_t alloc_size_avx2(size_t n)
{ return FVector::alloc_header + MY_ALIGN(n*2, AVX2_bytes) + AVX2_bytes - 1; }

extern "C"
MHNSW_AVX2
FVector *align_ptr_avx2(void *ptr)
{ return (FVector*)(MY_ALIGN(((intptr)ptr) + FVector::alloc_header, AVX2_bytes)
- FVector::alloc_header); }

extern "C"
MHNSW_AVX2
void fix_tail_avx2(int16_t *dims, size_t vec_len)
{
bzero(dims + vec_len, (MY_ALIGN(vec_len, AVX2_dims) - vec_len)*2);
}
#endif

#ifdef AVX512_IMPLEMENTATION
#ifdef MHNSW_AVX512
/************* AVX512 ****************************************************/
static constexpr size_t AVX512_bytes= 512/8;
static constexpr size_t AVX512_dims= AVX512_bytes/sizeof(int16_t);
static_assert(subdist_part % AVX512_dims == 0);

AVX512_IMPLEMENTATION
static float dot_product(const int16_t *v1, const int16_t *v2, size_t len)
extern "C"
MHNSW_AVX512
float dot_product_avx512(const int16_t *v1, const int16_t *v2, size_t len)
{
__m512i *p1= (__m512i*)v1;
__m512i *p2= (__m512i*)v2;
Expand All @@ -219,17 +280,20 @@ struct FVector
return _mm512_reduce_add_ps(d);
}

AVX512_IMPLEMENTATION
static size_t alloc_size(size_t n)
{ return alloc_header + MY_ALIGN(n*2, AVX512_bytes) + AVX512_bytes - 1; }
extern "C"
MHNSW_AVX512
size_t alloc_size_avx512(size_t n)
{ return FVector::alloc_header + MY_ALIGN(n*2, AVX512_bytes) + AVX512_bytes - 1; }

AVX512_IMPLEMENTATION
static FVector *align_ptr(void *ptr)
{ return (FVector*)(MY_ALIGN(((intptr)ptr) + alloc_header, AVX512_bytes)
- alloc_header); }
extern "C"
MHNSW_AVX512
FVector *align_ptr_avx512(void *ptr)
{ return (FVector*)(MY_ALIGN(((intptr)ptr) + FVector::alloc_header, AVX512_bytes)
- FVector::alloc_header); }

AVX512_IMPLEMENTATION
void fix_tail(size_t vec_len)
extern "C"
MHNSW_AVX512
void fix_tail_avx512(int16_t *dims, size_t vec_len)
{
bzero(dims + vec_len, (MY_ALIGN(vec_len, AVX512_dims) - vec_len)*2);
}
Expand All @@ -245,12 +309,12 @@ struct FVector
vmull+vmlal2_high implementations.
*/

#ifdef NEON_IMPLEMENTATION
#ifdef MHNSW_HAVE_NEON
static constexpr size_t NEON_bytes= 128 / 8;
static constexpr size_t NEON_dims= NEON_bytes / sizeof(int16_t);
static_assert(subdist_part % NEON_dims == 0);

static float dot_product(const int16_t *v1, const int16_t *v2, size_t len)
float dot_product_neon(const int16_t *v1, const int16_t *v2, size_t len)
{
int64_t d= 0;
for (size_t i= 0; i < (len + NEON_dims - 1) / NEON_dims; i++)
Expand All @@ -265,26 +329,26 @@ struct FVector
return static_cast<float>(d);
}

static size_t alloc_size(size_t n)
{ return alloc_header + MY_ALIGN(n * 2, NEON_bytes) + NEON_bytes - 1; }
size_t alloc_size_neon(size_t n)
{ return FVector::alloc_header + MY_ALIGN(n * 2, NEON_bytes) + NEON_bytes - 1; }

static FVector *align_ptr(void *ptr)
{ return (FVector*) (MY_ALIGN(((intptr) ptr) + alloc_header, NEON_bytes)
- alloc_header); }
FVector *align_ptr_neon(void *ptr)
{ return (FVector*) (MY_ALIGN(((intptr) ptr) + FVector::alloc_header, NEON_bytes)
- FVector::alloc_header); }

void fix_tail(size_t vec_len)
void fix_tail_neon(int16_t *dims, size_t vec_len)
{
bzero(dims + vec_len, (MY_ALIGN(vec_len, NEON_dims) - vec_len) * 2);
}
#endif

#ifdef POWER_IMPLEMENTATION
#ifdef MHNSW_HAVE_POWER
/************* POWERPC *****************************************************/
static constexpr size_t POWER_bytes= 128 / 8; // Assume 128-bit vector width
static constexpr size_t POWER_dims= POWER_bytes / sizeof(int16_t);
static_assert(subdist_part % POWER_dims == 0);

static float dot_product(const int16_t *v1, const int16_t *v2, size_t len)
float dot_product_power(const int16_t *v1, const int16_t *v2, size_t len)
{
// Using vector long long for int64_t accumulation
vector long long ll_sum= {0, 0};
Expand Down Expand Up @@ -314,67 +378,77 @@ struct FVector
static_cast<int64_t>(ll_sum[1]));
}

static size_t alloc_size(size_t n)
size_t alloc_size_power(size_t n)
{
return alloc_header + MY_ALIGN(n * 2, POWER_bytes) + POWER_bytes - 1;
return FVector::alloc_header + MY_ALIGN(n * 2, POWER_bytes) + POWER_bytes - 1;
}

static FVector *align_ptr(void *ptr)
FVector *align_ptr_power(void *ptr)
{
return (FVector*)(MY_ALIGN(((intptr)ptr) + alloc_header, POWER_bytes)
- alloc_header);
return (FVector*)(MY_ALIGN(((intptr)ptr) + FVector::alloc_header, POWER_bytes)
- FVector::alloc_header);
}

void fix_tail(size_t vec_len)
void fix_tail_power(int16_t *dims, size_t vec_len)
{
bzero(dims + vec_len, (MY_ALIGN(vec_len, POWER_dims) - vec_len) * 2);
}
#undef DEFAULT_IMPLEMENTATION
#endif

/************* no-SIMD default ******************************************/
#ifdef DEFAULT_IMPLEMENTATION
DEFAULT_IMPLEMENTATION
static float dot_product(const int16_t *v1, const int16_t *v2, size_t len)
/************* no-SIMD ******************************************/
static float dot_product_default(const int16_t *v1, const int16_t *v2, size_t len)
{
int64_t d= 0;
for (size_t i= 0; i < len; i++)
d+= int32_t(v1[i]) * int32_t(v2[i]);
return static_cast<float>(d);
}

DEFAULT_IMPLEMENTATION
static size_t alloc_size(size_t n) { return alloc_header + n*2; }
size_t alloc_size_default(size_t n) { return FVector::alloc_header + n*2; }

FVector *align_ptr_default(void *ptr) { return (FVector*)ptr; }
void fix_tail_default(int16_t *dims, size_t) { }

DEFAULT_IMPLEMENTATION
static FVector *align_ptr(void *ptr) { return (FVector*)ptr; }

DEFAULT_IMPLEMENTATION
void fix_tail(size_t) { }
/*******************************CPU Dispatching*******************************/



static Vector_ops choose_vector_ops_impl()
{
#if defined __x86_64__ || defined _M_X64
auto ops = vector_ops_x86_available();
if (ops.dot_product)
return ops;
#elif defined __aarch64__
return {dot_product_neon, alloc_size_neon, align_ptr_neon, fix_tail_neon};
#elif defined __powerpc64__
return {dot_product_power, alloc_size_power, align_ptr_power, fix_tail_power};
#endif
return {dot_product_default, alloc_size_default, align_ptr_default, fix_tail_default};
}

float distance_to(const FVector *other, size_t vec_len) const
{
return abs2 + other->abs2 - scale * other->scale *
dot_product(dims, other->dims, vec_len);
}
static const Vector_ops chosen_vector_ops= choose_vector_ops_impl();

float distance_greater_than(const FVector *other, size_t vec_len, float than,
Stats *stats) const
{
float k = scale * other->scale;
float dp= dot_product(dims, other->dims, subdist_part);
float subdist= (subabs2 + other->subabs2 - k * dp)/subdist_part*vec_len;
if (subdist > than)
return subdist;
dp+= dot_product(dims+subdist_part, other->dims+subdist_part,
vec_len - subdist_part);
float dist= abs2 + other->abs2 - k * dp;
stats->subdist.add(subdist/dist);
return dist;
}
};
#pragma pack(pop)
float FVector::dot_product(const int16_t *v1, const int16_t *v2, size_t len)
{
return chosen_vector_ops.dot_product(v1, v2, len);
}
size_t FVector::alloc_size(size_t n)
{
return chosen_vector_ops.alloc_size(n);
}
FVector *FVector::align_ptr(void *ptr)
{
return chosen_vector_ops.align_ptr(ptr);
}

void FVector::fix_tail(int16_t *dims, size_t vec_len)
{
chosen_vector_ops.fix_tail(dims, vec_len);
}

/**************************************************************/

/*
An array of pointers to graph nodes
Expand Down
Loading