Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
ad941b8
Add design doc for SQ8↔FP16 SIMD x86 kernels [MOD-14954]
dor-forer May 26, 2026
97467b2
Append -mf16c to AVX2_FMA/AVX2/SSE4 dispatcher sources [MOD-14954]
dor-forer May 26, 2026
bab7473
Add SQ8_FP16_SpacesOptimizationTest skeleton [MOD-14954]
dor-forer May 26, 2026
671a7cc
Add AVX-512 SQ8↔FP16 SIMD distance kernels [MOD-14954]
dor-forer May 26, 2026
c2f8340
Add AVX2+FMA SQ8↔FP16 SIMD distance kernels [MOD-14954]
dor-forer May 26, 2026
415c2ed
Add AVX2 (no FMA) SQ8↔FP16 SIMD distance kernels [MOD-14954]
dor-forer May 26, 2026
25c5a96
Add SSE4+F16C SQ8↔FP16 SIMD distance kernels [MOD-14954]
dor-forer May 26, 2026
4b7f3eb
Update SQ8_FP16 dispatcher assertions to walk SIMD tiers [MOD-14954]
dor-forer May 26, 2026
e21cb3b
Register per-ISA SQ8↔FP16 microbenchmarks [MOD-14954]
dor-forer May 26, 2026
4c8828e
Reformat SQ8↔FP16 SIMD kernels for consistent line breaks
dor-forer May 26, 2026
fdc5c1c
Address PR review findings for SQ8↔FP16 x86 kernels [MOD-14954]
dor-forer May 28, 2026
ce16f6b
Add multi-accumulator ILP to SQ8↔FP16 x86 kernels [MOD-14954]
dor-forer May 28, 2026
658c485
Drop misleading VNNI suffix from SQ8↔FP16 AVX-512 kernel [MOD-14954]
dor-forer May 28, 2026
fe69f85
Remove SQ8↔FP16 design doc from PR [MOD-14954]
dor-forer May 28, 2026
2a4ef92
Simplify SQ8↔FP16 tests to match sister conventions [MOD-14954]
dor-forer May 28, 2026
929f694
Split SQ8↔FP16 F16C kernels into sibling TUs [MOD-14954]
dor-forer May 28, 2026
b689840
Move SQ8↔FP16 AVX-512 dispatch to AVX512F tier + flatten F16C guards …
dor-forer May 28, 2026
839fe3c
Clean up whitespace and formatting inconsistencies
dor-forer May 28, 2026
3565985
Remove obsolete SQ8-to-FP16 dispatch comments
dor-forer May 28, 2026
771bb39
Hoist OPT_F16C guard around lower SIMD tiers in SQ8↔FP16 tests [MOD-1…
dor-forer May 31, 2026
8fe3d74
Drop non-idiomatic SQ8↔FP16 tier-coverage reporter test [MOD-14954]
dor-forer May 31, 2026
999580f
Simplify SQ8↔FP16 kernels and trim PR churn [MOD-14954]
dor-forer May 31, 2026
91c14e5
Document why OPT_F16C differs from the other OPT_* macros [MOD-14954]
dor-forer May 31, 2026
f5926c2
Cover AVX512 three-chunk tail and dim<16 dispatcher guard in SQ8_FP16…
dor-forer May 31, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions cmake/x86_64InstructionFlags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,24 @@ if(CXX_AVX512F AND CXX_AVX512BW AND CXX_AVX512VL AND CXX_AVX512VNNI)
add_compile_definitions(OPT_AVX512_F_BW_VL_VNNI)
endif()

# OPT_F16C is unusual compared to the other OPT_* macros above:
#
# 1. It is a *capability* gate, not a dispatch tier. Every other OPT_* maps 1:1 to a single
# ISA tier that owns its own translation unit (OPT_AVX2 -> AVX2.cpp, OPT_SSE4 -> SSE4.cpp).
# F16C owns no tier of its own; it only enables the vcvtph2ps (FP16<->FP32) conversion that
# several tiers need. So it is hoisted *around* multiple tiers (AVX2_FMA / AVX2 / SSE4 for
# the SQ8<->FP16 kernels) rather than selecting one.
#
# 2. It is a compound guard (CXX_F16C AND CXX_FMA AND CXX_AVX), not a single flag. F16C is
# VEX-encoded, so vcvtph2ps requires AVX state to execute -- emitting it without AVX is
# invalid. Defining OPT_F16C therefore implies AVX is present, and the F16C kernels must be
# compiled with -mf16c added *on top of* -mavx (see functions/*_F16C.cpp in
# src/VecSim/spaces/CMakeLists.txt). The base AVX2.cpp / SSE4.cpp objects stay F16C-free so
# they still run on CPUs without F16C.
#
# 3. The AVX-512 tier deliberately does NOT use this gate: _mm512_cvtph_ps is part of AVX512F
# itself, so the AVX-512 SQ8<->FP16 path needs only OPT_AVX512F and lives outside any
# OPT_F16C guard.
if(CXX_F16C AND CXX_FMA AND CXX_AVX)
add_compile_definitions(OPT_F16C)
endif()
Expand Down
30 changes: 30 additions & 0 deletions src/VecSim/spaces/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,18 +50,40 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
list(APPEND OPTIMIZATIONS functions/AVX512F_BW_VL_VNNI.cpp)
endif()

# F16C is VEX-encoded and requires AVX state, so it is only meaningful when the toolchain
# can also emit AVX/FMA. Mirrors the OPT_F16C macro condition in x86_64InstructionFlags.cmake.
set(_has_full_f16c FALSE)
if(CXX_F16C AND CXX_FMA AND CXX_AVX)
set(_has_full_f16c TRUE)
endif()

# Base AVX2 / AVX2+FMA dispatcher TUs hold only kernels with no F16C dependency.
# SQ8↔FP16 kernels (which require F16C) live in sibling TUs functions/AVX2_F16C.cpp and
# functions/AVX2_FMA_F16C.cpp, compiled only when _has_full_f16c is true.
if(CXX_AVX2)
message("Building with AVX2")
set_source_files_properties(functions/AVX2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
list(APPEND OPTIMIZATIONS functions/AVX2.cpp)
endif()

if(CXX_AVX2 AND _has_full_f16c)
message("Building functions/AVX2_F16C.cpp with AVX2 and F16C")
set_source_files_properties(functions/AVX2_F16C.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mf16c")
list(APPEND OPTIMIZATIONS functions/AVX2_F16C.cpp)
endif()

if(CXX_AVX2 AND CXX_FMA)
message("Building with AVX2 and FMA")
set_source_files_properties(functions/AVX2_FMA.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
list(APPEND OPTIMIZATIONS functions/AVX2_FMA.cpp)
endif()

if(CXX_AVX2 AND CXX_FMA AND _has_full_f16c)
message("Building functions/AVX2_FMA_F16C.cpp with AVX2, FMA, and F16C")
set_source_files_properties(functions/AVX2_FMA_F16C.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -mf16c")
list(APPEND OPTIMIZATIONS functions/AVX2_FMA_F16C.cpp)
endif()

if(CXX_F16C AND CXX_FMA AND CXX_AVX)
message("Building with CXX_F16C")
set_source_files_properties(functions/F16C.cpp PROPERTIES COMPILE_FLAGS "-mf16c -mfma -mavx")
Expand All @@ -86,6 +108,14 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
list(APPEND OPTIMIZATIONS functions/SSE4.cpp)
endif()

# SSE4 SQ8↔FP16 kernels need F16C, which is VEX-encoded → require -mavx alongside -mf16c
# (mirrors the F16C.cpp recipe above).
if(CXX_SSE4 AND _has_full_f16c)
message("Building functions/SSE4_F16C.cpp with SSE4.1, AVX, and F16C")
set_source_files_properties(functions/SSE4_F16C.cpp PROPERTIES COMPILE_FLAGS "-msse4.1 -mavx -mf16c")
list(APPEND OPTIMIZATIONS functions/SSE4_F16C.cpp)
endif()

if(CXX_SSE)
message("Building with SSE")
set_source_files_properties(functions/SSE.cpp PROPERTIES COMPILE_FLAGS -msse)
Expand Down
102 changes: 102 additions & 0 deletions src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8_FP16.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
* Copyright (c) 2006-Present, Redis Ltd.
* All rights reserved.
*
* Licensed under your choice of the Redis Source Available License 2.0
* (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
* GNU Affero General Public License v3 (AGPLv3).
*/
#pragma once
#include "VecSim/spaces/space_includes.h"
#include "VecSim/spaces/AVX_utils.h"
#include "VecSim/types/sq8.h"
#include "VecSim/types/float16.h"
#include "VecSim/utils/alignment.h"

using sq8 = vecsim_types::sq8;
using float16 = vecsim_types::float16;

/*
* Asymmetric SQ8 (storage) <-> FP16 (query) inner product using algebraic identity:
* IP(x, y) = min * y_sum + delta * Σ(q_i * y_i)
*
* FP16 query lanes are widened to FP32 per 8-lane chunk via _mm256_cvtph_ps (F16C);
* inner-loop arithmetic runs in FP32 with _mm256_fmadd_ps.
*/

// 8-wide AVX2+FMA step: 8 SQ8 lanes + 8 FP16 lanes -> 8 FP32 fused-multiply-add.
static inline void SQ8_FP16_InnerProductStep_AVX2_FMA(const uint8_t *&pVect1,
const float16 *&pVect2, __m256 &sum256) {
__m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect1));
pVect1 += 8;
__m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
__m256 v1_f = _mm256_cvtepi32_ps(v1_256);

__m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVect2));
__m256 v2_f = _mm256_cvtph_ps(v2_128);
pVect2 += 8;

sum256 = _mm256_fmadd_ps(v1_f, v2_f, sum256);
}

// pVec1v = SQ8 storage, pVec2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher).
template <unsigned char residual> // 0..15
float SQ8_FP16_InnerProductImp_AVX2_FMA(const void *pVec1v, const void *pVec2v, size_t dimension) {
const uint8_t *pVec1 = static_cast<const uint8_t *>(pVec1v);
const float16 *pVec2 = static_cast<const float16 *>(pVec2v);
const uint8_t *pEnd1 = pVec1 + dimension;

// Two accumulators break the FMA dependency chain across consecutive iterations.
__m256 sum_a = _mm256_setzero_ps();
__m256 sum_b = _mm256_setzero_ps();

if constexpr (residual % 8) {
constexpr int mask = (1 << (residual % 8)) - 1;

__m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVec1));
pVec1 += residual % 8;
__m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
__m256 v1_f = _mm256_cvtepi32_ps(v1_256);

__m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec2));
__m256 v2_f = _mm256_cvtph_ps(v2_128);
v2_f = _mm256_blend_ps(_mm256_setzero_ps(), v2_f, mask);
pVec2 += residual % 8;

sum_a = _mm256_mul_ps(v1_f, v2_f);
}

if constexpr (residual >= 8) {
SQ8_FP16_InnerProductStep_AVX2_FMA(pVec1, pVec2, sum_b);
}

do {
SQ8_FP16_InnerProductStep_AVX2_FMA(pVec1, pVec2, sum_a);
SQ8_FP16_InnerProductStep_AVX2_FMA(pVec1, pVec2, sum_b);
} while (pVec1 < pEnd1);

__m256 sum256 = _mm256_add_ps(sum_a, sum_b);
float quantized_dot = my_mm256_reduce_add_ps(sum256);

const uint8_t *pVec1Base = static_cast<const uint8_t *>(pVec1v);
const uint8_t *params_bytes = pVec1Base + dimension;
const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));

const float16 *pVec2Base = static_cast<const float16 *>(pVec2v);
const auto *query_meta_bytes = reinterpret_cast<const uint8_t *>(pVec2Base + dimension);
const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));

return min_val * y_sum + delta * quantized_dot;
}

template <unsigned char residual> // 0..15
float SQ8_FP16_InnerProductSIMD16_AVX2_FMA(const void *pVec1v, const void *pVec2v,
size_t dimension) {
return 1.0f - SQ8_FP16_InnerProductImp_AVX2_FMA<residual>(pVec1v, pVec2v, dimension);
}

template <unsigned char residual> // 0..15
float SQ8_FP16_CosineSIMD16_AVX2_FMA(const void *pVec1v, const void *pVec2v, size_t dimension) {
return SQ8_FP16_InnerProductSIMD16_AVX2_FMA<residual>(pVec1v, pVec2v, dimension);
}
101 changes: 101 additions & 0 deletions src/VecSim/spaces/IP/IP_AVX2_SQ8_FP16.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*
* Copyright (c) 2006-Present, Redis Ltd.
* All rights reserved.
*
* Licensed under your choice of the Redis Source Available License 2.0
* (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
* GNU Affero General Public License v3 (AGPLv3).
*/
#pragma once
#include "VecSim/spaces/space_includes.h"
#include "VecSim/spaces/AVX_utils.h"
#include "VecSim/types/sq8.h"
#include "VecSim/types/float16.h"
#include "VecSim/utils/alignment.h"

using sq8 = vecsim_types::sq8;
using float16 = vecsim_types::float16;

/*
* Asymmetric SQ8 (storage) <-> FP16 (query) inner product using algebraic identity:
* IP(x, y) = min * y_sum + delta * Σ(q_i * y_i)
*
* FP16 query lanes are widened to FP32 per 8-lane chunk via _mm256_cvtph_ps (F16C);
* inner-loop arithmetic runs in FP32 with separate _mm256_mul_ps + _mm256_add_ps (no FMA).
*/

// 8-wide AVX2 step (no FMA): 8 SQ8 lanes + 8 FP16 lanes -> mul + add into sum.
static inline void SQ8_FP16_InnerProductStep_AVX2(const uint8_t *&pVect1, const float16 *&pVect2,
__m256 &sum256) {
__m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect1));
pVect1 += 8;
__m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
__m256 v1_f = _mm256_cvtepi32_ps(v1_256);

__m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVect2));
__m256 v2_f = _mm256_cvtph_ps(v2_128);
pVect2 += 8;

sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1_f, v2_f));
}

// pVec1v = SQ8 storage, pVec2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher).
template <unsigned char residual> // 0..15
float SQ8_FP16_InnerProductImp_AVX2(const void *pVec1v, const void *pVec2v, size_t dimension) {
const uint8_t *pVec1 = static_cast<const uint8_t *>(pVec1v);
const float16 *pVec2 = static_cast<const float16 *>(pVec2v);
const uint8_t *pEnd1 = pVec1 + dimension;

// Two accumulators break the mul->add dependency chain (no FMA on this tier).
__m256 sum_a = _mm256_setzero_ps();
__m256 sum_b = _mm256_setzero_ps();

if constexpr (residual % 8) {
constexpr int mask = (1 << (residual % 8)) - 1;

__m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVec1));
pVec1 += residual % 8;
__m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
__m256 v1_f = _mm256_cvtepi32_ps(v1_256);

__m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec2));
__m256 v2_f = _mm256_cvtph_ps(v2_128);
v2_f = _mm256_blend_ps(_mm256_setzero_ps(), v2_f, mask);
pVec2 += residual % 8;

sum_a = _mm256_mul_ps(v1_f, v2_f);
}

if constexpr (residual >= 8) {
SQ8_FP16_InnerProductStep_AVX2(pVec1, pVec2, sum_b);
}

do {
SQ8_FP16_InnerProductStep_AVX2(pVec1, pVec2, sum_a);
SQ8_FP16_InnerProductStep_AVX2(pVec1, pVec2, sum_b);
} while (pVec1 < pEnd1);

__m256 sum256 = _mm256_add_ps(sum_a, sum_b);
float quantized_dot = my_mm256_reduce_add_ps(sum256);

const uint8_t *pVec1Base = static_cast<const uint8_t *>(pVec1v);
const uint8_t *params_bytes = pVec1Base + dimension;
const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));

const float16 *pVec2Base = static_cast<const float16 *>(pVec2v);
const auto *query_meta_bytes = reinterpret_cast<const uint8_t *>(pVec2Base + dimension);
const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));

return min_val * y_sum + delta * quantized_dot;
}

template <unsigned char residual> // 0..15
float SQ8_FP16_InnerProductSIMD16_AVX2(const void *pVec1v, const void *pVec2v, size_t dimension) {
return 1.0f - SQ8_FP16_InnerProductImp_AVX2<residual>(pVec1v, pVec2v, dimension);
}

template <unsigned char residual> // 0..15
float SQ8_FP16_CosineSIMD16_AVX2(const void *pVec1v, const void *pVec2v, size_t dimension) {
return SQ8_FP16_InnerProductSIMD16_AVX2<residual>(pVec1v, pVec2v, dimension);
}
113 changes: 113 additions & 0 deletions src/VecSim/spaces/IP/IP_AVX512F_SQ8_FP16.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
/*
* Copyright (c) 2006-Present, Redis Ltd.
* All rights reserved.
*
* Licensed under your choice of the Redis Source Available License 2.0
* (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
* GNU Affero General Public License v3 (AGPLv3).
*/
#pragma once
#include "VecSim/spaces/space_includes.h"
#include "VecSim/types/sq8.h"
#include "VecSim/types/float16.h"
#include "VecSim/utils/alignment.h"

using sq8 = vecsim_types::sq8;
using float16 = vecsim_types::float16;

/*
* Asymmetric SQ8 (storage) <-> FP16 (query) inner product using algebraic identity:
* IP(x, y) = min * y_sum + delta * Σ(q_i * y_i)
*
* FP16 query lanes are widened to FP32 per 16-lane chunk via _mm512_cvtph_ps (AVX512F);
* inner-loop arithmetic runs in FP32 with _mm512_fmadd_ps.
*/

// 16-wide AVX512F step: 16 SQ8 lanes + 16 FP16 lanes -> 16 FP32 fused-multiply-add.
static inline void SQ8_FP16_InnerProductStep_AVX512(const uint8_t *&pVec1, const float16 *&pVec2,
__m512 &sum) {
__m128i v1_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec1));
__m512i v1_512 = _mm512_cvtepu8_epi32(v1_128);
__m512 v1_f = _mm512_cvtepi32_ps(v1_512);

__m256i v2_16 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pVec2));
__m512 v2_f = _mm512_cvtph_ps(v2_16);

sum = _mm512_fmadd_ps(v1_f, v2_f, sum);

pVec1 += 16;
pVec2 += 16;
}

// pVec1v = SQ8 storage, pVec2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher).
template <unsigned char residual> // 0..15
float SQ8_FP16_InnerProductImp_AVX512(const void *pVec1v, const void *pVec2v, size_t dimension) {
const uint8_t *pVec1 = static_cast<const uint8_t *>(pVec1v);
const float16 *pVec2 = static_cast<const float16 *>(pVec2v);
const uint8_t *pEnd1 = pVec1 + dimension;

// Four accumulators break the FMA dependency chain to saturate both FMA ports.
__m512 sum0 = _mm512_setzero_ps();
__m512 sum1 = _mm512_setzero_ps();
__m512 sum2 = _mm512_setzero_ps();
__m512 sum3 = _mm512_setzero_ps();

if constexpr (residual > 0) {
__mmask16 mask = (1U << residual) - 1;

__m128i v1_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec1));
__m512i v1_512 = _mm512_cvtepu8_epi32(v1_128);
__m512 v1_f = _mm512_cvtepi32_ps(v1_512);

__m256i v2_16 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pVec2));
__m512 v2_f = _mm512_cvtph_ps(v2_16);

sum0 = _mm512_maskz_mul_ps(mask, v1_f, v2_f);

pVec1 += residual;
pVec2 += residual;
}

// Main loop: 4 chunks of 16 lanes per iteration, one chunk per accumulator.
while (static_cast<size_t>(pEnd1 - pVec1) >= 64) {
SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum0);
SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum1);
SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum2);
SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum3);
}

// Tail: at most three remaining 16-lane chunks (post-residual remainder is a multiple of 16).
// Keep chunks on distinct accumulators to preserve ILP when the main loop did not run.
const size_t remaining = pEnd1 - pVec1;
if (remaining >= 16)
SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum0);
if (remaining >= 32)
SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum1);
if (remaining >= 48)
SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum2);

__m512 sum = _mm512_add_ps(_mm512_add_ps(sum0, sum1), _mm512_add_ps(sum2, sum3));
float quantized_dot = _mm512_reduce_add_ps(sum);

const uint8_t *pVec1Base = static_cast<const uint8_t *>(pVec1v);
const uint8_t *params_bytes = pVec1Base + dimension;
const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));

const float16 *pVec2Base = static_cast<const float16 *>(pVec2v);
const auto *query_meta_bytes = reinterpret_cast<const uint8_t *>(pVec2Base + dimension);
const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));

return min_val * y_sum + delta * quantized_dot;
}

template <unsigned char residual> // 0..15
float SQ8_FP16_InnerProductSIMD16_AVX512F(const void *pVec1v, const void *pVec2v,
size_t dimension) {
return 1.0f - SQ8_FP16_InnerProductImp_AVX512<residual>(pVec1v, pVec2v, dimension);
}

template <unsigned char residual> // 0..15
float SQ8_FP16_CosineSIMD16_AVX512F(const void *pVec1v, const void *pVec2v, size_t dimension) {
return SQ8_FP16_InnerProductSIMD16_AVX512F<residual>(pVec1v, pVec2v, dimension);
}
Loading
Loading