RedisAI · dor-forer · May 26, 2026 · May 26, 2026 · May 26, 2026 · May 26, 2026
diff --git a/cmake/x86_64InstructionFlags.cmake b/cmake/x86_64InstructionFlags.cmake
@@ -73,6 +73,24 @@ if(CXX_AVX512F AND CXX_AVX512BW AND CXX_AVX512VL AND CXX_AVX512VNNI)
 	add_compile_definitions(OPT_AVX512_F_BW_VL_VNNI)
 endif()
 
+# OPT_F16C is unusual compared to the other OPT_* macros above:
+#
+#  1. It is a *capability* gate, not a dispatch tier. Every other OPT_* maps 1:1 to a single
+#     ISA tier that owns its own translation unit (OPT_AVX2 -> AVX2.cpp, OPT_SSE4 -> SSE4.cpp).
+#     F16C owns no tier of its own; it only enables the vcvtph2ps (FP16<->FP32) conversion that
+#     several tiers need. So it is hoisted *around* multiple tiers (AVX2_FMA / AVX2 / SSE4 for
+#     the SQ8<->FP16 kernels) rather than selecting one.
+#
+#  2. It is a compound guard (CXX_F16C AND CXX_FMA AND CXX_AVX), not a single flag. F16C is
+#     VEX-encoded, so vcvtph2ps requires AVX state to execute -- emitting it without AVX is
+#     invalid. Defining OPT_F16C therefore implies AVX is present, and the F16C kernels must be
+#     compiled with -mf16c added *on top of* -mavx (see functions/*_F16C.cpp in
+#     src/VecSim/spaces/CMakeLists.txt). The base AVX2.cpp / SSE4.cpp objects stay F16C-free so
+#     they still run on CPUs without F16C.
+#
+#  3. The AVX-512 tier deliberately does NOT use this gate: _mm512_cvtph_ps is part of AVX512F
+#     itself, so the AVX-512 SQ8<->FP16 path needs only OPT_AVX512F and lives outside any
+#     OPT_F16C guard.
 if(CXX_F16C AND CXX_FMA AND CXX_AVX)
 	add_compile_definitions(OPT_F16C)
 endif()

diff --git a/src/VecSim/spaces/CMakeLists.txt b/src/VecSim/spaces/CMakeLists.txt
@@ -50,18 +50,40 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
 		list(APPEND OPTIMIZATIONS functions/AVX512F_BW_VL_VNNI.cpp)
 	endif()
 
+	# F16C is VEX-encoded and requires AVX state, so it is only meaningful when the toolchain
+	# can also emit AVX/FMA. Mirrors the OPT_F16C macro condition in x86_64InstructionFlags.cmake.
+	set(_has_full_f16c FALSE)
+	if(CXX_F16C AND CXX_FMA AND CXX_AVX)
+		set(_has_full_f16c TRUE)
+	endif()
+
+	# Base AVX2 / AVX2+FMA dispatcher TUs hold only kernels with no F16C dependency.
+	# SQ8↔FP16 kernels (which require F16C) live in sibling TUs functions/AVX2_F16C.cpp and
+	# functions/AVX2_FMA_F16C.cpp, compiled only when _has_full_f16c is true.
 	if(CXX_AVX2)
 		message("Building with AVX2")
 		set_source_files_properties(functions/AVX2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
 		list(APPEND OPTIMIZATIONS functions/AVX2.cpp)
 	endif()
 
+	if(CXX_AVX2 AND _has_full_f16c)
+		message("Building functions/AVX2_F16C.cpp with AVX2 and F16C")
+		set_source_files_properties(functions/AVX2_F16C.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mf16c")
+		list(APPEND OPTIMIZATIONS functions/AVX2_F16C.cpp)
+	endif()
+
 	if(CXX_AVX2 AND CXX_FMA)
 		message("Building with AVX2 and FMA")
 		set_source_files_properties(functions/AVX2_FMA.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
 		list(APPEND OPTIMIZATIONS functions/AVX2_FMA.cpp)
 	endif()
 
+	if(CXX_AVX2 AND CXX_FMA AND _has_full_f16c)
+		message("Building functions/AVX2_FMA_F16C.cpp with AVX2, FMA, and F16C")
+		set_source_files_properties(functions/AVX2_FMA_F16C.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -mf16c")
+		list(APPEND OPTIMIZATIONS functions/AVX2_FMA_F16C.cpp)
+	endif()
+
 	if(CXX_F16C AND CXX_FMA AND CXX_AVX)
 		message("Building with CXX_F16C")
 		set_source_files_properties(functions/F16C.cpp PROPERTIES COMPILE_FLAGS "-mf16c -mfma -mavx")
@@ -86,6 +108,14 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
 		list(APPEND OPTIMIZATIONS functions/SSE4.cpp)
 	endif()
 
+	# SSE4 SQ8↔FP16 kernels need F16C, which is VEX-encoded → require -mavx alongside -mf16c
+	# (mirrors the F16C.cpp recipe above).
+	if(CXX_SSE4 AND _has_full_f16c)
+		message("Building functions/SSE4_F16C.cpp with SSE4.1, AVX, and F16C")
+		set_source_files_properties(functions/SSE4_F16C.cpp PROPERTIES COMPILE_FLAGS "-msse4.1 -mavx -mf16c")
+		list(APPEND OPTIMIZATIONS functions/SSE4_F16C.cpp)
+	endif()
+
 	if(CXX_SSE)
 		message("Building with SSE")
 		set_source_files_properties(functions/SSE.cpp PROPERTIES COMPILE_FLAGS -msse)

diff --git a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8_FP16.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/AVX_utils.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+#include "VecSim/utils/alignment.h"
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+/*
+ * Asymmetric SQ8 (storage) <-> FP16 (query) inner product using algebraic identity:
+ *   IP(x, y) = min * y_sum + delta * Σ(q_i * y_i)
+ *
+ * FP16 query lanes are widened to FP32 per 8-lane chunk via _mm256_cvtph_ps (F16C);
+ * inner-loop arithmetic runs in FP32 with _mm256_fmadd_ps.
+ */
+
+// 8-wide AVX2+FMA step: 8 SQ8 lanes + 8 FP16 lanes -> 8 FP32 fused-multiply-add.
+static inline void SQ8_FP16_InnerProductStep_AVX2_FMA(const uint8_t *&pVect1,
+                                                      const float16 *&pVect2, __m256 &sum256) {
+    __m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect1));
+    pVect1 += 8;
+    __m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
+    __m256 v1_f = _mm256_cvtepi32_ps(v1_256);
+
+    __m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVect2));
+    __m256 v2_f = _mm256_cvtph_ps(v2_128);
+    pVect2 += 8;
+
+    sum256 = _mm256_fmadd_ps(v1_f, v2_f, sum256);
+}
+
+// pVec1v = SQ8 storage, pVec2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher).
+template <unsigned char residual> // 0..15
+float SQ8_FP16_InnerProductImp_AVX2_FMA(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    const uint8_t *pVec1 = static_cast<const uint8_t *>(pVec1v);
+    const float16 *pVec2 = static_cast<const float16 *>(pVec2v);
+    const uint8_t *pEnd1 = pVec1 + dimension;
+
+    // Two accumulators break the FMA dependency chain across consecutive iterations.
+    __m256 sum_a = _mm256_setzero_ps();
+    __m256 sum_b = _mm256_setzero_ps();
+
+    if constexpr (residual % 8) {
+        constexpr int mask = (1 << (residual % 8)) - 1;
+
+        __m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVec1));
+        pVec1 += residual % 8;
+        __m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
+        __m256 v1_f = _mm256_cvtepi32_ps(v1_256);
+
+        __m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec2));
+        __m256 v2_f = _mm256_cvtph_ps(v2_128);
+        v2_f = _mm256_blend_ps(_mm256_setzero_ps(), v2_f, mask);
+        pVec2 += residual % 8;
+
+        sum_a = _mm256_mul_ps(v1_f, v2_f);
+    }
+
+    if constexpr (residual >= 8) {
+        SQ8_FP16_InnerProductStep_AVX2_FMA(pVec1, pVec2, sum_b);
+    }
+
+    do {
+        SQ8_FP16_InnerProductStep_AVX2_FMA(pVec1, pVec2, sum_a);
+        SQ8_FP16_InnerProductStep_AVX2_FMA(pVec1, pVec2, sum_b);
+    } while (pVec1 < pEnd1);
+
+    __m256 sum256 = _mm256_add_ps(sum_a, sum_b);
+    float quantized_dot = my_mm256_reduce_add_ps(sum256);
+
+    const uint8_t *pVec1Base = static_cast<const uint8_t *>(pVec1v);
+    const uint8_t *params_bytes = pVec1Base + dimension;
+    const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
+    const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
+
+    const float16 *pVec2Base = static_cast<const float16 *>(pVec2v);
+    const auto *query_meta_bytes = reinterpret_cast<const uint8_t *>(pVec2Base + dimension);
+    const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));
+
+    return min_val * y_sum + delta * quantized_dot;
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_InnerProductSIMD16_AVX2_FMA(const void *pVec1v, const void *pVec2v,
+                                           size_t dimension) {
+    return 1.0f - SQ8_FP16_InnerProductImp_AVX2_FMA<residual>(pVec1v, pVec2v, dimension);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_CosineSIMD16_AVX2_FMA(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    return SQ8_FP16_InnerProductSIMD16_AVX2_FMA<residual>(pVec1v, pVec2v, dimension);
+}
diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8_FP16.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/AVX_utils.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+#include "VecSim/utils/alignment.h"
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+/*
+ * Asymmetric SQ8 (storage) <-> FP16 (query) inner product using algebraic identity:
+ *   IP(x, y) = min * y_sum + delta * Σ(q_i * y_i)
+ *
+ * FP16 query lanes are widened to FP32 per 8-lane chunk via _mm256_cvtph_ps (F16C);
+ * inner-loop arithmetic runs in FP32 with separate _mm256_mul_ps + _mm256_add_ps (no FMA).
+ */
+
+// 8-wide AVX2 step (no FMA): 8 SQ8 lanes + 8 FP16 lanes -> mul + add into sum.
+static inline void SQ8_FP16_InnerProductStep_AVX2(const uint8_t *&pVect1, const float16 *&pVect2,
+                                                  __m256 &sum256) {
+    __m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect1));
+    pVect1 += 8;
+    __m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
+    __m256 v1_f = _mm256_cvtepi32_ps(v1_256);
+
+    __m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVect2));
+    __m256 v2_f = _mm256_cvtph_ps(v2_128);
+    pVect2 += 8;
+
+    sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1_f, v2_f));
+}
+
+// pVec1v = SQ8 storage, pVec2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher).
+template <unsigned char residual> // 0..15
+float SQ8_FP16_InnerProductImp_AVX2(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    const uint8_t *pVec1 = static_cast<const uint8_t *>(pVec1v);
+    const float16 *pVec2 = static_cast<const float16 *>(pVec2v);
+    const uint8_t *pEnd1 = pVec1 + dimension;
+
+    // Two accumulators break the mul->add dependency chain (no FMA on this tier).
+    __m256 sum_a = _mm256_setzero_ps();
+    __m256 sum_b = _mm256_setzero_ps();
+
+    if constexpr (residual % 8) {
+        constexpr int mask = (1 << (residual % 8)) - 1;
+
+        __m128i v1_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVec1));
+        pVec1 += residual % 8;
+        __m256i v1_256 = _mm256_cvtepu8_epi32(v1_128);
+        __m256 v1_f = _mm256_cvtepi32_ps(v1_256);
+
+        __m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec2));
+        __m256 v2_f = _mm256_cvtph_ps(v2_128);
+        v2_f = _mm256_blend_ps(_mm256_setzero_ps(), v2_f, mask);
+        pVec2 += residual % 8;
+
+        sum_a = _mm256_mul_ps(v1_f, v2_f);
+    }
+
+    if constexpr (residual >= 8) {
+        SQ8_FP16_InnerProductStep_AVX2(pVec1, pVec2, sum_b);
+    }
+
+    do {
+        SQ8_FP16_InnerProductStep_AVX2(pVec1, pVec2, sum_a);
+        SQ8_FP16_InnerProductStep_AVX2(pVec1, pVec2, sum_b);
+    } while (pVec1 < pEnd1);
+
+    __m256 sum256 = _mm256_add_ps(sum_a, sum_b);
+    float quantized_dot = my_mm256_reduce_add_ps(sum256);
+
+    const uint8_t *pVec1Base = static_cast<const uint8_t *>(pVec1v);
+    const uint8_t *params_bytes = pVec1Base + dimension;
+    const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
+    const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
+
+    const float16 *pVec2Base = static_cast<const float16 *>(pVec2v);
+    const auto *query_meta_bytes = reinterpret_cast<const uint8_t *>(pVec2Base + dimension);
+    const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));
+
+    return min_val * y_sum + delta * quantized_dot;
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_InnerProductSIMD16_AVX2(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    return 1.0f - SQ8_FP16_InnerProductImp_AVX2<residual>(pVec1v, pVec2v, dimension);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_CosineSIMD16_AVX2(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    return SQ8_FP16_InnerProductSIMD16_AVX2<residual>(pVec1v, pVec2v, dimension);
+}
diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_FP16.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+#include "VecSim/utils/alignment.h"
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+/*
+ * Asymmetric SQ8 (storage) <-> FP16 (query) inner product using algebraic identity:
+ *   IP(x, y) = min * y_sum + delta * Σ(q_i * y_i)
+ *
+ * FP16 query lanes are widened to FP32 per 16-lane chunk via _mm512_cvtph_ps (AVX512F);
+ * inner-loop arithmetic runs in FP32 with _mm512_fmadd_ps.
+ */
+
+// 16-wide AVX512F step: 16 SQ8 lanes + 16 FP16 lanes -> 16 FP32 fused-multiply-add.
+static inline void SQ8_FP16_InnerProductStep_AVX512(const uint8_t *&pVec1, const float16 *&pVec2,
+                                                    __m512 &sum) {
+    __m128i v1_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec1));
+    __m512i v1_512 = _mm512_cvtepu8_epi32(v1_128);
+    __m512 v1_f = _mm512_cvtepi32_ps(v1_512);
+
+    __m256i v2_16 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pVec2));
+    __m512 v2_f = _mm512_cvtph_ps(v2_16);
+
+    sum = _mm512_fmadd_ps(v1_f, v2_f, sum);
+
+    pVec1 += 16;
+    pVec2 += 16;
+}
+
+// pVec1v = SQ8 storage, pVec2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher).
+template <unsigned char residual> // 0..15
+float SQ8_FP16_InnerProductImp_AVX512(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    const uint8_t *pVec1 = static_cast<const uint8_t *>(pVec1v);
+    const float16 *pVec2 = static_cast<const float16 *>(pVec2v);
+    const uint8_t *pEnd1 = pVec1 + dimension;
+
+    // Four accumulators break the FMA dependency chain to saturate both FMA ports.
+    __m512 sum0 = _mm512_setzero_ps();
+    __m512 sum1 = _mm512_setzero_ps();
+    __m512 sum2 = _mm512_setzero_ps();
+    __m512 sum3 = _mm512_setzero_ps();
+
+    if constexpr (residual > 0) {
+        __mmask16 mask = (1U << residual) - 1;
+
+        __m128i v1_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec1));
+        __m512i v1_512 = _mm512_cvtepu8_epi32(v1_128);
+        __m512 v1_f = _mm512_cvtepi32_ps(v1_512);
+
+        __m256i v2_16 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pVec2));
+        __m512 v2_f = _mm512_cvtph_ps(v2_16);
+
+        sum0 = _mm512_maskz_mul_ps(mask, v1_f, v2_f);
+
+        pVec1 += residual;
+        pVec2 += residual;
+    }
+
+    // Main loop: 4 chunks of 16 lanes per iteration, one chunk per accumulator.
+    while (static_cast<size_t>(pEnd1 - pVec1) >= 64) {
+        SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum0);
+        SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum1);
+        SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum2);
+        SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum3);
+    }
+
+    // Tail: at most three remaining 16-lane chunks (post-residual remainder is a multiple of 16).
+    // Keep chunks on distinct accumulators to preserve ILP when the main loop did not run.
+    const size_t remaining = pEnd1 - pVec1;
+    if (remaining >= 16)
+        SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum0);
+    if (remaining >= 32)
+        SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum1);
+    if (remaining >= 48)
+        SQ8_FP16_InnerProductStep_AVX512(pVec1, pVec2, sum2);
+
+    __m512 sum = _mm512_add_ps(_mm512_add_ps(sum0, sum1), _mm512_add_ps(sum2, sum3));
+    float quantized_dot = _mm512_reduce_add_ps(sum);
+
+    const uint8_t *pVec1Base = static_cast<const uint8_t *>(pVec1v);
+    const uint8_t *params_bytes = pVec1Base + dimension;
+    const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
+    const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
+
+    const float16 *pVec2Base = static_cast<const float16 *>(pVec2v);
+    const auto *query_meta_bytes = reinterpret_cast<const uint8_t *>(pVec2Base + dimension);
+    const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));
+
+    return min_val * y_sum + delta * quantized_dot;
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_InnerProductSIMD16_AVX512F(const void *pVec1v, const void *pVec2v,
+                                          size_t dimension) {
+    return 1.0f - SQ8_FP16_InnerProductImp_AVX512<residual>(pVec1v, pVec2v, dimension);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_CosineSIMD16_AVX512F(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    return SQ8_FP16_InnerProductSIMD16_AVX512F<residual>(pVec1v, pVec2v, dimension);
+}