diff --git a/.github/actions/build-faiss-native/action.yml b/.github/actions/build-faiss-native/action.yml new file mode 100644 index 000000000000..974d9390458c --- /dev/null +++ b/.github/actions/build-faiss-native/action.yml @@ -0,0 +1,112 @@ +name: 'Build FAISS Native Library' +description: 'Build FAISS native library for specified platform' +inputs: + platform: + description: 'Target platform (linux-amd64, linux-aarch64, darwin-aarch64)' + required: true + faiss-version: + description: 'FAISS version to build (e.g., 1.7.4)' + required: false + default: '1.7.4' + use-homebrew: + description: 'Whether to use Homebrew for dependencies (macOS)' + required: false + default: 'false' + +runs: + using: 'composite' + steps: + - name: Install native dependencies (Linux) + if: inputs.use-homebrew == 'false' + shell: bash + run: | + sudo apt-get update + sudo apt-get install -y \ + build-essential \ + libopenblas-dev \ + liblapack-dev \ + patchelf \ + libgomp1 \ + wget + + - name: Install GCC 9 (Linux) + if: inputs.use-homebrew == 'false' + shell: bash + run: | + sudo apt-get install -y gcc-9 g++-9 || sudo apt-get install -y gcc g++ + if command -v gcc-9 &>/dev/null; then + sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 90 + sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 90 + fi + gcc --version + + - name: Install CMake 3.30.1 (Linux) + if: inputs.use-homebrew == 'false' + shell: bash + run: | + CMAKE_VERSION="3.30.1" + if [[ "${{ inputs.platform }}" == *"amd64"* ]]; then + CMAKE_ARCH="x86_64" + else + CMAKE_ARCH="aarch64" + fi + wget -q https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz + tar -xzf cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz + sudo mv cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH} /opt/cmake + sudo ln -sf /opt/cmake/bin/cmake /usr/local/bin/cmake + cmake --version + + - name: Install FAISS (Linux) + if: inputs.use-homebrew == 'false' + shell: bash + run: | + git clone --depth 1 --branch v1.7.4 https://github.com/facebookresearch/faiss.git /tmp/faiss + cd /tmp/faiss + cmake -B build \ + -DFAISS_ENABLE_GPU=OFF \ + -DFAISS_ENABLE_PYTHON=OFF \ + -DBUILD_TESTING=OFF \ + -DCMAKE_BUILD_TYPE=Release + cmake --build build -j $(nproc) + sudo cmake --install build + + - name: Install dependencies (macOS) + if: inputs.use-homebrew == 'true' + shell: bash + run: | + brew install cmake libomp openblas faiss + + - name: Build native library + shell: bash + run: | + ./paimon-faiss/paimon-faiss-jni/scripts/build-native.sh --clean --fat-lib --faiss-version ${{ inputs.faiss-version }} + + - name: List built libraries (Linux) + if: inputs.use-homebrew == 'false' && inputs.platform == 'linux-amd64' + shell: bash + run: | + echo "=== Built libraries ===" + ls -la paimon-faiss/paimon-faiss-jni/src/main/resources/linux/amd64/ + echo "" + echo "=== Library dependencies ===" + ldd paimon-faiss/paimon-faiss-jni/src/main/resources/linux/amd64/libpaimon_faiss_jni.so || true + + - name: List built libraries (Linux AARCH64) + if: inputs.use-homebrew == 'false' && inputs.platform == 'linux-aarch64' + shell: bash + run: | + echo "=== Built libraries ===" + ls -la paimon-faiss/paimon-faiss-jni/src/main/resources/linux/aarch64/ + echo "" + echo "=== Library dependencies ===" + ldd paimon-faiss/paimon-faiss-jni/src/main/resources/linux/aarch64/libpaimon_faiss_jni.so || true + + - name: List built libraries (macOS) + if: inputs.use-homebrew == 'true' + shell: bash + run: | + echo "=== Built libraries ===" + ls -la paimon-faiss/paimon-faiss-jni/src/main/resources/darwin/aarch64/ + echo "" + echo "=== Library dependencies ===" + otool -L paimon-faiss/paimon-faiss-jni/src/main/resources/darwin/aarch64/libpaimon_faiss_jni.dylib || true diff --git a/.github/workflows/faiss-vector-index-tests.yml b/.github/workflows/faiss-vector-index-tests.yml new file mode 100644 index 000000000000..16a1d386fb41 --- /dev/null +++ b/.github/workflows/faiss-vector-index-tests.yml @@ -0,0 +1,66 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +name: Faiss Vector Index Tests + +on: + push: + paths: + - 'paimon-faiss/**' + - '.github/workflows/faiss-vector-index-tests.yml' + pull_request: + paths: + - 'paimon-faiss/**' + - '.github/workflows/faiss-vector-index-tests.yml' + +env: + JDK_VERSION: 8 + MAVEN_OPTS: -Dmaven.wagon.httpconnectionManager.ttlSeconds=30 -Dmaven.wagon.http.retryHandler.requestSentEnabled=true + +concurrency: + group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event.number || github.run_id }} + cancel-in-progress: true + +jobs: + build_test: + runs-on: ubuntu-latest + timeout-minutes: 90 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up JDK ${{ env.JDK_VERSION }} + uses: actions/setup-java@v4 + with: + java-version: ${{ env.JDK_VERSION }} + distribution: 'temurin' + + - name: Build FAISS native library + uses: ./.github/actions/build-faiss-native + with: + platform: linux-amd64 + + - name: Build paimon-faiss-jni + shell: bash + run: mvn -B clean install -pl paimon-faiss/paimon-faiss-jni -am -DskipTests -Ppaimon-faiss + + - name: Test paimon-faiss-jni + timeout-minutes: 10 + run: mvn -T 1C -B test -pl paimon-faiss/paimon-faiss-jni -DskipFaissTests=false -Ppaimon-faiss + env: + MAVEN_OPTS: -Xmx2048m diff --git a/.github/workflows/utitcase.yml b/.github/workflows/utitcase.yml index b24ab35386d3..62939d90cff7 100644 --- a/.github/workflows/utitcase.yml +++ b/.github/workflows/utitcase.yml @@ -27,6 +27,8 @@ on: - 'paimon-python/**' - '.github/workflows/paimon-python-checks.yml' - 'paimon-lucene/**' + - 'paimon-faiss/**' + - '.github/workflows/faiss-vector-index-tests.yml' env: JDK_VERSION: 8 @@ -62,7 +64,7 @@ jobs: jvm_timezone=$(random_timezone) echo "JVM timezone is set to $jvm_timezone" - TEST_MODULES="!paimon-e2e-tests," + TEST_MODULES="!paimon-e2e-tests,!paimon-faiss/paimon-faiss-jni," for suffix in ut 3.5 3.4 3.3 3.2; do TEST_MODULES+="!org.apache.paimon:paimon-spark-${suffix}_2.12," done diff --git a/.gitignore b/.gitignore index d961d026a773..3f42fdc44a97 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,9 @@ paimon-lucene/.idea/ ### Mac OS ### .DS_Store + +### Native build artifacts ### +paimon-faiss/paimon-faiss-jni/build/ +paimon-faiss/paimon-faiss-jni/src/main/resources/darwin* +paimon-faiss/paimon-faiss-jni/src/main/resources/linux* +paimon-faiss/paimon-faiss-jni/src/main/native/cmake-build-debug/ diff --git a/paimon-faiss/paimon-faiss-jni/NOTICE b/paimon-faiss/paimon-faiss-jni/NOTICE new file mode 100644 index 000000000000..349f16941e1c --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/NOTICE @@ -0,0 +1,9 @@ +Apache Paimon Faiss JNI +Copyright 2024-2026 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (https://www.apache.org/). + +This product includes Faiss (https://github.com/facebookresearch/faiss) +developed by Facebook AI Research under the MIT License. + diff --git a/paimon-faiss/paimon-faiss-jni/pom.xml b/paimon-faiss/paimon-faiss-jni/pom.xml new file mode 100644 index 000000000000..e03525d8b1ce --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/pom.xml @@ -0,0 +1,105 @@ + + + + 4.0.0 + + + paimon-faiss + org.apache.paimon + 1.4-SNAPSHOT + + + paimon-faiss-jni + Paimon : Faiss JNI + + + 1.7.4 + + + + + org.slf4j + slf4j-api + + + + + org.junit.jupiter + junit-jupiter + test + + + + + + + + org.apache.maven.plugins + maven-resources-plugin + + + so + dylib + + + + + + + + + + release + + + + + org.apache.maven.plugins + maven-resources-plugin + + + copy-native-libs + prepare-package + + copy-resources + + + ${project.build.outputDirectory} + + + ${project.basedir}/src/main/resources + + **/*.so + **/*.dylib + + + + + + + + + + + + + diff --git a/paimon-faiss/paimon-faiss-jni/scripts/build-native.sh b/paimon-faiss/paimon-faiss-jni/scripts/build-native.sh new file mode 100755 index 000000000000..672cea9c09a8 --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/scripts/build-native.sh @@ -0,0 +1,343 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_DIR="$(dirname "$SCRIPT_DIR")" +NATIVE_DIR="$PROJECT_DIR/src/main/native" +BUILD_DIR="$PROJECT_DIR/build/native" + +# Parse arguments +OPT_LEVEL="generic" +CLEAN=false +FAT_LIB=true # Default to fat lib +FAISS_VERSION="1.7.4" # Default FAISS version + +while [[ $# -gt 0 ]]; do + case $1 in + --opt-level) + OPT_LEVEL="$2" + shift 2 + ;; + --faiss-version) + FAISS_VERSION="$2" + shift 2 + ;; + --clean) + CLEAN=true + shift + ;; + --fat-lib) + FAT_LIB=true + shift + ;; + --no-fat-lib) + FAT_LIB=false + shift + ;; + --help) + echo "Usage: $0 [options]" + echo "" + echo "Options:" + echo " --opt-level LEVEL Optimization level: generic, avx2, avx512 (default: generic)" + echo " --faiss-version VER FAISS version to use (default: 1.7.4)" + echo " --fat-lib Build fat library with all dependencies (default: enabled)" + echo " --no-fat-lib Build without bundling dependencies" + echo " --clean Clean build directory before building" + echo " --help Show this help message" + echo "" + echo "Environment variables:" + echo " FAISS_ROOT Path to Faiss installation" + echo " JAVA_HOME Path to Java installation" + echo " OPENBLAS_ROOT Path to OpenBLAS installation" + echo "" + echo "Example:" + echo " FAISS_ROOT=/opt/faiss $0 --clean --fat-lib --faiss-version 1.8.0" + exit 0 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +echo "================================================" +echo "Building Paimon Faiss JNI - Native Library" +echo "================================================" +echo "FAISS version: $FAISS_VERSION" +echo "Optimization level: $OPT_LEVEL" +echo "Fat library: $FAT_LIB" +echo "" + +# Clean if requested +if [ "$CLEAN" = true ]; then + echo "Cleaning build directory..." + rm -rf "$BUILD_DIR" +fi + +# Create build directory +mkdir -p "$BUILD_DIR" +cd "$BUILD_DIR" + +# Check for CMake cache from different source directory (cross-machine builds) +if [ -f "CMakeCache.txt" ]; then + CACHED_SOURCE=$(grep "CMAKE_HOME_DIRECTORY:INTERNAL=" CMakeCache.txt 2>/dev/null | cut -d'=' -f2) + if [ -n "$CACHED_SOURCE" ] && [ "$CACHED_SOURCE" != "$NATIVE_DIR" ]; then + echo "Detected CMake cache from different source directory." + echo " Cached: $CACHED_SOURCE" + echo " Current: $NATIVE_DIR" + echo "Cleaning build directory to avoid conflicts..." + rm -rf "$BUILD_DIR"/* + fi +fi + +# Detect platform +OS=$(uname -s) +ARCH=$(uname -m) + +echo "Detected platform: $OS $ARCH" + +# macOS specific: check for libomp +if [ "$OS" = "Darwin" ]; then + if ! brew list libomp &>/dev/null; then + echo "" + echo "WARNING: libomp not found. Installing via Homebrew..." + echo "Run: brew install libomp" + echo "" + echo "If you don't have Homebrew, install it from https://brew.sh" + echo "Or install libomp manually and set OPENMP_ROOT environment variable." + echo "" + + # Try to install automatically + if command -v brew &>/dev/null; then + brew install libomp + else + echo "ERROR: Homebrew not found. Please install libomp manually." + exit 1 + fi + else + echo "Found libomp via Homebrew" + fi +fi + +# Run CMake +echo "" +echo "Configuring with CMake..." + +CMAKE_ARGS=( + -DCMAKE_BUILD_TYPE=Release + -DFAISS_VERSION="$FAISS_VERSION" + -DFAISS_OPT_LEVEL="$OPT_LEVEL" + -DBUILD_FAT_LIB="$FAT_LIB" +) + +# Add platform-specific options +if [ "$OS" = "Darwin" ]; then + # On macOS, we might need to specify the SDK + if [ -n "$SDKROOT" ]; then + CMAKE_ARGS+=(-DCMAKE_OSX_SYSROOT="$SDKROOT") + fi + + # For Apple Silicon, we might want universal binary + if [ "$ARCH" = "arm64" ]; then + CMAKE_ARGS+=(-DCMAKE_OSX_ARCHITECTURES="arm64") + fi +fi + +# If FAISS_ROOT is set, pass it to CMake +if [ -n "$FAISS_ROOT" ]; then + CMAKE_ARGS+=(-DFAISS_ROOT="$FAISS_ROOT") + echo "Using FAISS_ROOT: $FAISS_ROOT" +fi + +# If OPENBLAS_ROOT is set, pass it to CMake +if [ -n "$OPENBLAS_ROOT" ]; then + CMAKE_ARGS+=(-DOPENBLAS_ROOT="$OPENBLAS_ROOT") + echo "Using OPENBLAS_ROOT: $OPENBLAS_ROOT" +fi + +# If JAVA_HOME is set, use it +if [ -n "$JAVA_HOME" ]; then + CMAKE_ARGS+=(-DJAVA_HOME="$JAVA_HOME") + echo "Using JAVA_HOME: $JAVA_HOME" +fi + +cmake "${CMAKE_ARGS[@]}" "$NATIVE_DIR" + +# Build +echo "" +echo "Building..." +cmake --build . --config Release -j "$(nproc 2>/dev/null || sysctl -n hw.ncpu)" + +echo "" +echo "============================================" +echo "Build completed successfully!" +echo "============================================" + +# Determine output directory based on platform +if [ "$OS" = "Linux" ]; then + PLATFORM_OS="linux" + if [ "$ARCH" = "x86_64" ] || [ "$ARCH" = "amd64" ]; then + PLATFORM_ARCH="amd64" + else + PLATFORM_ARCH="aarch64" + fi +elif [ "$OS" = "Darwin" ]; then + PLATFORM_OS="darwin" + if [ "$ARCH" = "arm64" ]; then + PLATFORM_ARCH="aarch64" + else + PLATFORM_ARCH="amd64" + fi +fi +OUTPUT_DIR="$PROJECT_DIR/src/main/resources/$PLATFORM_OS/$PLATFORM_ARCH" + +# Bundle dependency libraries if building fat lib and they're dynamically linked +if [ "$FAT_LIB" = true ] && [ "$OS" = "Linux" ]; then + echo "" + echo "Checking for dynamic dependencies to bundle..." + + JNI_LIB="$OUTPUT_DIR/libpaimon_faiss_jni.so" + if [ -f "$JNI_LIB" ]; then + # Function to bundle a library and its dependencies + bundle_lib() { + local lib_path="$1" + local target_name="$2" + + if [ -z "$lib_path" ] || [ ! -f "$lib_path" ]; then + return 1 + fi + + local real_path=$(readlink -f "$lib_path") + if [ -f "$OUTPUT_DIR/$target_name" ]; then + echo " Already bundled: $target_name" + return 0 + fi + + cp "$real_path" "$OUTPUT_DIR/$target_name" + chmod +x "$OUTPUT_DIR/$target_name" + echo " Bundled: $real_path -> $target_name" + return 0 + } + + # Libraries to bundle (pattern -> target name) + # We check the JNI lib and all bundled libs recursively + LIBS_TO_CHECK="$JNI_LIB" + LIBS_CHECKED="" + + while [ -n "$LIBS_TO_CHECK" ]; do + CURRENT_LIB=$(echo "$LIBS_TO_CHECK" | awk '{print $1}') + LIBS_TO_CHECK=$(echo "$LIBS_TO_CHECK" | cut -d' ' -f2-) + [ "$LIBS_TO_CHECK" = "$CURRENT_LIB" ] && LIBS_TO_CHECK="" + + # Skip if already checked + echo "$LIBS_CHECKED" | grep -q "$CURRENT_LIB" && continue + LIBS_CHECKED="$LIBS_CHECKED $CURRENT_LIB" + + echo "Checking dependencies of: $(basename "$CURRENT_LIB")" + + # Get all dependencies + DEPS=$(ldd "$CURRENT_LIB" 2>/dev/null | grep "=>" | awk '{print $1 " " $3}') + + while IFS= read -r dep_line; do + [ -z "$dep_line" ] && continue + DEP_NAME=$(echo "$dep_line" | awk '{print $1}') + DEP_PATH=$(echo "$dep_line" | awk '{print $2}') + + # Skip system libraries that are universally available + case "$DEP_NAME" in + linux-vdso.so*|libc.so*|libm.so*|libpthread.so*|libdl.so*|librt.so*|ld-linux*) + continue + ;; + esac + + # Bundle specific libraries we know are problematic + case "$DEP_NAME" in + libopenblas*) + if bundle_lib "$DEP_PATH" "libopenblas.so.0"; then + LIBS_TO_CHECK="$LIBS_TO_CHECK $OUTPUT_DIR/libopenblas.so.0" + fi + ;; + libgfortran*) + # Keep the original versioned name + bundle_lib "$DEP_PATH" "$DEP_NAME" + ;; + libgomp*) + bundle_lib "$DEP_PATH" "libgomp.so.1" + ;; + libquadmath*) + bundle_lib "$DEP_PATH" "$DEP_NAME" + ;; + libgcc_s*) + bundle_lib "$DEP_PATH" "$DEP_NAME" + ;; + libblas*|liblapack*) + bundle_lib "$DEP_PATH" "$DEP_NAME" + ;; + esac + done <<< "$DEPS" + done + + # Set rpath to $ORIGIN for all bundled libraries + if command -v patchelf &>/dev/null; then + echo "" + echo "Setting rpath to \$ORIGIN for all libraries..." + for lib in "$OUTPUT_DIR"/*.so*; do + if [ -f "$lib" ]; then + patchelf --set-rpath '$ORIGIN' "$lib" 2>/dev/null || true + fi + done + echo "Done setting rpath" + else + echo "" + echo "WARNING: patchelf not found, cannot set rpath" + echo "Install with: sudo apt-get install patchelf" + fi + fi +fi + +echo "" +echo "Native library location:" +BUILT_LIBS=$(find "$PROJECT_DIR/src/main/resources" -type f \( -name "*.so" -o -name "*.so.*" -o -name "*.dylib" \) 2>/dev/null) + +if [ -n "$BUILT_LIBS" ]; then + for lib in $BUILT_LIBS; do + echo "" + echo "Library: $lib" + ls -la "$lib" + + # Show library dependencies + echo "" + echo "Dependencies:" + if [ "$OS" = "Darwin" ]; then + otool -L "$lib" 2>/dev/null | head -20 || true + elif [ "$OS" = "Linux" ]; then + ldd "$lib" 2>/dev/null | head -20 || readelf -d "$lib" 2>/dev/null | grep NEEDED | head -20 || true + fi + done +else + echo " (no libraries found)" + ls -la "$PROJECT_DIR/src/main/resources/"*/*/ 2>/dev/null || true +fi + +echo "" +echo "To package the JAR with native libraries, run:" +echo " mvn package" + diff --git a/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/Faiss.java b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/Faiss.java new file mode 100644 index 000000000000..1d08e1b9c7f9 --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/Faiss.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +/** Global Faiss configuration and utilities. */ +public final class Faiss { + + static { + try { + NativeLibraryLoader.load(); + } catch (FaissException e) { + throw new ExceptionInInitializerError(e); + } + } + + public static String getVersion() { + return FaissNative.getVersion(); + } + + /** + * Set the number of threads for parallel operations. + * + *

This affects operations like index training, adding vectors, and searching. Set to 1 to + * disable parallelism. + * + * @param numThreads the number of threads (must be positive) + */ + public static void setNumThreads(int numThreads) { + if (numThreads <= 0) { + throw new IllegalArgumentException("Number of threads must be positive: " + numThreads); + } + FaissNative.setNumThreads(numThreads); + } + + /** + * Get the number of threads for parallel operations. + * + * @return the current number of threads + */ + public static int getNumThreads() { + return FaissNative.getNumThreads(); + } + + /** + * Ensure the native library is loaded. + * + *

This method is called automatically when any Faiss class is used. It can be called + * explicitly to load the library early and catch any loading errors. + * + * @throws FaissException if the native library cannot be loaded + */ + public static void loadLibrary() throws FaissException { + NativeLibraryLoader.load(); + } + + /** + * Check if the native library has been loaded. + * + * @return true if the library is loaded + */ + public static boolean isLibraryLoaded() { + return NativeLibraryLoader.isLoaded(); + } +} diff --git a/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/FaissException.java b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/FaissException.java new file mode 100644 index 000000000000..c670b619e899 --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/FaissException.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +/** + * Exception thrown when a Faiss operation fails. + * + *

This exception wraps errors from the native Faiss library as well as errors that occur during + * JNI operations. + */ +public class FaissException extends Exception { + private static final long serialVersionUID = 1L; + + /** + * Creates a new FaissException with the specified message. + * + * @param message the error message + */ + public FaissException(String message) { + super(message); + } + + /** + * Creates a new FaissException with the specified message and cause. + * + * @param message the error message + * @param cause the underlying cause + */ + public FaissException(String message, Throwable cause) { + super(message, cause); + } + + /** + * Creates a new FaissException with the specified cause. + * + * @param cause the underlying cause + */ + public FaissException(Throwable cause) { + super(cause); + } +} diff --git a/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/FaissNative.java b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/FaissNative.java new file mode 100644 index 000000000000..f98a15b47f30 --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/FaissNative.java @@ -0,0 +1,328 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +import java.nio.ByteBuffer; + +/** + * Native method declarations for Faiss JNI with zero-copy support. + * + *

This class contains all the native method declarations that are implemented in the JNI C++ + * layer. These methods directly map to Faiss C++ API calls. + * + *

Users should not call these methods directly. Instead, use the high-level Java API classes + * like {@link Index} and {@link IndexFactory}. + * + *

All vector operations use {@link ByteBuffer#allocateDirect(int)} buffers to achieve zero-copy + * data transfer between Java and native code. This eliminates memory duplication and improves + * performance for large-scale vector operations. + */ +final class FaissNative { + + static { + try { + NativeLibraryLoader.load(); + } catch (FaissException e) { + throw new ExceptionInInitializerError(e); + } + } + + /** + * Create an index using an index factory string. + * + * @param dimension the dimension of the vectors + * @param description the index description string (e.g., "Flat", "IVF100,Flat", "HNSW32") + * @param metricType the metric type (0 = L2, 1 = Inner Product) + * @return the native handle of the created index + */ + static native long indexFactoryCreate(int dimension, String description, int metricType); + + /** + * Destroy an index and free its resources. + * + * @param handle the native handle of the index + */ + static native void indexDestroy(long handle); + + /** + * Get the dimension of an index. + * + * @param handle the native handle of the index + * @return the dimension + */ + static native int indexGetDimension(long handle); + + /** + * Get the number of vectors in an index. + * + * @param handle the native handle of the index + * @return the number of vectors + */ + static native long indexGetCount(long handle); + + /** + * Check if an index is trained. + * + * @param handle the native handle of the index + * @return true if trained + */ + static native boolean indexIsTrained(long handle); + + /** + * Get the metric type of an index. + * + * @param handle the native handle of the index + * @return the metric type (0 = L2, 1 = Inner Product) + */ + static native int indexGetMetricType(long handle); + + /** + * Reset an index (remove all vectors). + * + * @param handle the native handle of the index + */ + static native void indexReset(long handle); + + /** + * Write an index to a file. + * + * @param handle the native handle of the index + * @param path the file path to write to + */ + static native void indexWriteToFile(long handle, String path); + + /** + * Read an index from a file. + * + * @param path the file path to read from + * @return the native handle of the loaded index + */ + static native long indexReadFromFile(String path); + + /** + * Add vectors to an index using a direct ByteBuffer (zero-copy). + * + *

The buffer must be a direct buffer created via {@link ByteBuffer#allocateDirect(int)} and + * must contain n * dimension floats in native byte order. + * + * @param handle the native handle of the index + * @param n the number of vectors to add + * @param vectorBuffer direct ByteBuffer containing vectors (n * dimension * 4 bytes) + */ + static native void indexAdd(long handle, long n, ByteBuffer vectorBuffer); + + /** + * Add vectors with IDs to an index using direct ByteBuffers (zero-copy). + * + * @param handle the native handle of the index + * @param n the number of vectors to add + * @param vectorBuffer direct ByteBuffer containing vectors (n * dimension * 4 bytes) + * @param idBuffer direct ByteBuffer containing IDs (n * 8 bytes) + */ + static native void indexAddWithIds( + long handle, long n, ByteBuffer vectorBuffer, ByteBuffer idBuffer); + + /** + * Search for the k nearest neighbors. + * + * @param handle the native handle of the index + * @param n the number of query vectors + * @param queryVectors the query vectors (n * dimension floats) + * @param k the number of nearest neighbors to find + * @param distances output array for distances (n * k floats) + * @param labels output array for labels (n * k longs) + */ + static native void indexSearch( + long handle, long n, float[] queryVectors, int k, float[] distances, long[] labels); + + /** + * Train an index using a direct ByteBuffer (zero-copy). + * + * @param handle the native handle of the index + * @param n the number of training vectors + * @param vectorBuffer direct ByteBuffer containing training vectors (n * dimension * 4 bytes) + */ + static native void indexTrain(long handle, long n, ByteBuffer vectorBuffer); + + /** + * Search for neighbors within a given radius using direct ByteBuffer (zero-copy). + * + * @param handle the native handle of the index + * @param n the number of query vectors + * @param queryBuffer direct ByteBuffer containing query vectors (n * dimension * 4 bytes) + * @param radius the search radius + * @return a range search result handle + */ + static native long indexRangeSearch(long handle, long n, ByteBuffer queryBuffer, float radius); + + /** + * Serialize an index to a direct ByteBuffer (zero-copy). + * + *

Returns the number of bytes written. The caller must provide a buffer large enough to hold + * the serialized index. Use {@link #indexSerializeSize(long)} to get the required size. + * + * @param handle the native handle of the index + * @param buffer direct ByteBuffer to write the serialized index to + * @return the number of bytes written + */ + static native long indexSerialize(long handle, ByteBuffer buffer); + + /** + * Get the size in bytes needed to serialize an index. + * + * @param handle the native handle of the index + * @return the size in bytes + */ + static native long indexSerializeSize(long handle); + + /** + * Deserialize an index from a byte array. + * + * @param data the serialized index data + * @param length the number of bytes to read + * @return the native handle of the loaded index + */ + static native long indexDeserialize(byte[] data, long length); + + /** + * Destroy a range search result. + * + * @param handle the native handle of the range search result + */ + static native void rangeSearchResultDestroy(long handle); + + /** + * Get the number of results for each query in a range search. + * + * @param handle the native handle of the range search result + * @param limitsBuffer direct ByteBuffer for limits output ((nq + 1) * 8 bytes) + */ + static native void rangeSearchResultGetLimits(long handle, ByteBuffer limitsBuffer); + + /** + * Get the total number of results in a range search. + * + * @param handle the native handle of the range search result + * @return the total number of results + */ + static native long rangeSearchResultGetTotalSize(long handle); + + /** + * Get all labels from a range search result. + * + * @param handle the native handle of the range search result + * @param labelsBuffer direct ByteBuffer for labels output + */ + static native void rangeSearchResultGetLabels(long handle, ByteBuffer labelsBuffer); + + /** + * Get all distances from a range search result. + * + * @param handle the native handle of the range search result + * @param distancesBuffer direct ByteBuffer for distances output + */ + static native void rangeSearchResultGetDistances(long handle, ByteBuffer distancesBuffer); + + /** + * Get the number of queries in a range search result. + * + * @param handle the native handle of the range search result + * @return the number of queries + */ + static native int rangeSearchResultGetNumQueries(long handle); + + /** + * Get the number of probe lists for an IVF index. + * + * @param handle the native handle of the index + * @return the number of probe lists (nprobe) + */ + static native int ivfGetNprobe(long handle); + + /** + * Set the number of probe lists for an IVF index. + * + * @param handle the native handle of the index + * @param nprobe the number of probe lists + */ + static native void ivfSetNprobe(long handle, int nprobe); + + /** + * Get the number of lists (clusters) in an IVF index. + * + * @param handle the native handle of the index + * @return the number of lists + */ + static native int ivfGetNlist(long handle); + + /** + * Get the efSearch parameter of an HNSW index. + * + * @param handle the native handle of the index + * @return the efSearch value + */ + static native int hnswGetEfSearch(long handle); + + /** + * Set the efSearch parameter of an HNSW index. + * + * @param handle the native handle of the index + * @param efSearch the efSearch value + */ + static native void hnswSetEfSearch(long handle, int efSearch); + + /** + * Get the efConstruction parameter of an HNSW index. + * + * @param handle the native handle of the index + * @return the efConstruction value + */ + static native int hnswGetEfConstruction(long handle); + + /** + * Set the efConstruction parameter of an HNSW index. + * + *

This must be set before adding any vectors to the index. + * + * @param handle the native handle of the index + * @param efConstruction the efConstruction value + */ + static native void hnswSetEfConstruction(long handle, int efConstruction); + + /** + * Get the Faiss library version. + * + * @return the version string + */ + static native String getVersion(); + + /** + * Set the number of threads for parallel operations. + * + * @param numThreads the number of threads + */ + static native void setNumThreads(int numThreads); + + /** + * Get the number of threads for parallel operations. + * + * @return the number of threads + */ + static native int getNumThreads(); +} diff --git a/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/Index.java b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/Index.java new file mode 100644 index 000000000000..0ccf0a9f0d1c --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/Index.java @@ -0,0 +1,354 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +import java.io.File; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +/** + * A Faiss index for similarity search with zero-copy support. + * + *

This class wraps a native Faiss index and provides methods for adding vectors, searching for + * nearest neighbors, and managing the index. All vector operations use direct ByteBuffers for + * zero-copy data transfer between Java and native code. + * + *

Thread Safety: Index instances are NOT thread-safe. External synchronization is required if an + * index is accessed from multiple threads. + * + * @see IndexFactory + */ +public class Index implements AutoCloseable { + + /** Native handle to the Faiss index. */ + private long nativeHandle; + + /** The dimension of vectors in this index. */ + private final int dimension; + + /** Whether this index has been closed. */ + private volatile boolean closed = false; + + /** + * Create an Index wrapper around a native handle. + * + * @param nativeHandle the native handle + * @param dimension the vector dimension + */ + Index(long nativeHandle, int dimension) { + this.nativeHandle = nativeHandle; + this.dimension = dimension; + } + + /** + * Get the dimension of vectors in this index. + * + * @return the vector dimension + */ + public int getDimension() { + return dimension; + } + + /** + * Get the number of vectors in this index. + * + * @return the number of vectors + */ + public long getCount() { + checkNotClosed(); + return FaissNative.indexGetCount(nativeHandle); + } + + /** + * Check if this index is trained. + * + *

Some index types (like IVF) require training before vectors can be added. Flat indexes are + * always considered trained. + * + * @return true if the index is trained + */ + public boolean isTrained() { + checkNotClosed(); + return FaissNative.indexIsTrained(nativeHandle); + } + + /** + * Get the metric type used by this index. + * + * @return the metric type + */ + public MetricType getMetricType() { + checkNotClosed(); + return MetricType.fromValue(FaissNative.indexGetMetricType(nativeHandle)); + } + + /** + * Train the index on a set of training vectors (zero-copy). + * + *

This is required for some index types (like IVF) before adding vectors. For flat indexes, + * this is a no-op. + * + * @param n the number of training vectors + * @param vectorBuffer direct ByteBuffer containing training vectors (n * dimension floats) + */ + public void train(long n, ByteBuffer vectorBuffer) { + checkNotClosed(); + validateDirectBuffer(vectorBuffer, n * dimension * Float.BYTES, "vector"); + FaissNative.indexTrain(nativeHandle, n, vectorBuffer); + } + + /** + * Add vectors to the index (zero-copy). + * + *

The vectors are assigned sequential IDs starting from the current count. + * + * @param n the number of vectors to add + * @param vectorBuffer direct ByteBuffer containing vectors (n * dimension floats) + */ + public void add(long n, ByteBuffer vectorBuffer) { + checkNotClosed(); + validateDirectBuffer(vectorBuffer, n * dimension * Float.BYTES, "vector"); + FaissNative.indexAdd(nativeHandle, n, vectorBuffer); + } + + /** + * Add vectors with explicit IDs to the index (zero-copy). + * + *

Note: Not all index types support this operation. Flat indexes and IndexIDMap wrapped + * indexes support it. + * + * @param n the number of vectors to add + * @param vectorBuffer direct ByteBuffer containing vectors (n * dimension floats) + * @param idBuffer direct ByteBuffer containing IDs (n longs) + */ + public void addWithIds(long n, ByteBuffer vectorBuffer, ByteBuffer idBuffer) { + checkNotClosed(); + validateDirectBuffer(vectorBuffer, n * dimension * Float.BYTES, "vector"); + validateDirectBuffer(idBuffer, n * Long.BYTES, "id"); + FaissNative.indexAddWithIds(nativeHandle, n, vectorBuffer, idBuffer); + } + + /** + * Search for the k nearest neighbors of query vectors. + * + * @param n the number of query vectors + * @param queryVectors array containing query vectors (n * dimension floats) + * @param k the number of nearest neighbors to find + * @param distances output array for distances (n * k floats) + * @param labels output array for labels (n * k longs) + */ + public void search(long n, float[] queryVectors, int k, float[] distances, long[] labels) { + checkNotClosed(); + if (queryVectors.length < n * dimension) { + throw new IllegalArgumentException( + "Query vectors array too small: required " + + (n * dimension) + + ", got " + + queryVectors.length); + } + if (distances.length < n * k) { + throw new IllegalArgumentException( + "Distances array too small: required " + (n * k) + ", got " + distances.length); + } + if (labels.length < n * k) { + throw new IllegalArgumentException( + "Labels array too small: required " + (n * k) + ", got " + labels.length); + } + FaissNative.indexSearch(nativeHandle, n, queryVectors, k, distances, labels); + } + + /** + * Search for all neighbors within a given radius (zero-copy). + * + * @param n the number of query vectors + * @param queryBuffer direct ByteBuffer containing query vectors (n * dimension floats) + * @param radius the search radius + * @return the range search result + */ + public RangeSearchResult rangeSearch(long n, ByteBuffer queryBuffer, float radius) { + checkNotClosed(); + validateDirectBuffer(queryBuffer, n * dimension * Float.BYTES, "query"); + long resultHandle = FaissNative.indexRangeSearch(nativeHandle, n, queryBuffer, radius); + return new RangeSearchResult(resultHandle, (int) n); + } + + /** Reset the index (remove all vectors). */ + public void reset() { + checkNotClosed(); + FaissNative.indexReset(nativeHandle); + } + + /** + * Write the index to a file. + * + * @param path the file path + */ + public void writeToFile(String path) { + checkNotClosed(); + FaissNative.indexWriteToFile(nativeHandle, path); + } + + /** + * Write the index to a file. + * + * @param file the file + */ + public void writeToFile(File file) { + writeToFile(file.getAbsolutePath()); + } + + /** + * Read an index from a file. + * + * @param path the file path + * @return the loaded index + */ + public static Index readFromFile(String path) { + long handle = FaissNative.indexReadFromFile(path); + int dimension = FaissNative.indexGetDimension(handle); + return new Index(handle, dimension); + } + + /** + * Read an index from a file. + * + * @param file the file + * @return the loaded index + */ + public static Index readFromFile(File file) { + return readFromFile(file.getAbsolutePath()); + } + + /** + * Get the size in bytes needed to serialize this index. + * + * @return the serialization size + */ + public long serializeSize() { + checkNotClosed(); + return FaissNative.indexSerializeSize(nativeHandle); + } + + /** + * Serialize the index to a direct ByteBuffer (zero-copy). + * + * @param buffer direct ByteBuffer to write to (must have sufficient capacity) + * @return the number of bytes written + */ + public long serialize(ByteBuffer buffer) { + checkNotClosed(); + if (!buffer.isDirect()) { + throw new IllegalArgumentException("Buffer must be a direct buffer"); + } + return FaissNative.indexSerialize(nativeHandle, buffer); + } + + /** + * Deserialize an index from a byte array. + * + * @param data the serialized index data + * @return the deserialized index + */ + public static Index deserialize(byte[] data) { + long handle = FaissNative.indexDeserialize(data, data.length); + int dimension = FaissNative.indexGetDimension(handle); + return new Index(handle, dimension); + } + + /** + * Allocate a direct ByteBuffer suitable for vector data. + * + * @param numVectors number of vectors + * @param dimension vector dimension + * @return a direct ByteBuffer in native byte order + */ + public static ByteBuffer allocateVectorBuffer(int numVectors, int dimension) { + return ByteBuffer.allocateDirect(numVectors * dimension * Float.BYTES) + .order(ByteOrder.nativeOrder()); + } + + /** + * Allocate a direct ByteBuffer suitable for ID data. + * + * @param numIds number of IDs + * @return a direct ByteBuffer in native byte order + */ + public static ByteBuffer allocateIdBuffer(int numIds) { + return ByteBuffer.allocateDirect(numIds * Long.BYTES).order(ByteOrder.nativeOrder()); + } + + private void validateDirectBuffer(ByteBuffer buffer, long requiredBytes, String name) { + if (!buffer.isDirect()) { + throw new IllegalArgumentException(name + " buffer must be a direct buffer"); + } + if (buffer.capacity() < requiredBytes) { + throw new IllegalArgumentException( + name + + " buffer too small: required " + + requiredBytes + + " bytes, got " + + buffer.capacity()); + } + } + + /** + * Get the native handle. + * + *

This is for internal use only. + * + * @return the native handle + */ + long nativeHandle() { + return nativeHandle; + } + + private void checkNotClosed() { + if (closed) { + throw new IllegalStateException("Index has been closed"); + } + } + + @Override + public void close() { + if (!closed) { + closed = true; + if (nativeHandle != 0) { + FaissNative.indexDestroy(nativeHandle); + nativeHandle = 0; + } + } + } + + @Override + public String toString() { + if (closed) { + return "Index[closed]"; + } + return "Index{" + + "dimension=" + + dimension + + ", count=" + + getCount() + + ", trained=" + + isTrained() + + ", metricType=" + + getMetricType() + + '}'; + } +} diff --git a/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexFactory.java b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexFactory.java new file mode 100644 index 000000000000..09f081235a52 --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexFactory.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +/** Factory for creating Faiss indexes. */ +public final class IndexFactory { + + /** + * Create a Faiss index using the index factory. + * + * @param dimension the dimension of the vectors + * @param description the index description string + * @param metricType the metric type for similarity computation + * @return the created index + */ + public static Index create(int dimension, String description, MetricType metricType) { + if (dimension <= 0) { + throw new IllegalArgumentException("Dimension must be positive: " + dimension); + } + if (description == null || description.isEmpty()) { + throw new IllegalArgumentException("Index description cannot be null or empty"); + } + if (metricType == null) { + throw new IllegalArgumentException("Metric type cannot be null"); + } + + long handle = FaissNative.indexFactoryCreate(dimension, description, metricType.getValue()); + return new Index(handle, dimension); + } + + /** + * Create a Faiss index with L2 (Euclidean) metric. + * + * @param dimension the dimension of the vectors + * @param description the index description string + * @return the created index + */ + public static Index create(int dimension, String description) { + return create(dimension, description, MetricType.L2); + } +} diff --git a/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexHNSW.java b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexHNSW.java new file mode 100644 index 000000000000..1e4744f8e748 --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexHNSW.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +/** + * Utility class for HNSW (Hierarchical Navigable Small World) index operations. + * + *

HNSW indexes build a graph structure for fast approximate nearest neighbor search. The key + * parameters are: + * + *

+ */ +public final class IndexHNSW { + + /** + * Get the efSearch parameter. + * + *

This controls the size of the dynamic candidate list during search. Higher values give + * more accurate results but slower search. + * + * @param index the HNSW index + * @return the current efSearch value + * @throws IllegalArgumentException if the index is not an HNSW index + */ + public static int getEfSearch(Index index) { + return FaissNative.hnswGetEfSearch(index.nativeHandle()); + } + + /** + * Set the efSearch parameter. + * + *

This should be at least k (the number of neighbors requested in search). Typical values + * range from 16 to 256. Higher values give more accurate results but slower search. + * + * @param index the HNSW index + * @param efSearch the efSearch value + * @throws IllegalArgumentException if the index is not an HNSW index + */ + public static void setEfSearch(Index index, int efSearch) { + if (efSearch <= 0) { + throw new IllegalArgumentException("efSearch must be positive: " + efSearch); + } + FaissNative.hnswSetEfSearch(index.nativeHandle(), efSearch); + } + + /** + * Get the efConstruction parameter. + * + *

This was the size of the dynamic candidate list during index construction. It cannot be + * changed after the index is built. + * + * @param index the HNSW index + * @return the efConstruction value + * @throws IllegalArgumentException if the index is not an HNSW index + */ + public static int getEfConstruction(Index index) { + return FaissNative.hnswGetEfConstruction(index.nativeHandle()); + } + + /** + * Set the efConstruction parameter. + * + *

This controls the size of the dynamic candidate list during construction. It must be set + * before adding any vectors to the index. Higher values give more accurate results but slower + * construction. Typical values range from 40 to 400. + * + * @param index the HNSW index + * @param efConstruction the efConstruction value + * @throws IllegalArgumentException if the index is not an HNSW index or efConstruction is not + * positive + */ + public static void setEfConstruction(Index index, int efConstruction) { + if (efConstruction <= 0) { + throw new IllegalArgumentException( + "efConstruction must be positive: " + efConstruction); + } + FaissNative.hnswSetEfConstruction(index.nativeHandle(), efConstruction); + } +} diff --git a/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexIVF.java b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexIVF.java new file mode 100644 index 000000000000..7cf98dfbd0f1 --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexIVF.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +/** Utility class for IVF (Inverted File) index operations. */ +public final class IndexIVF { + + /** + * Get the number of clusters to probe during search (nprobe). + * + * @param index the IVF index + * @return the current nprobe value + * @throws IllegalArgumentException if the index is not an IVF index + */ + public static int getNprobe(Index index) { + return FaissNative.ivfGetNprobe(index.nativeHandle()); + } + + /** + * Set the number of clusters to probe during search (nprobe). + * + *

Higher values increase accuracy but decrease search speed. A good starting point is 1-10% + * of the total number of clusters. + * + * @param index the IVF index + * @param nprobe the number of clusters to probe + * @throws IllegalArgumentException if the index is not an IVF index + */ + public static void setNprobe(Index index, int nprobe) { + if (nprobe <= 0) { + throw new IllegalArgumentException("nprobe must be positive: " + nprobe); + } + FaissNative.ivfSetNprobe(index.nativeHandle(), nprobe); + } + + /** + * Get the total number of clusters (nlist) in the index. + * + * @param index the IVF index + * @return the number of clusters + * @throws IllegalArgumentException if the index is not an IVF index + */ + public static int getNlist(Index index) { + return FaissNative.ivfGetNlist(index.nativeHandle()); + } +} diff --git a/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/MetricType.java b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/MetricType.java new file mode 100644 index 000000000000..cd20bef07cbe --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/MetricType.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +/** + * Metric type for similarity search. + * + *

Faiss supports two main metric types for measuring similarity between vectors: + * + *

+ */ +public enum MetricType { + L2(0), + INNER_PRODUCT(1); + + private final int value; + + MetricType(int value) { + this.value = value; + } + + /** + * Get the numeric value of this metric type. + * + * @return the numeric value + */ + public int getValue() { + return value; + } + + /** + * Get a MetricType from its numeric value. + * + * @param value the numeric value + * @return the corresponding MetricType + * @throws IllegalArgumentException if the value is not valid + */ + public static MetricType fromValue(int value) { + for (MetricType type : values()) { + if (type.value == value) { + return type; + } + } + throw new IllegalArgumentException("Unknown metric type value: " + value); + } +} diff --git a/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/NativeLibraryLoader.java b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/NativeLibraryLoader.java new file mode 100644 index 000000000000..58c8806378c2 --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/NativeLibraryLoader.java @@ -0,0 +1,306 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * Native library loader for Faiss JNI. + * + *

This class is responsible for loading the native Faiss library from the JAR file or system + * path. It follows a similar pattern to RocksDB's native library loading mechanism. + * + *

The loader attempts to load the library in the following order: + * + *

    + *
  1. From the path specified by the {@code paimon.faiss.lib.path} system property + *
  2. From the system library path using {@code System.loadLibrary} + *
  3. From the JAR file bundled with the distribution + *
+ */ +public class NativeLibraryLoader { + private static final Logger LOG = LoggerFactory.getLogger(NativeLibraryLoader.class); + + /** The name of the native library. */ + private static final String JNI_LIBRARY_NAME = "paimon_faiss_jni"; + + /** System property to specify a custom path to the native library. */ + private static final String LIBRARY_PATH_PROPERTY = "paimon.faiss.lib.path"; + + /** + * Dependency libraries that need to be loaded before the main JNI library. These are bundled in + * the JAR when the main library cannot be statically linked. + * + *

Order matters! Libraries must be loaded before the libraries that depend on them. + */ + private static final String[] DEPENDENCY_LIBRARIES = { + // GCC runtime libraries (must be loaded first as others depend on them) + "libgcc_s.so.1", + // Quadmath library (needed by gfortran) + "libquadmath.so.0", + // Fortran runtime (needed by OpenBLAS) - try multiple versions + "libgfortran.so.5", + "libgfortran.so.4", + "libgfortran.so.3", + // OpenMP runtime + "libgomp.so.1", + // BLAS/LAPACK + "libblas.so.3", + "liblapack.so.3", + // OpenBLAS for FAISS (load last as it depends on above) + "libopenblas.so.0", + }; + + /** Whether the native library has been loaded. */ + private static volatile boolean libraryLoaded = false; + + /** Lock for thread-safe library loading. */ + private static final Object LOAD_LOCK = new Object(); + + /** Temporary directory for extracting native libraries. */ + private static Path tempDir; + + private NativeLibraryLoader() { + // Utility class, no instantiation + } + + /** + * Load the native library. + * + * @throws FaissException if the library cannot be loaded + */ + public static void load() throws FaissException { + if (libraryLoaded) { + return; + } + + synchronized (LOAD_LOCK) { + if (libraryLoaded) { + return; + } + + try { + loadNativeLibrary(); + libraryLoaded = true; + LOG.info("Faiss native library loaded successfully"); + } catch (Exception e) { + throw new FaissException("Failed to load Faiss native library", e); + } + } + } + + /** + * Check if the native library has been loaded. + * + * @return true if the library is loaded + */ + public static boolean isLoaded() { + return libraryLoaded; + } + + private static void loadNativeLibrary() throws IOException { + // First, try loading from custom path + String customPath = System.getProperty(LIBRARY_PATH_PROPERTY); + if (customPath != null && !customPath.isEmpty()) { + File customLibrary = new File(customPath); + if (customLibrary.exists()) { + System.load(customLibrary.getAbsolutePath()); + LOG.info("Loaded Faiss native library from custom path: {}", customPath); + return; + } else { + LOG.warn("Custom library path specified but file not found: {}", customPath); + } + } + + // Second, try loading from system library path + try { + System.loadLibrary(JNI_LIBRARY_NAME); + LOG.info("Loaded Faiss native library from system path"); + return; + } catch (UnsatisfiedLinkError e) { + LOG.debug( + "Could not load from system path, trying bundled library: {}", e.getMessage()); + } + + // Third, try loading from JAR + loadFromJar(); + } + + private static void loadFromJar() throws IOException { + String libraryPath = getLibraryResourcePath(); + LOG.debug("Attempting to load native library from JAR: {}", libraryPath); + + try (InputStream is = NativeLibraryLoader.class.getResourceAsStream(libraryPath)) { + if (is == null) { + throw new IOException( + "Native library not found in JAR: " + + libraryPath + + ". " + + "Make sure you are using the correct JAR for your platform (" + + getPlatformIdentifier() + + ")"); + } + + // Create temp directory if needed + if (tempDir == null) { + tempDir = Files.createTempDirectory("paimon-faiss-native"); + tempDir.toFile().deleteOnExit(); + } + + // First, extract and load dependency libraries (if bundled) + loadDependencyLibraries(); + + // Extract native library to temp file + String fileName = System.mapLibraryName(JNI_LIBRARY_NAME); + File tempFile = new File(tempDir.toFile(), fileName); + tempFile.deleteOnExit(); + + try (OutputStream os = new FileOutputStream(tempFile)) { + byte[] buffer = new byte[8192]; + int bytesRead; + while ((bytesRead = is.read(buffer)) != -1) { + os.write(buffer, 0, bytesRead); + } + } + + // Make the file executable (for Unix-like systems) + if (!tempFile.setExecutable(true)) { + LOG.warn("Could not set executable permission on native library"); + } + + // Load the library + System.load(tempFile.getAbsolutePath()); + LOG.info("Loaded Faiss native library from JAR: {}", libraryPath); + } + } + + /** + * Extract and load dependency libraries that are bundled in the JAR. These must be loaded + * before the main JNI library to satisfy its dynamic linking requirements. + */ + private static void loadDependencyLibraries() { + String os = getOsName(); + String arch = getArchName(); + + for (String depLib : DEPENDENCY_LIBRARIES) { + String resourcePath = "/" + os + "/" + arch + "/" + depLib; + try (InputStream is = NativeLibraryLoader.class.getResourceAsStream(resourcePath)) { + if (is == null) { + LOG.debug("Dependency library not bundled: {}", depLib); + continue; + } + + File tempFile = new File(tempDir.toFile(), depLib); + tempFile.deleteOnExit(); + + try (OutputStream fos = new FileOutputStream(tempFile)) { + byte[] buffer = new byte[8192]; + int bytesRead; + while ((bytesRead = is.read(buffer)) != -1) { + fos.write(buffer, 0, bytesRead); + } + } + + if (!tempFile.setExecutable(true)) { + LOG.warn("Could not set executable permission on: {}", depLib); + } + + // Load the dependency library + System.load(tempFile.getAbsolutePath()); + LOG.info("Loaded bundled dependency library: {}", depLib); + } catch (UnsatisfiedLinkError e) { + // Library might already be loaded or not needed + LOG.debug("Could not load dependency {}: {}", depLib, e.getMessage()); + } catch (IOException e) { + LOG.debug("Could not extract dependency {}: {}", depLib, e.getMessage()); + } + } + } + + private static String getLibraryResourcePath() { + String os = getOsName(); + String arch = getArchName(); + String libraryFileName = System.mapLibraryName(JNI_LIBRARY_NAME); + return "/" + os + "/" + arch + "/" + libraryFileName; + } + + /** + * Get the platform identifier for the current system. + * + * @return platform identifier string (e.g., "linux/amd64", "darwin/aarch64") + */ + static String getPlatformIdentifier() { + return getOsName() + "/" + getArchName(); + } + + /** + * Get the normalized OS name for the current system. + * + * @return OS name string (e.g., "linux", "darwin") + */ + private static String getOsName() { + String osName = System.getProperty("os.name").toLowerCase(); + + if (osName.contains("linux")) { + return "linux"; + } else if (osName.contains("mac") || osName.contains("darwin")) { + return "darwin"; + } else { + throw new UnsupportedOperationException( + "Unsupported operating system: " + + osName + + ". Only Linux and macOS are supported."); + } + } + + /** + * Get the normalized architecture name for the current system. + * + * @return architecture name string (e.g., "amd64", "aarch64") + */ + private static String getArchName() { + String osArch = System.getProperty("os.arch").toLowerCase(); + + if (osArch.equals("amd64") || osArch.equals("x86_64")) { + return "amd64"; + } else if (osArch.equals("aarch64") || osArch.equals("arm64")) { + return "aarch64"; + } else { + throw new UnsupportedOperationException("Unsupported architecture: " + osArch); + } + } + + /** + * Get the name of the JNI library. + * + * @return the library name + */ + public static String getLibraryName() { + return JNI_LIBRARY_NAME; + } +} diff --git a/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/RangeSearchResult.java b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/RangeSearchResult.java new file mode 100644 index 000000000000..8cbecf568fa2 --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/RangeSearchResult.java @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.FloatBuffer; +import java.nio.LongBuffer; + +/** + * Result of a range search operation with zero-copy support. + * + *

Unlike k-NN search which returns a fixed number of neighbors per query, range search returns + * all neighbors within a given radius, which can vary per query. + * + *

This class uses direct ByteBuffers to avoid memory copies when retrieving results from native + * code. + */ +public class RangeSearchResult implements AutoCloseable { + + private long nativeHandle; + private final int numQueries; + private ByteBuffer limitsBuffer; + private ByteBuffer labelsBuffer; + private ByteBuffer distancesBuffer; + private long totalSize = -1; + + /** + * Create a new RangeSearchResult from a native handle. + * + * @param nativeHandle the native handle + * @param numQueries the number of query vectors + */ + RangeSearchResult(long nativeHandle, int numQueries) { + this.nativeHandle = nativeHandle; + this.numQueries = numQueries; + } + + /** + * Get the number of query vectors. + * + * @return the number of queries + */ + public int getNumQueries() { + return numQueries; + } + + /** + * Get the number of results for a specific query. + * + * @param queryIndex the query index + * @return the number of results + */ + public long getResultCount(int queryIndex) { + ensureLimitsLoaded(); + if (queryIndex < 0 || queryIndex >= numQueries) { + throw new IndexOutOfBoundsException("Query index out of bounds: " + queryIndex); + } + LongBuffer limits = limitsBuffer.asLongBuffer(); + return limits.get(queryIndex + 1) - limits.get(queryIndex); + } + + /** + * Get the total number of results across all queries. + * + * @return the total number of results + */ + public long getTotalResultCount() { + if (totalSize < 0 && nativeHandle != 0) { + totalSize = FaissNative.rangeSearchResultGetTotalSize(nativeHandle); + } + return totalSize; + } + + /** + * Get the limits buffer containing the start/end indices for each query. + * + *

The limits buffer has (numQueries + 1) longs. For query i, results are in the range + * [limits[i], limits[i+1]). + * + * @return the limits buffer as a LongBuffer view + */ + public LongBuffer getLimitsBuffer() { + ensureLimitsLoaded(); + limitsBuffer.rewind(); + return limitsBuffer.asLongBuffer(); + } + + /** + * Get the labels buffer containing all result labels. + * + * @return the labels buffer as a LongBuffer view + */ + public LongBuffer getLabelsBuffer() { + ensureFullyLoaded(); + labelsBuffer.rewind(); + return labelsBuffer.asLongBuffer(); + } + + /** + * Get the distances buffer containing all result distances. + * + * @return the distances buffer as a FloatBuffer view + */ + public FloatBuffer getDistancesBuffer() { + ensureFullyLoaded(); + distancesBuffer.rewind(); + return distancesBuffer.asFloatBuffer(); + } + + /** + * Get the labels for a specific query. + * + * @param queryIndex the query index + * @return the labels for this query + */ + public long[] getLabelsForQuery(int queryIndex) { + ensureFullyLoaded(); + if (queryIndex < 0 || queryIndex >= numQueries) { + throw new IndexOutOfBoundsException("Query index out of bounds: " + queryIndex); + } + LongBuffer limits = limitsBuffer.asLongBuffer(); + int start = (int) limits.get(queryIndex); + int end = (int) limits.get(queryIndex + 1); + int count = end - start; + + long[] result = new long[count]; + LongBuffer labels = labelsBuffer.asLongBuffer(); + labels.position(start); + labels.get(result); + return result; + } + + /** + * Get the distances for a specific query. + * + * @param queryIndex the query index + * @return the distances for this query + */ + public float[] getDistancesForQuery(int queryIndex) { + ensureFullyLoaded(); + if (queryIndex < 0 || queryIndex >= numQueries) { + throw new IndexOutOfBoundsException("Query index out of bounds: " + queryIndex); + } + LongBuffer limits = limitsBuffer.asLongBuffer(); + int start = (int) limits.get(queryIndex); + int end = (int) limits.get(queryIndex + 1); + int count = end - start; + + float[] result = new float[count]; + FloatBuffer distances = distancesBuffer.asFloatBuffer(); + distances.position(start); + distances.get(result); + return result; + } + + private void ensureLimitsLoaded() { + if (limitsBuffer == null && nativeHandle != 0) { + limitsBuffer = + ByteBuffer.allocateDirect((numQueries + 1) * Long.BYTES) + .order(ByteOrder.nativeOrder()); + FaissNative.rangeSearchResultGetLimits(nativeHandle, limitsBuffer); + } + } + + private void ensureFullyLoaded() { + ensureLimitsLoaded(); + if (labelsBuffer == null && nativeHandle != 0) { + long total = getTotalResultCount(); + labelsBuffer = + ByteBuffer.allocateDirect((int) total * Long.BYTES) + .order(ByteOrder.nativeOrder()); + distancesBuffer = + ByteBuffer.allocateDirect((int) total * Float.BYTES) + .order(ByteOrder.nativeOrder()); + FaissNative.rangeSearchResultGetLabels(nativeHandle, labelsBuffer); + FaissNative.rangeSearchResultGetDistances(nativeHandle, distancesBuffer); + } + } + + @Override + public void close() { + if (nativeHandle != 0) { + FaissNative.rangeSearchResultDestroy(nativeHandle); + nativeHandle = 0; + } + } +} diff --git a/paimon-faiss/paimon-faiss-jni/src/main/native/CMakeLists.txt b/paimon-faiss/paimon-faiss-jni/src/main/native/CMakeLists.txt new file mode 100644 index 000000000000..b7f3c8713975 --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/src/main/native/CMakeLists.txt @@ -0,0 +1,457 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cmake_minimum_required(VERSION 3.30.1) +project(paimon_faiss_jni VERSION 0.1.0 LANGUAGES CXX) + +# FAISS version +set(FAISS_VERSION "1.7.4" CACHE STRING "FAISS library version") +add_definitions(-DFAISS_VERSION="${FAISS_VERSION}") + +# Check GCC version (must be >= 9.3.0) +if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9.3.0") + message(FATAL_ERROR "GCC version must be >= 9.3.0. Found: ${CMAKE_CXX_COMPILER_VERSION}") + endif() + message(STATUS "Using GCC ${CMAKE_CXX_COMPILER_VERSION}") +endif() + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +# Options +option(FAISS_ENABLE_GPU "Build with GPU support" OFF) +option(FAISS_OPT_LEVEL "Optimization level (generic, avx2, avx512)" "generic") +option(BUILD_FAT_LIB "Build fat library with all dependencies statically linked" ON) + +# Find JNI +find_package(JNI REQUIRED) +include_directories(${JNI_INCLUDE_DIRS}) + +# Find OpenMP (with special handling for macOS) +if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + # macOS requires special handling for OpenMP + # First try to find libomp from Homebrew + execute_process( + COMMAND brew --prefix libomp + OUTPUT_VARIABLE HOMEBREW_LIBOMP_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_QUIET + ) + + if(HOMEBREW_LIBOMP_PREFIX) + message(STATUS "Found Homebrew libomp: ${HOMEBREW_LIBOMP_PREFIX}") + set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -I${HOMEBREW_LIBOMP_PREFIX}/include") + set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -I${HOMEBREW_LIBOMP_PREFIX}/include") + set(OpenMP_C_LIB_NAMES "omp") + set(OpenMP_CXX_LIB_NAMES "omp") + set(OpenMP_omp_LIBRARY "${HOMEBREW_LIBOMP_PREFIX}/lib/libomp.dylib") + + # Create imported target manually + if(NOT TARGET OpenMP::OpenMP_CXX) + add_library(OpenMP::OpenMP_CXX SHARED IMPORTED) + set_target_properties(OpenMP::OpenMP_CXX PROPERTIES + IMPORTED_LOCATION "${HOMEBREW_LIBOMP_PREFIX}/lib/libomp.dylib" + INTERFACE_INCLUDE_DIRECTORIES "${HOMEBREW_LIBOMP_PREFIX}/include" + INTERFACE_COMPILE_OPTIONS "-Xpreprocessor;-fopenmp" + ) + endif() + set(OpenMP_FOUND TRUE) + else() + message(WARNING "libomp not found via Homebrew. Trying standard OpenMP detection...") + find_package(OpenMP) + endif() +else() + find_package(OpenMP REQUIRED) +endif() + +if(NOT OpenMP_FOUND AND NOT TARGET OpenMP::OpenMP_CXX) + message(WARNING "OpenMP not found. Building without OpenMP support.") + message(WARNING "On macOS, install libomp: brew install libomp") +endif() + +# Find Faiss +# For fat lib, prefer static libraries +if(BUILD_FAT_LIB) + message(STATUS "Building fat library - preferring static libraries") + set(CMAKE_FIND_LIBRARY_SUFFIXES ".a" ".so" ".dylib") + set(FAISS_STATIC_PREFERRED TRUE) +else() + set(FAISS_STATIC_PREFERRED FALSE) +endif() + +# First try to find Faiss via CMake config +find_package(faiss CONFIG QUIET) + +if(NOT faiss_FOUND) + # Try pkg-config + find_package(PkgConfig QUIET) + if(PKG_CONFIG_FOUND) + pkg_check_modules(FAISS QUIET faiss) + endif() + + if(NOT FAISS_FOUND) + # Manual search - look in common locations + find_path(FAISS_INCLUDE_DIR + NAMES faiss/Index.h + PATHS + /usr/local/include + /usr/include + ${FAISS_ROOT}/include + $ENV{FAISS_ROOT}/include + ) + + # For fat lib, try to find static library first + if(BUILD_FAT_LIB) + find_library(FAISS_LIBRARY_STATIC + NAMES libfaiss.a faiss_static + PATHS + /usr/local/lib + /usr/lib + /usr/local/lib64 + /usr/lib64 + ${FAISS_ROOT}/lib + ${FAISS_ROOT}/lib64 + $ENV{FAISS_ROOT}/lib + $ENV{FAISS_ROOT}/lib64 + ) + if(FAISS_LIBRARY_STATIC) + set(FAISS_LIBRARY ${FAISS_LIBRARY_STATIC}) + message(STATUS "Found Faiss static library: ${FAISS_LIBRARY}") + endif() + endif() + + # If static not found or not building fat lib, find any library + if(NOT FAISS_LIBRARY) + find_library(FAISS_LIBRARY + NAMES faiss + PATHS + /usr/local/lib + /usr/lib + /usr/local/lib64 + /usr/lib64 + ${FAISS_ROOT}/lib + ${FAISS_ROOT}/lib64 + $ENV{FAISS_ROOT}/lib + $ENV{FAISS_ROOT}/lib64 + ) + endif() + + if(FAISS_INCLUDE_DIR AND FAISS_LIBRARY) + set(FAISS_FOUND TRUE) + set(FAISS_INCLUDE_DIRS ${FAISS_INCLUDE_DIR}) + set(FAISS_LIBRARIES ${FAISS_LIBRARY}) + message(STATUS "Found Faiss: ${FAISS_LIBRARY}") + else() + message(FATAL_ERROR "Faiss not found. Please install Faiss or set FAISS_ROOT environment variable.") + endif() + endif() +endif() + +# Find BLAS/LAPACK for static linking (Faiss depends on them) +if(BUILD_FAT_LIB) + # Save original suffixes + set(_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES}) + + # Force static library search only + set(CMAKE_FIND_LIBRARY_SUFFIXES ".a") + + # Try to find OpenBLAS static library + find_library(OPENBLAS_STATIC_LIBRARY + NAMES openblas openblas_static + PATHS + /usr/local/lib + /usr/lib + /usr/local/lib64 + /usr/lib64 + /usr/lib/x86_64-linux-gnu + /usr/lib/aarch64-linux-gnu + ${OPENBLAS_ROOT}/lib + $ENV{OPENBLAS_ROOT}/lib + NO_DEFAULT_PATH + ) + + # Also try default paths + if(NOT OPENBLAS_STATIC_LIBRARY) + find_library(OPENBLAS_STATIC_LIBRARY + NAMES openblas openblas_static + ) + endif() + + if(OPENBLAS_STATIC_LIBRARY AND OPENBLAS_STATIC_LIBRARY MATCHES "\\.a$") + message(STATUS "Found OpenBLAS static library: ${OPENBLAS_STATIC_LIBRARY}") + set(OPENBLAS_USE_STATIC TRUE) + list(APPEND FAISS_STATIC_LIBS ${OPENBLAS_STATIC_LIBRARY}) + else() + message(STATUS "OpenBLAS static library not found, trying shared library") + set(OPENBLAS_USE_STATIC FALSE) + + # Restore suffixes and find shared library + set(CMAKE_FIND_LIBRARY_SUFFIXES ${_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES}) + find_library(OPENBLAS_SHARED_LIBRARY + NAMES openblas + PATHS + /usr/local/lib + /usr/lib + /usr/local/lib64 + /usr/lib64 + /usr/lib/x86_64-linux-gnu + /usr/lib/aarch64-linux-gnu + ${OPENBLAS_ROOT}/lib + $ENV{OPENBLAS_ROOT}/lib + ) + if(OPENBLAS_SHARED_LIBRARY) + message(STATUS "Found OpenBLAS shared library: ${OPENBLAS_SHARED_LIBRARY}") + list(APPEND FAISS_EXTRA_LIBS ${OPENBLAS_SHARED_LIBRARY}) + # Mark that we need to bundle this library + set(BUNDLE_OPENBLAS TRUE) + set(BUNDLE_OPENBLAS_PATH ${OPENBLAS_SHARED_LIBRARY}) + else() + # Try to find any BLAS + find_package(BLAS QUIET) + if(BLAS_FOUND) + list(APPEND FAISS_EXTRA_LIBS ${BLAS_LIBRARIES}) + message(STATUS "Found BLAS: ${BLAS_LIBRARIES}") + endif() + endif() + endif() + + # Restore suffixes for static search + set(CMAKE_FIND_LIBRARY_SUFFIXES ".a") + + # Find LAPACK static library + find_library(LAPACK_STATIC_LIBRARY + NAMES lapack + PATHS + /usr/local/lib + /usr/lib + /usr/local/lib64 + /usr/lib64 + /usr/lib/x86_64-linux-gnu + /usr/lib/aarch64-linux-gnu + ) + if(LAPACK_STATIC_LIBRARY AND LAPACK_STATIC_LIBRARY MATCHES "\\.a$") + message(STATUS "Found LAPACK static library: ${LAPACK_STATIC_LIBRARY}") + list(APPEND FAISS_STATIC_LIBS ${LAPACK_STATIC_LIBRARY}) + endif() + + # Find gfortran static library (needed by OpenBLAS) + find_library(GFORTRAN_STATIC_LIBRARY + NAMES gfortran + ) + if(GFORTRAN_STATIC_LIBRARY AND GFORTRAN_STATIC_LIBRARY MATCHES "\\.a$") + message(STATUS "Found gfortran static library: ${GFORTRAN_STATIC_LIBRARY}") + list(APPEND FAISS_STATIC_LIBS ${GFORTRAN_STATIC_LIBRARY}) + endif() + + # Restore original suffixes + set(CMAKE_FIND_LIBRARY_SUFFIXES ${_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES}) + + # On Linux, we may need pthread, dl, and m (these are typically dynamically linked) + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + list(APPEND FAISS_EXTRA_LIBS pthread dl m) + + # Try to find gfortran shared if static not found + if(NOT GFORTRAN_STATIC_LIBRARY) + find_library(GFORTRAN_LIBRARY gfortran) + if(GFORTRAN_LIBRARY) + list(APPEND FAISS_EXTRA_LIBS ${GFORTRAN_LIBRARY}) + endif() + endif() + endif() +endif() + +# Platform detection - using {os}/{arch} directory structure +if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + set(PLATFORM_OS "linux") + if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64") + set(PLATFORM_ARCH "amd64") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64") + set(PLATFORM_ARCH "aarch64") + else() + message(FATAL_ERROR "Unsupported Linux architecture: ${CMAKE_SYSTEM_PROCESSOR}") + endif() +elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + set(PLATFORM_OS "darwin") + if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64") + set(PLATFORM_ARCH "amd64") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64") + set(PLATFORM_ARCH "aarch64") + else() + message(FATAL_ERROR "Unsupported macOS architecture: ${CMAKE_SYSTEM_PROCESSOR}") + endif() +else() + message(FATAL_ERROR "Unsupported operating system: ${CMAKE_SYSTEM_NAME}. Only Linux and macOS are supported.") +endif() + +set(PLATFORM_DIR "${PLATFORM_OS}/${PLATFORM_ARCH}") +message(STATUS "Building for platform: ${PLATFORM_DIR}") + +# Build the JNI library +add_library(paimon_faiss_jni SHARED + paimon_faiss_jni.cpp +) + +# Include directories +if(TARGET faiss) + target_link_libraries(paimon_faiss_jni PRIVATE faiss) +else() + target_include_directories(paimon_faiss_jni PRIVATE ${FAISS_INCLUDE_DIRS}) + target_link_libraries(paimon_faiss_jni PRIVATE ${FAISS_LIBRARIES}) +endif() + +# Link extra libraries for fat lib (BLAS, LAPACK, etc.) +if(BUILD_FAT_LIB) + # Link static libraries with --whole-archive to embed all symbols + if(FAISS_STATIC_LIBS AND CMAKE_SYSTEM_NAME STREQUAL "Linux") + message(STATUS "Linking static libraries with --whole-archive: ${FAISS_STATIC_LIBS}") + target_link_options(paimon_faiss_jni PRIVATE + "-Wl,--whole-archive" + ) + target_link_libraries(paimon_faiss_jni PRIVATE ${FAISS_STATIC_LIBS}) + target_link_options(paimon_faiss_jni PRIVATE + "-Wl,--no-whole-archive" + ) + elseif(FAISS_STATIC_LIBS) + # macOS doesn't use --whole-archive, use -force_load instead + foreach(static_lib ${FAISS_STATIC_LIBS}) + target_link_options(paimon_faiss_jni PRIVATE "-Wl,-force_load,${static_lib}") + endforeach() + message(STATUS "Linking static libraries with -force_load: ${FAISS_STATIC_LIBS}") + endif() + + # Link remaining shared libraries + if(FAISS_EXTRA_LIBS) + target_link_libraries(paimon_faiss_jni PRIVATE ${FAISS_EXTRA_LIBS}) + message(STATUS "Linking extra libraries: ${FAISS_EXTRA_LIBS}") + endif() +endif() + +# Link OpenMP - always use dynamic linking for OpenMP (static libgomp.a often lacks -fPIC) +if(TARGET OpenMP::OpenMP_CXX) + target_link_libraries(paimon_faiss_jni PRIVATE OpenMP::OpenMP_CXX) + message(STATUS "Linking OpenMP via imported target") +elseif(OpenMP_FOUND) + target_compile_options(paimon_faiss_jni PRIVATE ${OpenMP_CXX_FLAGS}) + # Link against the shared gomp library + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + find_library(GOMP_SHARED_LIBRARY NAMES gomp PATHS /usr/lib /usr/lib64 /usr/lib/x86_64-linux-gnu) + if(GOMP_SHARED_LIBRARY) + target_link_libraries(paimon_faiss_jni PRIVATE ${GOMP_SHARED_LIBRARY}) + message(STATUS "Linking OpenMP shared library: ${GOMP_SHARED_LIBRARY}") + else() + target_link_libraries(paimon_faiss_jni PRIVATE gomp) + message(STATUS "Linking OpenMP: gomp") + endif() + else() + target_link_libraries(paimon_faiss_jni PRIVATE ${OpenMP_CXX_FLAGS}) + endif() +endif() + +# Platform-specific settings +if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + # macOS specific settings + set_target_properties(paimon_faiss_jni PROPERTIES + SUFFIX ".dylib" + INSTALL_NAME_DIR "@rpath" + BUILD_WITH_INSTALL_RPATH TRUE + ) + + # Link against libc++ + target_link_libraries(paimon_faiss_jni PRIVATE c++) + + # For fat lib on macOS, embed OpenMP library path + if(BUILD_FAT_LIB AND HOMEBREW_LIBOMP_PREFIX) + target_link_options(paimon_faiss_jni PRIVATE + "-Wl,-rpath,@loader_path" + "-Wl,-rpath,${HOMEBREW_LIBOMP_PREFIX}/lib" + ) + endif() + +elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux") + # Linux specific settings + set_target_properties(paimon_faiss_jni PROPERTIES + SUFFIX ".so" + ) + + if(BUILD_FAT_LIB) + # For fat lib, use static libstdc++ and libgcc + target_link_options(paimon_faiss_jni PRIVATE + "-static-libstdc++" + "-static-libgcc" + "-Wl,--exclude-libs,ALL" + ) + message(STATUS "Using static libstdc++ and libgcc for fat lib") + else() + target_link_libraries(paimon_faiss_jni PRIVATE stdc++) + endif() + +endif() + +# Set output directory - output to src/main/resources/{os}/{arch}/ +set(OUTPUT_DIR "${CMAKE_SOURCE_DIR}/../resources/${PLATFORM_DIR}") +set_target_properties(paimon_faiss_jni PROPERTIES + LIBRARY_OUTPUT_DIRECTORY ${OUTPUT_DIR} + RUNTIME_OUTPUT_DIRECTORY ${OUTPUT_DIR} +) + +# Optimization level +if(FAISS_OPT_LEVEL STREQUAL "avx2") + target_compile_options(paimon_faiss_jni PRIVATE -mavx2 -mfma) + message(STATUS "Building with AVX2 optimizations") +elseif(FAISS_OPT_LEVEL STREQUAL "avx512") + target_compile_options(paimon_faiss_jni PRIVATE -mavx512f -mavx512dq -mavx512bw -mavx512vl) + message(STATUS "Building with AVX-512 optimizations") +else() + message(STATUS "Building with generic optimizations") +endif() + +# Copy bundled shared libraries to output directory and set rpath +if(BUILD_FAT_LIB AND BUNDLE_OPENBLAS AND BUNDLE_OPENBLAS_PATH) + message(STATUS "Will bundle OpenBLAS shared library: ${BUNDLE_OPENBLAS_PATH}") + + # Get the actual library file (resolve symlinks) + get_filename_component(OPENBLAS_REALPATH ${BUNDLE_OPENBLAS_PATH} REALPATH) + get_filename_component(OPENBLAS_FILENAME ${OPENBLAS_REALPATH} NAME) + + # Copy OpenBLAS to output directory after build + add_custom_command(TARGET paimon_faiss_jni POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + ${OPENBLAS_REALPATH} + ${OUTPUT_DIR}/libopenblas.so.0 + COMMENT "Bundling OpenBLAS shared library" + ) + + # Set rpath to look in the same directory as the library + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + target_link_options(paimon_faiss_jni PRIVATE + "-Wl,-rpath,$ORIGIN" + ) + # Also patch the library after build to use the bundled libopenblas + add_custom_command(TARGET paimon_faiss_jni POST_BUILD + COMMAND patchelf --set-rpath "$$ORIGIN" ${OUTPUT_DIR}/libpaimon_faiss_jni.so || true + COMMENT "Setting rpath to $ORIGIN" + ) + endif() +endif() + +# Install target +install(TARGETS paimon_faiss_jni + LIBRARY DESTINATION ${PLATFORM_DIR} + RUNTIME DESTINATION ${PLATFORM_DIR} +) + diff --git a/paimon-faiss/paimon-faiss-jni/src/main/native/paimon_faiss_jni.cpp b/paimon-faiss/paimon-faiss-jni/src/main/native/paimon_faiss_jni.cpp new file mode 100644 index 000000000000..72e8cc66808e --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/src/main/native/paimon_faiss_jni.cpp @@ -0,0 +1,475 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon_faiss_jni.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _OPENMP +#include +#endif + +#include +#include +#include +#include +#include + +// Helper macro for exception handling +#define FAISS_TRY try { +#define FAISS_CATCH(env) \ + } catch (const std::exception& e) { \ + jclass exceptionClass = env->FindClass("org/apache/paimon/faiss/FaissException"); \ + if (exceptionClass != nullptr) { \ + env->ThrowNew(exceptionClass, e.what()); \ + } \ + } catch (...) { \ + jclass exceptionClass = env->FindClass("org/apache/paimon/faiss/FaissException"); \ + if (exceptionClass != nullptr) { \ + env->ThrowNew(exceptionClass, "Unknown native exception"); \ + } \ + } + +// Helper function to convert jstring to std::string +static std::string jstringToString(JNIEnv* env, jstring jstr) { + if (jstr == nullptr) { + return ""; + } + const char* chars = env->GetStringUTFChars(jstr, nullptr); + std::string result(chars); + env->ReleaseStringUTFChars(jstr, chars); + return result; +} + +// Helper function to get index pointer from handle +static faiss::Index* getIndex(jlong handle) { + return reinterpret_cast(handle); +} + +// Helper to get IVF index +static faiss::IndexIVF* getIndexIVF(jlong handle) { + faiss::Index* index = getIndex(handle); + + // Try direct cast + faiss::IndexIVF* ivf = dynamic_cast(index); + if (ivf != nullptr) { + return ivf; + } + + // Try through IDMap wrapper + faiss::IndexIDMap* idmap = dynamic_cast(index); + if (idmap != nullptr) { + ivf = dynamic_cast(idmap->index); + if (ivf != nullptr) { + return ivf; + } + } + + throw std::runtime_error("Index is not an IVF index"); +} + +// Helper to get HNSW index +static faiss::IndexHNSW* getIndexHNSW(jlong handle) { + faiss::Index* index = getIndex(handle); + + // Try direct cast + faiss::IndexHNSW* hnsw = dynamic_cast(index); + if (hnsw != nullptr) { + return hnsw; + } + + // Try through IDMap wrapper + faiss::IndexIDMap* idmap = dynamic_cast(index); + if (idmap != nullptr) { + hnsw = dynamic_cast(idmap->index); + if (hnsw != nullptr) { + return hnsw; + } + } + + throw std::runtime_error("Index is not an HNSW index"); +} + +// Helper to get direct buffer address with validation +static void* getDirectBufferAddress(JNIEnv* env, jobject buffer, const char* name) { + void* addr = env->GetDirectBufferAddress(buffer); + if (addr == nullptr) { + std::string msg = std::string(name) + " buffer is not a direct buffer or address is unavailable"; + throw std::runtime_error(msg); + } + return addr; +} + +// Range search result wrapper +struct RangeSearchResultWrapper { + faiss::RangeSearchResult result; + int nq; + + RangeSearchResultWrapper(int nq_) : result(nq_), nq(nq_) {} +}; + +JNIEXPORT jlong JNICALL Java_org_apache_paimon_faiss_FaissNative_indexFactoryCreate + (JNIEnv* env, jclass, jint dimension, jstring description, jint metricType) { + FAISS_TRY + std::string desc = jstringToString(env, description); + faiss::MetricType metric = (metricType == 0) ? faiss::METRIC_L2 : faiss::METRIC_INNER_PRODUCT; + faiss::Index* index = faiss::index_factory(dimension, desc.c_str(), metric); + return reinterpret_cast(index); + FAISS_CATCH(env) + return 0; +} + +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_indexDestroy + (JNIEnv* env, jclass, jlong handle) { + FAISS_TRY + delete getIndex(handle); + FAISS_CATCH(env) +} + +JNIEXPORT jint JNICALL Java_org_apache_paimon_faiss_FaissNative_indexGetDimension + (JNIEnv* env, jclass, jlong handle) { + FAISS_TRY + return static_cast(getIndex(handle)->d); + FAISS_CATCH(env) + return 0; +} + +JNIEXPORT jlong JNICALL Java_org_apache_paimon_faiss_FaissNative_indexGetCount + (JNIEnv* env, jclass, jlong handle) { + FAISS_TRY + return static_cast(getIndex(handle)->ntotal); + FAISS_CATCH(env) + return 0; +} + +JNIEXPORT jboolean JNICALL Java_org_apache_paimon_faiss_FaissNative_indexIsTrained + (JNIEnv* env, jclass, jlong handle) { + FAISS_TRY + return getIndex(handle)->is_trained ? JNI_TRUE : JNI_FALSE; + FAISS_CATCH(env) + return JNI_FALSE; +} + +JNIEXPORT jint JNICALL Java_org_apache_paimon_faiss_FaissNative_indexGetMetricType + (JNIEnv* env, jclass, jlong handle) { + FAISS_TRY + faiss::MetricType metric = getIndex(handle)->metric_type; + return (metric == faiss::METRIC_L2) ? 0 : 1; + FAISS_CATCH(env) + return 0; +} + +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_indexReset + (JNIEnv* env, jclass, jlong handle) { + FAISS_TRY + getIndex(handle)->reset(); + FAISS_CATCH(env) +} + +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_indexTrain + (JNIEnv* env, jclass, jlong handle, jlong n, jobject vectorBuffer) { + FAISS_TRY + float* vectorData = static_cast(getDirectBufferAddress(env, vectorBuffer, "vector")); + getIndex(handle)->train(n, vectorData); + FAISS_CATCH(env) +} + +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_indexAdd + (JNIEnv* env, jclass, jlong handle, jlong n, jobject vectorBuffer) { + FAISS_TRY + float* vectorData = static_cast(getDirectBufferAddress(env, vectorBuffer, "vector")); + getIndex(handle)->add(n, vectorData); + FAISS_CATCH(env) +} + +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_indexAddWithIds + (JNIEnv* env, jclass, jlong handle, jlong n, jobject vectorBuffer, jobject idBuffer) { + FAISS_TRY + float* vectorData = static_cast(getDirectBufferAddress(env, vectorBuffer, "vector")); + int64_t* idData = static_cast(getDirectBufferAddress(env, idBuffer, "id")); + + // Convert to faiss::idx_t if size differs + if (sizeof(faiss::idx_t) == sizeof(int64_t)) { + getIndex(handle)->add_with_ids(n, vectorData, reinterpret_cast(idData)); + } else { + std::vector faissIds(n); + for (jlong i = 0; i < n; i++) { + faissIds[i] = static_cast(idData[i]); + } + getIndex(handle)->add_with_ids(n, vectorData, faissIds.data()); + } + FAISS_CATCH(env) +} + +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_indexSearch + (JNIEnv* env, jclass, jlong handle, jlong n, jfloatArray queryVectors, jint k, + jfloatArray distances, jlongArray labels) { + FAISS_TRY + // Get query vectors + jfloat* queryData = env->GetFloatArrayElements(queryVectors, nullptr); + if (queryData == nullptr) { + throw std::runtime_error("Failed to get query vectors"); + } + + // Allocate temporary arrays for results + std::vector distTemp(n * k); + std::vector labelTemp(n * k); + + // Perform search + getIndex(handle)->search(n, queryData, k, distTemp.data(), labelTemp.data()); + + // Release query array + env->ReleaseFloatArrayElements(queryVectors, queryData, JNI_ABORT); + + // Copy results to output arrays + env->SetFloatArrayRegion(distances, 0, n * k, distTemp.data()); + + // Convert labels to jlong + std::vector jlongLabels(n * k); + for (jlong i = 0; i < n * k; i++) { + jlongLabels[i] = static_cast(labelTemp[i]); + } + env->SetLongArrayRegion(labels, 0, n * k, jlongLabels.data()); + FAISS_CATCH(env) +} + +JNIEXPORT jlong JNICALL Java_org_apache_paimon_faiss_FaissNative_indexRangeSearch + (JNIEnv* env, jclass, jlong handle, jlong n, jobject queryBuffer, jfloat radius) { + FAISS_TRY + float* queryData = static_cast(getDirectBufferAddress(env, queryBuffer, "query")); + + RangeSearchResultWrapper* wrapper = new RangeSearchResultWrapper(static_cast(n)); + getIndex(handle)->range_search(n, queryData, radius, &wrapper->result); + + return reinterpret_cast(wrapper); + FAISS_CATCH(env) + return 0; +} + +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_indexWriteToFile + (JNIEnv* env, jclass, jlong handle, jstring path) { + FAISS_TRY + std::string filePath = jstringToString(env, path); + faiss::write_index(getIndex(handle), filePath.c_str()); + FAISS_CATCH(env) +} + +JNIEXPORT jlong JNICALL Java_org_apache_paimon_faiss_FaissNative_indexReadFromFile + (JNIEnv* env, jclass, jstring path) { + FAISS_TRY + std::string filePath = jstringToString(env, path); + faiss::Index* index = faiss::read_index(filePath.c_str(), faiss::IO_FLAG_MMAP); + return reinterpret_cast(index); + FAISS_CATCH(env) + return 0; +} + +JNIEXPORT jlong JNICALL Java_org_apache_paimon_faiss_FaissNative_indexSerializeSize + (JNIEnv* env, jclass, jlong handle) { + FAISS_TRY + faiss::VectorIOWriter writer; + faiss::write_index(getIndex(handle), &writer); + return static_cast(writer.data.size()); + FAISS_CATCH(env) + return 0; +} + +JNIEXPORT jlong JNICALL Java_org_apache_paimon_faiss_FaissNative_indexSerialize + (JNIEnv* env, jclass, jlong handle, jobject buffer) { + FAISS_TRY + void* bufferData = getDirectBufferAddress(env, buffer, "output"); + + faiss::VectorIOWriter writer; + faiss::write_index(getIndex(handle), &writer); + + jlong capacity = env->GetDirectBufferCapacity(buffer); + if (static_cast(capacity) < writer.data.size()) { + throw std::runtime_error("Buffer too small for serialized index"); + } + + memcpy(bufferData, writer.data.data(), writer.data.size()); + return static_cast(writer.data.size()); + FAISS_CATCH(env) + return 0; +} + +JNIEXPORT jlong JNICALL Java_org_apache_paimon_faiss_FaissNative_indexDeserialize + (JNIEnv* env, jclass, jbyteArray data, jlong length) { + FAISS_TRY + jbyte* dataPtr = env->GetByteArrayElements(data, nullptr); + if (dataPtr == nullptr) { + throw std::runtime_error("Failed to get byte array elements"); + } + + faiss::VectorIOReader reader; + reader.data.resize(length); + memcpy(reader.data.data(), dataPtr, length); + + env->ReleaseByteArrayElements(data, dataPtr, JNI_ABORT); + + faiss::Index* index = faiss::read_index(&reader); + return reinterpret_cast(index); + FAISS_CATCH(env) + return 0; +} + +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_rangeSearchResultDestroy + (JNIEnv* env, jclass, jlong handle) { + FAISS_TRY + delete reinterpret_cast(handle); + FAISS_CATCH(env) +} + +JNIEXPORT jlong JNICALL Java_org_apache_paimon_faiss_FaissNative_rangeSearchResultGetTotalSize + (JNIEnv* env, jclass, jlong handle) { + FAISS_TRY + RangeSearchResultWrapper* wrapper = reinterpret_cast(handle); + return static_cast(wrapper->result.lims[wrapper->nq]); + FAISS_CATCH(env) + return 0; +} + +JNIEXPORT jint JNICALL Java_org_apache_paimon_faiss_FaissNative_rangeSearchResultGetNumQueries + (JNIEnv* env, jclass, jlong handle) { + FAISS_TRY + RangeSearchResultWrapper* wrapper = reinterpret_cast(handle); + return static_cast(wrapper->nq); + FAISS_CATCH(env) + return 0; +} + +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_rangeSearchResultGetLimits + (JNIEnv* env, jclass, jlong handle, jobject limitsBuffer) { + FAISS_TRY + RangeSearchResultWrapper* wrapper = reinterpret_cast(handle); + int64_t* limitsData = static_cast(getDirectBufferAddress(env, limitsBuffer, "limits")); + + for (int i = 0; i <= wrapper->nq; i++) { + limitsData[i] = static_cast(wrapper->result.lims[i]); + } + FAISS_CATCH(env) +} + +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_rangeSearchResultGetLabels + (JNIEnv* env, jclass, jlong handle, jobject labelsBuffer) { + FAISS_TRY + RangeSearchResultWrapper* wrapper = reinterpret_cast(handle); + int64_t* labelsData = static_cast(getDirectBufferAddress(env, labelsBuffer, "labels")); + + size_t total = wrapper->result.lims[wrapper->nq]; + for (size_t i = 0; i < total; i++) { + labelsData[i] = static_cast(wrapper->result.labels[i]); + } + FAISS_CATCH(env) +} + +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_rangeSearchResultGetDistances + (JNIEnv* env, jclass, jlong handle, jobject distancesBuffer) { + FAISS_TRY + RangeSearchResultWrapper* wrapper = reinterpret_cast(handle); + float* distData = static_cast(getDirectBufferAddress(env, distancesBuffer, "distances")); + + size_t total = wrapper->result.lims[wrapper->nq]; + memcpy(distData, wrapper->result.distances, total * sizeof(float)); + FAISS_CATCH(env) +} + +JNIEXPORT jint JNICALL Java_org_apache_paimon_faiss_FaissNative_ivfGetNprobe + (JNIEnv* env, jclass, jlong handle) { + FAISS_TRY + return static_cast(getIndexIVF(handle)->nprobe); + FAISS_CATCH(env) + return 0; +} + +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_ivfSetNprobe + (JNIEnv* env, jclass, jlong handle, jint nprobe) { + FAISS_TRY + getIndexIVF(handle)->nprobe = static_cast(nprobe); + FAISS_CATCH(env) +} + +JNIEXPORT jint JNICALL Java_org_apache_paimon_faiss_FaissNative_ivfGetNlist + (JNIEnv* env, jclass, jlong handle) { + FAISS_TRY + return static_cast(getIndexIVF(handle)->nlist); + FAISS_CATCH(env) + return 0; +} + +JNIEXPORT jint JNICALL Java_org_apache_paimon_faiss_FaissNative_hnswGetEfSearch + (JNIEnv* env, jclass, jlong handle) { + FAISS_TRY + return static_cast(getIndexHNSW(handle)->hnsw.efSearch); + FAISS_CATCH(env) + return 0; +} + +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_hnswSetEfSearch + (JNIEnv* env, jclass, jlong handle, jint efSearch) { + FAISS_TRY + getIndexHNSW(handle)->hnsw.efSearch = static_cast(efSearch); + FAISS_CATCH(env) +} + +JNIEXPORT jint JNICALL Java_org_apache_paimon_faiss_FaissNative_hnswGetEfConstruction + (JNIEnv* env, jclass, jlong handle) { + FAISS_TRY + return static_cast(getIndexHNSW(handle)->hnsw.efConstruction); + FAISS_CATCH(env) + return 0; +} + +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_hnswSetEfConstruction + (JNIEnv* env, jclass, jlong handle, jint efConstruction) { + FAISS_TRY + getIndexHNSW(handle)->hnsw.efConstruction = static_cast(efConstruction); + FAISS_CATCH(env) +} + +JNIEXPORT jstring JNICALL Java_org_apache_paimon_faiss_FaissNative_getVersion + (JNIEnv* env, jclass) { + // Return FAISS version defined in CMakeLists.txt + return env->NewStringUTF(FAISS_VERSION); +} + +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_setNumThreads + (JNIEnv* env, jclass, jint numThreads) { +#ifdef _OPENMP + omp_set_num_threads(numThreads); +#else + // OpenMP not available, ignore + (void)numThreads; +#endif +} + +JNIEXPORT jint JNICALL Java_org_apache_paimon_faiss_FaissNative_getNumThreads + (JNIEnv* env, jclass) { +#ifdef _OPENMP + return omp_get_max_threads(); +#else + return 1; +#endif +} diff --git a/paimon-faiss/paimon-faiss-jni/src/main/native/paimon_faiss_jni.h b/paimon-faiss/paimon-faiss-jni/src/main/native/paimon_faiss_jni.h new file mode 100644 index 000000000000..c5739dbe8e82 --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/src/main/native/paimon_faiss_jni.h @@ -0,0 +1,298 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PAIMON_FAISS_JNI_H +#define PAIMON_FAISS_JNI_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: indexFactoryCreate + * Signature: (ILjava/lang/String;I)J + */ +JNIEXPORT jlong JNICALL Java_org_apache_paimon_faiss_FaissNative_indexFactoryCreate + (JNIEnv *, jclass, jint, jstring, jint); + + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: indexDestroy + * Signature: (J)V + */ +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_indexDestroy + (JNIEnv *, jclass, jlong); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: indexGetDimension + * Signature: (J)I + */ +JNIEXPORT jint JNICALL Java_org_apache_paimon_faiss_FaissNative_indexGetDimension + (JNIEnv *, jclass, jlong); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: indexGetCount + * Signature: (J)J + */ +JNIEXPORT jlong JNICALL Java_org_apache_paimon_faiss_FaissNative_indexGetCount + (JNIEnv *, jclass, jlong); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: indexIsTrained + * Signature: (J)Z + */ +JNIEXPORT jboolean JNICALL Java_org_apache_paimon_faiss_FaissNative_indexIsTrained + (JNIEnv *, jclass, jlong); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: indexGetMetricType + * Signature: (J)I + */ +JNIEXPORT jint JNICALL Java_org_apache_paimon_faiss_FaissNative_indexGetMetricType + (JNIEnv *, jclass, jlong); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: indexReset + * Signature: (J)V + */ +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_indexReset + (JNIEnv *, jclass, jlong); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: indexTrain + * Signature: (JJLjava/nio/ByteBuffer;)V + */ +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_indexTrain + (JNIEnv *, jclass, jlong, jlong, jobject); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: indexAdd + * Signature: (JJLjava/nio/ByteBuffer;)V + */ +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_indexAdd + (JNIEnv *, jclass, jlong, jlong, jobject); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: indexAddWithIds + * Signature: (JJLjava/nio/ByteBuffer;Ljava/nio/ByteBuffer;)V + */ +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_indexAddWithIds + (JNIEnv *, jclass, jlong, jlong, jobject, jobject); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: indexSearch + * Signature: (JJ[FI[F[J)V + */ +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_indexSearch + (JNIEnv *, jclass, jlong, jlong, jfloatArray, jint, jfloatArray, jlongArray); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: indexRangeSearch + * Signature: (JJLjava/nio/ByteBuffer;F)J + */ +JNIEXPORT jlong JNICALL Java_org_apache_paimon_faiss_FaissNative_indexRangeSearch + (JNIEnv *, jclass, jlong, jlong, jobject, jfloat); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: indexWriteToFile + * Signature: (JLjava/lang/String;)V + */ +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_indexWriteToFile + (JNIEnv *, jclass, jlong, jstring); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: indexReadFromFile + * Signature: (Ljava/lang/String;)J + */ +JNIEXPORT jlong JNICALL Java_org_apache_paimon_faiss_FaissNative_indexReadFromFile + (JNIEnv *, jclass, jstring); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: indexSerializeSize + * Signature: (J)J + */ +JNIEXPORT jlong JNICALL Java_org_apache_paimon_faiss_FaissNative_indexSerializeSize + (JNIEnv *, jclass, jlong); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: indexSerialize + * Signature: (JLjava/nio/ByteBuffer;)J + */ +JNIEXPORT jlong JNICALL Java_org_apache_paimon_faiss_FaissNative_indexSerialize + (JNIEnv *, jclass, jlong, jobject); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: indexDeserialize + * Signature: ([BJ)J + */ +JNIEXPORT jlong JNICALL Java_org_apache_paimon_faiss_FaissNative_indexDeserialize + (JNIEnv *, jclass, jbyteArray, jlong); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: rangeSearchResultDestroy + * Signature: (J)V + */ +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_rangeSearchResultDestroy + (JNIEnv *, jclass, jlong); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: rangeSearchResultGetTotalSize + * Signature: (J)J + */ +JNIEXPORT jlong JNICALL Java_org_apache_paimon_faiss_FaissNative_rangeSearchResultGetTotalSize + (JNIEnv *, jclass, jlong); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: rangeSearchResultGetNumQueries + * Signature: (J)I + */ +JNIEXPORT jint JNICALL Java_org_apache_paimon_faiss_FaissNative_rangeSearchResultGetNumQueries + (JNIEnv *, jclass, jlong); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: rangeSearchResultGetLimits + * Signature: (JLjava/nio/ByteBuffer;)V + */ +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_rangeSearchResultGetLimits + (JNIEnv *, jclass, jlong, jobject); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: rangeSearchResultGetLabels + * Signature: (JLjava/nio/ByteBuffer;)V + */ +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_rangeSearchResultGetLabels + (JNIEnv *, jclass, jlong, jobject); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: rangeSearchResultGetDistances + * Signature: (JLjava/nio/ByteBuffer;)V + */ +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_rangeSearchResultGetDistances + (JNIEnv *, jclass, jlong, jobject); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: ivfGetNprobe + * Signature: (J)I + */ +JNIEXPORT jint JNICALL Java_org_apache_paimon_faiss_FaissNative_ivfGetNprobe + (JNIEnv *, jclass, jlong); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: ivfSetNprobe + * Signature: (JI)V + */ +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_ivfSetNprobe + (JNIEnv *, jclass, jlong, jint); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: ivfGetNlist + * Signature: (J)I + */ +JNIEXPORT jint JNICALL Java_org_apache_paimon_faiss_FaissNative_ivfGetNlist + (JNIEnv *, jclass, jlong); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: hnswGetEfSearch + * Signature: (J)I + */ +JNIEXPORT jint JNICALL Java_org_apache_paimon_faiss_FaissNative_hnswGetEfSearch + (JNIEnv *, jclass, jlong); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: hnswSetEfSearch + * Signature: (JI)V + */ +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_hnswSetEfSearch + (JNIEnv *, jclass, jlong, jint); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: hnswGetEfConstruction + * Signature: (J)I + */ +JNIEXPORT jint JNICALL Java_org_apache_paimon_faiss_FaissNative_hnswGetEfConstruction + (JNIEnv *, jclass, jlong); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: hnswSetEfConstruction + * Signature: (JI)V + */ +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_hnswSetEfConstruction + (JNIEnv *, jclass, jlong, jint); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: getVersion + * Signature: ()Ljava/lang/String; + */ +JNIEXPORT jstring JNICALL Java_org_apache_paimon_faiss_FaissNative_getVersion + (JNIEnv *, jclass); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: setNumThreads + * Signature: (I)V + */ +JNIEXPORT void JNICALL Java_org_apache_paimon_faiss_FaissNative_setNumThreads + (JNIEnv *, jclass, jint); + +/* + * Class: org_apache_paimon_faiss_FaissNative + * Method: getNumThreads + * Signature: ()I + */ +JNIEXPORT jint JNICALL Java_org_apache_paimon_faiss_FaissNative_getNumThreads + (JNIEnv *, jclass); + +#ifdef __cplusplus +} +#endif + +#endif /* PAIMON_FAISS_JNI_H */ diff --git a/paimon-faiss/paimon-faiss-jni/src/test/java/org/apache/paimon/faiss/IndexTest.java b/paimon-faiss/paimon-faiss-jni/src/test/java/org/apache/paimon/faiss/IndexTest.java new file mode 100644 index 000000000000..811bad0d730e --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/src/test/java/org/apache/paimon/faiss/IndexTest.java @@ -0,0 +1,456 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.File; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.FloatBuffer; +import java.nio.file.Path; +import java.util.Random; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Tests for the Faiss Index class. + * + *

Note: These tests require the native library to be built and available. They will be skipped + * if the native library is not found. + */ +class IndexTest { + + private static final int DIMENSION = 128; + private static final int NUM_VECTORS = 1000; + private static final int K = 10; + + @Test + void testFlatIndexBasicOperations() { + try (Index index = createFlatIndexWithMetric(MetricType.L2)) { + assertEquals(DIMENSION, index.getDimension()); + assertEquals(0, index.getCount()); + assertTrue(index.isTrained()); + assertEquals(MetricType.L2, index.getMetricType()); + + // Add vectors using zero-copy API + ByteBuffer vectorBuffer = createVectorBuffer(NUM_VECTORS, DIMENSION); + index.add(NUM_VECTORS, vectorBuffer); + assertEquals(NUM_VECTORS, index.getCount()); + + // Search using array API + float[] queryVectors = createQueryVectors(1, DIMENSION); + float[] distances = new float[K]; + long[] labels = new long[K]; + + index.search(1, queryVectors, K, distances, labels); + + // Verify labels are in valid range + for (int i = 0; i < K; i++) { + assertTrue( + labels[i] >= 0 && labels[i] < NUM_VECTORS, + "Label " + labels[i] + " out of range"); + } + + // Verify distances are non-negative for L2 + for (int i = 0; i < K; i++) { + assertTrue(distances[i] >= 0, "Distance should be non-negative for L2"); + } + } + } + + @Test + void testFlatIndexWithIds() { + try (Index index = IndexFactory.create(DIMENSION, "IDMap,Flat", MetricType.L2)) { + ByteBuffer vectorBuffer = createVectorBuffer(NUM_VECTORS, DIMENSION); + ByteBuffer idBuffer = Index.allocateIdBuffer(NUM_VECTORS); + idBuffer.asLongBuffer(); + for (int i = 0; i < NUM_VECTORS; i++) { + idBuffer.putLong(i * Long.BYTES, i * 100L); // Use custom IDs + } + + index.addWithIds(NUM_VECTORS, vectorBuffer, idBuffer); + assertEquals(NUM_VECTORS, index.getCount()); + + // Search should return our custom IDs + float[] queryVectors = createQueryVectors(1, DIMENSION); + float[] distances = new float[K]; + long[] labels = new long[K]; + + index.search(1, queryVectors, K, distances, labels); + + for (int i = 0; i < K; i++) { + assertTrue(labels[i] % 100 == 0, "Label should be a multiple of 100"); + } + } + } + + @Test + void testBatchSearch() { + try (Index index = createFlatIndexWithMetric(MetricType.L2)) { + ByteBuffer vectorBuffer = createVectorBuffer(NUM_VECTORS, DIMENSION); + index.add(NUM_VECTORS, vectorBuffer); + + int numQueries = 5; + float[] queryVectors = createQueryVectors(numQueries, DIMENSION); + float[] distances = new float[numQueries * K]; + long[] labels = new long[numQueries * K]; + + index.search(numQueries, queryVectors, K, distances, labels); + + // Read results for each query + for (int q = 0; q < numQueries; q++) { + for (int n = 0; n < K; n++) { + int idx = q * K + n; + assertTrue(labels[idx] >= 0 && labels[idx] < NUM_VECTORS); + assertTrue(distances[idx] >= 0); + } + } + } + } + + @Test + void testInnerProductMetric() { + try (Index index = createFlatIndexWithMetric(MetricType.INNER_PRODUCT)) { + assertEquals(MetricType.INNER_PRODUCT, index.getMetricType()); + + ByteBuffer vectorBuffer = createVectorBuffer(NUM_VECTORS, DIMENSION); + index.add(NUM_VECTORS, vectorBuffer); + + float[] queryVectors = createQueryVectors(1, DIMENSION); + float[] distances = new float[K]; + long[] labels = new long[K]; + + index.search(1, queryVectors, K, distances, labels); + + // For inner product, higher is better, so first result should have highest score + for (int i = 1; i < K; i++) { + assertTrue( + distances[i - 1] >= distances[i], + "Distances should be sorted in descending order for inner product"); + } + } + } + + @Test + void testIndexReset() { + try (Index index = createFlatIndex()) { + ByteBuffer vectorBuffer = createVectorBuffer(100, DIMENSION); + index.add(100, vectorBuffer); + assertEquals(100, index.getCount()); + + index.reset(); + assertEquals(0, index.getCount()); + + // Can add again after reset + index.add(100, vectorBuffer); + assertEquals(100, index.getCount()); + } + } + + @Test + void testIndexSerialization(@TempDir Path tempDir) { + ByteBuffer vectorBuffer = createVectorBuffer(NUM_VECTORS, DIMENSION); + float[] queryVectors = createQueryVectors(1, DIMENSION); + long[] originalLabels = new long[K]; + float[] originalDistances = new float[K]; + + // Create and populate index + try (Index index = createFlatIndex()) { + index.add(NUM_VECTORS, vectorBuffer); + + // Perform search and save original results + index.search(1, queryVectors, K, originalDistances, originalLabels); + + // Test file I/O + File indexFile = tempDir.resolve("test.index").toFile(); + index.writeToFile(indexFile); + + try (Index loadedIndex = Index.readFromFile(indexFile)) { + assertEquals(DIMENSION, loadedIndex.getDimension()); + assertEquals(NUM_VECTORS, loadedIndex.getCount()); + + float[] loadedDistances = new float[K]; + long[] loadedLabels = new long[K]; + loadedIndex.search(1, queryVectors, K, loadedDistances, loadedLabels); + + assertArrayEquals(originalLabels, loadedLabels); + } + } + + // Test ByteBuffer serialization (zero-copy for write) and byte[] deserialization + try (Index index = createFlatIndex()) { + index.add(NUM_VECTORS, vectorBuffer); + + long serializeSize = index.serializeSize(); + assertTrue(serializeSize > 0); + + ByteBuffer serialized = + ByteBuffer.allocateDirect((int) serializeSize).order(ByteOrder.nativeOrder()); + long bytesWritten = index.serialize(serialized); + assertEquals(serializeSize, bytesWritten); + + // Convert to byte array for deserialization + serialized.rewind(); + byte[] serializedBytes = new byte[(int) bytesWritten]; + serialized.get(serializedBytes); + + try (Index deserializedIndex = Index.deserialize(serializedBytes)) { + assertEquals(DIMENSION, deserializedIndex.getDimension()); + assertEquals(NUM_VECTORS, deserializedIndex.getCount()); + + float[] deserializedDistances = new float[K]; + long[] deserializedLabels = new long[K]; + deserializedIndex.search( + 1, queryVectors, K, deserializedDistances, deserializedLabels); + + assertArrayEquals(originalLabels, deserializedLabels); + } + } + } + + @Test + void testIndexFactoryDescriptions() { + // Test various index factory strings + String[] descriptions = {"Flat", "IDMap,Flat", "HNSW32", "HNSW32,Flat"}; + + for (String desc : descriptions) { + try (Index index = IndexFactory.create(DIMENSION, desc, MetricType.L2)) { + assertEquals(DIMENSION, index.getDimension()); + assertNotNull(index.toString()); + } + } + } + + @Test + void testHNSWIndex() { + try (Index index = IndexFactory.create(DIMENSION, "HNSW" + 32, MetricType.L2)) { + assertTrue(index.isTrained()); // HNSW doesn't need training + + ByteBuffer vectorBuffer = createVectorBuffer(NUM_VECTORS, DIMENSION); + index.add(NUM_VECTORS, vectorBuffer); + + // Get and set efSearch + int efSearch = IndexHNSW.getEfSearch(index); + assertTrue(efSearch > 0); + + IndexHNSW.setEfSearch(index, 64); + assertEquals(64, IndexHNSW.getEfSearch(index)); + + // Search + float[] queryVectors = createQueryVectors(1, DIMENSION); + float[] distances = new float[K]; + long[] labels = new long[K]; + + index.search(1, queryVectors, K, distances, labels); + + for (int i = 0; i < K; i++) { + assertTrue(labels[i] >= 0); + } + } + } + + @Test + void testIVFSQ8Index() { + // IVF16384,SQ8 is a quantized index that needs training + try (Index index = IndexFactory.create(DIMENSION, "IVF16384,SQ8", MetricType.L2)) { + assertEquals(DIMENSION, index.getDimension()); + assertEquals(MetricType.L2, index.getMetricType()); + + // IVF index needs training + assertTrue(!index.isTrained(), "IVF index should not be trained initially"); + + // Train the index with training vectors + int numTrainingVectors = 20000; // Should be >= nlist (16384) for good training + ByteBuffer trainingBuffer = createVectorBuffer(numTrainingVectors, DIMENSION); + index.train(numTrainingVectors, trainingBuffer); + + assertTrue(index.isTrained(), "Index should be trained after training"); + + // Add vectors after training + ByteBuffer vectorBuffer = createVectorBuffer(NUM_VECTORS, DIMENSION); + index.add(NUM_VECTORS, vectorBuffer); + assertEquals(NUM_VECTORS, index.getCount()); + + // Set nprobe for search (number of clusters to visit) + IndexIVF.setNprobe(index, 64); + assertEquals(64, IndexIVF.getNprobe(index)); + + // Search + float[] queryVectors = createQueryVectors(1, DIMENSION); + float[] distances = new float[K]; + long[] labels = new long[K]; + + index.search(1, queryVectors, K, distances, labels); + + // Verify search results + for (int i = 0; i < K; i++) { + assertTrue( + labels[i] >= 0 && labels[i] < NUM_VECTORS, + "Label " + labels[i] + " out of range"); + assertTrue(distances[i] >= 0, "Distance should be non-negative for L2"); + } + + // Test batch search + int numQueries = 3; + float[] batchQueryVectors = createQueryVectors(numQueries, DIMENSION); + float[] batchDistances = new float[numQueries * K]; + long[] batchLabels = new long[numQueries * K]; + + index.search(numQueries, batchQueryVectors, K, batchDistances, batchLabels); + + for (int q = 0; q < numQueries; q++) { + for (int n = 0; n < K; n++) { + int idx = q * K + n; + assertTrue(batchLabels[idx] >= 0 && batchLabels[idx] < NUM_VECTORS); + assertTrue(batchDistances[idx] >= 0); + } + } + } + } + + @Test + void testErrorHandling() { + // Test invalid dimension + assertThrows( + IllegalArgumentException.class, + () -> { + IndexFactory.create(0, "Flat", MetricType.L2); + }); + + assertThrows( + IllegalArgumentException.class, + () -> { + IndexFactory.create(-1, "Flat", MetricType.L2); + }); + + // Test null description + assertThrows( + IllegalArgumentException.class, + () -> { + IndexFactory.create(DIMENSION, null, MetricType.L2); + }); + + // Test buffer validation - wrong size buffer + try (Index index = createFlatIndex()) { + ByteBuffer wrongSizeBuffer = + ByteBuffer.allocateDirect(10).order(ByteOrder.nativeOrder()); + assertThrows( + IllegalArgumentException.class, + () -> { + index.add(1, wrongSizeBuffer); // Buffer too small for 1 vector + }); + } + + // Test non-direct buffer + try (Index index = createFlatIndex()) { + ByteBuffer heapBuffer = ByteBuffer.allocate(DIMENSION * Float.BYTES); + assertThrows( + IllegalArgumentException.class, + () -> { + index.add(1, heapBuffer); // Not a direct buffer + }); + } + + // Test closed index + Index closedIndex = createFlatIndex(); + closedIndex.close(); + assertThrows( + IllegalStateException.class, + () -> { + closedIndex.getCount(); + }); + } + + @Test + void testSearchResultArrays() { + try (Index index = createFlatIndex()) { + ByteBuffer vectorBuffer = createVectorBuffer(100, DIMENSION); + index.add(100, vectorBuffer); + + int numQueries = 3; + int k = 5; + float[] queryVectors = createQueryVectors(numQueries, DIMENSION); + float[] distances = new float[numQueries * k]; + long[] labels = new long[numQueries * k]; + + index.search(numQueries, queryVectors, k, distances, labels); + + // Test reading individual results + for (int q = 0; q < numQueries; q++) { + for (int n = 0; n < k; n++) { + int idx = q * k + n; + assertTrue(labels[idx] >= 0 && labels[idx] < 100); + assertTrue(distances[idx] >= 0); + } + } + } + } + + @Test + void testBufferAllocationHelpers() { + // Test vector buffer allocation + ByteBuffer vectorBuffer = Index.allocateVectorBuffer(10, DIMENSION); + assertTrue(vectorBuffer.isDirect()); + assertEquals(ByteOrder.nativeOrder(), vectorBuffer.order()); + assertEquals(10 * DIMENSION * Float.BYTES, vectorBuffer.capacity()); + + // Test ID buffer allocation + ByteBuffer idBuffer = Index.allocateIdBuffer(10); + assertTrue(idBuffer.isDirect()); + assertEquals(ByteOrder.nativeOrder(), idBuffer.order()); + assertEquals(10 * Long.BYTES, idBuffer.capacity()); + } + + private Index createFlatIndexWithMetric(MetricType metricType) { + return IndexFactory.create(DIMENSION, "Flat", metricType); + } + + private Index createFlatIndex() { + return IndexFactory.create(DIMENSION, "Flat", MetricType.L2); + } + + /** Create a direct ByteBuffer with random vectors. */ + private ByteBuffer createVectorBuffer(int n, int d) { + ByteBuffer buffer = Index.allocateVectorBuffer(n, d); + FloatBuffer floatView = buffer.asFloatBuffer(); + + Random random = new Random(42); + for (int i = 0; i < n * d; i++) { + floatView.put(i, random.nextFloat()); + } + + return buffer; + } + + /** Create a float array with random query vectors. */ + private float[] createQueryVectors(int n, int d) { + float[] vectors = new float[n * d]; + Random random = new Random(42); + for (int i = 0; i < n * d; i++) { + vectors[i] = random.nextFloat(); + } + return vectors; + } +} diff --git a/paimon-faiss/pom.xml b/paimon-faiss/pom.xml new file mode 100644 index 000000000000..43e22504a6e2 --- /dev/null +++ b/paimon-faiss/pom.xml @@ -0,0 +1,38 @@ + + + + 4.0.0 + + + paimon-parent + org.apache.paimon + 1.4-SNAPSHOT + + + paimon-faiss + Paimon : Faiss + pom + + + paimon-faiss-jni + + diff --git a/pom.xml b/pom.xml index 6176baaab666..d020c79f0bea 100644 --- a/pom.xml +++ b/pom.xml @@ -522,6 +522,16 @@ under the License. 11 + + + paimon-faiss + + paimon-faiss + + + true + + @@ -626,6 +636,7 @@ under the License. paimon-common/src/main/antlr4/** paimon-core/src/test/resources/compatibility/** + paimon-faiss/paimon-faiss-jni/build/**