diff --git a/.github/actions/build-faiss-native/action.yml b/.github/actions/build-faiss-native/action.yml
new file mode 100644
index 000000000000..974d9390458c
--- /dev/null
+++ b/.github/actions/build-faiss-native/action.yml
@@ -0,0 +1,112 @@
+name: 'Build FAISS Native Library'
+description: 'Build FAISS native library for specified platform'
+inputs:
+ platform:
+ description: 'Target platform (linux-amd64, linux-aarch64, darwin-aarch64)'
+ required: true
+ faiss-version:
+ description: 'FAISS version to build (e.g., 1.7.4)'
+ required: false
+ default: '1.7.4'
+ use-homebrew:
+ description: 'Whether to use Homebrew for dependencies (macOS)'
+ required: false
+ default: 'false'
+
+runs:
+ using: 'composite'
+ steps:
+ - name: Install native dependencies (Linux)
+ if: inputs.use-homebrew == 'false'
+ shell: bash
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y \
+ build-essential \
+ libopenblas-dev \
+ liblapack-dev \
+ patchelf \
+ libgomp1 \
+ wget
+
+ - name: Install GCC 9 (Linux)
+ if: inputs.use-homebrew == 'false'
+ shell: bash
+ run: |
+ sudo apt-get install -y gcc-9 g++-9 || sudo apt-get install -y gcc g++
+ if command -v gcc-9 &>/dev/null; then
+ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 90
+ sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 90
+ fi
+ gcc --version
+
+ - name: Install CMake 3.30.1 (Linux)
+ if: inputs.use-homebrew == 'false'
+ shell: bash
+ run: |
+ CMAKE_VERSION="3.30.1"
+ if [[ "${{ inputs.platform }}" == *"amd64"* ]]; then
+ CMAKE_ARCH="x86_64"
+ else
+ CMAKE_ARCH="aarch64"
+ fi
+ wget -q https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz
+ tar -xzf cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz
+ sudo mv cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH} /opt/cmake
+ sudo ln -sf /opt/cmake/bin/cmake /usr/local/bin/cmake
+ cmake --version
+
+ - name: Install FAISS (Linux)
+ if: inputs.use-homebrew == 'false'
+ shell: bash
+ run: |
+ git clone --depth 1 --branch v1.7.4 https://github.com/facebookresearch/faiss.git /tmp/faiss
+ cd /tmp/faiss
+ cmake -B build \
+ -DFAISS_ENABLE_GPU=OFF \
+ -DFAISS_ENABLE_PYTHON=OFF \
+ -DBUILD_TESTING=OFF \
+ -DCMAKE_BUILD_TYPE=Release
+ cmake --build build -j $(nproc)
+ sudo cmake --install build
+
+ - name: Install dependencies (macOS)
+ if: inputs.use-homebrew == 'true'
+ shell: bash
+ run: |
+ brew install cmake libomp openblas faiss
+
+ - name: Build native library
+ shell: bash
+ run: |
+ ./paimon-faiss/paimon-faiss-jni/scripts/build-native.sh --clean --fat-lib --faiss-version ${{ inputs.faiss-version }}
+
+ - name: List built libraries (Linux)
+ if: inputs.use-homebrew == 'false' && inputs.platform == 'linux-amd64'
+ shell: bash
+ run: |
+ echo "=== Built libraries ==="
+ ls -la paimon-faiss/paimon-faiss-jni/src/main/resources/linux/amd64/
+ echo ""
+ echo "=== Library dependencies ==="
+ ldd paimon-faiss/paimon-faiss-jni/src/main/resources/linux/amd64/libpaimon_faiss_jni.so || true
+
+ - name: List built libraries (Linux AARCH64)
+ if: inputs.use-homebrew == 'false' && inputs.platform == 'linux-aarch64'
+ shell: bash
+ run: |
+ echo "=== Built libraries ==="
+ ls -la paimon-faiss/paimon-faiss-jni/src/main/resources/linux/aarch64/
+ echo ""
+ echo "=== Library dependencies ==="
+ ldd paimon-faiss/paimon-faiss-jni/src/main/resources/linux/aarch64/libpaimon_faiss_jni.so || true
+
+ - name: List built libraries (macOS)
+ if: inputs.use-homebrew == 'true'
+ shell: bash
+ run: |
+ echo "=== Built libraries ==="
+ ls -la paimon-faiss/paimon-faiss-jni/src/main/resources/darwin/aarch64/
+ echo ""
+ echo "=== Library dependencies ==="
+ otool -L paimon-faiss/paimon-faiss-jni/src/main/resources/darwin/aarch64/libpaimon_faiss_jni.dylib || true
diff --git a/.github/workflows/faiss-vector-index-tests.yml b/.github/workflows/faiss-vector-index-tests.yml
new file mode 100644
index 000000000000..16a1d386fb41
--- /dev/null
+++ b/.github/workflows/faiss-vector-index-tests.yml
@@ -0,0 +1,66 @@
+################################################################################
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+name: Faiss Vector Index Tests
+
+on:
+ push:
+ paths:
+ - 'paimon-faiss/**'
+ - '.github/workflows/faiss-vector-index-tests.yml'
+ pull_request:
+ paths:
+ - 'paimon-faiss/**'
+ - '.github/workflows/faiss-vector-index-tests.yml'
+
+env:
+ JDK_VERSION: 8
+ MAVEN_OPTS: -Dmaven.wagon.httpconnectionManager.ttlSeconds=30 -Dmaven.wagon.http.retryHandler.requestSentEnabled=true
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event.number || github.run_id }}
+ cancel-in-progress: true
+
+jobs:
+ build_test:
+ runs-on: ubuntu-latest
+ timeout-minutes: 90
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Set up JDK ${{ env.JDK_VERSION }}
+ uses: actions/setup-java@v4
+ with:
+ java-version: ${{ env.JDK_VERSION }}
+ distribution: 'temurin'
+
+ - name: Build FAISS native library
+ uses: ./.github/actions/build-faiss-native
+ with:
+ platform: linux-amd64
+
+ - name: Build paimon-faiss-jni
+ shell: bash
+ run: mvn -B clean install -pl paimon-faiss/paimon-faiss-jni -am -DskipTests -Ppaimon-faiss
+
+ - name: Test paimon-faiss-jni
+ timeout-minutes: 10
+ run: mvn -T 1C -B test -pl paimon-faiss/paimon-faiss-jni -DskipFaissTests=false -Ppaimon-faiss
+ env:
+ MAVEN_OPTS: -Xmx2048m
diff --git a/.github/workflows/utitcase.yml b/.github/workflows/utitcase.yml
index b24ab35386d3..62939d90cff7 100644
--- a/.github/workflows/utitcase.yml
+++ b/.github/workflows/utitcase.yml
@@ -27,6 +27,8 @@ on:
- 'paimon-python/**'
- '.github/workflows/paimon-python-checks.yml'
- 'paimon-lucene/**'
+ - 'paimon-faiss/**'
+ - '.github/workflows/faiss-vector-index-tests.yml'
env:
JDK_VERSION: 8
@@ -62,7 +64,7 @@ jobs:
jvm_timezone=$(random_timezone)
echo "JVM timezone is set to $jvm_timezone"
- TEST_MODULES="!paimon-e2e-tests,"
+ TEST_MODULES="!paimon-e2e-tests,!paimon-faiss/paimon-faiss-jni,"
for suffix in ut 3.5 3.4 3.3 3.2; do
TEST_MODULES+="!org.apache.paimon:paimon-spark-${suffix}_2.12,"
done
diff --git a/.gitignore b/.gitignore
index d961d026a773..3f42fdc44a97 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,3 +31,9 @@ paimon-lucene/.idea/
### Mac OS ###
.DS_Store
+
+### Native build artifacts ###
+paimon-faiss/paimon-faiss-jni/build/
+paimon-faiss/paimon-faiss-jni/src/main/resources/darwin*
+paimon-faiss/paimon-faiss-jni/src/main/resources/linux*
+paimon-faiss/paimon-faiss-jni/src/main/native/cmake-build-debug/
diff --git a/paimon-faiss/paimon-faiss-jni/NOTICE b/paimon-faiss/paimon-faiss-jni/NOTICE
new file mode 100644
index 000000000000..349f16941e1c
--- /dev/null
+++ b/paimon-faiss/paimon-faiss-jni/NOTICE
@@ -0,0 +1,9 @@
+Apache Paimon Faiss JNI
+Copyright 2024-2026 The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (https://www.apache.org/).
+
+This product includes Faiss (https://github.com/facebookresearch/faiss)
+developed by Facebook AI Research under the MIT License.
+
diff --git a/paimon-faiss/paimon-faiss-jni/pom.xml b/paimon-faiss/paimon-faiss-jni/pom.xml
new file mode 100644
index 000000000000..e03525d8b1ce
--- /dev/null
+++ b/paimon-faiss/paimon-faiss-jni/pom.xml
@@ -0,0 +1,105 @@
+
+
+
This affects operations like index training, adding vectors, and searching. Set to 1 to + * disable parallelism. + * + * @param numThreads the number of threads (must be positive) + */ + public static void setNumThreads(int numThreads) { + if (numThreads <= 0) { + throw new IllegalArgumentException("Number of threads must be positive: " + numThreads); + } + FaissNative.setNumThreads(numThreads); + } + + /** + * Get the number of threads for parallel operations. + * + * @return the current number of threads + */ + public static int getNumThreads() { + return FaissNative.getNumThreads(); + } + + /** + * Ensure the native library is loaded. + * + *
This method is called automatically when any Faiss class is used. It can be called + * explicitly to load the library early and catch any loading errors. + * + * @throws FaissException if the native library cannot be loaded + */ + public static void loadLibrary() throws FaissException { + NativeLibraryLoader.load(); + } + + /** + * Check if the native library has been loaded. + * + * @return true if the library is loaded + */ + public static boolean isLibraryLoaded() { + return NativeLibraryLoader.isLoaded(); + } +} diff --git a/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/FaissException.java b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/FaissException.java new file mode 100644 index 000000000000..c670b619e899 --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/FaissException.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +/** + * Exception thrown when a Faiss operation fails. + * + *
This exception wraps errors from the native Faiss library as well as errors that occur during + * JNI operations. + */ +public class FaissException extends Exception { + private static final long serialVersionUID = 1L; + + /** + * Creates a new FaissException with the specified message. + * + * @param message the error message + */ + public FaissException(String message) { + super(message); + } + + /** + * Creates a new FaissException with the specified message and cause. + * + * @param message the error message + * @param cause the underlying cause + */ + public FaissException(String message, Throwable cause) { + super(message, cause); + } + + /** + * Creates a new FaissException with the specified cause. + * + * @param cause the underlying cause + */ + public FaissException(Throwable cause) { + super(cause); + } +} diff --git a/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/FaissNative.java b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/FaissNative.java new file mode 100644 index 000000000000..f98a15b47f30 --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/FaissNative.java @@ -0,0 +1,328 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +import java.nio.ByteBuffer; + +/** + * Native method declarations for Faiss JNI with zero-copy support. + * + *
This class contains all the native method declarations that are implemented in the JNI C++ + * layer. These methods directly map to Faiss C++ API calls. + * + *
Users should not call these methods directly. Instead, use the high-level Java API classes + * like {@link Index} and {@link IndexFactory}. + * + *
All vector operations use {@link ByteBuffer#allocateDirect(int)} buffers to achieve zero-copy + * data transfer between Java and native code. This eliminates memory duplication and improves + * performance for large-scale vector operations. + */ +final class FaissNative { + + static { + try { + NativeLibraryLoader.load(); + } catch (FaissException e) { + throw new ExceptionInInitializerError(e); + } + } + + /** + * Create an index using an index factory string. + * + * @param dimension the dimension of the vectors + * @param description the index description string (e.g., "Flat", "IVF100,Flat", "HNSW32") + * @param metricType the metric type (0 = L2, 1 = Inner Product) + * @return the native handle of the created index + */ + static native long indexFactoryCreate(int dimension, String description, int metricType); + + /** + * Destroy an index and free its resources. + * + * @param handle the native handle of the index + */ + static native void indexDestroy(long handle); + + /** + * Get the dimension of an index. + * + * @param handle the native handle of the index + * @return the dimension + */ + static native int indexGetDimension(long handle); + + /** + * Get the number of vectors in an index. + * + * @param handle the native handle of the index + * @return the number of vectors + */ + static native long indexGetCount(long handle); + + /** + * Check if an index is trained. + * + * @param handle the native handle of the index + * @return true if trained + */ + static native boolean indexIsTrained(long handle); + + /** + * Get the metric type of an index. + * + * @param handle the native handle of the index + * @return the metric type (0 = L2, 1 = Inner Product) + */ + static native int indexGetMetricType(long handle); + + /** + * Reset an index (remove all vectors). + * + * @param handle the native handle of the index + */ + static native void indexReset(long handle); + + /** + * Write an index to a file. + * + * @param handle the native handle of the index + * @param path the file path to write to + */ + static native void indexWriteToFile(long handle, String path); + + /** + * Read an index from a file. + * + * @param path the file path to read from + * @return the native handle of the loaded index + */ + static native long indexReadFromFile(String path); + + /** + * Add vectors to an index using a direct ByteBuffer (zero-copy). + * + *
The buffer must be a direct buffer created via {@link ByteBuffer#allocateDirect(int)} and + * must contain n * dimension floats in native byte order. + * + * @param handle the native handle of the index + * @param n the number of vectors to add + * @param vectorBuffer direct ByteBuffer containing vectors (n * dimension * 4 bytes) + */ + static native void indexAdd(long handle, long n, ByteBuffer vectorBuffer); + + /** + * Add vectors with IDs to an index using direct ByteBuffers (zero-copy). + * + * @param handle the native handle of the index + * @param n the number of vectors to add + * @param vectorBuffer direct ByteBuffer containing vectors (n * dimension * 4 bytes) + * @param idBuffer direct ByteBuffer containing IDs (n * 8 bytes) + */ + static native void indexAddWithIds( + long handle, long n, ByteBuffer vectorBuffer, ByteBuffer idBuffer); + + /** + * Search for the k nearest neighbors. + * + * @param handle the native handle of the index + * @param n the number of query vectors + * @param queryVectors the query vectors (n * dimension floats) + * @param k the number of nearest neighbors to find + * @param distances output array for distances (n * k floats) + * @param labels output array for labels (n * k longs) + */ + static native void indexSearch( + long handle, long n, float[] queryVectors, int k, float[] distances, long[] labels); + + /** + * Train an index using a direct ByteBuffer (zero-copy). + * + * @param handle the native handle of the index + * @param n the number of training vectors + * @param vectorBuffer direct ByteBuffer containing training vectors (n * dimension * 4 bytes) + */ + static native void indexTrain(long handle, long n, ByteBuffer vectorBuffer); + + /** + * Search for neighbors within a given radius using direct ByteBuffer (zero-copy). + * + * @param handle the native handle of the index + * @param n the number of query vectors + * @param queryBuffer direct ByteBuffer containing query vectors (n * dimension * 4 bytes) + * @param radius the search radius + * @return a range search result handle + */ + static native long indexRangeSearch(long handle, long n, ByteBuffer queryBuffer, float radius); + + /** + * Serialize an index to a direct ByteBuffer (zero-copy). + * + *
Returns the number of bytes written. The caller must provide a buffer large enough to hold + * the serialized index. Use {@link #indexSerializeSize(long)} to get the required size. + * + * @param handle the native handle of the index + * @param buffer direct ByteBuffer to write the serialized index to + * @return the number of bytes written + */ + static native long indexSerialize(long handle, ByteBuffer buffer); + + /** + * Get the size in bytes needed to serialize an index. + * + * @param handle the native handle of the index + * @return the size in bytes + */ + static native long indexSerializeSize(long handle); + + /** + * Deserialize an index from a byte array. + * + * @param data the serialized index data + * @param length the number of bytes to read + * @return the native handle of the loaded index + */ + static native long indexDeserialize(byte[] data, long length); + + /** + * Destroy a range search result. + * + * @param handle the native handle of the range search result + */ + static native void rangeSearchResultDestroy(long handle); + + /** + * Get the number of results for each query in a range search. + * + * @param handle the native handle of the range search result + * @param limitsBuffer direct ByteBuffer for limits output ((nq + 1) * 8 bytes) + */ + static native void rangeSearchResultGetLimits(long handle, ByteBuffer limitsBuffer); + + /** + * Get the total number of results in a range search. + * + * @param handle the native handle of the range search result + * @return the total number of results + */ + static native long rangeSearchResultGetTotalSize(long handle); + + /** + * Get all labels from a range search result. + * + * @param handle the native handle of the range search result + * @param labelsBuffer direct ByteBuffer for labels output + */ + static native void rangeSearchResultGetLabels(long handle, ByteBuffer labelsBuffer); + + /** + * Get all distances from a range search result. + * + * @param handle the native handle of the range search result + * @param distancesBuffer direct ByteBuffer for distances output + */ + static native void rangeSearchResultGetDistances(long handle, ByteBuffer distancesBuffer); + + /** + * Get the number of queries in a range search result. + * + * @param handle the native handle of the range search result + * @return the number of queries + */ + static native int rangeSearchResultGetNumQueries(long handle); + + /** + * Get the number of probe lists for an IVF index. + * + * @param handle the native handle of the index + * @return the number of probe lists (nprobe) + */ + static native int ivfGetNprobe(long handle); + + /** + * Set the number of probe lists for an IVF index. + * + * @param handle the native handle of the index + * @param nprobe the number of probe lists + */ + static native void ivfSetNprobe(long handle, int nprobe); + + /** + * Get the number of lists (clusters) in an IVF index. + * + * @param handle the native handle of the index + * @return the number of lists + */ + static native int ivfGetNlist(long handle); + + /** + * Get the efSearch parameter of an HNSW index. + * + * @param handle the native handle of the index + * @return the efSearch value + */ + static native int hnswGetEfSearch(long handle); + + /** + * Set the efSearch parameter of an HNSW index. + * + * @param handle the native handle of the index + * @param efSearch the efSearch value + */ + static native void hnswSetEfSearch(long handle, int efSearch); + + /** + * Get the efConstruction parameter of an HNSW index. + * + * @param handle the native handle of the index + * @return the efConstruction value + */ + static native int hnswGetEfConstruction(long handle); + + /** + * Set the efConstruction parameter of an HNSW index. + * + *
This must be set before adding any vectors to the index. + * + * @param handle the native handle of the index + * @param efConstruction the efConstruction value + */ + static native void hnswSetEfConstruction(long handle, int efConstruction); + + /** + * Get the Faiss library version. + * + * @return the version string + */ + static native String getVersion(); + + /** + * Set the number of threads for parallel operations. + * + * @param numThreads the number of threads + */ + static native void setNumThreads(int numThreads); + + /** + * Get the number of threads for parallel operations. + * + * @return the number of threads + */ + static native int getNumThreads(); +} diff --git a/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/Index.java b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/Index.java new file mode 100644 index 000000000000..0ccf0a9f0d1c --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/Index.java @@ -0,0 +1,354 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +import java.io.File; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +/** + * A Faiss index for similarity search with zero-copy support. + * + *
This class wraps a native Faiss index and provides methods for adding vectors, searching for + * nearest neighbors, and managing the index. All vector operations use direct ByteBuffers for + * zero-copy data transfer between Java and native code. + * + *
Thread Safety: Index instances are NOT thread-safe. External synchronization is required if an + * index is accessed from multiple threads. + * + * @see IndexFactory + */ +public class Index implements AutoCloseable { + + /** Native handle to the Faiss index. */ + private long nativeHandle; + + /** The dimension of vectors in this index. */ + private final int dimension; + + /** Whether this index has been closed. */ + private volatile boolean closed = false; + + /** + * Create an Index wrapper around a native handle. + * + * @param nativeHandle the native handle + * @param dimension the vector dimension + */ + Index(long nativeHandle, int dimension) { + this.nativeHandle = nativeHandle; + this.dimension = dimension; + } + + /** + * Get the dimension of vectors in this index. + * + * @return the vector dimension + */ + public int getDimension() { + return dimension; + } + + /** + * Get the number of vectors in this index. + * + * @return the number of vectors + */ + public long getCount() { + checkNotClosed(); + return FaissNative.indexGetCount(nativeHandle); + } + + /** + * Check if this index is trained. + * + *
Some index types (like IVF) require training before vectors can be added. Flat indexes are + * always considered trained. + * + * @return true if the index is trained + */ + public boolean isTrained() { + checkNotClosed(); + return FaissNative.indexIsTrained(nativeHandle); + } + + /** + * Get the metric type used by this index. + * + * @return the metric type + */ + public MetricType getMetricType() { + checkNotClosed(); + return MetricType.fromValue(FaissNative.indexGetMetricType(nativeHandle)); + } + + /** + * Train the index on a set of training vectors (zero-copy). + * + *
This is required for some index types (like IVF) before adding vectors. For flat indexes, + * this is a no-op. + * + * @param n the number of training vectors + * @param vectorBuffer direct ByteBuffer containing training vectors (n * dimension floats) + */ + public void train(long n, ByteBuffer vectorBuffer) { + checkNotClosed(); + validateDirectBuffer(vectorBuffer, n * dimension * Float.BYTES, "vector"); + FaissNative.indexTrain(nativeHandle, n, vectorBuffer); + } + + /** + * Add vectors to the index (zero-copy). + * + *
The vectors are assigned sequential IDs starting from the current count. + * + * @param n the number of vectors to add + * @param vectorBuffer direct ByteBuffer containing vectors (n * dimension floats) + */ + public void add(long n, ByteBuffer vectorBuffer) { + checkNotClosed(); + validateDirectBuffer(vectorBuffer, n * dimension * Float.BYTES, "vector"); + FaissNative.indexAdd(nativeHandle, n, vectorBuffer); + } + + /** + * Add vectors with explicit IDs to the index (zero-copy). + * + *
Note: Not all index types support this operation. Flat indexes and IndexIDMap wrapped + * indexes support it. + * + * @param n the number of vectors to add + * @param vectorBuffer direct ByteBuffer containing vectors (n * dimension floats) + * @param idBuffer direct ByteBuffer containing IDs (n longs) + */ + public void addWithIds(long n, ByteBuffer vectorBuffer, ByteBuffer idBuffer) { + checkNotClosed(); + validateDirectBuffer(vectorBuffer, n * dimension * Float.BYTES, "vector"); + validateDirectBuffer(idBuffer, n * Long.BYTES, "id"); + FaissNative.indexAddWithIds(nativeHandle, n, vectorBuffer, idBuffer); + } + + /** + * Search for the k nearest neighbors of query vectors. + * + * @param n the number of query vectors + * @param queryVectors array containing query vectors (n * dimension floats) + * @param k the number of nearest neighbors to find + * @param distances output array for distances (n * k floats) + * @param labels output array for labels (n * k longs) + */ + public void search(long n, float[] queryVectors, int k, float[] distances, long[] labels) { + checkNotClosed(); + if (queryVectors.length < n * dimension) { + throw new IllegalArgumentException( + "Query vectors array too small: required " + + (n * dimension) + + ", got " + + queryVectors.length); + } + if (distances.length < n * k) { + throw new IllegalArgumentException( + "Distances array too small: required " + (n * k) + ", got " + distances.length); + } + if (labels.length < n * k) { + throw new IllegalArgumentException( + "Labels array too small: required " + (n * k) + ", got " + labels.length); + } + FaissNative.indexSearch(nativeHandle, n, queryVectors, k, distances, labels); + } + + /** + * Search for all neighbors within a given radius (zero-copy). + * + * @param n the number of query vectors + * @param queryBuffer direct ByteBuffer containing query vectors (n * dimension floats) + * @param radius the search radius + * @return the range search result + */ + public RangeSearchResult rangeSearch(long n, ByteBuffer queryBuffer, float radius) { + checkNotClosed(); + validateDirectBuffer(queryBuffer, n * dimension * Float.BYTES, "query"); + long resultHandle = FaissNative.indexRangeSearch(nativeHandle, n, queryBuffer, radius); + return new RangeSearchResult(resultHandle, (int) n); + } + + /** Reset the index (remove all vectors). */ + public void reset() { + checkNotClosed(); + FaissNative.indexReset(nativeHandle); + } + + /** + * Write the index to a file. + * + * @param path the file path + */ + public void writeToFile(String path) { + checkNotClosed(); + FaissNative.indexWriteToFile(nativeHandle, path); + } + + /** + * Write the index to a file. + * + * @param file the file + */ + public void writeToFile(File file) { + writeToFile(file.getAbsolutePath()); + } + + /** + * Read an index from a file. + * + * @param path the file path + * @return the loaded index + */ + public static Index readFromFile(String path) { + long handle = FaissNative.indexReadFromFile(path); + int dimension = FaissNative.indexGetDimension(handle); + return new Index(handle, dimension); + } + + /** + * Read an index from a file. + * + * @param file the file + * @return the loaded index + */ + public static Index readFromFile(File file) { + return readFromFile(file.getAbsolutePath()); + } + + /** + * Get the size in bytes needed to serialize this index. + * + * @return the serialization size + */ + public long serializeSize() { + checkNotClosed(); + return FaissNative.indexSerializeSize(nativeHandle); + } + + /** + * Serialize the index to a direct ByteBuffer (zero-copy). + * + * @param buffer direct ByteBuffer to write to (must have sufficient capacity) + * @return the number of bytes written + */ + public long serialize(ByteBuffer buffer) { + checkNotClosed(); + if (!buffer.isDirect()) { + throw new IllegalArgumentException("Buffer must be a direct buffer"); + } + return FaissNative.indexSerialize(nativeHandle, buffer); + } + + /** + * Deserialize an index from a byte array. + * + * @param data the serialized index data + * @return the deserialized index + */ + public static Index deserialize(byte[] data) { + long handle = FaissNative.indexDeserialize(data, data.length); + int dimension = FaissNative.indexGetDimension(handle); + return new Index(handle, dimension); + } + + /** + * Allocate a direct ByteBuffer suitable for vector data. + * + * @param numVectors number of vectors + * @param dimension vector dimension + * @return a direct ByteBuffer in native byte order + */ + public static ByteBuffer allocateVectorBuffer(int numVectors, int dimension) { + return ByteBuffer.allocateDirect(numVectors * dimension * Float.BYTES) + .order(ByteOrder.nativeOrder()); + } + + /** + * Allocate a direct ByteBuffer suitable for ID data. + * + * @param numIds number of IDs + * @return a direct ByteBuffer in native byte order + */ + public static ByteBuffer allocateIdBuffer(int numIds) { + return ByteBuffer.allocateDirect(numIds * Long.BYTES).order(ByteOrder.nativeOrder()); + } + + private void validateDirectBuffer(ByteBuffer buffer, long requiredBytes, String name) { + if (!buffer.isDirect()) { + throw new IllegalArgumentException(name + " buffer must be a direct buffer"); + } + if (buffer.capacity() < requiredBytes) { + throw new IllegalArgumentException( + name + + " buffer too small: required " + + requiredBytes + + " bytes, got " + + buffer.capacity()); + } + } + + /** + * Get the native handle. + * + *
This is for internal use only. + * + * @return the native handle + */ + long nativeHandle() { + return nativeHandle; + } + + private void checkNotClosed() { + if (closed) { + throw new IllegalStateException("Index has been closed"); + } + } + + @Override + public void close() { + if (!closed) { + closed = true; + if (nativeHandle != 0) { + FaissNative.indexDestroy(nativeHandle); + nativeHandle = 0; + } + } + } + + @Override + public String toString() { + if (closed) { + return "Index[closed]"; + } + return "Index{" + + "dimension=" + + dimension + + ", count=" + + getCount() + + ", trained=" + + isTrained() + + ", metricType=" + + getMetricType() + + '}'; + } +} diff --git a/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexFactory.java b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexFactory.java new file mode 100644 index 000000000000..09f081235a52 --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexFactory.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +/** Factory for creating Faiss indexes. */ +public final class IndexFactory { + + /** + * Create a Faiss index using the index factory. + * + * @param dimension the dimension of the vectors + * @param description the index description string + * @param metricType the metric type for similarity computation + * @return the created index + */ + public static Index create(int dimension, String description, MetricType metricType) { + if (dimension <= 0) { + throw new IllegalArgumentException("Dimension must be positive: " + dimension); + } + if (description == null || description.isEmpty()) { + throw new IllegalArgumentException("Index description cannot be null or empty"); + } + if (metricType == null) { + throw new IllegalArgumentException("Metric type cannot be null"); + } + + long handle = FaissNative.indexFactoryCreate(dimension, description, metricType.getValue()); + return new Index(handle, dimension); + } + + /** + * Create a Faiss index with L2 (Euclidean) metric. + * + * @param dimension the dimension of the vectors + * @param description the index description string + * @return the created index + */ + public static Index create(int dimension, String description) { + return create(dimension, description, MetricType.L2); + } +} diff --git a/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexHNSW.java b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexHNSW.java new file mode 100644 index 000000000000..1e4744f8e748 --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexHNSW.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +/** + * Utility class for HNSW (Hierarchical Navigable Small World) index operations. + * + *
HNSW indexes build a graph structure for fast approximate nearest neighbor search. The key + * parameters are: + * + *
This controls the size of the dynamic candidate list during search. Higher values give + * more accurate results but slower search. + * + * @param index the HNSW index + * @return the current efSearch value + * @throws IllegalArgumentException if the index is not an HNSW index + */ + public static int getEfSearch(Index index) { + return FaissNative.hnswGetEfSearch(index.nativeHandle()); + } + + /** + * Set the efSearch parameter. + * + *
This should be at least k (the number of neighbors requested in search). Typical values + * range from 16 to 256. Higher values give more accurate results but slower search. + * + * @param index the HNSW index + * @param efSearch the efSearch value + * @throws IllegalArgumentException if the index is not an HNSW index + */ + public static void setEfSearch(Index index, int efSearch) { + if (efSearch <= 0) { + throw new IllegalArgumentException("efSearch must be positive: " + efSearch); + } + FaissNative.hnswSetEfSearch(index.nativeHandle(), efSearch); + } + + /** + * Get the efConstruction parameter. + * + *
This was the size of the dynamic candidate list during index construction. It cannot be + * changed after the index is built. + * + * @param index the HNSW index + * @return the efConstruction value + * @throws IllegalArgumentException if the index is not an HNSW index + */ + public static int getEfConstruction(Index index) { + return FaissNative.hnswGetEfConstruction(index.nativeHandle()); + } + + /** + * Set the efConstruction parameter. + * + *
This controls the size of the dynamic candidate list during construction. It must be set + * before adding any vectors to the index. Higher values give more accurate results but slower + * construction. Typical values range from 40 to 400. + * + * @param index the HNSW index + * @param efConstruction the efConstruction value + * @throws IllegalArgumentException if the index is not an HNSW index or efConstruction is not + * positive + */ + public static void setEfConstruction(Index index, int efConstruction) { + if (efConstruction <= 0) { + throw new IllegalArgumentException( + "efConstruction must be positive: " + efConstruction); + } + FaissNative.hnswSetEfConstruction(index.nativeHandle(), efConstruction); + } +} diff --git a/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexIVF.java b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexIVF.java new file mode 100644 index 000000000000..7cf98dfbd0f1 --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/IndexIVF.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +/** Utility class for IVF (Inverted File) index operations. */ +public final class IndexIVF { + + /** + * Get the number of clusters to probe during search (nprobe). + * + * @param index the IVF index + * @return the current nprobe value + * @throws IllegalArgumentException if the index is not an IVF index + */ + public static int getNprobe(Index index) { + return FaissNative.ivfGetNprobe(index.nativeHandle()); + } + + /** + * Set the number of clusters to probe during search (nprobe). + * + *
Higher values increase accuracy but decrease search speed. A good starting point is 1-10% + * of the total number of clusters. + * + * @param index the IVF index + * @param nprobe the number of clusters to probe + * @throws IllegalArgumentException if the index is not an IVF index + */ + public static void setNprobe(Index index, int nprobe) { + if (nprobe <= 0) { + throw new IllegalArgumentException("nprobe must be positive: " + nprobe); + } + FaissNative.ivfSetNprobe(index.nativeHandle(), nprobe); + } + + /** + * Get the total number of clusters (nlist) in the index. + * + * @param index the IVF index + * @return the number of clusters + * @throws IllegalArgumentException if the index is not an IVF index + */ + public static int getNlist(Index index) { + return FaissNative.ivfGetNlist(index.nativeHandle()); + } +} diff --git a/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/MetricType.java b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/MetricType.java new file mode 100644 index 000000000000..cd20bef07cbe --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/MetricType.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +/** + * Metric type for similarity search. + * + *
Faiss supports two main metric types for measuring similarity between vectors: + * + *
This class is responsible for loading the native Faiss library from the JAR file or system + * path. It follows a similar pattern to RocksDB's native library loading mechanism. + * + *
The loader attempts to load the library in the following order: + * + *
Order matters! Libraries must be loaded before the libraries that depend on them. + */ + private static final String[] DEPENDENCY_LIBRARIES = { + // GCC runtime libraries (must be loaded first as others depend on them) + "libgcc_s.so.1", + // Quadmath library (needed by gfortran) + "libquadmath.so.0", + // Fortran runtime (needed by OpenBLAS) - try multiple versions + "libgfortran.so.5", + "libgfortran.so.4", + "libgfortran.so.3", + // OpenMP runtime + "libgomp.so.1", + // BLAS/LAPACK + "libblas.so.3", + "liblapack.so.3", + // OpenBLAS for FAISS (load last as it depends on above) + "libopenblas.so.0", + }; + + /** Whether the native library has been loaded. */ + private static volatile boolean libraryLoaded = false; + + /** Lock for thread-safe library loading. */ + private static final Object LOAD_LOCK = new Object(); + + /** Temporary directory for extracting native libraries. */ + private static Path tempDir; + + private NativeLibraryLoader() { + // Utility class, no instantiation + } + + /** + * Load the native library. + * + * @throws FaissException if the library cannot be loaded + */ + public static void load() throws FaissException { + if (libraryLoaded) { + return; + } + + synchronized (LOAD_LOCK) { + if (libraryLoaded) { + return; + } + + try { + loadNativeLibrary(); + libraryLoaded = true; + LOG.info("Faiss native library loaded successfully"); + } catch (Exception e) { + throw new FaissException("Failed to load Faiss native library", e); + } + } + } + + /** + * Check if the native library has been loaded. + * + * @return true if the library is loaded + */ + public static boolean isLoaded() { + return libraryLoaded; + } + + private static void loadNativeLibrary() throws IOException { + // First, try loading from custom path + String customPath = System.getProperty(LIBRARY_PATH_PROPERTY); + if (customPath != null && !customPath.isEmpty()) { + File customLibrary = new File(customPath); + if (customLibrary.exists()) { + System.load(customLibrary.getAbsolutePath()); + LOG.info("Loaded Faiss native library from custom path: {}", customPath); + return; + } else { + LOG.warn("Custom library path specified but file not found: {}", customPath); + } + } + + // Second, try loading from system library path + try { + System.loadLibrary(JNI_LIBRARY_NAME); + LOG.info("Loaded Faiss native library from system path"); + return; + } catch (UnsatisfiedLinkError e) { + LOG.debug( + "Could not load from system path, trying bundled library: {}", e.getMessage()); + } + + // Third, try loading from JAR + loadFromJar(); + } + + private static void loadFromJar() throws IOException { + String libraryPath = getLibraryResourcePath(); + LOG.debug("Attempting to load native library from JAR: {}", libraryPath); + + try (InputStream is = NativeLibraryLoader.class.getResourceAsStream(libraryPath)) { + if (is == null) { + throw new IOException( + "Native library not found in JAR: " + + libraryPath + + ". " + + "Make sure you are using the correct JAR for your platform (" + + getPlatformIdentifier() + + ")"); + } + + // Create temp directory if needed + if (tempDir == null) { + tempDir = Files.createTempDirectory("paimon-faiss-native"); + tempDir.toFile().deleteOnExit(); + } + + // First, extract and load dependency libraries (if bundled) + loadDependencyLibraries(); + + // Extract native library to temp file + String fileName = System.mapLibraryName(JNI_LIBRARY_NAME); + File tempFile = new File(tempDir.toFile(), fileName); + tempFile.deleteOnExit(); + + try (OutputStream os = new FileOutputStream(tempFile)) { + byte[] buffer = new byte[8192]; + int bytesRead; + while ((bytesRead = is.read(buffer)) != -1) { + os.write(buffer, 0, bytesRead); + } + } + + // Make the file executable (for Unix-like systems) + if (!tempFile.setExecutable(true)) { + LOG.warn("Could not set executable permission on native library"); + } + + // Load the library + System.load(tempFile.getAbsolutePath()); + LOG.info("Loaded Faiss native library from JAR: {}", libraryPath); + } + } + + /** + * Extract and load dependency libraries that are bundled in the JAR. These must be loaded + * before the main JNI library to satisfy its dynamic linking requirements. + */ + private static void loadDependencyLibraries() { + String os = getOsName(); + String arch = getArchName(); + + for (String depLib : DEPENDENCY_LIBRARIES) { + String resourcePath = "/" + os + "/" + arch + "/" + depLib; + try (InputStream is = NativeLibraryLoader.class.getResourceAsStream(resourcePath)) { + if (is == null) { + LOG.debug("Dependency library not bundled: {}", depLib); + continue; + } + + File tempFile = new File(tempDir.toFile(), depLib); + tempFile.deleteOnExit(); + + try (OutputStream fos = new FileOutputStream(tempFile)) { + byte[] buffer = new byte[8192]; + int bytesRead; + while ((bytesRead = is.read(buffer)) != -1) { + fos.write(buffer, 0, bytesRead); + } + } + + if (!tempFile.setExecutable(true)) { + LOG.warn("Could not set executable permission on: {}", depLib); + } + + // Load the dependency library + System.load(tempFile.getAbsolutePath()); + LOG.info("Loaded bundled dependency library: {}", depLib); + } catch (UnsatisfiedLinkError e) { + // Library might already be loaded or not needed + LOG.debug("Could not load dependency {}: {}", depLib, e.getMessage()); + } catch (IOException e) { + LOG.debug("Could not extract dependency {}: {}", depLib, e.getMessage()); + } + } + } + + private static String getLibraryResourcePath() { + String os = getOsName(); + String arch = getArchName(); + String libraryFileName = System.mapLibraryName(JNI_LIBRARY_NAME); + return "/" + os + "/" + arch + "/" + libraryFileName; + } + + /** + * Get the platform identifier for the current system. + * + * @return platform identifier string (e.g., "linux/amd64", "darwin/aarch64") + */ + static String getPlatformIdentifier() { + return getOsName() + "/" + getArchName(); + } + + /** + * Get the normalized OS name for the current system. + * + * @return OS name string (e.g., "linux", "darwin") + */ + private static String getOsName() { + String osName = System.getProperty("os.name").toLowerCase(); + + if (osName.contains("linux")) { + return "linux"; + } else if (osName.contains("mac") || osName.contains("darwin")) { + return "darwin"; + } else { + throw new UnsupportedOperationException( + "Unsupported operating system: " + + osName + + ". Only Linux and macOS are supported."); + } + } + + /** + * Get the normalized architecture name for the current system. + * + * @return architecture name string (e.g., "amd64", "aarch64") + */ + private static String getArchName() { + String osArch = System.getProperty("os.arch").toLowerCase(); + + if (osArch.equals("amd64") || osArch.equals("x86_64")) { + return "amd64"; + } else if (osArch.equals("aarch64") || osArch.equals("arm64")) { + return "aarch64"; + } else { + throw new UnsupportedOperationException("Unsupported architecture: " + osArch); + } + } + + /** + * Get the name of the JNI library. + * + * @return the library name + */ + public static String getLibraryName() { + return JNI_LIBRARY_NAME; + } +} diff --git a/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/RangeSearchResult.java b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/RangeSearchResult.java new file mode 100644 index 000000000000..8cbecf568fa2 --- /dev/null +++ b/paimon-faiss/paimon-faiss-jni/src/main/java/org/apache/paimon/faiss/RangeSearchResult.java @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.faiss; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.FloatBuffer; +import java.nio.LongBuffer; + +/** + * Result of a range search operation with zero-copy support. + * + *
Unlike k-NN search which returns a fixed number of neighbors per query, range search returns + * all neighbors within a given radius, which can vary per query. + * + *
This class uses direct ByteBuffers to avoid memory copies when retrieving results from native + * code. + */ +public class RangeSearchResult implements AutoCloseable { + + private long nativeHandle; + private final int numQueries; + private ByteBuffer limitsBuffer; + private ByteBuffer labelsBuffer; + private ByteBuffer distancesBuffer; + private long totalSize = -1; + + /** + * Create a new RangeSearchResult from a native handle. + * + * @param nativeHandle the native handle + * @param numQueries the number of query vectors + */ + RangeSearchResult(long nativeHandle, int numQueries) { + this.nativeHandle = nativeHandle; + this.numQueries = numQueries; + } + + /** + * Get the number of query vectors. + * + * @return the number of queries + */ + public int getNumQueries() { + return numQueries; + } + + /** + * Get the number of results for a specific query. + * + * @param queryIndex the query index + * @return the number of results + */ + public long getResultCount(int queryIndex) { + ensureLimitsLoaded(); + if (queryIndex < 0 || queryIndex >= numQueries) { + throw new IndexOutOfBoundsException("Query index out of bounds: " + queryIndex); + } + LongBuffer limits = limitsBuffer.asLongBuffer(); + return limits.get(queryIndex + 1) - limits.get(queryIndex); + } + + /** + * Get the total number of results across all queries. + * + * @return the total number of results + */ + public long getTotalResultCount() { + if (totalSize < 0 && nativeHandle != 0) { + totalSize = FaissNative.rangeSearchResultGetTotalSize(nativeHandle); + } + return totalSize; + } + + /** + * Get the limits buffer containing the start/end indices for each query. + * + *
The limits buffer has (numQueries + 1) longs. For query i, results are in the range
+ * [limits[i], limits[i+1]).
+ *
+ * @return the limits buffer as a LongBuffer view
+ */
+ public LongBuffer getLimitsBuffer() {
+ ensureLimitsLoaded();
+ limitsBuffer.rewind();
+ return limitsBuffer.asLongBuffer();
+ }
+
+ /**
+ * Get the labels buffer containing all result labels.
+ *
+ * @return the labels buffer as a LongBuffer view
+ */
+ public LongBuffer getLabelsBuffer() {
+ ensureFullyLoaded();
+ labelsBuffer.rewind();
+ return labelsBuffer.asLongBuffer();
+ }
+
+ /**
+ * Get the distances buffer containing all result distances.
+ *
+ * @return the distances buffer as a FloatBuffer view
+ */
+ public FloatBuffer getDistancesBuffer() {
+ ensureFullyLoaded();
+ distancesBuffer.rewind();
+ return distancesBuffer.asFloatBuffer();
+ }
+
+ /**
+ * Get the labels for a specific query.
+ *
+ * @param queryIndex the query index
+ * @return the labels for this query
+ */
+ public long[] getLabelsForQuery(int queryIndex) {
+ ensureFullyLoaded();
+ if (queryIndex < 0 || queryIndex >= numQueries) {
+ throw new IndexOutOfBoundsException("Query index out of bounds: " + queryIndex);
+ }
+ LongBuffer limits = limitsBuffer.asLongBuffer();
+ int start = (int) limits.get(queryIndex);
+ int end = (int) limits.get(queryIndex + 1);
+ int count = end - start;
+
+ long[] result = new long[count];
+ LongBuffer labels = labelsBuffer.asLongBuffer();
+ labels.position(start);
+ labels.get(result);
+ return result;
+ }
+
+ /**
+ * Get the distances for a specific query.
+ *
+ * @param queryIndex the query index
+ * @return the distances for this query
+ */
+ public float[] getDistancesForQuery(int queryIndex) {
+ ensureFullyLoaded();
+ if (queryIndex < 0 || queryIndex >= numQueries) {
+ throw new IndexOutOfBoundsException("Query index out of bounds: " + queryIndex);
+ }
+ LongBuffer limits = limitsBuffer.asLongBuffer();
+ int start = (int) limits.get(queryIndex);
+ int end = (int) limits.get(queryIndex + 1);
+ int count = end - start;
+
+ float[] result = new float[count];
+ FloatBuffer distances = distancesBuffer.asFloatBuffer();
+ distances.position(start);
+ distances.get(result);
+ return result;
+ }
+
+ private void ensureLimitsLoaded() {
+ if (limitsBuffer == null && nativeHandle != 0) {
+ limitsBuffer =
+ ByteBuffer.allocateDirect((numQueries + 1) * Long.BYTES)
+ .order(ByteOrder.nativeOrder());
+ FaissNative.rangeSearchResultGetLimits(nativeHandle, limitsBuffer);
+ }
+ }
+
+ private void ensureFullyLoaded() {
+ ensureLimitsLoaded();
+ if (labelsBuffer == null && nativeHandle != 0) {
+ long total = getTotalResultCount();
+ labelsBuffer =
+ ByteBuffer.allocateDirect((int) total * Long.BYTES)
+ .order(ByteOrder.nativeOrder());
+ distancesBuffer =
+ ByteBuffer.allocateDirect((int) total * Float.BYTES)
+ .order(ByteOrder.nativeOrder());
+ FaissNative.rangeSearchResultGetLabels(nativeHandle, labelsBuffer);
+ FaissNative.rangeSearchResultGetDistances(nativeHandle, distancesBuffer);
+ }
+ }
+
+ @Override
+ public void close() {
+ if (nativeHandle != 0) {
+ FaissNative.rangeSearchResultDestroy(nativeHandle);
+ nativeHandle = 0;
+ }
+ }
+}
diff --git a/paimon-faiss/paimon-faiss-jni/src/main/native/CMakeLists.txt b/paimon-faiss/paimon-faiss-jni/src/main/native/CMakeLists.txt
new file mode 100644
index 000000000000..b7f3c8713975
--- /dev/null
+++ b/paimon-faiss/paimon-faiss-jni/src/main/native/CMakeLists.txt
@@ -0,0 +1,457 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.30.1)
+project(paimon_faiss_jni VERSION 0.1.0 LANGUAGES CXX)
+
+# FAISS version
+set(FAISS_VERSION "1.7.4" CACHE STRING "FAISS library version")
+add_definitions(-DFAISS_VERSION="${FAISS_VERSION}")
+
+# Check GCC version (must be >= 9.3.0)
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+ if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9.3.0")
+ message(FATAL_ERROR "GCC version must be >= 9.3.0. Found: ${CMAKE_CXX_COMPILER_VERSION}")
+ endif()
+ message(STATUS "Using GCC ${CMAKE_CXX_COMPILER_VERSION}")
+endif()
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+# Options
+option(FAISS_ENABLE_GPU "Build with GPU support" OFF)
+option(FAISS_OPT_LEVEL "Optimization level (generic, avx2, avx512)" "generic")
+option(BUILD_FAT_LIB "Build fat library with all dependencies statically linked" ON)
+
+# Find JNI
+find_package(JNI REQUIRED)
+include_directories(${JNI_INCLUDE_DIRS})
+
+# Find OpenMP (with special handling for macOS)
+if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+ # macOS requires special handling for OpenMP
+ # First try to find libomp from Homebrew
+ execute_process(
+ COMMAND brew --prefix libomp
+ OUTPUT_VARIABLE HOMEBREW_LIBOMP_PREFIX
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+ ERROR_QUIET
+ )
+
+ if(HOMEBREW_LIBOMP_PREFIX)
+ message(STATUS "Found Homebrew libomp: ${HOMEBREW_LIBOMP_PREFIX}")
+ set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -I${HOMEBREW_LIBOMP_PREFIX}/include")
+ set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -I${HOMEBREW_LIBOMP_PREFIX}/include")
+ set(OpenMP_C_LIB_NAMES "omp")
+ set(OpenMP_CXX_LIB_NAMES "omp")
+ set(OpenMP_omp_LIBRARY "${HOMEBREW_LIBOMP_PREFIX}/lib/libomp.dylib")
+
+ # Create imported target manually
+ if(NOT TARGET OpenMP::OpenMP_CXX)
+ add_library(OpenMP::OpenMP_CXX SHARED IMPORTED)
+ set_target_properties(OpenMP::OpenMP_CXX PROPERTIES
+ IMPORTED_LOCATION "${HOMEBREW_LIBOMP_PREFIX}/lib/libomp.dylib"
+ INTERFACE_INCLUDE_DIRECTORIES "${HOMEBREW_LIBOMP_PREFIX}/include"
+ INTERFACE_COMPILE_OPTIONS "-Xpreprocessor;-fopenmp"
+ )
+ endif()
+ set(OpenMP_FOUND TRUE)
+ else()
+ message(WARNING "libomp not found via Homebrew. Trying standard OpenMP detection...")
+ find_package(OpenMP)
+ endif()
+else()
+ find_package(OpenMP REQUIRED)
+endif()
+
+if(NOT OpenMP_FOUND AND NOT TARGET OpenMP::OpenMP_CXX)
+ message(WARNING "OpenMP not found. Building without OpenMP support.")
+ message(WARNING "On macOS, install libomp: brew install libomp")
+endif()
+
+# Find Faiss
+# For fat lib, prefer static libraries
+if(BUILD_FAT_LIB)
+ message(STATUS "Building fat library - preferring static libraries")
+ set(CMAKE_FIND_LIBRARY_SUFFIXES ".a" ".so" ".dylib")
+ set(FAISS_STATIC_PREFERRED TRUE)
+else()
+ set(FAISS_STATIC_PREFERRED FALSE)
+endif()
+
+# First try to find Faiss via CMake config
+find_package(faiss CONFIG QUIET)
+
+if(NOT faiss_FOUND)
+ # Try pkg-config
+ find_package(PkgConfig QUIET)
+ if(PKG_CONFIG_FOUND)
+ pkg_check_modules(FAISS QUIET faiss)
+ endif()
+
+ if(NOT FAISS_FOUND)
+ # Manual search - look in common locations
+ find_path(FAISS_INCLUDE_DIR
+ NAMES faiss/Index.h
+ PATHS
+ /usr/local/include
+ /usr/include
+ ${FAISS_ROOT}/include
+ $ENV{FAISS_ROOT}/include
+ )
+
+ # For fat lib, try to find static library first
+ if(BUILD_FAT_LIB)
+ find_library(FAISS_LIBRARY_STATIC
+ NAMES libfaiss.a faiss_static
+ PATHS
+ /usr/local/lib
+ /usr/lib
+ /usr/local/lib64
+ /usr/lib64
+ ${FAISS_ROOT}/lib
+ ${FAISS_ROOT}/lib64
+ $ENV{FAISS_ROOT}/lib
+ $ENV{FAISS_ROOT}/lib64
+ )
+ if(FAISS_LIBRARY_STATIC)
+ set(FAISS_LIBRARY ${FAISS_LIBRARY_STATIC})
+ message(STATUS "Found Faiss static library: ${FAISS_LIBRARY}")
+ endif()
+ endif()
+
+ # If static not found or not building fat lib, find any library
+ if(NOT FAISS_LIBRARY)
+ find_library(FAISS_LIBRARY
+ NAMES faiss
+ PATHS
+ /usr/local/lib
+ /usr/lib
+ /usr/local/lib64
+ /usr/lib64
+ ${FAISS_ROOT}/lib
+ ${FAISS_ROOT}/lib64
+ $ENV{FAISS_ROOT}/lib
+ $ENV{FAISS_ROOT}/lib64
+ )
+ endif()
+
+ if(FAISS_INCLUDE_DIR AND FAISS_LIBRARY)
+ set(FAISS_FOUND TRUE)
+ set(FAISS_INCLUDE_DIRS ${FAISS_INCLUDE_DIR})
+ set(FAISS_LIBRARIES ${FAISS_LIBRARY})
+ message(STATUS "Found Faiss: ${FAISS_LIBRARY}")
+ else()
+ message(FATAL_ERROR "Faiss not found. Please install Faiss or set FAISS_ROOT environment variable.")
+ endif()
+ endif()
+endif()
+
+# Find BLAS/LAPACK for static linking (Faiss depends on them)
+if(BUILD_FAT_LIB)
+ # Save original suffixes
+ set(_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
+
+ # Force static library search only
+ set(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
+
+ # Try to find OpenBLAS static library
+ find_library(OPENBLAS_STATIC_LIBRARY
+ NAMES openblas openblas_static
+ PATHS
+ /usr/local/lib
+ /usr/lib
+ /usr/local/lib64
+ /usr/lib64
+ /usr/lib/x86_64-linux-gnu
+ /usr/lib/aarch64-linux-gnu
+ ${OPENBLAS_ROOT}/lib
+ $ENV{OPENBLAS_ROOT}/lib
+ NO_DEFAULT_PATH
+ )
+
+ # Also try default paths
+ if(NOT OPENBLAS_STATIC_LIBRARY)
+ find_library(OPENBLAS_STATIC_LIBRARY
+ NAMES openblas openblas_static
+ )
+ endif()
+
+ if(OPENBLAS_STATIC_LIBRARY AND OPENBLAS_STATIC_LIBRARY MATCHES "\\.a$")
+ message(STATUS "Found OpenBLAS static library: ${OPENBLAS_STATIC_LIBRARY}")
+ set(OPENBLAS_USE_STATIC TRUE)
+ list(APPEND FAISS_STATIC_LIBS ${OPENBLAS_STATIC_LIBRARY})
+ else()
+ message(STATUS "OpenBLAS static library not found, trying shared library")
+ set(OPENBLAS_USE_STATIC FALSE)
+
+ # Restore suffixes and find shared library
+ set(CMAKE_FIND_LIBRARY_SUFFIXES ${_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES})
+ find_library(OPENBLAS_SHARED_LIBRARY
+ NAMES openblas
+ PATHS
+ /usr/local/lib
+ /usr/lib
+ /usr/local/lib64
+ /usr/lib64
+ /usr/lib/x86_64-linux-gnu
+ /usr/lib/aarch64-linux-gnu
+ ${OPENBLAS_ROOT}/lib
+ $ENV{OPENBLAS_ROOT}/lib
+ )
+ if(OPENBLAS_SHARED_LIBRARY)
+ message(STATUS "Found OpenBLAS shared library: ${OPENBLAS_SHARED_LIBRARY}")
+ list(APPEND FAISS_EXTRA_LIBS ${OPENBLAS_SHARED_LIBRARY})
+ # Mark that we need to bundle this library
+ set(BUNDLE_OPENBLAS TRUE)
+ set(BUNDLE_OPENBLAS_PATH ${OPENBLAS_SHARED_LIBRARY})
+ else()
+ # Try to find any BLAS
+ find_package(BLAS QUIET)
+ if(BLAS_FOUND)
+ list(APPEND FAISS_EXTRA_LIBS ${BLAS_LIBRARIES})
+ message(STATUS "Found BLAS: ${BLAS_LIBRARIES}")
+ endif()
+ endif()
+ endif()
+
+ # Restore suffixes for static search
+ set(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
+
+ # Find LAPACK static library
+ find_library(LAPACK_STATIC_LIBRARY
+ NAMES lapack
+ PATHS
+ /usr/local/lib
+ /usr/lib
+ /usr/local/lib64
+ /usr/lib64
+ /usr/lib/x86_64-linux-gnu
+ /usr/lib/aarch64-linux-gnu
+ )
+ if(LAPACK_STATIC_LIBRARY AND LAPACK_STATIC_LIBRARY MATCHES "\\.a$")
+ message(STATUS "Found LAPACK static library: ${LAPACK_STATIC_LIBRARY}")
+ list(APPEND FAISS_STATIC_LIBS ${LAPACK_STATIC_LIBRARY})
+ endif()
+
+ # Find gfortran static library (needed by OpenBLAS)
+ find_library(GFORTRAN_STATIC_LIBRARY
+ NAMES gfortran
+ )
+ if(GFORTRAN_STATIC_LIBRARY AND GFORTRAN_STATIC_LIBRARY MATCHES "\\.a$")
+ message(STATUS "Found gfortran static library: ${GFORTRAN_STATIC_LIBRARY}")
+ list(APPEND FAISS_STATIC_LIBS ${GFORTRAN_STATIC_LIBRARY})
+ endif()
+
+ # Restore original suffixes
+ set(CMAKE_FIND_LIBRARY_SUFFIXES ${_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES})
+
+ # On Linux, we may need pthread, dl, and m (these are typically dynamically linked)
+ if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ list(APPEND FAISS_EXTRA_LIBS pthread dl m)
+
+ # Try to find gfortran shared if static not found
+ if(NOT GFORTRAN_STATIC_LIBRARY)
+ find_library(GFORTRAN_LIBRARY gfortran)
+ if(GFORTRAN_LIBRARY)
+ list(APPEND FAISS_EXTRA_LIBS ${GFORTRAN_LIBRARY})
+ endif()
+ endif()
+ endif()
+endif()
+
+# Platform detection - using {os}/{arch} directory structure
+if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ set(PLATFORM_OS "linux")
+ if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64")
+ set(PLATFORM_ARCH "amd64")
+ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64")
+ set(PLATFORM_ARCH "aarch64")
+ else()
+ message(FATAL_ERROR "Unsupported Linux architecture: ${CMAKE_SYSTEM_PROCESSOR}")
+ endif()
+elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+ set(PLATFORM_OS "darwin")
+ if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64")
+ set(PLATFORM_ARCH "amd64")
+ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
+ set(PLATFORM_ARCH "aarch64")
+ else()
+ message(FATAL_ERROR "Unsupported macOS architecture: ${CMAKE_SYSTEM_PROCESSOR}")
+ endif()
+else()
+ message(FATAL_ERROR "Unsupported operating system: ${CMAKE_SYSTEM_NAME}. Only Linux and macOS are supported.")
+endif()
+
+set(PLATFORM_DIR "${PLATFORM_OS}/${PLATFORM_ARCH}")
+message(STATUS "Building for platform: ${PLATFORM_DIR}")
+
+# Build the JNI library
+add_library(paimon_faiss_jni SHARED
+ paimon_faiss_jni.cpp
+)
+
+# Include directories
+if(TARGET faiss)
+ target_link_libraries(paimon_faiss_jni PRIVATE faiss)
+else()
+ target_include_directories(paimon_faiss_jni PRIVATE ${FAISS_INCLUDE_DIRS})
+ target_link_libraries(paimon_faiss_jni PRIVATE ${FAISS_LIBRARIES})
+endif()
+
+# Link extra libraries for fat lib (BLAS, LAPACK, etc.)
+if(BUILD_FAT_LIB)
+ # Link static libraries with --whole-archive to embed all symbols
+ if(FAISS_STATIC_LIBS AND CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ message(STATUS "Linking static libraries with --whole-archive: ${FAISS_STATIC_LIBS}")
+ target_link_options(paimon_faiss_jni PRIVATE
+ "-Wl,--whole-archive"
+ )
+ target_link_libraries(paimon_faiss_jni PRIVATE ${FAISS_STATIC_LIBS})
+ target_link_options(paimon_faiss_jni PRIVATE
+ "-Wl,--no-whole-archive"
+ )
+ elseif(FAISS_STATIC_LIBS)
+ # macOS doesn't use --whole-archive, use -force_load instead
+ foreach(static_lib ${FAISS_STATIC_LIBS})
+ target_link_options(paimon_faiss_jni PRIVATE "-Wl,-force_load,${static_lib}")
+ endforeach()
+ message(STATUS "Linking static libraries with -force_load: ${FAISS_STATIC_LIBS}")
+ endif()
+
+ # Link remaining shared libraries
+ if(FAISS_EXTRA_LIBS)
+ target_link_libraries(paimon_faiss_jni PRIVATE ${FAISS_EXTRA_LIBS})
+ message(STATUS "Linking extra libraries: ${FAISS_EXTRA_LIBS}")
+ endif()
+endif()
+
+# Link OpenMP - always use dynamic linking for OpenMP (static libgomp.a often lacks -fPIC)
+if(TARGET OpenMP::OpenMP_CXX)
+ target_link_libraries(paimon_faiss_jni PRIVATE OpenMP::OpenMP_CXX)
+ message(STATUS "Linking OpenMP via imported target")
+elseif(OpenMP_FOUND)
+ target_compile_options(paimon_faiss_jni PRIVATE ${OpenMP_CXX_FLAGS})
+ # Link against the shared gomp library
+ if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ find_library(GOMP_SHARED_LIBRARY NAMES gomp PATHS /usr/lib /usr/lib64 /usr/lib/x86_64-linux-gnu)
+ if(GOMP_SHARED_LIBRARY)
+ target_link_libraries(paimon_faiss_jni PRIVATE ${GOMP_SHARED_LIBRARY})
+ message(STATUS "Linking OpenMP shared library: ${GOMP_SHARED_LIBRARY}")
+ else()
+ target_link_libraries(paimon_faiss_jni PRIVATE gomp)
+ message(STATUS "Linking OpenMP: gomp")
+ endif()
+ else()
+ target_link_libraries(paimon_faiss_jni PRIVATE ${OpenMP_CXX_FLAGS})
+ endif()
+endif()
+
+# Platform-specific settings
+if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+ # macOS specific settings
+ set_target_properties(paimon_faiss_jni PROPERTIES
+ SUFFIX ".dylib"
+ INSTALL_NAME_DIR "@rpath"
+ BUILD_WITH_INSTALL_RPATH TRUE
+ )
+
+ # Link against libc++
+ target_link_libraries(paimon_faiss_jni PRIVATE c++)
+
+ # For fat lib on macOS, embed OpenMP library path
+ if(BUILD_FAT_LIB AND HOMEBREW_LIBOMP_PREFIX)
+ target_link_options(paimon_faiss_jni PRIVATE
+ "-Wl,-rpath,@loader_path"
+ "-Wl,-rpath,${HOMEBREW_LIBOMP_PREFIX}/lib"
+ )
+ endif()
+
+elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ # Linux specific settings
+ set_target_properties(paimon_faiss_jni PROPERTIES
+ SUFFIX ".so"
+ )
+
+ if(BUILD_FAT_LIB)
+ # For fat lib, use static libstdc++ and libgcc
+ target_link_options(paimon_faiss_jni PRIVATE
+ "-static-libstdc++"
+ "-static-libgcc"
+ "-Wl,--exclude-libs,ALL"
+ )
+ message(STATUS "Using static libstdc++ and libgcc for fat lib")
+ else()
+ target_link_libraries(paimon_faiss_jni PRIVATE stdc++)
+ endif()
+
+endif()
+
+# Set output directory - output to src/main/resources/{os}/{arch}/
+set(OUTPUT_DIR "${CMAKE_SOURCE_DIR}/../resources/${PLATFORM_DIR}")
+set_target_properties(paimon_faiss_jni PROPERTIES
+ LIBRARY_OUTPUT_DIRECTORY ${OUTPUT_DIR}
+ RUNTIME_OUTPUT_DIRECTORY ${OUTPUT_DIR}
+)
+
+# Optimization level
+if(FAISS_OPT_LEVEL STREQUAL "avx2")
+ target_compile_options(paimon_faiss_jni PRIVATE -mavx2 -mfma)
+ message(STATUS "Building with AVX2 optimizations")
+elseif(FAISS_OPT_LEVEL STREQUAL "avx512")
+ target_compile_options(paimon_faiss_jni PRIVATE -mavx512f -mavx512dq -mavx512bw -mavx512vl)
+ message(STATUS "Building with AVX-512 optimizations")
+else()
+ message(STATUS "Building with generic optimizations")
+endif()
+
+# Copy bundled shared libraries to output directory and set rpath
+if(BUILD_FAT_LIB AND BUNDLE_OPENBLAS AND BUNDLE_OPENBLAS_PATH)
+ message(STATUS "Will bundle OpenBLAS shared library: ${BUNDLE_OPENBLAS_PATH}")
+
+ # Get the actual library file (resolve symlinks)
+ get_filename_component(OPENBLAS_REALPATH ${BUNDLE_OPENBLAS_PATH} REALPATH)
+ get_filename_component(OPENBLAS_FILENAME ${OPENBLAS_REALPATH} NAME)
+
+ # Copy OpenBLAS to output directory after build
+ add_custom_command(TARGET paimon_faiss_jni POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E copy_if_different
+ ${OPENBLAS_REALPATH}
+ ${OUTPUT_DIR}/libopenblas.so.0
+ COMMENT "Bundling OpenBLAS shared library"
+ )
+
+ # Set rpath to look in the same directory as the library
+ if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ target_link_options(paimon_faiss_jni PRIVATE
+ "-Wl,-rpath,$ORIGIN"
+ )
+ # Also patch the library after build to use the bundled libopenblas
+ add_custom_command(TARGET paimon_faiss_jni POST_BUILD
+ COMMAND patchelf --set-rpath "$$ORIGIN" ${OUTPUT_DIR}/libpaimon_faiss_jni.so || true
+ COMMENT "Setting rpath to $ORIGIN"
+ )
+ endif()
+endif()
+
+# Install target
+install(TARGETS paimon_faiss_jni
+ LIBRARY DESTINATION ${PLATFORM_DIR}
+ RUNTIME DESTINATION ${PLATFORM_DIR}
+)
+
diff --git a/paimon-faiss/paimon-faiss-jni/src/main/native/paimon_faiss_jni.cpp b/paimon-faiss/paimon-faiss-jni/src/main/native/paimon_faiss_jni.cpp
new file mode 100644
index 000000000000..72e8cc66808e
--- /dev/null
+++ b/paimon-faiss/paimon-faiss-jni/src/main/native/paimon_faiss_jni.cpp
@@ -0,0 +1,475 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "paimon_faiss_jni.h"
+
+#include Note: These tests require the native library to be built and available. They will be skipped
+ * if the native library is not found.
+ */
+class IndexTest {
+
+ private static final int DIMENSION = 128;
+ private static final int NUM_VECTORS = 1000;
+ private static final int K = 10;
+
+ @Test
+ void testFlatIndexBasicOperations() {
+ try (Index index = createFlatIndexWithMetric(MetricType.L2)) {
+ assertEquals(DIMENSION, index.getDimension());
+ assertEquals(0, index.getCount());
+ assertTrue(index.isTrained());
+ assertEquals(MetricType.L2, index.getMetricType());
+
+ // Add vectors using zero-copy API
+ ByteBuffer vectorBuffer = createVectorBuffer(NUM_VECTORS, DIMENSION);
+ index.add(NUM_VECTORS, vectorBuffer);
+ assertEquals(NUM_VECTORS, index.getCount());
+
+ // Search using array API
+ float[] queryVectors = createQueryVectors(1, DIMENSION);
+ float[] distances = new float[K];
+ long[] labels = new long[K];
+
+ index.search(1, queryVectors, K, distances, labels);
+
+ // Verify labels are in valid range
+ for (int i = 0; i < K; i++) {
+ assertTrue(
+ labels[i] >= 0 && labels[i] < NUM_VECTORS,
+ "Label " + labels[i] + " out of range");
+ }
+
+ // Verify distances are non-negative for L2
+ for (int i = 0; i < K; i++) {
+ assertTrue(distances[i] >= 0, "Distance should be non-negative for L2");
+ }
+ }
+ }
+
+ @Test
+ void testFlatIndexWithIds() {
+ try (Index index = IndexFactory.create(DIMENSION, "IDMap,Flat", MetricType.L2)) {
+ ByteBuffer vectorBuffer = createVectorBuffer(NUM_VECTORS, DIMENSION);
+ ByteBuffer idBuffer = Index.allocateIdBuffer(NUM_VECTORS);
+ idBuffer.asLongBuffer();
+ for (int i = 0; i < NUM_VECTORS; i++) {
+ idBuffer.putLong(i * Long.BYTES, i * 100L); // Use custom IDs
+ }
+
+ index.addWithIds(NUM_VECTORS, vectorBuffer, idBuffer);
+ assertEquals(NUM_VECTORS, index.getCount());
+
+ // Search should return our custom IDs
+ float[] queryVectors = createQueryVectors(1, DIMENSION);
+ float[] distances = new float[K];
+ long[] labels = new long[K];
+
+ index.search(1, queryVectors, K, distances, labels);
+
+ for (int i = 0; i < K; i++) {
+ assertTrue(labels[i] % 100 == 0, "Label should be a multiple of 100");
+ }
+ }
+ }
+
+ @Test
+ void testBatchSearch() {
+ try (Index index = createFlatIndexWithMetric(MetricType.L2)) {
+ ByteBuffer vectorBuffer = createVectorBuffer(NUM_VECTORS, DIMENSION);
+ index.add(NUM_VECTORS, vectorBuffer);
+
+ int numQueries = 5;
+ float[] queryVectors = createQueryVectors(numQueries, DIMENSION);
+ float[] distances = new float[numQueries * K];
+ long[] labels = new long[numQueries * K];
+
+ index.search(numQueries, queryVectors, K, distances, labels);
+
+ // Read results for each query
+ for (int q = 0; q < numQueries; q++) {
+ for (int n = 0; n < K; n++) {
+ int idx = q * K + n;
+ assertTrue(labels[idx] >= 0 && labels[idx] < NUM_VECTORS);
+ assertTrue(distances[idx] >= 0);
+ }
+ }
+ }
+ }
+
+ @Test
+ void testInnerProductMetric() {
+ try (Index index = createFlatIndexWithMetric(MetricType.INNER_PRODUCT)) {
+ assertEquals(MetricType.INNER_PRODUCT, index.getMetricType());
+
+ ByteBuffer vectorBuffer = createVectorBuffer(NUM_VECTORS, DIMENSION);
+ index.add(NUM_VECTORS, vectorBuffer);
+
+ float[] queryVectors = createQueryVectors(1, DIMENSION);
+ float[] distances = new float[K];
+ long[] labels = new long[K];
+
+ index.search(1, queryVectors, K, distances, labels);
+
+ // For inner product, higher is better, so first result should have highest score
+ for (int i = 1; i < K; i++) {
+ assertTrue(
+ distances[i - 1] >= distances[i],
+ "Distances should be sorted in descending order for inner product");
+ }
+ }
+ }
+
+ @Test
+ void testIndexReset() {
+ try (Index index = createFlatIndex()) {
+ ByteBuffer vectorBuffer = createVectorBuffer(100, DIMENSION);
+ index.add(100, vectorBuffer);
+ assertEquals(100, index.getCount());
+
+ index.reset();
+ assertEquals(0, index.getCount());
+
+ // Can add again after reset
+ index.add(100, vectorBuffer);
+ assertEquals(100, index.getCount());
+ }
+ }
+
+ @Test
+ void testIndexSerialization(@TempDir Path tempDir) {
+ ByteBuffer vectorBuffer = createVectorBuffer(NUM_VECTORS, DIMENSION);
+ float[] queryVectors = createQueryVectors(1, DIMENSION);
+ long[] originalLabels = new long[K];
+ float[] originalDistances = new float[K];
+
+ // Create and populate index
+ try (Index index = createFlatIndex()) {
+ index.add(NUM_VECTORS, vectorBuffer);
+
+ // Perform search and save original results
+ index.search(1, queryVectors, K, originalDistances, originalLabels);
+
+ // Test file I/O
+ File indexFile = tempDir.resolve("test.index").toFile();
+ index.writeToFile(indexFile);
+
+ try (Index loadedIndex = Index.readFromFile(indexFile)) {
+ assertEquals(DIMENSION, loadedIndex.getDimension());
+ assertEquals(NUM_VECTORS, loadedIndex.getCount());
+
+ float[] loadedDistances = new float[K];
+ long[] loadedLabels = new long[K];
+ loadedIndex.search(1, queryVectors, K, loadedDistances, loadedLabels);
+
+ assertArrayEquals(originalLabels, loadedLabels);
+ }
+ }
+
+ // Test ByteBuffer serialization (zero-copy for write) and byte[] deserialization
+ try (Index index = createFlatIndex()) {
+ index.add(NUM_VECTORS, vectorBuffer);
+
+ long serializeSize = index.serializeSize();
+ assertTrue(serializeSize > 0);
+
+ ByteBuffer serialized =
+ ByteBuffer.allocateDirect((int) serializeSize).order(ByteOrder.nativeOrder());
+ long bytesWritten = index.serialize(serialized);
+ assertEquals(serializeSize, bytesWritten);
+
+ // Convert to byte array for deserialization
+ serialized.rewind();
+ byte[] serializedBytes = new byte[(int) bytesWritten];
+ serialized.get(serializedBytes);
+
+ try (Index deserializedIndex = Index.deserialize(serializedBytes)) {
+ assertEquals(DIMENSION, deserializedIndex.getDimension());
+ assertEquals(NUM_VECTORS, deserializedIndex.getCount());
+
+ float[] deserializedDistances = new float[K];
+ long[] deserializedLabels = new long[K];
+ deserializedIndex.search(
+ 1, queryVectors, K, deserializedDistances, deserializedLabels);
+
+ assertArrayEquals(originalLabels, deserializedLabels);
+ }
+ }
+ }
+
+ @Test
+ void testIndexFactoryDescriptions() {
+ // Test various index factory strings
+ String[] descriptions = {"Flat", "IDMap,Flat", "HNSW32", "HNSW32,Flat"};
+
+ for (String desc : descriptions) {
+ try (Index index = IndexFactory.create(DIMENSION, desc, MetricType.L2)) {
+ assertEquals(DIMENSION, index.getDimension());
+ assertNotNull(index.toString());
+ }
+ }
+ }
+
+ @Test
+ void testHNSWIndex() {
+ try (Index index = IndexFactory.create(DIMENSION, "HNSW" + 32, MetricType.L2)) {
+ assertTrue(index.isTrained()); // HNSW doesn't need training
+
+ ByteBuffer vectorBuffer = createVectorBuffer(NUM_VECTORS, DIMENSION);
+ index.add(NUM_VECTORS, vectorBuffer);
+
+ // Get and set efSearch
+ int efSearch = IndexHNSW.getEfSearch(index);
+ assertTrue(efSearch > 0);
+
+ IndexHNSW.setEfSearch(index, 64);
+ assertEquals(64, IndexHNSW.getEfSearch(index));
+
+ // Search
+ float[] queryVectors = createQueryVectors(1, DIMENSION);
+ float[] distances = new float[K];
+ long[] labels = new long[K];
+
+ index.search(1, queryVectors, K, distances, labels);
+
+ for (int i = 0; i < K; i++) {
+ assertTrue(labels[i] >= 0);
+ }
+ }
+ }
+
+ @Test
+ void testIVFSQ8Index() {
+ // IVF16384,SQ8 is a quantized index that needs training
+ try (Index index = IndexFactory.create(DIMENSION, "IVF16384,SQ8", MetricType.L2)) {
+ assertEquals(DIMENSION, index.getDimension());
+ assertEquals(MetricType.L2, index.getMetricType());
+
+ // IVF index needs training
+ assertTrue(!index.isTrained(), "IVF index should not be trained initially");
+
+ // Train the index with training vectors
+ int numTrainingVectors = 20000; // Should be >= nlist (16384) for good training
+ ByteBuffer trainingBuffer = createVectorBuffer(numTrainingVectors, DIMENSION);
+ index.train(numTrainingVectors, trainingBuffer);
+
+ assertTrue(index.isTrained(), "Index should be trained after training");
+
+ // Add vectors after training
+ ByteBuffer vectorBuffer = createVectorBuffer(NUM_VECTORS, DIMENSION);
+ index.add(NUM_VECTORS, vectorBuffer);
+ assertEquals(NUM_VECTORS, index.getCount());
+
+ // Set nprobe for search (number of clusters to visit)
+ IndexIVF.setNprobe(index, 64);
+ assertEquals(64, IndexIVF.getNprobe(index));
+
+ // Search
+ float[] queryVectors = createQueryVectors(1, DIMENSION);
+ float[] distances = new float[K];
+ long[] labels = new long[K];
+
+ index.search(1, queryVectors, K, distances, labels);
+
+ // Verify search results
+ for (int i = 0; i < K; i++) {
+ assertTrue(
+ labels[i] >= 0 && labels[i] < NUM_VECTORS,
+ "Label " + labels[i] + " out of range");
+ assertTrue(distances[i] >= 0, "Distance should be non-negative for L2");
+ }
+
+ // Test batch search
+ int numQueries = 3;
+ float[] batchQueryVectors = createQueryVectors(numQueries, DIMENSION);
+ float[] batchDistances = new float[numQueries * K];
+ long[] batchLabels = new long[numQueries * K];
+
+ index.search(numQueries, batchQueryVectors, K, batchDistances, batchLabels);
+
+ for (int q = 0; q < numQueries; q++) {
+ for (int n = 0; n < K; n++) {
+ int idx = q * K + n;
+ assertTrue(batchLabels[idx] >= 0 && batchLabels[idx] < NUM_VECTORS);
+ assertTrue(batchDistances[idx] >= 0);
+ }
+ }
+ }
+ }
+
+ @Test
+ void testErrorHandling() {
+ // Test invalid dimension
+ assertThrows(
+ IllegalArgumentException.class,
+ () -> {
+ IndexFactory.create(0, "Flat", MetricType.L2);
+ });
+
+ assertThrows(
+ IllegalArgumentException.class,
+ () -> {
+ IndexFactory.create(-1, "Flat", MetricType.L2);
+ });
+
+ // Test null description
+ assertThrows(
+ IllegalArgumentException.class,
+ () -> {
+ IndexFactory.create(DIMENSION, null, MetricType.L2);
+ });
+
+ // Test buffer validation - wrong size buffer
+ try (Index index = createFlatIndex()) {
+ ByteBuffer wrongSizeBuffer =
+ ByteBuffer.allocateDirect(10).order(ByteOrder.nativeOrder());
+ assertThrows(
+ IllegalArgumentException.class,
+ () -> {
+ index.add(1, wrongSizeBuffer); // Buffer too small for 1 vector
+ });
+ }
+
+ // Test non-direct buffer
+ try (Index index = createFlatIndex()) {
+ ByteBuffer heapBuffer = ByteBuffer.allocate(DIMENSION * Float.BYTES);
+ assertThrows(
+ IllegalArgumentException.class,
+ () -> {
+ index.add(1, heapBuffer); // Not a direct buffer
+ });
+ }
+
+ // Test closed index
+ Index closedIndex = createFlatIndex();
+ closedIndex.close();
+ assertThrows(
+ IllegalStateException.class,
+ () -> {
+ closedIndex.getCount();
+ });
+ }
+
+ @Test
+ void testSearchResultArrays() {
+ try (Index index = createFlatIndex()) {
+ ByteBuffer vectorBuffer = createVectorBuffer(100, DIMENSION);
+ index.add(100, vectorBuffer);
+
+ int numQueries = 3;
+ int k = 5;
+ float[] queryVectors = createQueryVectors(numQueries, DIMENSION);
+ float[] distances = new float[numQueries * k];
+ long[] labels = new long[numQueries * k];
+
+ index.search(numQueries, queryVectors, k, distances, labels);
+
+ // Test reading individual results
+ for (int q = 0; q < numQueries; q++) {
+ for (int n = 0; n < k; n++) {
+ int idx = q * k + n;
+ assertTrue(labels[idx] >= 0 && labels[idx] < 100);
+ assertTrue(distances[idx] >= 0);
+ }
+ }
+ }
+ }
+
+ @Test
+ void testBufferAllocationHelpers() {
+ // Test vector buffer allocation
+ ByteBuffer vectorBuffer = Index.allocateVectorBuffer(10, DIMENSION);
+ assertTrue(vectorBuffer.isDirect());
+ assertEquals(ByteOrder.nativeOrder(), vectorBuffer.order());
+ assertEquals(10 * DIMENSION * Float.BYTES, vectorBuffer.capacity());
+
+ // Test ID buffer allocation
+ ByteBuffer idBuffer = Index.allocateIdBuffer(10);
+ assertTrue(idBuffer.isDirect());
+ assertEquals(ByteOrder.nativeOrder(), idBuffer.order());
+ assertEquals(10 * Long.BYTES, idBuffer.capacity());
+ }
+
+ private Index createFlatIndexWithMetric(MetricType metricType) {
+ return IndexFactory.create(DIMENSION, "Flat", metricType);
+ }
+
+ private Index createFlatIndex() {
+ return IndexFactory.create(DIMENSION, "Flat", MetricType.L2);
+ }
+
+ /** Create a direct ByteBuffer with random vectors. */
+ private ByteBuffer createVectorBuffer(int n, int d) {
+ ByteBuffer buffer = Index.allocateVectorBuffer(n, d);
+ FloatBuffer floatView = buffer.asFloatBuffer();
+
+ Random random = new Random(42);
+ for (int i = 0; i < n * d; i++) {
+ floatView.put(i, random.nextFloat());
+ }
+
+ return buffer;
+ }
+
+ /** Create a float array with random query vectors. */
+ private float[] createQueryVectors(int n, int d) {
+ float[] vectors = new float[n * d];
+ Random random = new Random(42);
+ for (int i = 0; i < n * d; i++) {
+ vectors[i] = random.nextFloat();
+ }
+ return vectors;
+ }
+}
diff --git a/paimon-faiss/pom.xml b/paimon-faiss/pom.xml
new file mode 100644
index 000000000000..43e22504a6e2
--- /dev/null
+++ b/paimon-faiss/pom.xml
@@ -0,0 +1,38 @@
+
+
+