From dbe7cbee32b8ac1d2d90ddb094e029fae4321540 Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 6 Nov 2025 12:52:56 +0530 Subject: [PATCH 1/3] Expose API to get in-use GPU memory --- c_api/gpu/DeviceUtils_c.cpp | 9 +++++++++ c_api/gpu/DeviceUtils_c.h | 3 +++ 2 files changed, 12 insertions(+) diff --git a/c_api/gpu/DeviceUtils_c.cpp b/c_api/gpu/DeviceUtils_c.cpp index 37850d39a0..b375c9a3a1 100644 --- a/c_api/gpu/DeviceUtils_c.cpp +++ b/c_api/gpu/DeviceUtils_c.cpp @@ -44,3 +44,12 @@ int faiss_gpu_sync_all_devices() { } CATCH_AND_HANDLE } + +/// Returns the free memory (in bytes) on the specified device +int faiss_get_free_memory(int device, size_t* p_free_bytes) { + try { + size_t freeBytes = faiss::gpu::getFreeMemory(device); + *p_free_bytes = freeBytes; + } + CATCH_AND_HANDLE +} \ No newline at end of file diff --git a/c_api/gpu/DeviceUtils_c.h b/c_api/gpu/DeviceUtils_c.h index b5c67e97ad..6755f24c97 100644 --- a/c_api/gpu/DeviceUtils_c.h +++ b/c_api/gpu/DeviceUtils_c.h @@ -31,6 +31,9 @@ int faiss_gpu_profiler_stop(); /// cudaDeviceSynchronize for each device) int faiss_gpu_sync_all_devices(); +/// Returns the free memory (in bytes) on the specified device +int faiss_get_free_memory(int device, size_t* p_free_bytes); + #ifdef __cplusplus } #endif From 9b1518990aa737d676ee0e6972c5949de9ec3bcb Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 13 Nov 2025 19:39:56 +0530 Subject: [PATCH 2/3] add newline --- c_api/gpu/DeviceUtils_c.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_api/gpu/DeviceUtils_c.cpp b/c_api/gpu/DeviceUtils_c.cpp index b375c9a3a1..859926964e 100644 --- a/c_api/gpu/DeviceUtils_c.cpp +++ b/c_api/gpu/DeviceUtils_c.cpp @@ -52,4 +52,4 @@ int faiss_get_free_memory(int device, size_t* p_free_bytes) { *p_free_bytes = freeBytes; } CATCH_AND_HANDLE -} \ No newline at end of file +} From de0f035c68b3dbc15816691538cb8d3a9986f8ce Mon Sep 17 00:00:00 2001 From: Rahul Rampure Date: Thu, 8 Jan 2026 20:26:45 +0530 Subject: [PATCH 3/3] MB-59670: Add paging support for residual computation in GpuIndexFlat --- faiss/gpu/GpuIndex.cu | 5 +- faiss/gpu/GpuIndex.h | 3 + faiss/gpu/GpuIndexFlat.cu | 129 +++++++++++++++++++++++++++++++++----- faiss/gpu/GpuIndexFlat.h | 20 ++++++ 4 files changed, 139 insertions(+), 18 deletions(-) diff --git a/faiss/gpu/GpuIndex.cu b/faiss/gpu/GpuIndex.cu index 73f0867d38..def784342b 100644 --- a/faiss/gpu/GpuIndex.cu +++ b/faiss/gpu/GpuIndex.cu @@ -61,7 +61,8 @@ GpuIndex::GpuIndex( : Index(dims, metric), resources_(resources), config_(config), - minPagedSize_(kMinPageSize) { + minPagedSize_(kMinPageSize), + pageSize_(kNonPinnedPageSize) { FAISS_THROW_IF_NOT_FMT( config_.device < getNumDevices(), "Invalid GPU device %d", @@ -483,7 +484,7 @@ void GpuIndex::searchFromCpuPaged_ex_( if (!pinnedAlloc.first || pageSizeInVecs < 1) { // Just page without overlapping copy with compute idx_t batchSize = utils::nextHighestPowerOf2( - (kNonPinnedPageSize / + (pageSize_ / (get_numeric_type_size(numeric_type) * this->d))); for (idx_t cur = 0; cur < n; cur += batchSize) { diff --git a/faiss/gpu/GpuIndex.h b/faiss/gpu/GpuIndex.h index 9dba6542bf..e2657b4f50 100644 --- a/faiss/gpu/GpuIndex.h +++ b/faiss/gpu/GpuIndex.h @@ -271,6 +271,9 @@ class GpuIndex : public faiss::Index { /// Size above which we page copies from the CPU to GPU size_t minPagedSize_; + + /// Size of the pages we use to page copies from the CPU to GPU + size_t pageSize_; }; /// If the given index is a GPU index, this returns the index instance diff --git a/faiss/gpu/GpuIndexFlat.cu b/faiss/gpu/GpuIndexFlat.cu index eb87e082e9..47d1bde282 100644 --- a/faiss/gpu/GpuIndexFlat.cu +++ b/faiss/gpu/GpuIndexFlat.cu @@ -312,39 +312,136 @@ void GpuIndexFlat::compute_residual(const float* x, float* residual, idx_t key) compute_residual_n(1, x, residual, &key); } -void GpuIndexFlat::compute_residual_n( - idx_t n, +void GpuIndexFlat::compute_residual_n_batch( + idx_t batchSize, const float* xs, float* residuals, - const idx_t* keys) const { - DeviceScope scope(config_.device); - auto stream = resources_->getDefaultStream(config_.device); - - if (n == 0) { - // nothing to do - return; - } + const idx_t* keys, + bool residualOnHost, + cudaStream_t stream) const { auto vecsDevice = toDeviceTemporary( resources_.get(), config_.device, const_cast(xs), stream, - {n, this->d}); + {batchSize, d}); + auto idsDevice = toDeviceTemporary( resources_.get(), config_.device, const_cast(keys), stream, - {n}); + {batchSize}); + auto residualDevice = toDeviceTemporary( - resources_.get(), config_.device, residuals, stream, {n, this->d}); + resources_.get(), + config_.device, + residuals, + stream, + {batchSize, d}); - FAISS_ASSERT(data_); data_->computeResidual(vecsDevice, idsDevice, residualDevice); - // If the output is on the host, copy back if needed - fromDevice(residualDevice, residuals, stream); + if (residualOnHost) { + fromDevice(residualDevice, residuals, stream); + } +} + +void GpuIndexFlat::compute_residual_n_paged( + idx_t n, + const float* xs, + float* residuals, + const idx_t* keys, + bool xsOnHost, + bool resOnHost, + cudaStream_t stream) const { + + idx_t batchSize = utils::nextHighestPowerOf2( + pageSize_ / (d * sizeof(float))); + + // If residuals already on device, create one wrapper and slice it + if (!resOnHost) { + auto residualDevice = toDeviceTemporary( + resources_.get(), + config_.device, + residuals, + stream, + {n, d}); + + for (idx_t cur = 0; cur < n; cur += batchSize) { + idx_t thisBatch = std::min(batchSize, n - cur); + + auto residualBatch = + residualDevice.narrowOutermost(cur, thisBatch); + + compute_residual_n_batch( + thisBatch, + xs + cur * d, + residualBatch.data(), + keys + cur, + false, + stream); + } + } else { + // residuals on host → per-batch copy back + for (idx_t cur = 0; cur < n; cur += batchSize) { + idx_t thisBatch = std::min(batchSize, n - cur); + + compute_residual_n_batch( + thisBatch, + xs + cur * d, + residuals + cur * d, + keys + cur, + true, + stream); + } + } +} + + +void GpuIndexFlat::compute_residual_n( + idx_t n, + const float* xs, + float* residuals, + const idx_t* keys) const { + if (n == 0) { + // nothing to do + return; + } + FAISS_ASSERT(data_); + DeviceScope scope(config_.device); + auto stream = resources_->getDefaultStream(config_.device); + // first check if we need to page the device transfers + // since n*d may exceed device memory, so we call compute_residual in pages + // use paged mode if + // - data_size >= minPagedSize_ AND + // - atleast one of xs or residuals is on host + size_t dataSize = (size_t)n * d * sizeof(float); + bool xsOnHost = getDeviceForAddress(xs) == -1; + bool resOnHost = getDeviceForAddress(residuals) == -1; + bool usePaged = + dataSize >= minPagedSize_ && (xsOnHost || resOnHost); + if (!usePaged) { + compute_residual_n_batch( + n, + xs, + residuals, + keys, + resOnHost, + stream); + return; + } + + // Paged path + compute_residual_n_paged( + n, + xs, + residuals, + keys, + xsOnHost, + resOnHost, + stream); } // diff --git a/faiss/gpu/GpuIndexFlat.h b/faiss/gpu/GpuIndexFlat.h index ee4c14466e..f4fcff6ec2 100644 --- a/faiss/gpu/GpuIndexFlat.h +++ b/faiss/gpu/GpuIndexFlat.h @@ -107,6 +107,26 @@ class GpuIndexFlat : public GpuIndex { float* residuals, const idx_t* keys) const override; + /// Compute residual (batch mode) with paging support + void compute_residual_n_paged( + idx_t n, + const float* xs, + float* residuals, + const idx_t* keys, + bool xsOnHost, + bool resOnHost, + cudaStream_t stream) const; + + /// Compute residual (batch mode) + void compute_residual_n_batch( + idx_t batchSize, + const float* xs, + float* residuals, + const idx_t* keys, + bool residualOnHost, + cudaStream_t stream) const; + + /// /// For internal access inline FlatIndex* getGpuData() { return data_.get();