From dbe7cbee32b8ac1d2d90ddb094e029fae4321540 Mon Sep 17 00:00:00 2001
From: Rahul Rampure <rampurerahul0@gmail.com>
Date: Thu, 6 Nov 2025 12:52:56 +0530
Subject: [PATCH 1/3] Expose API to get in-use GPU memory

---
 c_api/gpu/DeviceUtils_c.cpp | 9 +++++++++
 c_api/gpu/DeviceUtils_c.h   | 3 +++
 2 files changed, 12 insertions(+)

diff --git a/c_api/gpu/DeviceUtils_c.cpp b/c_api/gpu/DeviceUtils_c.cpp
index 37850d39a0..b375c9a3a1 100644
--- a/c_api/gpu/DeviceUtils_c.cpp
+++ b/c_api/gpu/DeviceUtils_c.cpp
@@ -44,3 +44,12 @@ int faiss_gpu_sync_all_devices() {
     }
     CATCH_AND_HANDLE
 }
+
+/// Returns the free memory (in bytes) on the specified device
+int faiss_get_free_memory(int device, size_t* p_free_bytes) {
+    try {
+        size_t freeBytes = faiss::gpu::getFreeMemory(device);
+        *p_free_bytes = freeBytes;
+    }
+    CATCH_AND_HANDLE
+}
\ No newline at end of file
diff --git a/c_api/gpu/DeviceUtils_c.h b/c_api/gpu/DeviceUtils_c.h
index b5c67e97ad..6755f24c97 100644
--- a/c_api/gpu/DeviceUtils_c.h
+++ b/c_api/gpu/DeviceUtils_c.h
@@ -31,6 +31,9 @@ int faiss_gpu_profiler_stop();
 /// cudaDeviceSynchronize for each device)
 int faiss_gpu_sync_all_devices();
 
+/// Returns the free memory (in bytes) on the specified device
+int faiss_get_free_memory(int device, size_t* p_free_bytes);
+
 #ifdef __cplusplus
 }
 #endif

From 9b1518990aa737d676ee0e6972c5949de9ec3bcb Mon Sep 17 00:00:00 2001
From: Rahul Rampure <rampurerahul0@gmail.com>
Date: Thu, 13 Nov 2025 19:39:56 +0530
Subject: [PATCH 2/3] add newline

---
 c_api/gpu/DeviceUtils_c.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/c_api/gpu/DeviceUtils_c.cpp b/c_api/gpu/DeviceUtils_c.cpp
index b375c9a3a1..859926964e 100644
--- a/c_api/gpu/DeviceUtils_c.cpp
+++ b/c_api/gpu/DeviceUtils_c.cpp
@@ -52,4 +52,4 @@ int faiss_get_free_memory(int device, size_t* p_free_bytes) {
         *p_free_bytes = freeBytes;
     }
     CATCH_AND_HANDLE
-}
\ No newline at end of file
+}

From de0f035c68b3dbc15816691538cb8d3a9986f8ce Mon Sep 17 00:00:00 2001
From: Rahul Rampure <rampurerahul0@gmail.com>
Date: Thu, 8 Jan 2026 20:26:45 +0530
Subject: [PATCH 3/3] MB-59670: Add paging support for residual computation in
 GpuIndexFlat

---
 faiss/gpu/GpuIndex.cu     |   5 +-
 faiss/gpu/GpuIndex.h      |   3 +
 faiss/gpu/GpuIndexFlat.cu | 129 +++++++++++++++++++++++++++++++++-----
 faiss/gpu/GpuIndexFlat.h  |  20 ++++++
 4 files changed, 139 insertions(+), 18 deletions(-)

diff --git a/faiss/gpu/GpuIndex.cu b/faiss/gpu/GpuIndex.cu
index 73f0867d38..def784342b 100644
--- a/faiss/gpu/GpuIndex.cu
+++ b/faiss/gpu/GpuIndex.cu
@@ -61,7 +61,8 @@ GpuIndex::GpuIndex(
         : Index(dims, metric),
           resources_(resources),
           config_(config),
-          minPagedSize_(kMinPageSize) {
+          minPagedSize_(kMinPageSize),
+          pageSize_(kNonPinnedPageSize) {
     FAISS_THROW_IF_NOT_FMT(
             config_.device < getNumDevices(),
             "Invalid GPU device %d",
@@ -483,7 +484,7 @@ void GpuIndex::searchFromCpuPaged_ex_(
     if (!pinnedAlloc.first || pageSizeInVecs < 1) {
         // Just page without overlapping copy with compute
         idx_t batchSize = utils::nextHighestPowerOf2(
-                (kNonPinnedPageSize /
+                (pageSize_ /
                  (get_numeric_type_size(numeric_type) * this->d)));
 
         for (idx_t cur = 0; cur < n; cur += batchSize) {
diff --git a/faiss/gpu/GpuIndex.h b/faiss/gpu/GpuIndex.h
index 9dba6542bf..e2657b4f50 100644
--- a/faiss/gpu/GpuIndex.h
+++ b/faiss/gpu/GpuIndex.h
@@ -271,6 +271,9 @@ class GpuIndex : public faiss::Index {
 
     /// Size above which we page copies from the CPU to GPU
     size_t minPagedSize_;
+
+    /// Size of the pages we use to page copies from the CPU to GPU
+    size_t pageSize_;
 };
 
 /// If the given index is a GPU index, this returns the index instance
diff --git a/faiss/gpu/GpuIndexFlat.cu b/faiss/gpu/GpuIndexFlat.cu
index eb87e082e9..47d1bde282 100644
--- a/faiss/gpu/GpuIndexFlat.cu
+++ b/faiss/gpu/GpuIndexFlat.cu
@@ -312,39 +312,136 @@ void GpuIndexFlat::compute_residual(const float* x, float* residual, idx_t key)
     compute_residual_n(1, x, residual, &key);
 }
 
-void GpuIndexFlat::compute_residual_n(
-        idx_t n,
+void GpuIndexFlat::compute_residual_n_batch(
+        idx_t batchSize,
         const float* xs,
         float* residuals,
-        const idx_t* keys) const {
-    DeviceScope scope(config_.device);
-    auto stream = resources_->getDefaultStream(config_.device);
-
-    if (n == 0) {
-        // nothing to do
-        return;
-    }
+        const idx_t* keys,
+        bool residualOnHost,
+        cudaStream_t stream) const {
 
     auto vecsDevice = toDeviceTemporary<float, 2>(
             resources_.get(),
             config_.device,
             const_cast<float*>(xs),
             stream,
-            {n, this->d});
+            {batchSize, d});
+
     auto idsDevice = toDeviceTemporary<idx_t, 1>(
             resources_.get(),
             config_.device,
             const_cast<idx_t*>(keys),
             stream,
-            {n});
+            {batchSize});
+
     auto residualDevice = toDeviceTemporary<float, 2>(
-            resources_.get(), config_.device, residuals, stream, {n, this->d});
+            resources_.get(),
+            config_.device,
+            residuals,
+            stream,
+            {batchSize, d});
 
-    FAISS_ASSERT(data_);
     data_->computeResidual(vecsDevice, idsDevice, residualDevice);
 
-    // If the output is on the host, copy back if needed
-    fromDevice<float, 2>(residualDevice, residuals, stream);
+    if (residualOnHost) {
+        fromDevice<float, 2>(residualDevice, residuals, stream);
+    }
+}
+
+void GpuIndexFlat::compute_residual_n_paged(
+        idx_t n,
+        const float* xs,
+        float* residuals,
+        const idx_t* keys,
+        bool xsOnHost,
+        bool resOnHost,
+        cudaStream_t stream) const {
+
+    idx_t batchSize = utils::nextHighestPowerOf2(
+            pageSize_ / (d * sizeof(float)));
+
+    // If residuals already on device, create one wrapper and slice it
+    if (!resOnHost) {
+        auto residualDevice = toDeviceTemporary<float, 2>(
+                resources_.get(),
+                config_.device,
+                residuals,
+                stream,
+                {n, d});
+
+        for (idx_t cur = 0; cur < n; cur += batchSize) {
+            idx_t thisBatch = std::min(batchSize, n - cur);
+
+            auto residualBatch =
+                residualDevice.narrowOutermost(cur, thisBatch);
+
+            compute_residual_n_batch(
+                    thisBatch,
+                    xs + cur * d,
+                    residualBatch.data(),
+                    keys + cur,
+                    false,
+                    stream);
+        }
+    } else {
+        // residuals on host → per-batch copy back
+        for (idx_t cur = 0; cur < n; cur += batchSize) {
+            idx_t thisBatch = std::min(batchSize, n - cur);
+
+            compute_residual_n_batch(
+                    thisBatch,
+                    xs + cur * d,
+                    residuals + cur * d,
+                    keys + cur,
+                    true,
+                    stream);
+        }
+    }
+}
+
+
+void GpuIndexFlat::compute_residual_n(
+        idx_t n,
+        const float* xs,
+        float* residuals,
+        const idx_t* keys) const {
+    if (n == 0) {
+        // nothing to do
+        return;
+    }
+    FAISS_ASSERT(data_);
+    DeviceScope scope(config_.device);
+    auto stream = resources_->getDefaultStream(config_.device);
+    // first check if we need to page the device transfers 
+    // since n*d may exceed device memory, so we call compute_residual in pages 
+    // use paged mode if
+    // - data_size >= minPagedSize_ AND
+    // - atleast one of xs or residuals is on host
+    size_t dataSize = (size_t)n * d * sizeof(float);
+    bool xsOnHost  = getDeviceForAddress(xs) == -1;
+    bool resOnHost = getDeviceForAddress(residuals) == -1;
+    bool usePaged =
+        dataSize >= minPagedSize_ && (xsOnHost || resOnHost);
+    if (!usePaged) {
+        compute_residual_n_batch(
+                n,
+                xs,
+                residuals,
+                keys,
+                resOnHost,
+                stream);
+        return;
+    }
+
+    // Paged path
+    compute_residual_n_paged(
+            n,
+            xs,
+            residuals,
+            keys,
+            xsOnHost,
+            resOnHost,
+            stream);
 }
 
 //
diff --git a/faiss/gpu/GpuIndexFlat.h b/faiss/gpu/GpuIndexFlat.h
index ee4c14466e..f4fcff6ec2 100644
--- a/faiss/gpu/GpuIndexFlat.h
+++ b/faiss/gpu/GpuIndexFlat.h
@@ -107,6 +107,26 @@ class GpuIndexFlat : public GpuIndex {
             float* residuals,
             const idx_t* keys) const override;
 
+    /// Compute residual (batch mode) with paging support
+    void compute_residual_n_paged(
+        idx_t n,
+        const float* xs,
+        float* residuals,
+        const idx_t* keys,
+        bool xsOnHost,
+        bool resOnHost,
+        cudaStream_t stream) const;
+
+    /// Compute residual (batch mode)
+        void compute_residual_n_batch(
+                idx_t batchSize,
+                const float* xs,
+                float* residuals,
+                const idx_t* keys,
+                bool residualOnHost,
+                cudaStream_t stream) const;
+
+    ///             
     /// For internal access
     inline FlatIndex* getGpuData() {
         return data_.get();