LAST CHECKPOINT

jprakash-db · jprakash-db · commit c7492cc78c0e · 2025-07-12T11:33:54.000+05:30
diff --git a/src/databricks/sql/client.py b/src/databricks/sql/client.py
@@ -31,6 +31,8 @@
     transform_paramstyle,
     ColumnTable,
     ColumnQueue,
+    concat_chunked_tables,
+    merge_columnar,
 )
 from databricks.sql.parameters.native import (
     DbsqlParameterBase,
@@ -1454,36 +1456,25 @@ def fetchmany_arrow(self, size: int) -> "pyarrow.Table":
         results = self.results.next_n_rows(size)
         n_remaining_rows = size - results.num_rows
         self._next_row_index += results.num_rows
+        partial_result_chunks = [results]
 
+        TOTAL_SIZE = results.num_rows
         while (
             n_remaining_rows > 0
             and not self.has_been_closed_server_side
             and self.has_more_rows
         ):
+            print(f"TOTAL DATA ROWS {TOTAL_SIZE}")
             self._fill_results_buffer()
             partial_results = self.results.next_n_rows(n_remaining_rows)
-            results = pyarrow.concat_tables([results, partial_results])
+            partial_result_chunks.append(partial_results)
             n_remaining_rows -= partial_results.num_rows
             self._next_row_index += partial_results.num_rows
+            TOTAL_SIZE += partial_results.num_rows
 
-        return results
-
-    def merge_columnar(self, result1, result2):
-        """
-        Function to merge / combining the columnar results into a single result
-        :param result1:
-        :param result2:
-        :return:
-        """
-
-        if result1.column_names != result2.column_names:
-            raise ValueError("The columns in the results don't match")
-
-        merged_result = [
-            result1.column_table[i] + result2.column_table[i]
-            for i in range(result1.num_columns)
-        ]
-        return ColumnTable(merged_result, result1.column_names)
+        return concat_chunked_tables(partial_result_chunks)
+    
+    
 
     def fetchmany_columnar(self, size: int):
         """
@@ -1504,7 +1495,7 @@ def fetchmany_columnar(self, size: int):
         ):
             self._fill_results_buffer()
             partial_results = self.results.next_n_rows(n_remaining_rows)
-            results = self.merge_columnar(results, partial_results)
+            results = merge_columnar(results, partial_results)
             n_remaining_rows -= partial_results.num_rows
             self._next_row_index += partial_results.num_rows
 
@@ -1514,20 +1505,20 @@ def fetchall_arrow(self) -> "pyarrow.Table":
         """Fetch all (remaining) rows of a query result, returning them as a PyArrow table."""
         results = self.results.remaining_rows()
         self._next_row_index += results.num_rows
-
-        print("Server side has more rows", self.has_more_rows)
         
+        partial_result_chunks = [results]
+        print("Server side has more rows", self.has_more_rows)
+        TOTAL_SIZE = results.num_rows
+
         while not self.has_been_closed_server_side and self.has_more_rows:
-            print(f"RESULT SIZE TOTAL {results.num_rows}")
+            print(f"TOTAL DATA ROWS {TOTAL_SIZE}")
             self._fill_results_buffer()
             partial_results = self.results.remaining_rows()
-            if isinstance(results, ColumnTable) and isinstance(
-                partial_results, ColumnTable
-            ):
-                results = self.merge_columnar(results, partial_results)
-            else:
-                results = pyarrow.concat_tables([results, partial_results])
+            partial_result_chunks.append(partial_results)
             self._next_row_index += partial_results.num_rows
+            TOTAL_SIZE += partial_results.num_rows
+        
+        results = concat_chunked_tables(partial_result_chunks)
 
         # If PyArrow is installed and we have a ColumnTable result, convert it to PyArrow Table
         # Valid only for metadata commands result set
@@ -1547,7 +1538,7 @@ def fetchall_columnar(self):
         while not self.has_been_closed_server_side and self.has_more_rows:
             self._fill_results_buffer()
             partial_results = self.results.remaining_rows()
-            results = self.merge_columnar(results, partial_results)
+            results = merge_columnar(results, partial_results)
             self._next_row_index += partial_results.num_rows
 
         return results
diff --git a/src/databricks/sql/cloudfetch/downloader.py b/src/databricks/sql/cloudfetch/downloader.py
@@ -9,6 +9,7 @@
 from databricks.sql.thrift_api.TCLIService.ttypes import TSparkArrowResultLink
 from databricks.sql.exc import Error
 from databricks.sql.types import SSLOptions
+from databricks.sql.common.http import DatabricksHttpClient, HttpMethod
 
 logger = logging.getLogger(__name__)
 
@@ -70,6 +71,7 @@ def __init__(
         self.settings = settings
         self.link = link
         self._ssl_options = ssl_options
+        self._http_client = DatabricksHttpClient.get_instance()
 
     def run(self) -> DownloadedFile:
         """
@@ -89,27 +91,20 @@ def run(self) -> DownloadedFile:
         ResultSetDownloadHandler._validate_link(
             self.link, self.settings.link_expiry_buffer_secs
         )
-
-        session = requests.Session()
-        session.mount("http://", HTTPAdapter(max_retries=retryPolicy))
-        session.mount("https://", HTTPAdapter(max_retries=retryPolicy))
-
-        try:
+        
+        with self._http_client.execute(
+            method=HttpMethod.GET,
+            url=self.link.fileLink,
+            timeout=self.settings.download_timeout,
+            verify=self._ssl_options.tls_verify,
+            headers=self.link.httpHeaders
+        ) as response:
             print_text = [
 
             ]
-            start_time = time.time()
-            # Get the file via HTTP request
-            response = session.get(
-                self.link.fileLink,
-                timeout=self.settings.download_timeout,
-                verify=self._ssl_options.tls_verify,
-                headers=self.link.httpHeaders
-                # TODO: Pass cert from `self._ssl_options`
-            )
+            
             response.raise_for_status()
-            end_time = time.time()
-            print_text.append(f"Downloaded file in {end_time - start_time} seconds")
+            
             # Save (and decompress if needed) the downloaded file
             compressed_data = response.content
             decompressed_data = (
@@ -144,9 +139,63 @@ def run(self) -> DownloadedFile:
                 self.link.startRowOffset,
                 self.link.rowCount,
             )
-        finally:
-            if session:
-                session.close()
+        # session = requests.Session()
+        # session.mount("http://", HTTPAdapter(max_retries=retryPolicy))
+        # session.mount("https://", HTTPAdapter(max_retries=retryPolicy))
+
+        # try:
+        #     print_text = [
+
+        #     ]
+        #     start_time = time.time()
+        #     # Get the file via HTTP request
+        #     response = session.get(
+        #         self.link.fileLink,
+        #         timeout=self.settings.download_timeout,
+        #         verify=self._ssl_options.tls_verify,
+        #         headers=self.link.httpHeaders
+        #         # TODO: Pass cert from `self._ssl_options`
+        #     )
+        #     response.raise_for_status()
+        #     end_time = time.time()
+        #     print_text.append(f"Downloaded file in {end_time - start_time} seconds")
+        #     # Save (and decompress if needed) the downloaded file
+        #     compressed_data = response.content
+        #     decompressed_data = (
+        #         ResultSetDownloadHandler._decompress_data(compressed_data)
+        #         if self.settings.is_lz4_compressed
+        #         else compressed_data
+        #     )
+
+        #     # The size of the downloaded file should match the size specified from TSparkArrowResultLink
+        #     if len(decompressed_data) != self.link.bytesNum:
+        #         logger.debug(
+        #             "ResultSetDownloadHandler: downloaded file size {} does not match the expected value {}".format(
+        #                 len(decompressed_data), self.link.bytesNum
+        #             )
+        #         )
+
+        #     logger.debug(
+        #         "ResultSetDownloadHandler: successfully downloaded file, offset {}, row count {}".format(
+        #             self.link.startRowOffset, self.link.rowCount
+        #         )
+        #     )
+
+        #     print_text.append(
+        #         f"Downloaded file startRowOffset - {self.link.startRowOffset} - rowCount - {self.link.rowCount}"
+        #     )
+
+        #     for text in print_text:
+        #         print(text)
+
+        #     return DownloadedFile(
+        #         decompressed_data,
+        #         self.link.startRowOffset,
+        #         self.link.rowCount,
+        #     )
+        # finally:
+        #     if session:
+        #         session.close()
 
     @staticmethod
     def _validate_link(link: TSparkArrowResultLink, expiry_buffer_secs: int):
diff --git a/src/databricks/sql/common/http.py b/src/databricks/sql/common/http.py
@@ -7,7 +7,7 @@
 from contextlib import contextmanager
 from typing import Generator
 import logging
-
+import time
 logger = logging.getLogger(__name__)
 
 
@@ -70,7 +70,10 @@ def execute(
         logger.info("Executing HTTP request: %s with url: %s", method.value, url)
         response = None
         try:
+            start_time = time.time()
             response = self.session.request(method.value, url, **kwargs)
+            end_time = time.time()
+            print(f"Downloaded file in {end_time - start_time} seconds")
             yield response
         except Exception as e:
             logger.error("Error executing HTTP request in DatabricksHttpClient: %s", e)
diff --git a/src/databricks/sql/utils.py b/src/databricks/sql/utils.py
@@ -137,6 +137,11 @@ def __eq__(self, other):
         )
 
 
+class ArrowStreamTable:
+    def __init__(self, arrow_stream, num_rows):
+        self.arrow_stream = arrow_stream
+        self.num_rows = num_rows
+
 class ColumnQueue(ResultSetQueue):
     def __init__(self, column_table: ColumnTable):
         self.column_table = column_table
@@ -263,11 +268,12 @@ def next_n_rows(self, num_rows: int) -> "pyarrow.Table":
             return self._create_empty_table()
         logger.debug("CloudFetchQueue: trying to get {} next rows".format(num_rows))
         results = self.table.slice(0, 0)
+        partial_result_chunks = [results]
         while num_rows > 0 and self.table:
             # Get remaining of num_rows or the rest of the current table, whichever is smaller
             length = min(num_rows, self.table.num_rows - self.table_row_index)
             table_slice = self.table.slice(self.table_row_index, length)
-            results = pyarrow.concat_tables([results, table_slice])
+            partial_result_chunks.append(table_slice)
             self.table_row_index += table_slice.num_rows
 
             # Replace current table with the next table if we are at the end of the current table
@@ -277,7 +283,7 @@ def next_n_rows(self, num_rows: int) -> "pyarrow.Table":
             num_rows -= table_slice.num_rows
 
         logger.debug("CloudFetchQueue: collected {} next rows".format(results.num_rows))
-        return results
+        return concat_chunked_tables(partial_result_chunks)
 
     def remaining_rows(self) -> "pyarrow.Table":
         """
@@ -290,19 +296,19 @@ def remaining_rows(self) -> "pyarrow.Table":
             # Return empty pyarrow table to cause retry of fetch
             return self._create_empty_table()
         results = self.table.slice(0, 0)
-        
+        partial_result_chunks = [results]
         print("remaining_rows call")
         print(f"self.table.num_rows - {self.table.num_rows}")
         while self.table:
             table_slice = self.table.slice(
                 self.table_row_index, self.table.num_rows - self.table_row_index
             )
-            results = pyarrow.concat_tables([results, table_slice])
+            partial_result_chunks.append(table_slice)
             self.table_row_index += table_slice.num_rows
             self.table = self._create_next_table()
             self.table_row_index = 0
         print(f"results.num_rows - {results.num_rows}")
-        return results
+        return concat_chunked_tables(partial_result_chunks)
 
     def _create_next_table(self) -> Union["pyarrow.Table", None]:
         logger.debug(
@@ -771,3 +777,30 @@ def _create_python_tuple(t_col_value_wrapper):
             result[i] = None
 
     return tuple(result)
+
+
+def concat_chunked_tables(tables: List[Union["pyarrow.Table", ColumnTable]]) -> Union["pyarrow.Table", ColumnTable]:
+        if isinstance(tables[0], ColumnTable):
+            base_table = tables[0]
+            for table in tables[1:]:
+                base_table = merge_columnar(base_table, table)
+            return base_table
+        else:
+            return pyarrow.concat_tables(tables)
+        
+def merge_columnar(result1: ColumnTable, result2: ColumnTable) -> ColumnTable:
+    """
+    Function to merge / combining the columnar results into a single result
+    :param result1:
+    :param result2:
+    :return:
+    """
+
+    if result1.column_names != result2.column_names:
+        raise ValueError("The columns in the results don't match")
+
+    merged_result = [
+        result1.column_table[i] + result2.column_table[i]
+        for i in range(result1.num_columns)
+    ]
+    return ColumnTable(merged_result, result1.column_names)