Add test_large_query.ipynb

kosiew · kosiew · commit eabb6bfb5a89 · 2025-06-03T21:26:44.000+08:00
diff --git a/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/.ipynb_checkpoints/Untitled-checkpoint.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/.ipynb_checkpoints/test_large_query-checkpoint.ipynb b/.ipynb_checkpoints/test_large_query-checkpoint.ipynb
@@ -0,0 +1,370 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b8171e80-141a-4f5a-9427-9459b93f9103",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/Users/kosiew/GitHub/datafusion-python/.venv/bin/python3\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "print(sys.executable)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "6f6810fe-6cc5-4277-b314-ea277e61455d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "import threading\n",
+    "import pyarrow as pa\n",
+    "from datafusion import SessionContext"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "22d46be4-49ff-4a12-93e3-4dfac34e293e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset created successfully!\n"
+     ]
+    }
+   ],
+   "source": [
+    "def create_large_dataset():\n",
+    "    \"\"\"Create a large dataset for testing interruption.\"\"\"\n",
+    "    ctx = SessionContext()\n",
+    "    \n",
+    "    # Create large record batches similar to the test\n",
+    "    batches = []\n",
+    "    for i in range(10):\n",
+    "        batch = pa.RecordBatch.from_arrays(\n",
+    "            [\n",
+    "                pa.array(list(range(i * 1000, (i + 1) * 1000))),\n",
+    "                pa.array([f\"value_{j}\" for j in range(i * 1000, (i + 1) * 1000)]),\n",
+    "            ],\n",
+    "            names=[\"a\", \"b\"],\n",
+    "        )\n",
+    "        batches.append(batch)\n",
+    "    \n",
+    "    # Register tables\n",
+    "    ctx.register_record_batches(\"t1\", [batches])\n",
+    "    ctx.register_record_batches(\"t2\", [batches])\n",
+    "    \n",
+    "    return ctx\n",
+    "\n",
+    "# Setup the test environment\n",
+    "ctx = create_large_dataset()\n",
+    "print(\"Dataset created successfully!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "8f31a017-41c5-4222-b63e-c942ddd4d002",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Starting long-running query...\n",
+      "Press Ctrl+C to interrupt!\n",
+      "Query completed successfully! Got 2996 batches\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create a complex, long-running query\n",
+    "df = ctx.sql(\"\"\"\n",
+    "    WITH t1_expanded AS (\n",
+    "        SELECT \n",
+    "            a, \n",
+    "            b, \n",
+    "            CAST(a AS DOUBLE) / 1.5 AS c,\n",
+    "            CAST(a AS DOUBLE) * CAST(a AS DOUBLE) AS d\n",
+    "        FROM t1\n",
+    "        CROSS JOIN (SELECT 1 AS dummy FROM t1 LIMIT 5)\n",
+    "    ),\n",
+    "    t2_expanded AS (\n",
+    "        SELECT \n",
+    "            a,\n",
+    "            b,\n",
+    "            CAST(a AS DOUBLE) * 2.5 AS e,\n",
+    "            CAST(a AS DOUBLE) * CAST(a AS DOUBLE) * CAST(a AS DOUBLE) AS f\n",
+    "        FROM t2\n",
+    "        CROSS JOIN (SELECT 1 AS dummy FROM t2 LIMIT 5)\n",
+    "    )\n",
+    "    SELECT \n",
+    "        t1.a, t1.b, t1.c, t1.d, \n",
+    "        t2.a AS a2, t2.b AS b2, t2.e, t2.f\n",
+    "    FROM t1_expanded t1\n",
+    "    JOIN t2_expanded t2 ON t1.a % 100 = t2.a % 100\n",
+    "    WHERE t1.a > 100 AND t2.a > 100\n",
+    "\"\"\")\n",
+    "\n",
+    "print(\"Starting long-running query...\")\n",
+    "print(\"Press Ctrl+C to interrupt!\")\n",
+    "\n",
+    "try:\n",
+    "    result = df.collect()\n",
+    "    print(f\"Query completed successfully! Got {len(result)} batches\")\n",
+    "except KeyboardInterrupt:\n",
+    "    print(\"✅ Query was successfully interrupted by Ctrl+C!\")\n",
+    "except Exception as e:\n",
+    "    print(f\"❌ Unexpected error: {e}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "6ad8d10c-afc2-411b-ad2c-057e1f38c5ed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = ctx.sql(\"\"\"\n",
+    "    WITH t1_expanded AS (\n",
+    "        SELECT \n",
+    "            a, \n",
+    "            b, \n",
+    "            CAST(a AS DOUBLE) / 1.5 AS c,\n",
+    "            CAST(a AS DOUBLE) * CAST(a AS DOUBLE) AS d\n",
+    "        FROM t1\n",
+    "        CROSS JOIN (SELECT 1 AS dummy FROM t1 LIMIT 5)\n",
+    "    ),\n",
+    "    t2_expanded AS (\n",
+    "        SELECT \n",
+    "            a,\n",
+    "            b,\n",
+    "            CAST(a AS DOUBLE) * 2.5 AS e,\n",
+    "            CAST(a AS DOUBLE) * CAST(a AS DOUBLE) * CAST(a AS DOUBLE) AS f\n",
+    "        FROM t2\n",
+    "        CROSS JOIN (SELECT 1 AS dummy FROM t2 LIMIT 5)\n",
+    "    )\n",
+    "    SELECT \n",
+    "        t1.a, t1.b, t1.c, t1.d, \n",
+    "        t2.a AS a2, t2.b AS b2, t2.e, t2.f\n",
+    "    FROM t1_expanded t1\n",
+    "    JOIN t2_expanded t2 ON t1.a % 100 = t2.a % 100\n",
+    "    WHERE t1.a > 100 AND t2.a > 100\n",
+    "\"\"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "de500c47-3c2a-4732-8763-82cd3ff69701",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'create_very_large_dataset' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m ctx \u001b[38;5;241m=\u001b[39m \u001b[43mcreate_very_large_dataset\u001b[49m()\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'create_very_large_dataset' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "ctx = create_very_large_dataset()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "2bc83e68-c836-4a1e-9e6e-8e0e8212d8ee",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "argument 'partitions': 'RecordBatch' object cannot be converted to 'PyList'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[8], line 28\u001b[0m\n\u001b[1;32m     25\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m ctx\n\u001b[1;32m     27\u001b[0m \u001b[38;5;66;03m# Setup the test environment\u001b[39;00m\n\u001b[0;32m---> 28\u001b[0m ctx \u001b[38;5;241m=\u001b[39m \u001b[43mcreate_very_large_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "Cell \u001b[0;32mIn[8], line 20\u001b[0m, in \u001b[0;36mcreate_very_large_dataset\u001b[0;34m()\u001b[0m\n\u001b[1;32m     17\u001b[0m     batches\u001b[38;5;241m.\u001b[39mappend(batch)\n\u001b[1;32m     19\u001b[0m \u001b[38;5;66;03m# Register multiple large tables\u001b[39;00m\n\u001b[0;32m---> 20\u001b[0m \u001b[43mctx\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mregister_record_batches\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mlarge_table1\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatches\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     21\u001b[0m ctx\u001b[38;5;241m.\u001b[39mregister_record_batches(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlarge_table2\u001b[39m\u001b[38;5;124m\"\u001b[39m, batches)\n\u001b[1;32m     22\u001b[0m ctx\u001b[38;5;241m.\u001b[39mregister_record_batches(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlarge_table3\u001b[39m\u001b[38;5;124m\"\u001b[39m, batches)\n",
+      "File \u001b[0;32m~/GitHub/datafusion-python/python/datafusion/context.py:771\u001b[0m, in \u001b[0;36mSessionContext.register_record_batches\u001b[0;34m(self, name, partitions)\u001b[0m\n\u001b[1;32m    759\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mregister_record_batches\u001b[39m(\n\u001b[1;32m    760\u001b[0m     \u001b[38;5;28mself\u001b[39m, name: \u001b[38;5;28mstr\u001b[39m, partitions: \u001b[38;5;28mlist\u001b[39m[\u001b[38;5;28mlist\u001b[39m[pa\u001b[38;5;241m.\u001b[39mRecordBatch]]\n\u001b[1;32m    761\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    762\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Register record batches as a table.\u001b[39;00m\n\u001b[1;32m    763\u001b[0m \n\u001b[1;32m    764\u001b[0m \u001b[38;5;124;03m    This function will convert the provided partitions into a table and\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    769\u001b[0m \u001b[38;5;124;03m        partitions: Record batches to register as a table.\u001b[39;00m\n\u001b[1;32m    770\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 771\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mctx\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mregister_record_batches\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpartitions\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mTypeError\u001b[0m: argument 'partitions': 'RecordBatch' object cannot be converted to 'PyList'"
+     ]
+    }
+   ],
+   "source": [
+    "def create_very_large_dataset():\n",
+    "    \"\"\"Create a much larger dataset that will take time to process.\"\"\"\n",
+    "    ctx = SessionContext()\n",
+    "    \n",
+    "    # Create much larger record batches\n",
+    "    batches = []\n",
+    "    for i in range(100):  # Increased from 10 to 100\n",
+    "        batch = pa.RecordBatch.from_arrays(\n",
+    "            [\n",
+    "                pa.array(list(range(i * 10000, (i + 1) * 10000))),  # 10k rows per batch\n",
+    "                pa.array([f\"value_{j}\" for j in range(i * 10000, (i + 1) * 10000)]),\n",
+    "                pa.array([j * 1.5 for j in range(i * 10000, (i + 1) * 10000)]),  # Float column\n",
+    "                pa.array([f\"category_{j % 1000}\" for j in range(i * 10000, (i + 1) * 10000)]),  # Categories\n",
+    "            ],\n",
+    "            names=[\"id\", \"text_col\", \"float_col\", \"category\"],\n",
+    "        )\n",
+    "        batches.append(batch)\n",
+    "    \n",
+    "    # Register multiple large tables\n",
+    "    ctx.register_record_batches(\"large_table1\", batches)\n",
+    "    ctx.register_record_batches(\"large_table2\", batches)\n",
+    "    ctx.register_record_batches(\"large_table3\", batches)\n",
+    "    \n",
+    "    print(f\"Created dataset with {len(batches)} batches, ~{len(batches) * 10000:,} rows each\")\n",
+    "    return ctx\n",
+    "\n",
+    "# Setup the test environment\n",
+    "ctx = create_very_large_dataset()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "76686311-ea6f-4f24-9870-543837a387bf",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Created dataset with 100 batches, ~1,000,000 rows each\n"
+     ]
+    }
+   ],
+   "source": [
+    "import time\n",
+    "import pyarrow as pa\n",
+    "from datafusion import SessionContext\n",
+    "\n",
+    "def create_very_large_dataset():\n",
+    "    \"\"\"Create a much larger dataset that will take time to process.\"\"\"\n",
+    "    ctx = SessionContext()\n",
+    "    \n",
+    "    # Create much larger record batches\n",
+    "    batches = []\n",
+    "    for i in range(100):  # Increased from 10 to 100\n",
+    "        batch = pa.RecordBatch.from_arrays(\n",
+    "            [\n",
+    "                pa.array(list(range(i * 10000, (i + 1) * 10000))),  # 10k rows per batch\n",
+    "                pa.array([f\"value_{j}\" for j in range(i * 10000, (i + 1) * 10000)]),\n",
+    "                pa.array([j * 1.5 for j in range(i * 10000, (i + 1) * 10000)]),  # Float column\n",
+    "                pa.array([f\"category_{j % 1000}\" for j in range(i * 10000, (i + 1) * 10000)]),  # Categories\n",
+    "            ],\n",
+    "            names=[\"id\", \"text_col\", \"float_col\", \"category\"],\n",
+    "        )\n",
+    "        batches.append(batch)\n",
+    "    \n",
+    "    # Fix: Register multiple large tables - wrap batches in a list for partitions\n",
+    "    ctx.register_record_batches(\"large_table1\", [batches])  # List of partitions\n",
+    "    ctx.register_record_batches(\"large_table2\", [batches])  # List of partitions\n",
+    "    ctx.register_record_batches(\"large_table3\", [batches])  # List of partitions\n",
+    "    \n",
+    "    print(f\"Created dataset with {len(batches)} batches, ~{len(batches) * 10000:,} rows each\")\n",
+    "    return ctx\n",
+    "\n",
+    "# Setup the test environment\n",
+    "ctx = create_very_large_dataset()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "a163d524-73e7-4905-b3b1-1c83c3001572",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Starting cartesian product query...\n",
+      "Press Ctrl+C to interrupt!\n",
+      "Query completed! Got 110 batches\n"
+     ]
+    }
+   ],
+   "source": [
+    "# This will definitely be slow enough to interrupt\n",
+    "df = ctx.sql(\"\"\"\n",
+    "    SELECT \n",
+    "        t1.id,\n",
+    "        t2.id as id2,\n",
+    "        t1.float_col * t2.float_col as product,\n",
+    "        CONCAT(t1.text_col, '_', t2.text_col) as combined_text,\n",
+    "        SIN(t1.float_col) + COS(t2.float_col) as trig_calc,\n",
+    "        CASE \n",
+    "            WHEN t1.id % 2 = 0 THEN 'even'\n",
+    "            ELSE 'odd'\n",
+    "        END as parity\n",
+    "    FROM large_table1 t1\n",
+    "    CROSS JOIN large_table2 t2\n",
+    "    WHERE t1.id BETWEEN 1000 AND 5000\n",
+    "      AND t2.id BETWEEN 1500 AND 5500\n",
+    "    ORDER BY product DESC\n",
+    "    LIMIT 900000\n",
+    "\"\"\")\n",
+    "\n",
+    "print(\"Starting cartesian product query...\")\n",
+    "print(\"Press Ctrl+C to interrupt!\")\n",
+    "\n",
+    "try:\n",
+    "    result = df.collect()\n",
+    "    print(f\"Query completed! Got {len(result)} batches\")\n",
+    "except KeyboardInterrupt:\n",
+    "    print(\"✅ Query was successfully interrupted by Ctrl+C!\")\n",
+    "except Exception as e:\n",
+    "    print(f\"Error: {e}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec71bab4-b79e-4ca0-b08e-bec540522784",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/test_large_query.ipynb b/test_large_query.ipynb