apache
diff --git a/‎.github/workflows/python-ci.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/python-ci.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 2 additions & 4 deletions b/‎Makefile‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎dev/Dockerfile‎
Lines changed: 0 additions & 98 deletions b/‎dev/Dockerfile‎
Lines changed: 0 additions & 98 deletions
diff --git a/‎dev/docker-compose-integration.yml‎
Lines changed: 8 additions & 3 deletions b/‎dev/docker-compose-integration.yml‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎dev/entrypoint.sh‎
Lines changed: 0 additions & 23 deletions b/‎dev/entrypoint.sh‎
Lines changed: 0 additions & 23 deletions
diff --git a/‎dev/provision.py‎
Lines changed: 21 additions & 35 deletions b/‎dev/provision.py‎
Lines changed: 21 additions & 35 deletions
diff --git a/‎dev/spark/Dockerfile‎
Lines changed: 69 additions & 0 deletions b/‎dev/spark/Dockerfile‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎dev/spark-defaults.conf‎ ‎dev/spark/spark-defaults.conf‎dev/spark-defaults.conf renamed to dev/spark/spark-defaults.conf
Lines changed: 2 additions & 0 deletions b/‎dev/spark-defaults.conf‎ ‎dev/spark/spark-defaults.conf‎dev/spark-defaults.conf renamed to dev/spark/spark-defaults.conf
Lines changed: 2 additions & 0 deletions
@@ -75,6 +75,9 @@ jobs:
 
     steps:
     - uses: actions/checkout@v5
+    - uses: actions/setup-python@v6
+      with:
+        python-version: ${{ matrix.python }}
     - name: Install system dependencies
       run: sudo apt-get update && sudo apt-get install -y libkrb5-dev # for kerberos
     - name: Install
 
@@ -100,10 +100,8 @@ test-integration: test-integration-setup test-integration-exec test-integration-
 test-integration-setup: ## Start Docker services for integration tests
 	docker compose -f dev/docker-compose-integration.yml kill
 	docker compose -f dev/docker-compose-integration.yml rm -f
-	docker compose -f dev/docker-compose-integration.yml up -d
-	sleep 10
-	docker compose -f dev/docker-compose-integration.yml cp ./dev/provision.py spark-iceberg:/opt/spark/provision.py
-	docker compose -f dev/docker-compose-integration.yml exec -T spark-iceberg ipython ./provision.py
+	docker compose -f dev/docker-compose-integration.yml up -d --wait
+	$(POETRY) run python dev/provision.py
 
 test-integration-exec: ## Run integration tests (excluding provision)
 	$(TEST_RUNNER) pytest tests/ -m integration $(PYTEST_ARGS)
 
@@ -17,9 +17,8 @@
 
 services:
   spark-iceberg:
-    image: python-integration
     container_name: pyiceberg-spark
-    build: .
+    build: spark/
     networks:
       iceberg_net:
     depends_on:
@@ -37,6 +36,12 @@ services:
       - rest:rest
       - hive:hive
       - minio:minio
+    healthcheck:
+      test: ["CMD", "sh", "-c", "netstat -an | grep 15002 | grep LISTEN"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 90s
   rest:
     image: apache/iceberg-rest-fixture
     container_name: pyiceberg-rest
@@ -87,7 +92,7 @@ services:
       "
   hive:
     build: hive/
-    container_name: hive
+    container_name: pyiceberg-hive
     hostname: hive
     networks:
       iceberg_net:
 
@@ -14,7 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import math
 
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import current_date, date_add, expr
@@ -23,35 +22,26 @@
 from pyiceberg.schema import Schema
 from pyiceberg.types import FixedType, NestedField, UUIDType
 
-# The configuration is important, otherwise we get many small
-# parquet files with a single row. When a positional delete
-# hits the Parquet file with one row, the parquet file gets
-# dropped instead of having a merge-on-read delete file.
-spark = (
-    SparkSession
-        .builder
-        .config("spark.sql.shuffle.partitions", "1")
-        .config("spark.default.parallelism", "1")
-        .getOrCreate()
-)
+# Create SparkSession against the remote Spark Connect server
+spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate()
 
 catalogs = {
-    'rest': load_catalog(
+    "rest": load_catalog(
         "rest",
         **{
             "type": "rest",
-            "uri": "http://rest:8181",
-            "s3.endpoint": "http://minio:9000",
+            "uri": "http://localhost:8181",
+            "s3.endpoint": "http://localhost:9000",
             "s3.access-key-id": "admin",
             "s3.secret-access-key": "password",
         },
     ),
-    'hive': load_catalog(
+    "hive": load_catalog(
         "hive",
         **{
             "type": "hive",
-            "uri": "thrift://hive:9083",
-            "s3.endpoint": "http://minio:9000",
+            "uri": "thrift://localhost:9083",
+            "s3.endpoint": "http://localhost:9000",
             "s3.access-key-id": "admin",
             "s3.secret-access-key": "password",
         },
@@ -119,7 +109,7 @@
     #   v3: Using deletion vectors
 
     for format_version in [2, 3]:
-        identifier = f'{catalog_name}.default.test_positional_mor_deletes_v{format_version}'
+        identifier = f"{catalog_name}.default.test_positional_mor_deletes_v{format_version}"
         spark.sql(
             f"""
         CREATE OR REPLACE TABLE {identifier} (
@@ -137,10 +127,8 @@
         """
         )
 
-        spark.sql(
-            f"""
-        INSERT INTO {identifier}
-        VALUES
+        spark.sql("""
+        SELECT * FROM VALUES
             (CAST('2023-03-01' AS date), 1, 'a'),
             (CAST('2023-03-02' AS date), 2, 'b'),
             (CAST('2023-03-03' AS date), 3, 'c'),
@@ -152,9 +140,9 @@
             (CAST('2023-03-09' AS date), 9, 'i'),
             (CAST('2023-03-10' AS date), 10, 'j'),
             (CAST('2023-03-11' AS date), 11, 'k'),
-            (CAST('2023-03-12' AS date), 12, 'l');
-        """
-        )
+            (CAST('2023-03-12' AS date), 12, 'l')
+        AS t(dt, number, letter)
+        """).coalesce(1).writeTo(identifier).append()
 
         spark.sql(f"ALTER TABLE {identifier} CREATE TAG tag_12")
 
@@ -164,7 +152,7 @@
 
         spark.sql(f"DELETE FROM {identifier} WHERE number = 9")
 
-        identifier = f'{catalog_name}.default.test_positional_mor_double_deletes_v{format_version}'
+        identifier = f"{catalog_name}.default.test_positional_mor_double_deletes_v{format_version}"
 
         spark.sql(
             f"""
@@ -178,15 +166,13 @@
             'write.delete.mode'='merge-on-read',
             'write.update.mode'='merge-on-read',
             'write.merge.mode'='merge-on-read',
-            'format-version'='2'
+            'format-version'='{format_version}'
           );
         """
         )
 
-        spark.sql(
-            f"""
-        INSERT INTO {identifier}
-        VALUES
+        spark.sql("""
+        SELECT * FROM VALUES
             (CAST('2023-03-01' AS date), 1, 'a'),
             (CAST('2023-03-02' AS date), 2, 'b'),
             (CAST('2023-03-03' AS date), 3, 'c'),
@@ -198,9 +184,9 @@
             (CAST('2023-03-09' AS date), 9, 'i'),
             (CAST('2023-03-10' AS date), 10, 'j'),
             (CAST('2023-03-11' AS date), 11, 'k'),
-            (CAST('2023-03-12' AS date), 12, 'l');
-        """
-        )
+            (CAST('2023-03-12' AS date), 12, 'l')
+        AS t(dt, number, letter)
+        """).coalesce(1).writeTo(identifier).append()
 
         # Perform two deletes, should produce:
         #   v2: two positional delete files in v2
 
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG BASE_IMAGE_SPARK_VERSION=4.0.1
+
+FROM apache/spark:${BASE_IMAGE_SPARK_VERSION}
+
+# Dependency versions - keep these compatible
+ARG ICEBERG_VERSION=1.10.0
+ARG ICEBERG_SPARK_RUNTIME_VERSION=4.0_2.13
+ARG SPARK_VERSION=4.0.1
+ARG HADOOP_VERSION=3.4.1
+ARG SCALA_VERSION=2.13
+ARG AWS_SDK_VERSION=2.24.6
+ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2
+
+USER root
+WORKDIR ${SPARK_HOME}
+
+# Install curl for JAR downloads
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends curl && \
+    rm -rf /var/lib/apt/lists/*
+
+# Copy configuration (early for better caching)
+COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/
+
+# Create event log directory
+RUN mkdir -p /home/iceberg/spark-events && \
+    chown -R spark:spark /home/iceberg
+
+# Required JAR dependencies
+ENV JARS_TO_DOWNLOAD="\
+    org/apache/spark/spark-connect_${SCALA_VERSION}/${SPARK_VERSION}/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar \
+    org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \
+    org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \
+    org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \
+    software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar"
+
+# Download JARs with retry logic
+RUN set -e && \
+    cd "${SPARK_HOME}/jars" && \
+    for jar_path in ${JARS_TO_DOWNLOAD}; do \
+        jar_name=$(basename "${jar_path}") && \
+        echo "Downloading ${jar_name}..." && \
+        curl -fsSL --retry 3 --retry-delay 5 \
+             -o "${jar_name}" \
+             "${MAVEN_MIRROR}/${jar_path}" && \
+        echo "✓ Downloaded ${jar_name}"; \
+    done && \
+    chown -R spark:spark "${SPARK_HOME}/jars"
+
+USER spark
+WORKDIR ${SPARK_HOME}
+
+# Start Spark Connect server
+CMD ["sh", "-c", "SPARK_NO_DAEMONIZE=true ${SPARK_HOME}/sbin/start-connect-server.sh"]
@@ -48,3 +48,5 @@ spark.sql.defaultCatalog               rest
 spark.ui.enabled                       true
 spark.eventLog.enabled                 true
 spark.eventLog.dir                     /home/iceberg/spark-events
+
+spark.sql.ansi.enabled                 false