Skip to content

Commit 31dfc1b

Browse files
authored
Merge branch 'main' into drop-python-39
2 parents ee5a486 + bc2aa00 commit 31dfc1b

27 files changed

+666
-250
lines changed

.github/workflows/python-ci.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,9 @@ jobs:
7575

7676
steps:
7777
- uses: actions/checkout@v5
78+
- uses: actions/setup-python@v6
79+
with:
80+
python-version: ${{ matrix.python }}
7881
- name: Install system dependencies
7982
run: sudo apt-get update && sudo apt-get install -y libkrb5-dev # for kerberos
8083
- name: Install

Makefile

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,10 +100,8 @@ test-integration: test-integration-setup test-integration-exec test-integration-
100100
test-integration-setup: ## Start Docker services for integration tests
101101
docker compose -f dev/docker-compose-integration.yml kill
102102
docker compose -f dev/docker-compose-integration.yml rm -f
103-
docker compose -f dev/docker-compose-integration.yml up -d
104-
sleep 10
105-
docker compose -f dev/docker-compose-integration.yml cp ./dev/provision.py spark-iceberg:/opt/spark/provision.py
106-
docker compose -f dev/docker-compose-integration.yml exec -T spark-iceberg ipython ./provision.py
103+
docker compose -f dev/docker-compose-integration.yml up -d --wait
104+
$(POETRY) run python dev/provision.py
107105

108106
test-integration-exec: ## Run integration tests (excluding provision)
109107
$(TEST_RUNNER) pytest tests/ -m integration $(PYTEST_ARGS)

dev/Dockerfile

Lines changed: 0 additions & 98 deletions
This file was deleted.

dev/docker-compose-integration.yml

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,8 @@
1717

1818
services:
1919
spark-iceberg:
20-
image: python-integration
2120
container_name: pyiceberg-spark
22-
build: .
21+
build: spark/
2322
networks:
2423
iceberg_net:
2524
depends_on:
@@ -37,6 +36,12 @@ services:
3736
- rest:rest
3837
- hive:hive
3938
- minio:minio
39+
healthcheck:
40+
test: ["CMD", "sh", "-c", "netstat -an | grep 15002 | grep LISTEN"]
41+
interval: 30s
42+
timeout: 10s
43+
retries: 5
44+
start_period: 90s
4045
rest:
4146
image: apache/iceberg-rest-fixture
4247
container_name: pyiceberg-rest
@@ -87,7 +92,7 @@ services:
8792
"
8893
hive:
8994
build: hive/
90-
container_name: hive
95+
container_name: pyiceberg-hive
9196
hostname: hive
9297
networks:
9398
iceberg_net:

dev/entrypoint.sh

Lines changed: 0 additions & 23 deletions
This file was deleted.

dev/provision.py

Lines changed: 21 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
# KIND, either express or implied. See the License for the
1515
# specific language governing permissions and limitations
1616
# under the License.
17-
import math
1817

1918
from pyspark.sql import SparkSession
2019
from pyspark.sql.functions import current_date, date_add, expr
@@ -23,35 +22,26 @@
2322
from pyiceberg.schema import Schema
2423
from pyiceberg.types import FixedType, NestedField, UUIDType
2524

26-
# The configuration is important, otherwise we get many small
27-
# parquet files with a single row. When a positional delete
28-
# hits the Parquet file with one row, the parquet file gets
29-
# dropped instead of having a merge-on-read delete file.
30-
spark = (
31-
SparkSession
32-
.builder
33-
.config("spark.sql.shuffle.partitions", "1")
34-
.config("spark.default.parallelism", "1")
35-
.getOrCreate()
36-
)
25+
# Create SparkSession against the remote Spark Connect server
26+
spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate()
3727

3828
catalogs = {
39-
'rest': load_catalog(
29+
"rest": load_catalog(
4030
"rest",
4131
**{
4232
"type": "rest",
43-
"uri": "http://rest:8181",
44-
"s3.endpoint": "http://minio:9000",
33+
"uri": "http://localhost:8181",
34+
"s3.endpoint": "http://localhost:9000",
4535
"s3.access-key-id": "admin",
4636
"s3.secret-access-key": "password",
4737
},
4838
),
49-
'hive': load_catalog(
39+
"hive": load_catalog(
5040
"hive",
5141
**{
5242
"type": "hive",
53-
"uri": "thrift://hive:9083",
54-
"s3.endpoint": "http://minio:9000",
43+
"uri": "thrift://localhost:9083",
44+
"s3.endpoint": "http://localhost:9000",
5545
"s3.access-key-id": "admin",
5646
"s3.secret-access-key": "password",
5747
},
@@ -119,7 +109,7 @@
119109
# v3: Using deletion vectors
120110

121111
for format_version in [2, 3]:
122-
identifier = f'{catalog_name}.default.test_positional_mor_deletes_v{format_version}'
112+
identifier = f"{catalog_name}.default.test_positional_mor_deletes_v{format_version}"
123113
spark.sql(
124114
f"""
125115
CREATE OR REPLACE TABLE {identifier} (
@@ -137,10 +127,8 @@
137127
"""
138128
)
139129

140-
spark.sql(
141-
f"""
142-
INSERT INTO {identifier}
143-
VALUES
130+
spark.sql("""
131+
SELECT * FROM VALUES
144132
(CAST('2023-03-01' AS date), 1, 'a'),
145133
(CAST('2023-03-02' AS date), 2, 'b'),
146134
(CAST('2023-03-03' AS date), 3, 'c'),
@@ -152,9 +140,9 @@
152140
(CAST('2023-03-09' AS date), 9, 'i'),
153141
(CAST('2023-03-10' AS date), 10, 'j'),
154142
(CAST('2023-03-11' AS date), 11, 'k'),
155-
(CAST('2023-03-12' AS date), 12, 'l');
156-
"""
157-
)
143+
(CAST('2023-03-12' AS date), 12, 'l')
144+
AS t(dt, number, letter)
145+
""").coalesce(1).writeTo(identifier).append()
158146

159147
spark.sql(f"ALTER TABLE {identifier} CREATE TAG tag_12")
160148

@@ -164,7 +152,7 @@
164152

165153
spark.sql(f"DELETE FROM {identifier} WHERE number = 9")
166154

167-
identifier = f'{catalog_name}.default.test_positional_mor_double_deletes_v{format_version}'
155+
identifier = f"{catalog_name}.default.test_positional_mor_double_deletes_v{format_version}"
168156

169157
spark.sql(
170158
f"""
@@ -178,15 +166,13 @@
178166
'write.delete.mode'='merge-on-read',
179167
'write.update.mode'='merge-on-read',
180168
'write.merge.mode'='merge-on-read',
181-
'format-version'='2'
169+
'format-version'='{format_version}'
182170
);
183171
"""
184172
)
185173

186-
spark.sql(
187-
f"""
188-
INSERT INTO {identifier}
189-
VALUES
174+
spark.sql("""
175+
SELECT * FROM VALUES
190176
(CAST('2023-03-01' AS date), 1, 'a'),
191177
(CAST('2023-03-02' AS date), 2, 'b'),
192178
(CAST('2023-03-03' AS date), 3, 'c'),
@@ -198,9 +184,9 @@
198184
(CAST('2023-03-09' AS date), 9, 'i'),
199185
(CAST('2023-03-10' AS date), 10, 'j'),
200186
(CAST('2023-03-11' AS date), 11, 'k'),
201-
(CAST('2023-03-12' AS date), 12, 'l');
202-
"""
203-
)
187+
(CAST('2023-03-12' AS date), 12, 'l')
188+
AS t(dt, number, letter)
189+
""").coalesce(1).writeTo(identifier).append()
204190

205191
# Perform two deletes, should produce:
206192
# v2: two positional delete files in v2

dev/spark/Dockerfile

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one or more
2+
# contributor license agreements. See the NOTICE file distributed with
3+
# this work for additional information regarding copyright ownership.
4+
# The ASF licenses this file to You under the Apache License, Version 2.0
5+
# (the "License"); you may not use this file except in compliance with
6+
# the License. You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
ARG BASE_IMAGE_SPARK_VERSION=4.0.1
17+
18+
FROM apache/spark:${BASE_IMAGE_SPARK_VERSION}
19+
20+
# Dependency versions - keep these compatible
21+
ARG ICEBERG_VERSION=1.10.0
22+
ARG ICEBERG_SPARK_RUNTIME_VERSION=4.0_2.13
23+
ARG SPARK_VERSION=4.0.1
24+
ARG HADOOP_VERSION=3.4.1
25+
ARG SCALA_VERSION=2.13
26+
ARG AWS_SDK_VERSION=2.24.6
27+
ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2
28+
29+
USER root
30+
WORKDIR ${SPARK_HOME}
31+
32+
# Install curl for JAR downloads
33+
RUN apt-get update && \
34+
apt-get install -y --no-install-recommends curl && \
35+
rm -rf /var/lib/apt/lists/*
36+
37+
# Copy configuration (early for better caching)
38+
COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/
39+
40+
# Create event log directory
41+
RUN mkdir -p /home/iceberg/spark-events && \
42+
chown -R spark:spark /home/iceberg
43+
44+
# Required JAR dependencies
45+
ENV JARS_TO_DOWNLOAD="\
46+
org/apache/spark/spark-connect_${SCALA_VERSION}/${SPARK_VERSION}/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar \
47+
org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \
48+
org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \
49+
org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \
50+
software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar"
51+
52+
# Download JARs with retry logic
53+
RUN set -e && \
54+
cd "${SPARK_HOME}/jars" && \
55+
for jar_path in ${JARS_TO_DOWNLOAD}; do \
56+
jar_name=$(basename "${jar_path}") && \
57+
echo "Downloading ${jar_name}..." && \
58+
curl -fsSL --retry 3 --retry-delay 5 \
59+
-o "${jar_name}" \
60+
"${MAVEN_MIRROR}/${jar_path}" && \
61+
echo "✓ Downloaded ${jar_name}"; \
62+
done && \
63+
chown -R spark:spark "${SPARK_HOME}/jars"
64+
65+
USER spark
66+
WORKDIR ${SPARK_HOME}
67+
68+
# Start Spark Connect server
69+
CMD ["sh", "-c", "SPARK_NO_DAEMONIZE=true ${SPARK_HOME}/sbin/start-connect-server.sh"]
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,5 @@ spark.sql.defaultCatalog rest
4848
spark.ui.enabled true
4949
spark.eventLog.enabled true
5050
spark.eventLog.dir /home/iceberg/spark-events
51+
52+
spark.sql.ansi.enabled false

0 commit comments

Comments
 (0)