Skip to content

Commit 6c38260

Browse files
build: Uplift supported python version to 3.11
BREAKING CHANGE: - Uplifted pyspark to 3.4 - Uplifted polars to 0.20 - Uplifted boto3 and botocore to 1.34 - Uplifted delta-spark to 2.4 These upgrades have resulted in a number of code changes which cannot be supported in a older version of DVE (1.0).
1 parent 970f0ef commit 6c38260

File tree

11 files changed

+88
-62
lines changed

11 files changed

+88
-62
lines changed

.mise.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
[tools]
2-
python="3.7.17"
3-
poetry="1.4.2"
2+
python="3.11"
3+
poetry="2.2"
44
java="liberica-1.8.0"

.tool-versions

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
python 3.7.17
2-
poetry 1.4.2
1+
python 3.11
2+
poetry 2.2
33
java liberica-1.8.0

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ activate = poetry run
33
# dev
44
install:
55
poetry lock
6-
poetry install --with dev,test
6+
poetry install --with dev
77

88
# dist
99
wheel:

pyproject.toml

Lines changed: 32 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,45 +16,59 @@ classifiers = [
1616
]
1717

1818
[tool.poetry.dependencies]
19-
python = ">=3.7.2,<3.8"
20-
boto3 = "1.28.47" # Boto3 will no longer support Python 3.7 starting December 13, 2023
21-
botocore = "1.31.47"
22-
delta-spark = "1.1.0"
19+
python = ">=3.11,<3.12"
20+
boto3 = "1.34.162"
21+
botocore = "1.34.162"
22+
delta-spark = "2.4.0"
2323
duckdb = "1.1.0" # mitigates security vuln in < 1.1.0
2424
formulas = "1.2.4"
2525
idna = "3.7" # Downstream dep of requests but has security vuln < 3.7
2626
Jinja2 = "3.1.6" # mitigates security vuln in < 3.1.6
2727
lxml = "4.9.1"
2828
openpyxl = "3.1.0"
29-
pandas = "1.3.5"
30-
polars = "0.17.14"
31-
pyarrow = "7.0.0"
29+
pandas = "2.2.2"
30+
polars = "0.20.14"
31+
pyarrow = "17.0.0"
3232
pydantic = "1.10.15" # Mitigates security vuln in < 1.10.13
3333
pymongo = "4.6.3"
34-
pyspark = "3.2.1"
34+
pyspark = "3.4.4"
3535
pytz = "2022.1"
36-
PyYAML = "5.4"
37-
requests = "2.31.0"
36+
PyYAML = "6.0.3"
37+
requests = "2.32.4" # Mitigates security vuln in < 2.31.0
3838
schedula = "1.2.19"
3939
sqlalchemy = "2.0.19"
4040
typing_extensions = "4.6.2"
41-
urllib3 = "1.26.19" # Used transiently, but has security vuln < 1.26.19
41+
urllib3 = "2.5.0" # Mitigates security vuln in < 1.26.19
4242
xmltodict = "0.13.0"
4343

44+
[tool.poetry.group.dev]
45+
optional = true
46+
include-groups = [
47+
"test",
48+
"lint"
49+
]
50+
4451
[tool.poetry.group.dev.dependencies]
45-
commitizen = "3.9.1" # latest version to support Python 3.7.17
46-
pre-commit = "2.21.0" # latest version to support Python 3.7.17
52+
commitizen = "4.9.1"
53+
pre-commit = "4.3.0"
54+
55+
[tool.poetry.group.test]
56+
optional = true
4757

4858
[tool.poetry.group.test.dependencies]
4959
faker = "18.11.1"
50-
behave = "1.2.6"
51-
coverage = "6.4.3"
52-
moto = {extras = ["s3"], version = "3.1.18"}
60+
behave = "1.3.3"
61+
coverage = "7.11.0"
62+
moto = {extras = ["s3"], version = "4.0.13"}
63+
Werkzeug = "3.0.6" # Dependency of moto which needs 3.0.6 for security vuln mitigation
5364
mongomock = "4.1.2"
54-
pytest = "7.4.4"
55-
pytest-lazy-fixture = "0.6.3"
65+
pytest = "8.4.2"
66+
pytest-lazy-fixtures = "1.4.0" # switched from https://github.com/TvoroG/pytest-lazy-fixture as it's no longer supported
5667
xlsx2csv = "0.8.2"
5768

69+
[tool.poetry.group.lint]
70+
optional = true
71+
5872
[tool.poetry.group.lint.dependencies]
5973
black = "22.6.0"
6074
astroid = "2.11.7"

src/dve/pipeline/pipeline.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -639,9 +639,10 @@ def _get_error_dataframes(self, submission_id: str):
639639

640640
df = pl.DataFrame(errors, schema={key: pl.Utf8() for key in errors[0]}) # type: ignore
641641
df = df.with_columns(
642-
error_type=pl.when(pl.col("Status") == "error") # type: ignore
643-
.then("Submission Failure")
644-
.otherwise("Warning")
642+
pl.when(pl.col("Status") == pl.lit("error")) # type: ignore
643+
.then(pl.lit("Submission Failure"))
644+
.otherwise(pl.lit("Warning"))
645+
.alias("error_type")
645646
)
646647
df = df.select(
647648
pl.col("Entity").alias("Table"), # type: ignore
@@ -677,7 +678,7 @@ def error_report(self, submission_info: SubmissionInfo, status: SubmissionStatus
677678
else:
678679
err_types = {
679680
rw.get("Type"): rw.get("Count")
680-
for rw in aggregates.groupby(pl.col("Type")) # type: ignore
681+
for rw in aggregates.group_by(pl.col("Type")) # type: ignore
681682
.agg(pl.col("Count").sum()) # type: ignore
682683
.iter_rows(named=True)
683684
}

src/dve/reporting/error_report.py

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,11 @@ def conditional_cast(value, primary_keys: List[str], value_separator: str) -> Un
6666

6767
def _convert_inner_dict(error: FeedbackMessage, key_fields):
6868
return {
69-
key: str(conditional_cast(value, key_fields.get(error.entity, ""), " -- "))
70-
if value is not None
71-
else None
69+
key: (
70+
str(conditional_cast(value, key_fields.get(error.entity, ""), " -- "))
71+
if value is not None
72+
else None
73+
)
7274
for key, value in error.to_dict(
7375
key_fields.get(error.entity),
7476
max_number_of_values=10,
@@ -97,9 +99,10 @@ def create_error_dataframe(errors: Deque[FeedbackMessage], key_fields):
9799
)
98100

99101
df = df.with_columns(
100-
error_type=pl.when(col("Status") == "error") # type: ignore
101-
.then("Submission Failure")
102-
.otherwise("Warning")
102+
pl.when(pl.col("Status") == pl.lit("error"))
103+
.then(pl.lit("Submission Failure"))
104+
.otherwise(pl.lit("Warning"))
105+
.alias("error_type")
103106
)
104107
df = df.select(
105108
col("Entity").alias("Table"),
@@ -128,20 +131,27 @@ def calculate_aggregates(error_frame: DataFrame) -> DataFrame:
128131
if error_frame.is_empty():
129132
return DataFrame({}, schema=AGGREGATE_SCHEMA)
130133
aggregates = (
131-
error_frame.lazy() # type: ignore
132-
.groupby(["Table", "Type", "Data_Item", "Error_Code", "Category"])
133-
.agg(count("*"))
134+
error_frame.group_by(
135+
[
136+
pl.col("Table"),
137+
pl.col("Type"),
138+
pl.col("Data_Item"),
139+
pl.col("Error_Code"),
140+
pl.col("Category"),
141+
]
142+
)
143+
.agg(pl.len())
134144
.select( # type: ignore
135-
"Type",
136-
"Table",
137-
"Data_Item",
138-
"Category",
139-
"Error_Code",
140-
col("Value").alias("Count"),
145+
pl.col("Type"),
146+
pl.col("Table"),
147+
pl.col("Data_Item"),
148+
pl.col("Category"),
149+
pl.col("Error_Code"),
150+
pl.col("len").alias("Count"),
141151
)
142-
.sort("Type", "Count", descending=[False, True])
152+
.sort(pl.col("Type"), pl.col("Count"), descending=[False, True])
143153
)
144-
return aggregates.collect() # type: ignore
154+
return aggregates
145155

146156

147157
def generate_report_dataframes(

src/dve/reporting/excel_report.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def create_summary_sheet(
6666

6767
error_summary = (
6868
# chaining methods on dataframes seems to confuse mypy
69-
aggregates.groupby(groups).agg(*self.aggregations) # type: ignore
69+
aggregates.group_by(groups).agg(*self.aggregations) # type: ignore
7070
)
7171

7272
try:
@@ -207,7 +207,7 @@ def create_summary_sheet(
207207

208208
error_summary = (
209209
# chaining methods on dataframes seems to confuse mypy
210-
aggregates.groupby(groups).agg(*self.aggregations) # type: ignore
210+
aggregates.group_by(groups).agg(*self.aggregations) # type: ignore
211211
)
212212
tables = [table for table in tables if table is not None]
213213
column = self.partition_key

tests/features/patches.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def get_spark_session() -> SparkSession:
8989
os.environ["PYSPARK_SUBMIT_ARGS"] = " ".join(
9090
[
9191
"--packages",
92-
"com.databricks:spark-xml_2.12:0.16.0,io.delta:delta-core_2.12:1.1.0",
92+
"com.databricks:spark-xml_2.12:0.16.0,io.delta:delta-core_2.12:2.4.0",
9393
"pyspark-shell",
9494
]
9595
)

tests/test_parser/test_file_handling.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
import boto3
1212
import pytest
13+
from pytest_lazy_fixtures import lf as lazy_fixture
1314
from typing_extensions import Literal
1415

1516
from dve.parser.exceptions import FileAccessError, LogDataLossWarning
@@ -68,10 +69,10 @@ def test_s3_uri_raises_missing_bucket():
6869
@pytest.mark.parametrize(
6970
"prefix",
7071
[
71-
pytest.lazy_fixture("temp_prefix"),
72-
pytest.lazy_fixture("temp_s3_prefix"),
73-
pytest.lazy_fixture("temp_dbfs_prefix"),
74-
], # type: ignore
72+
lazy_fixture("temp_prefix"),
73+
lazy_fixture("temp_s3_prefix"),
74+
lazy_fixture("temp_dbfs_prefix"),
75+
], # type: ignore # pylint: disable=E1102
7576
)
7677
class TestParametrizedFileInteractions:
7778
"""Tests which involve S3 and local filesystem."""
@@ -436,10 +437,10 @@ def test_filename_resolver_linux(uri, expected):
436437
@pytest.mark.parametrize(
437438
["source_prefix", "target_prefix"],
438439
[
439-
(pytest.lazy_fixture("temp_prefix"), pytest.lazy_fixture("temp_prefix")), # type: ignore
440-
(pytest.lazy_fixture("temp_s3_prefix"), pytest.lazy_fixture("temp_s3_prefix")), # type: ignore
441-
(pytest.lazy_fixture("temp_prefix"), pytest.lazy_fixture("temp_s3_prefix")), # type: ignore
442-
(pytest.lazy_fixture("temp_s3_prefix"), pytest.lazy_fixture("temp_prefix")), # type: ignore
440+
(lazy_fixture("temp_prefix"), lazy_fixture("temp_prefix")), # type: ignore
441+
(lazy_fixture("temp_s3_prefix"), lazy_fixture("temp_s3_prefix")), # type: ignore
442+
(lazy_fixture("temp_prefix"), lazy_fixture("temp_s3_prefix")), # type: ignore
443+
(lazy_fixture("temp_s3_prefix"), lazy_fixture("temp_prefix")), # type: ignore
443444
],
444445
)
445446
def test_copy_move_resource(
@@ -476,11 +477,11 @@ def test_copy_move_resource(
476477
@pytest.mark.parametrize(
477478
["source_prefix", "target_prefix"],
478479
[
479-
(pytest.lazy_fixture("temp_prefix"), pytest.lazy_fixture("temp_prefix")), # type: ignore
480-
(pytest.lazy_fixture("temp_s3_prefix"), pytest.lazy_fixture("temp_s3_prefix")), # type: ignore
481-
(pytest.lazy_fixture("temp_prefix"), pytest.lazy_fixture("temp_s3_prefix")), # type: ignore
482-
(pytest.lazy_fixture("temp_s3_prefix"), pytest.lazy_fixture("temp_prefix")), # type: ignore
483-
],
480+
(lazy_fixture("temp_prefix"), lazy_fixture("temp_prefix")), # type: ignore
481+
(lazy_fixture("temp_s3_prefix"), lazy_fixture("temp_s3_prefix")), # type: ignore
482+
(lazy_fixture("temp_prefix"), lazy_fixture("temp_s3_prefix")), # type: ignore
483+
(lazy_fixture("temp_s3_prefix"), lazy_fixture("temp_prefix")), # type: ignore
484+
], # pylint: disable=E1102
484485
)
485486
def test_copy_move_prefix(source_prefix: str, target_prefix: str, action: Literal["copy", "move"]):
486487
"""Test that resources can be copied and moved."""

tests/test_pipeline/pipeline_helpers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -393,4 +393,4 @@ def error_data_after_business_rules() -> Iterator[Tuple[SubmissionInfo, str]]:
393393

394394

395395
def pl_row_count(df: pl.DataFrame) -> int:
396-
return df.select(pl.count()).to_dicts()[0]["count"]
396+
return df.select(pl.len()).to_dicts()[0]["len"]

0 commit comments

Comments
 (0)