build: Uplift supported python version to 3.11

georgeRobertson · georgeRobertson · commit 6c382603251f · 2025-10-20T16:48:08.000+01:00
BREAKING CHANGE:

- Uplifted pyspark to 3.4
- Uplifted polars to 0.20
- Uplifted boto3 and botocore to 1.34
- Uplifted delta-spark to 2.4

These upgrades have resulted in a number of code changes which cannot be supported in a older version of DVE (1.0).
diff --git a/.mise.toml b/.mise.toml
@@ -1,4 +1,4 @@
 [tools]
-python="3.7.17"
-poetry="1.4.2"
+python="3.11"
+poetry="2.2"
 java="liberica-1.8.0"
diff --git a/.tool-versions b/.tool-versions
@@ -1,3 +1,3 @@
-python 3.7.17
-poetry 1.4.2
+python 3.11
+poetry 2.2
 java liberica-1.8.0
diff --git a/Makefile b/Makefile
@@ -3,7 +3,7 @@ activate = poetry run
 # dev
 install:
 	poetry lock
-	poetry install --with dev,test
+	poetry install --with dev
 
 # dist
 wheel:
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,45 +16,59 @@ classifiers = [
 ]
 
 [tool.poetry.dependencies]
-python = ">=3.7.2,<3.8"
-boto3 = "1.28.47" # Boto3 will no longer support Python 3.7 starting December 13, 2023
-botocore = "1.31.47"
-delta-spark = "1.1.0"
+python = ">=3.11,<3.12"
+boto3 = "1.34.162"
+botocore = "1.34.162"
+delta-spark = "2.4.0"
 duckdb = "1.1.0" # mitigates security vuln in < 1.1.0
 formulas = "1.2.4"
 idna = "3.7"   # Downstream dep of requests but has security vuln < 3.7
 Jinja2 = "3.1.6"   # mitigates security vuln in < 3.1.6
 lxml = "4.9.1"
 openpyxl = "3.1.0"
-pandas = "1.3.5"
-polars = "0.17.14"
-pyarrow = "7.0.0"
+pandas = "2.2.2"
+polars = "0.20.14"
+pyarrow = "17.0.0"
 pydantic = "1.10.15"  # Mitigates security vuln in < 1.10.13
 pymongo = "4.6.3"
-pyspark = "3.2.1"
+pyspark = "3.4.4"
 pytz = "2022.1"
-PyYAML = "5.4"
-requests = "2.31.0"
+PyYAML = "6.0.3"
+requests = "2.32.4"  # Mitigates security vuln in < 2.31.0
 schedula = "1.2.19"
 sqlalchemy = "2.0.19"
 typing_extensions = "4.6.2"
-urllib3 = "1.26.19"  # Used transiently, but has security vuln < 1.26.19
+urllib3 = "2.5.0"  # Mitigates security vuln in < 1.26.19
 xmltodict = "0.13.0"
 
+[tool.poetry.group.dev]
+optional = true
+include-groups = [
+    "test",
+    "lint"
+]
+
 [tool.poetry.group.dev.dependencies]
-commitizen = "3.9.1"  # latest version to support Python 3.7.17
-pre-commit = "2.21.0" # latest version to support Python 3.7.17
+commitizen = "4.9.1"
+pre-commit = "4.3.0"
+
+[tool.poetry.group.test]
+optional = true
 
 [tool.poetry.group.test.dependencies]
 faker = "18.11.1"
-behave = "1.2.6"
-coverage = "6.4.3"
-moto = {extras = ["s3"], version = "3.1.18"}
+behave = "1.3.3"
+coverage = "7.11.0"
+moto = {extras = ["s3"], version = "4.0.13"}
+Werkzeug = "3.0.6"  # Dependency of moto which needs 3.0.6 for security vuln mitigation
 mongomock = "4.1.2"
-pytest = "7.4.4"
-pytest-lazy-fixture = "0.6.3"
+pytest = "8.4.2"
+pytest-lazy-fixtures = "1.4.0"  # switched from https://github.com/TvoroG/pytest-lazy-fixture as it's no longer supported
 xlsx2csv = "0.8.2"
 
+[tool.poetry.group.lint]
+optional = true
+
 [tool.poetry.group.lint.dependencies]
 black = "22.6.0"
 astroid = "2.11.7"
diff --git a/src/dve/pipeline/pipeline.py b/src/dve/pipeline/pipeline.py
@@ -639,9 +639,10 @@ def _get_error_dataframes(self, submission_id: str):
 
                 df = pl.DataFrame(errors, schema={key: pl.Utf8() for key in errors[0]})  # type: ignore
                 df = df.with_columns(
-                    error_type=pl.when(pl.col("Status") == "error")  # type: ignore
-                    .then("Submission Failure")
-                    .otherwise("Warning")
+                    pl.when(pl.col("Status") == pl.lit("error"))  # type: ignore
+                    .then(pl.lit("Submission Failure"))
+                    .otherwise(pl.lit("Warning"))
+                    .alias("error_type")
                 )
                 df = df.select(
                     pl.col("Entity").alias("Table"),  # type: ignore
@@ -677,7 +678,7 @@ def error_report(self, submission_info: SubmissionInfo, status: SubmissionStatus
         else:
             err_types = {
                 rw.get("Type"): rw.get("Count")
-                for rw in aggregates.groupby(pl.col("Type"))  # type: ignore
+                for rw in aggregates.group_by(pl.col("Type"))  # type: ignore
                 .agg(pl.col("Count").sum())  # type: ignore
                 .iter_rows(named=True)
             }
diff --git a/src/dve/reporting/error_report.py b/src/dve/reporting/error_report.py
@@ -66,9 +66,11 @@ def conditional_cast(value, primary_keys: List[str], value_separator: str) -> Un
 
 def _convert_inner_dict(error: FeedbackMessage, key_fields):
     return {
-        key: str(conditional_cast(value, key_fields.get(error.entity, ""), " -- "))
-        if value is not None
-        else None
+        key: (
+            str(conditional_cast(value, key_fields.get(error.entity, ""), " -- "))
+            if value is not None
+            else None
+        )
         for key, value in error.to_dict(
             key_fields.get(error.entity),
             max_number_of_values=10,
@@ -97,9 +99,10 @@ def create_error_dataframe(errors: Deque[FeedbackMessage], key_fields):
         )
 
     df = df.with_columns(
-        error_type=pl.when(col("Status") == "error")  # type: ignore
-        .then("Submission Failure")
-        .otherwise("Warning")
+        pl.when(pl.col("Status") == pl.lit("error"))
+        .then(pl.lit("Submission Failure"))
+        .otherwise(pl.lit("Warning"))
+        .alias("error_type")
     )
     df = df.select(
         col("Entity").alias("Table"),
@@ -128,20 +131,27 @@ def calculate_aggregates(error_frame: DataFrame) -> DataFrame:
     if error_frame.is_empty():
         return DataFrame({}, schema=AGGREGATE_SCHEMA)
     aggregates = (
-        error_frame.lazy()  # type: ignore
-        .groupby(["Table", "Type", "Data_Item", "Error_Code", "Category"])
-        .agg(count("*"))
+        error_frame.group_by(
+            [
+                pl.col("Table"),
+                pl.col("Type"),
+                pl.col("Data_Item"),
+                pl.col("Error_Code"),
+                pl.col("Category"),
+            ]
+        )
+        .agg(pl.len())
         .select(  # type: ignore
-            "Type",
-            "Table",
-            "Data_Item",
-            "Category",
-            "Error_Code",
-            col("Value").alias("Count"),
+            pl.col("Type"),
+            pl.col("Table"),
+            pl.col("Data_Item"),
+            pl.col("Category"),
+            pl.col("Error_Code"),
+            pl.col("len").alias("Count"),
         )
-        .sort("Type", "Count", descending=[False, True])
+        .sort(pl.col("Type"), pl.col("Count"), descending=[False, True])
     )
-    return aggregates.collect()  # type: ignore
+    return aggregates
 
 
 def generate_report_dataframes(
diff --git a/src/dve/reporting/excel_report.py b/src/dve/reporting/excel_report.py
@@ -66,7 +66,7 @@ def create_summary_sheet(
 
             error_summary = (
                 # chaining methods on dataframes seems to confuse mypy
-                aggregates.groupby(groups).agg(*self.aggregations)  # type: ignore
+                aggregates.group_by(groups).agg(*self.aggregations)  # type: ignore
             )
 
         try:
@@ -207,7 +207,7 @@ def create_summary_sheet(
 
             error_summary = (
                 # chaining methods on dataframes seems to confuse mypy
-                aggregates.groupby(groups).agg(*self.aggregations)  # type: ignore
+                aggregates.group_by(groups).agg(*self.aggregations)  # type: ignore
             )
         tables = [table for table in tables if table is not None]
         column = self.partition_key
diff --git a/tests/features/patches.py b/tests/features/patches.py
@@ -89,7 +89,7 @@ def get_spark_session() -> SparkSession:
     os.environ["PYSPARK_SUBMIT_ARGS"] = " ".join(
         [
             "--packages",
-            "com.databricks:spark-xml_2.12:0.16.0,io.delta:delta-core_2.12:1.1.0",
+            "com.databricks:spark-xml_2.12:0.16.0,io.delta:delta-core_2.12:2.4.0",
             "pyspark-shell",
         ]
     )
diff --git a/tests/test_parser/test_file_handling.py b/tests/test_parser/test_file_handling.py
@@ -10,6 +10,7 @@
 
 import boto3
 import pytest
+from pytest_lazy_fixtures import lf as lazy_fixture
 from typing_extensions import Literal
 
 from dve.parser.exceptions import FileAccessError, LogDataLossWarning
@@ -68,10 +69,10 @@ def test_s3_uri_raises_missing_bucket():
 @pytest.mark.parametrize(
     "prefix",
     [
-        pytest.lazy_fixture("temp_prefix"),
-        pytest.lazy_fixture("temp_s3_prefix"),
-        pytest.lazy_fixture("temp_dbfs_prefix"),
-    ],  # type: ignore
+        lazy_fixture("temp_prefix"),  
+        lazy_fixture("temp_s3_prefix"),
+        lazy_fixture("temp_dbfs_prefix"),
+    ],  # type: ignore  # pylint: disable=E1102
 )
 class TestParametrizedFileInteractions:
     """Tests which involve S3 and local filesystem."""
@@ -436,10 +437,10 @@ def test_filename_resolver_linux(uri, expected):
 @pytest.mark.parametrize(
     ["source_prefix", "target_prefix"],
     [
-        (pytest.lazy_fixture("temp_prefix"), pytest.lazy_fixture("temp_prefix")),  # type: ignore
-        (pytest.lazy_fixture("temp_s3_prefix"), pytest.lazy_fixture("temp_s3_prefix")),  # type: ignore
-        (pytest.lazy_fixture("temp_prefix"), pytest.lazy_fixture("temp_s3_prefix")),  # type: ignore
-        (pytest.lazy_fixture("temp_s3_prefix"), pytest.lazy_fixture("temp_prefix")),  # type: ignore
+        (lazy_fixture("temp_prefix"), lazy_fixture("temp_prefix")),  # type: ignore
+        (lazy_fixture("temp_s3_prefix"), lazy_fixture("temp_s3_prefix")),  # type: ignore
+        (lazy_fixture("temp_prefix"), lazy_fixture("temp_s3_prefix")),  # type: ignore
+        (lazy_fixture("temp_s3_prefix"), lazy_fixture("temp_prefix")),  # type: ignore
     ],
 )
 def test_copy_move_resource(
@@ -476,11 +477,11 @@ def test_copy_move_resource(
 @pytest.mark.parametrize(
     ["source_prefix", "target_prefix"],
     [
-        (pytest.lazy_fixture("temp_prefix"), pytest.lazy_fixture("temp_prefix")),  # type: ignore
-        (pytest.lazy_fixture("temp_s3_prefix"), pytest.lazy_fixture("temp_s3_prefix")),  # type: ignore
-        (pytest.lazy_fixture("temp_prefix"), pytest.lazy_fixture("temp_s3_prefix")),  # type: ignore
-        (pytest.lazy_fixture("temp_s3_prefix"), pytest.lazy_fixture("temp_prefix")),  # type: ignore
-    ],
+        (lazy_fixture("temp_prefix"), lazy_fixture("temp_prefix")),  # type: ignore
+        (lazy_fixture("temp_s3_prefix"), lazy_fixture("temp_s3_prefix")),  # type: ignore
+        (lazy_fixture("temp_prefix"), lazy_fixture("temp_s3_prefix")),  # type: ignore
+        (lazy_fixture("temp_s3_prefix"), lazy_fixture("temp_prefix")),  # type: ignore
+    ],  # pylint: disable=E1102
 )
 def test_copy_move_prefix(source_prefix: str, target_prefix: str, action: Literal["copy", "move"]):
     """Test that resources can be copied and moved."""
diff --git a/tests/test_pipeline/pipeline_helpers.py b/tests/test_pipeline/pipeline_helpers.py
@@ -393,4 +393,4 @@ def error_data_after_business_rules() -> Iterator[Tuple[SubmissionInfo, str]]:
 
 
 def pl_row_count(df: pl.DataFrame) -> int:
-    return df.select(pl.count()).to_dicts()[0]["count"]
+    return df.select(pl.len()).to_dicts()[0]["len"]
diff --git a/tests/test_pipeline/test_spark_pipeline.py b/tests/test_pipeline/test_spark_pipeline.py
@@ -435,7 +435,7 @@ def test_error_report_where_report_is_expected(  # pylint: disable=redefined-out
 
     report_records = (
         pl.read_excel(report_uri)
-        .filter(pl.col("Data Summary") != pl.lit(None))
+        .filter(pl.col("Data Summary").is_not_null())
         .select(pl.col("Data Summary"), pl.col("_duplicated_0"))
         .rows()
     )

Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,7 @@ def create_summary_sheet(`
`66`	`66`
`67`	`67`	`error_summary = (`
`68`	`68`	`# chaining methods on dataframes seems to confuse mypy`
`69`		`- aggregates.groupby(groups).agg(*self.aggregations) # type: ignore`
	`69`	`+ aggregates.group_by(groups).agg(*self.aggregations) # type: ignore`
`70`	`70`	`)`
`71`	`71`
`72`	`72`	`try:`
`@@ -207,7 +207,7 @@ def create_summary_sheet(`
`207`	`207`
`208`	`208`	`error_summary = (`
`209`	`209`	`# chaining methods on dataframes seems to confuse mypy`
`210`		`- aggregates.groupby(groups).agg(*self.aggregations) # type: ignore`
	`210`	`+ aggregates.group_by(groups).agg(*self.aggregations) # type: ignore`
`211`	`211`	`)`
`212`	`212`	`tables = [table for table in tables if table is not None]`
`213`	`213`	`column = self.partition_key`
Original file line number	Diff line number	Diff line change
`@@ -89,7 +89,7 @@ def get_spark_session() -> SparkSession:`
`89`	`89`	`os.environ["PYSPARK_SUBMIT_ARGS"] = " ".join(`
`90`	`90`	`[`
`91`	`91`	`"--packages",`
`92`		`- "com.databricks:spark-xml_2.12:0.16.0,io.delta:delta-core_2.12:1.1.0",`
	`92`	`+ "com.databricks:spark-xml_2.12:0.16.0,io.delta:delta-core_2.12:2.4.0",`
`93`	`93`	`"pyspark-shell",`
`94`	`94`	`]`
`95`	`95`	`)`
Original file line number	Diff line number	Diff line change
`@@ -393,4 +393,4 @@ def error_data_after_business_rules() -> Iterator[Tuple[SubmissionInfo, str]]:`
`393`	`393`
`394`	`394`
`395`	`395`	`def pl_row_count(df: pl.DataFrame) -> int:`
`396`		`- return df.select(pl.count()).to_dicts()[0]["count"]`
	`396`	`+ return df.select(pl.len()).to_dicts()[0]["len"]`