feat: small fixes and movies dataset working up to end of data contract

stevenhsd · stevenhsd · commit 449904150249 · 2025-10-23T14:44:40.000+01:00
diff --git a/src/dve/core_engine/backends/implementations/duckdb/contract.py b/src/dve/core_engine/backends/implementations/duckdb/contract.py
@@ -94,8 +94,8 @@ def generate_ddb_cast_statement(
         Current duckdb python API doesn't play well with this currently.
         """
         if not null_flag:
-            return f"try_cast({column_name} AS {dtype}) AS {column_name}"
-        return f"cast(NULL AS {dtype}) AS {column_name}"
+            return f'try_cast("{column_name}" AS {dtype}) AS "{column_name}"'
+        return f'cast(NULL AS {dtype}) AS "{column_name}"'
 
     def apply_data_contract(
         self, entities: DuckDBEntities, contract_metadata: DataContractMetadata
diff --git a/src/dve/core_engine/configuration/v1/__init__.py b/src/dve/core_engine/configuration/v1/__init__.py
@@ -137,7 +137,7 @@ class V1DataContractConfig(BaseModel):
 
     cache_originals: bool = False
     """Whether to cache the original entities after loading."""
-    contract_error_message_info: Optional[URI] = None
+    error_details: Optional[URI] = None
     """Optional URI containing custom data contract error codes and messages"""
     types: Dict[TypeName, TypeOrDef] = Field(default_factory=dict)
     """Dataset specific types defined within the config."""
@@ -304,9 +304,9 @@ def get_contract_metadata(self) -> DataContractMetadata:
 
         contract_dict = self.contract.dict()
         error_info = {}
-        if self.contract.contract_error_message_info:
+        if self.contract.error_details:
             error_info = self.load_error_message_info(
-                self.contract.contract_error_message_info
+                self.contract.error_details
                 )
         for entity_name, dataset_config in self.contract.datasets.items():
             reader_metadata[entity_name] = {
diff --git a/tests/features/movies.feature b/tests/features/movies.feature
@@ -6,31 +6,20 @@ Feature: Pipeline tests using the movies dataset
 
     Some validation of entity attributes is performed: SQL expressions and Python filter
     functions are used, and templatable business rules feature in the transformations.
-   
+
     Scenario: Validate and filter movies (duckdb)
         Given I submit the movies file movies.json for processing
         And A duckdb pipeline is configured
         And I add initial audit entries for the submission
         Then the latest audit record for the submission is marked with processing status file_transformation
         When I run the file transformation phase
-        Then the planets entity is stored as a parquet after the file_transformation phase
+        Then the movies entity is stored as a parquet after the file_transformation phase
         And the latest audit record for the submission is marked with processing status data_contract
         When I run the data contract phase
-        Then there is 1 record rejection from the data_contract phase
+        Then there are 2 record rejections from the data_contract phase
+        And there are errors with the following details and associated error_count from the data_contract phase
+            | ErrorCode | ErrorMessage                         | error_count |
+            | BLANKYEAR | year not provided                    | 1           |
+            | DODGYYEAR | year value (NOT_A_NUMBER) is invalid | 1           |
         And the movies entity is stored as a parquet after the data_contract phase
         And the latest audit record for the submission is marked with processing status business_rules
-        When I run the business rules phase
-        Then The rules restrict "planets" to 1 qualifying record
-        And At least one row from "planets" has generated error code "HIGH_DENSITY"
-        And At least one row from "planets" has generated error code "WEAK_ESCAPE"
-        And the planets entity is stored as a parquet after the business_rules phase
-        And the latest audit record for the submission is marked with processing status error_report
-        When I run the error report phase
-        Then An error report is produced
-        And The entity "planets" does not contain an entry for "Jupiter" in column "planet"
-        And The entity "planets" contains an entry for "Neptune" in column "planet"
-        And The statistics entry for the submission shows the following information
-            | parameter                | value |
-            | record_count             | 9     |
-            | number_record_rejections | 18    |
-            | number_warnings          | 0     |
diff --git a/tests/features/steps/steps_pipeline.py b/tests/features/steps/steps_pipeline.py
@@ -7,10 +7,11 @@
 """
 # pylint: disable=no-name-in-module
 from concurrent.futures import ThreadPoolExecutor
-from functools import partial
+from functools import partial, reduce
 from itertools import chain
+import operator
 from pathlib import Path
-from typing import Callable, Dict, Optional
+from typing import Any, Callable, Dict, List, Optional, Tuple
 from uuid import uuid4
 from behave import given, then, when  # type: ignore
 from behave.model import Row, Table
@@ -163,6 +164,30 @@ def get_record_rejects_from_service(context: Context, service: str, expected_num
     message_df = load_errors_from_service(processing_path, service)
     num_rejections = message_df.filter(pl.col("FailureType").eq("record")).shape[0]
     assert num_rejections == expected_num_errors, f"Got {num_rejections} actual rejections"
+    
+
+@then("there are errors with the following details and associated error_count from the {service} phase")
+def check_error_record_details_from_service(context: Context, service:str):
+    processing_path = ctxt.get_processing_location(context)
+    table: Optional[Table] = context.table
+    if table is None:
+        raise ValueError("No table supplied in step")
+    error_details: List[Tuple[pl.Expr, int]] = []
+    row: Row
+    for row in table:
+        record = row.as_dict()
+        error_count = int(record.pop("error_count"))
+        filter_expr = reduce(operator.and_,
+                         [pl.col(k).eq(v) for k, v in record.items()])
+        error_details.append((filter_expr, error_count))
+        
+    message_df = load_errors_from_service(processing_path, service)
+    for err_details in error_details:
+        filter_expr, error_count = err_details
+        assert message_df.filter(filter_expr).shape[0] == error_count
+        
+        
+    
 
 
 @given("A {implementation} pipeline is configured")
diff --git a/tests/testdata/movies/movies.dischema.json b/tests/testdata/movies/movies.dischema.json
@@ -3,7 +3,7 @@
         "schemas": {
             "ratings": {
                 "fields": {
-                    "IMDB": "NonNegativeFloat",
+                    "IMDb": "NonNegativeFloat",
                     "RottenTomatoes": "str"
 
                 }
@@ -15,6 +15,7 @@
                 }
             }
         },
+        "error_details": "movies_contract_error_details.json",
         "datasets": {
             "movies": {
                 "fields": {

Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,7 @@`
`3`	`3`	`"schemas": {`
`4`	`4`	`"ratings": {`
`5`	`5`	`"fields": {`
`6`		`- "IMDB": "NonNegativeFloat",`
	`6`	`+ "IMDb": "NonNegativeFloat",`
`7`	`7`	`"RottenTomatoes": "str"`
`8`	`8`
`9`	`9`	`}`
`@@ -15,6 +15,7 @@`
`15`	`15`	`}`
`16`	`16`	`}`
`17`	`17`	`},`
	`18`	`+ "error_details": "movies_contract_error_details.json",`
`18`	`19`	`"datasets": {`
`19`	`20`	`"movies": {`
`20`	`21`	`"fields": {`