Skip to content

Commit 5b5a2eb

Browse files
authored
Merge pull request #24 from NHSDigital/develop_v05
Develop v05
2 parents e745050 + 079891b commit 5b5a2eb

File tree

24 files changed

+221
-94
lines changed

24 files changed

+221
-94
lines changed

CHANGELOG.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,20 @@
1+
## v0.5.0 (2026-01-16)
2+
3+
### Feat
4+
5+
- added entity name override option in data contract error details to align with business rules
6+
7+
### Fix
8+
9+
- Amend relation to python dictionaries approach as using polars (… (#25)
10+
- fix issue where reporting_entity resulted in key fields being removed from error reports (#23)
11+
12+
### Refactor
13+
14+
- added reporting_period_start and end attribute to submission_info model (#28)
15+
- rename "Grouping" to "Group"
16+
- rename the column headers for elements of the error report
17+
118
## v0.4.0 (2025-12-17)
219

320
### Feat

docs/json_schemas/contract/components/field_error_detail.schema.json

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,11 @@
1111
},
1212
"error_message": {
1313
"description": "The message to be used for the field and error type specified. This can include templating (specified using jinja2 conventions). During templating, the full record will be available with an additional __error_value to easily obtain nested offending values.",
14-
"type": "string",
15-
"enum": [
16-
"record_rejection",
17-
"file_rejection",
18-
"warning"
19-
]
14+
"type": "string"
15+
},
16+
"reporting_entity": {
17+
"description": "The entity name to be given for grouping in error report. If left blank will default to the contract entity name",
18+
"type": "string"
2019
}
2120
},
2221
"required": [
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
{
2+
"$schema": "https://json-schema.org/draft-07/schema",
3+
"$id": "data-ingest:contract/components/field_error_type.schema.json",
4+
"title": "field_error_detail",
5+
"description": "The error type for a field when a validation error is raised during the data contract phase",
6+
"type": "object",
7+
"properties": {
8+
"error_type": {
9+
"description": "The type of error the details are for",
10+
"type": "string",
11+
"enum": [
12+
"Blank",
13+
"Bad value",
14+
"Wrong format"
15+
],
16+
"additionalProperties": {
17+
"$ref": "field_error_detail.schema.json"
18+
}
19+
}
20+
}
21+
}

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "nhs_dve"
3-
version = "0.4.0"
3+
version = "0.5.0"
44
description = "`nhs data validation engine` is a framework used to validate data"
55
authors = ["NHS England <england.contactus@nhs.net>"]
66
readme = "README.md"
@@ -39,7 +39,7 @@ requests = "2.32.4" # Mitigates security vuln in < 2.31.0
3939
schedula = "1.2.19"
4040
sqlalchemy = "2.0.19"
4141
typing_extensions = "4.6.2"
42-
urllib3 = "2.6.0" # Mitigates security vuln in < 2.5.0
42+
urllib3 = "2.6.3" # Mitigates security vuln in < 2.6.0
4343
xmltodict = "0.13.0"
4444

4545
[tool.poetry.group.dev]

src/dve/core_engine/backends/implementations/duckdb/duckdb_helpers.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,3 +273,16 @@ def get_all_registered_udfs(connection: DuckDBPyConnection) -> set[str]:
273273
"""
274274
connection.sql("CREATE TEMP TABLE IF NOT EXISTS dve_udfs (function_name VARCHAR)")
275275
return {rw[0] for rw in connection.sql("SELECT * FROM dve_udfs").fetchall()}
276+
277+
278+
def duckdb_rel_to_dictionaries(
279+
entity: DuckDBPyRelation, batch_size=1000
280+
) -> Iterator[dict[str, Any]]:
281+
"""Iterator converting DuckDBPyRelation to lists of dictionaries.
282+
Avoids issues where dates are getting converted to datetimes using polars as intermediate."""
283+
# TODO - look into float conversion - floats that can't be stored exactly in binary
284+
# TODO - are given to nearest approximation. Tried Decimal, causes issues in arrays
285+
# TODO - with templating (as in complex fields, repr used when str called in jinja templating).
286+
cols: tuple[str] = tuple(entity.columns) # type: ignore
287+
while rows := entity.fetchmany(batch_size):
288+
yield from (dict(zip(cols, rw)) for rw in rows)

src/dve/core_engine/backends/implementations/duckdb/rules.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from dve.core_engine.backends.implementations.duckdb.duckdb_helpers import (
2424
DDBStruct,
2525
duckdb_read_parquet,
26+
duckdb_rel_to_dictionaries,
2627
duckdb_write_parquet,
2728
get_all_registered_udfs,
2829
get_duckdb_type_from_annotation,
@@ -511,12 +512,13 @@ def notify(self, entities: DuckDBEntities, *, config: Notification) -> Messages:
511512
if config.excluded_columns:
512513
matched = matched.select(StarExpression(exclude=config.excluded_columns))
513514

514-
for record in matched.df().to_dict(orient="records"):
515+
for record in duckdb_rel_to_dictionaries(matched):
515516
# NOTE: only templates using values directly accessible in record - nothing nested
516517
# more complex extraction done in reporting module
517518
messages.append(
518519
FeedbackMessage(
519520
entity=config.reporting.reporting_entity_override or config.entity_name,
521+
original_entity=config.entity_name,
520522
record=record, # type: ignore
521523
error_location=config.reporting.legacy_location,
522524
error_message=template_object(config.reporting.message, record), # type: ignore

src/dve/core_engine/backends/implementations/spark/rules.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,7 @@ def notify(self, entities: SparkEntities, *, config: Notification) -> Messages:
412412
# more complex extraction done in reporting module
413413
FeedbackMessage(
414414
entity=config.reporting.reporting_entity_override or config.entity_name,
415+
original_entity=config.entity_name,
415416
record=record.asDict(recursive=True),
416417
error_location=config.reporting.legacy_location,
417418
error_message=template_object(

src/dve/core_engine/message.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ class DataContractErrorDetail(BaseModel):
3030

3131
error_code: str
3232
error_message: Optional[str] = None
33+
reporting_entity: Optional[str] = None
3334

3435
def template_message(
3536
self,
@@ -105,6 +106,8 @@ class FeedbackMessage: # pylint: disable=too-many-instance-attributes
105106
still be completed (i.e. filters and joins can still be applied).
106107
107108
"""
109+
original_entity: Optional[EntityName] = None
110+
"""The original entity before any modifications to the name (if applicable)."""
108111
is_informational: bool = False
109112
"""Whether the message is simply for information or has affected the outputs."""
110113
error_type: Optional[str] = None
@@ -230,7 +233,8 @@ def from_pydantic_error(
230233

231234
messages.append(
232235
cls(
233-
entity=entity,
236+
entity=error_detail.reporting_entity or entity,
237+
original_entity=entity,
234238
record=record,
235239
failure_type=failure_type,
236240
is_informational=is_informational,

src/dve/core_engine/models.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,16 +51,18 @@ class SubmissionInfo(AuditRecord):
5151
"""The name of the submitted file."""
5252
file_extension: str
5353
"""The extension of the file received."""
54-
submission_method: str = None # type: ignore
54+
submission_method: Optional[str] = None # type: ignore
5555
"""The method that the file was submitted"""
56-
submitting_org: str = None # type: ignore
56+
submitting_org: Optional[str] = None # type: ignore
5757
"""The organisation who submitted the file."""
58-
reporting_period: str = None # type: ignore
59-
"""The reporting period the submission relates to."""
60-
file_size: int = None # type: ignore
58+
reporting_period_start: Optional[str] = None # type: ignore
59+
"""The start of the reporting period the submission relates to."""
60+
reporting_period_end: Optional[str] = None # type: ignore
61+
"""The end of the reporting period the submission relates to."""
62+
file_size: Optional[int] = None # type: ignore
6163
"""The size (in bytes) of the file received."""
62-
datetime_received: dt.datetime = None # type: ignore
63-
"""The datetime the SEFT transfer finished."""
64+
datetime_received: Optional[dt.datetime] = None # type: ignore
65+
"""The datetime the file was received."""
6466

6567
@validator("file_name")
6668
def _ensure_metadata_extension_removed(cls, filename): # pylint: disable=no-self-argument

src/dve/reporting/excel_report.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,13 @@ def _text_length(value):
443443

444444
@staticmethod
445445
def _format_headings(headings: list[str]) -> list[str]:
446+
# TODO - ideally this would be config driven to allow customisation.
447+
_renames = {
448+
"Table": "Group",
449+
"Data Item": "Data Item Submission Name",
450+
"Error": "Errors and Warnings",
451+
}
446452
headings = [heading.title() if heading[0].islower() else heading for heading in headings]
447453
headings = [heading.replace("_", " ") for heading in headings]
454+
headings = [_renames.get(heading, heading) for heading in headings]
448455
return headings

0 commit comments

Comments
 (0)