From 8eaef645876371e50105f7d7ee6a122a6a37ddb8 Mon Sep 17 00:00:00 2001 From: ProfRandom92 <159939812+ProfRandom92@users.noreply.github.com> Date: Sun, 7 Jun 2026 17:38:09 +0200 Subject: [PATCH 01/10] feat(sparkctl): add PDF extraction contract fixture --- .../tests/spark_pdf_extraction_contract.rs | 61 +++++ docs/use-cases/PDF_TO_EVIDENCE_PACKET.md | 53 +++++ examples/spark/pdf_extraction_fixture.json | 92 ++++++++ schemas/spark/pdf_extraction_v1.schema.json | 218 ++++++++++++++++++ 4 files changed, 424 insertions(+) create mode 100644 agy7rust/tests/spark_pdf_extraction_contract.rs create mode 100644 docs/use-cases/PDF_TO_EVIDENCE_PACKET.md create mode 100644 examples/spark/pdf_extraction_fixture.json create mode 100644 schemas/spark/pdf_extraction_v1.schema.json diff --git a/agy7rust/tests/spark_pdf_extraction_contract.rs b/agy7rust/tests/spark_pdf_extraction_contract.rs new file mode 100644 index 0000000..aaf5d05 --- /dev/null +++ b/agy7rust/tests/spark_pdf_extraction_contract.rs @@ -0,0 +1,61 @@ +use agy7rust::codec::package::canonical_json; +use agy7rust::sha256_hex; +use serde_json::Value; +use std::fs; + +#[test] +fn test_pdf_extraction_fixture_contract_shape() { + let fixture = fs::read_to_string("../examples/spark/pdf_extraction_fixture.json") + .expect("failed to read PDF extraction fixture"); + let value: Value = serde_json::from_str(&fixture).expect("fixture should parse as JSON"); + + assert_eq!(value["schema_version"], "PDF-EXTRACTION-V1"); + assert_non_empty_string(&value["source_file"], "source_file"); + assert_eq!(value["tool_metadata"]["converter"], "manual"); + assert_eq!(value["tool_metadata"]["extraction_mode"], "manual_fixture"); + + let extracted_fields = &value["extracted_fields"]; + assert_non_empty_string( + &extracted_fields["procedure_goal"], + "extracted_fields.procedure_goal", + ); + assert_non_empty_string(&extracted_fields["authority"], "extracted_fields.authority"); + assert_non_empty_array( + &extracted_fields["decision_points"], + "extracted_fields.decision_points", + ); + assert_non_empty_array( + &extracted_fields["required_documents"], + "extracted_fields.required_documents", + ); + assert_eq!(extracted_fields["review_required"], true); + + let tables = value["tables"] + .as_array() + .expect("tables should be an array"); + let first_table = tables.first().expect("fixture should include a table"); + let first_table_rows = first_table["rows"] + .as_array() + .expect("first table rows should be an array"); + assert_eq!(first_table_rows.len(), 3); + + let canonical = canonical_json(&value); + let hash_once = sha256_hex(&canonical); + let hash_twice = sha256_hex(canonical_json(&value)); + assert_eq!(hash_once, hash_twice); + assert_eq!(hash_once.len(), 64); +} + +fn assert_non_empty_string(value: &Value, label: &str) { + assert!( + value.as_str().is_some_and(|text| !text.trim().is_empty()), + "{label} should be a non-empty string" + ); +} + +fn assert_non_empty_array(value: &Value, label: &str) { + assert!( + value.as_array().is_some_and(|items| !items.is_empty()), + "{label} should be a non-empty array" + ); +} diff --git a/docs/use-cases/PDF_TO_EVIDENCE_PACKET.md b/docs/use-cases/PDF_TO_EVIDENCE_PACKET.md new file mode 100644 index 0000000..aa34e7f --- /dev/null +++ b/docs/use-cases/PDF_TO_EVIDENCE_PACKET.md @@ -0,0 +1,53 @@ +# PDF To Evidence Packet Adapter Contract + +## Purpose + +External or manual PDF extraction can produce a structured JSON artifact for `sparkctl` evidence workflows. `sparkctl` treats that JSON as input evidence, not as truth by itself. + +The `PDF-EXTRACTION-V1` contract gives future converters and manual fixtures a deterministic shape for extracted administrative fields, page summaries, tables, figures, warnings, and converter metadata. + +## Pipeline + +PDF or text fixture -> PDF-EXTRACTION-V1 JSON -> Context Pack / Evidence Packet -> Policy Gate -> Human Review -> Artifact Manifest + +The structured extraction can inform a Context Pack and be recorded in an Evidence Packet. It does not replace the Context Pack, the Policy Gate, or Human Review. + +## Boundaries + +- This PR does not implement OCR. +- This PR does not parse PDFs. +- This PR does not include protected SPARK data. +- This PR does not include real Daimler data. +- This PR does not include real medical or ePA data. +- This PR makes no production claim. +- This PR makes no compliance, legal, or forensic claim. +- Provider output remains untrusted until reviewed. +- Human review remains required. +- No real source PDF is committed. + +## Converter Strategy + +Docling, MinerU, Marker, pdftotext, and manual processes can be future producers of the same schema. This PR defines only the adapter contract. + +The contract records converter name, converter version, and extraction mode so reviewers can distinguish synthetic fixtures, manual fixtures, and future external-tool outputs. + +## Future Codex-Style Plugin Bundle Readiness + +Future Codex-style plugin bundles could expose skills, commands, hooks, and declared artifacts around this contract. + +This repository does not claim official OpenAI plugin compatibility or plugin-directory availability. A future bundle would still need human review, explicit connector boundaries, declared artifacts, and repository-specific approval before any write-capable action. + +## Method Precedent + +This follows prior CompText privacy-preserving synthetic fixture and replay-contract patterns. The fixture is synthetic and bounded so reviewers can inspect the contract without relying on protected source material. + +## Review Use + +Reviewers should check: + +- the fixture uses `schema_version: PDF-EXTRACTION-V1` +- the source file path is descriptive but no source PDF is committed +- extracted fields are administrative and synthetic +- warnings state the fixture limits +- converter metadata identifies manual fixture preparation +- downstream evidence packets preserve the source and warning context diff --git a/examples/spark/pdf_extraction_fixture.json b/examples/spark/pdf_extraction_fixture.json new file mode 100644 index 0000000..b355775 --- /dev/null +++ b/examples/spark/pdf_extraction_fixture.json @@ -0,0 +1,92 @@ +{ + "schema_version": "PDF-EXTRACTION-V1", + "source_file": "examples/spark/synthetic_bauantrag_fixture.pdf", + "document_type": "synthetic_public_admin_planning_fixture", + "license_or_usage_note": "Synthetic fixture for local contract testing only.", + "sanitization_status": "synthetic_no_real_personal_data", + "contains_personal_data_risk": "low", + "pages": [ + { + "page_number": 1, + "text_summary": "Synthetic building and planning application overview with authority, procedure goal, zoning placeholder, and deadline note.", + "field_refs": [ + "procedure_goal", + "authority", + "public_sector_context" + ] + }, + { + "page_number": 2, + "text_summary": "Synthetic review appendix with environmental note, protected-area placeholder, flood-zone placeholder, required attachments, and decision points.", + "field_refs": [ + "decision_points", + "required_documents", + "review_required" + ] + } + ], + "tables": [ + { + "table_id": "table_required_documents_v1", + "page_number": 2, + "caption": "Synthetic required attachments for planning review", + "columns": [ + "document", + "status", + "review_note" + ], + "rows": [ + [ + "site_plan", + "required", + "Check zoning placeholder reference ZONE-A" + ], + [ + "environmental_note", + "required", + "Review protected-area placeholder reference ENV-P" + ], + [ + "flood_zone_statement", + "conditional", + "Review flood-zone placeholder reference FLOOD-X" + ] + ] + } + ], + "figures": [ + { + "figure_id": "figure_site_context_v1", + "page_number": 1, + "description": "Synthetic site-context sketch descriptor; no image or source PDF is committed." + } + ], + "extracted_fields": { + "procedure_goal": "Assess a synthetic building and planning application for review routing and evidence packaging.", + "authority": "Synthetic Municipal Planning Office", + "decision_points": [ + "Confirm zoning placeholder reference ZONE-A is addressed.", + "Check environmental note for protected-area placeholder reference ENV-P.", + "Decide whether flood-zone placeholder reference FLOOD-X requires additional review.", + "Record deadline note before moving to human review." + ], + "required_documents": [ + "site_plan", + "environmental_note", + "flood_zone_statement", + "deadline_note" + ], + "review_required": true, + "public_sector_context": "Synthetic administrative planning workflow for local evidence-packet contract testing." + }, + "warnings": [ + "Manual fixture; not extracted from a real PDF.", + "Synthetic administrative content only; no protected personal data is included." + ], + "tool_metadata": { + "converter": "manual", + "converter_version": "fixture-v1", + "extraction_mode": "manual_fixture", + "notes": "Prepared by hand to exercise the PDF-EXTRACTION-V1 adapter contract." + } +} diff --git a/schemas/spark/pdf_extraction_v1.schema.json b/schemas/spark/pdf_extraction_v1.schema.json new file mode 100644 index 0000000..2a28064 --- /dev/null +++ b/schemas/spark/pdf_extraction_v1.schema.json @@ -0,0 +1,218 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://comptext.local/schemas/spark/pdf_extraction_v1.schema.json", + "title": "SPARK PDF Extraction V1", + "description": "Deterministic structured-data contract for external or manual PDF extraction outputs used as input evidence.", + "type": "object", + "additionalProperties": false, + "required": [ + "schema_version", + "source_file", + "document_type", + "pages", + "tables", + "figures", + "extracted_fields", + "warnings", + "tool_metadata" + ], + "properties": { + "schema_version": { + "const": "PDF-EXTRACTION-V1" + }, + "source_file": { + "type": "string", + "minLength": 1 + }, + "source_sha256": { + "type": "string", + "pattern": "^[a-f0-9]{64}$" + }, + "source_url": { + "type": "string", + "minLength": 1 + }, + "license_or_usage_note": { + "type": "string", + "minLength": 1 + }, + "sanitization_status": { + "type": "string", + "minLength": 1 + }, + "contains_personal_data_risk": { + "type": "string", + "enum": [ + "none", + "low", + "review_required", + "unknown" + ] + }, + "document_type": { + "type": "string", + "minLength": 1 + }, + "pages": { + "type": "array", + "items": { + "$ref": "#/$defs/page" + } + }, + "tables": { + "type": "array", + "items": { + "$ref": "#/$defs/table" + } + }, + "figures": { + "type": "array", + "items": { + "$ref": "#/$defs/figure" + } + }, + "extracted_fields": { + "type": "object", + "additionalProperties": true + }, + "warnings": { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + } + }, + "tool_metadata": { + "$ref": "#/$defs/tool_metadata" + } + }, + "$defs": { + "page": { + "type": "object", + "additionalProperties": false, + "required": [ + "page_number", + "text_summary" + ], + "properties": { + "page_number": { + "type": "integer", + "minimum": 1 + }, + "text_summary": { + "type": "string", + "minLength": 1 + }, + "field_refs": { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + } + } + } + }, + "table": { + "type": "object", + "additionalProperties": false, + "required": [ + "table_id", + "page_number", + "caption", + "columns", + "rows" + ], + "properties": { + "table_id": { + "type": "string", + "minLength": 1 + }, + "page_number": { + "type": "integer", + "minimum": 1 + }, + "caption": { + "type": "string", + "minLength": 1 + }, + "columns": { + "type": "array", + "minItems": 1, + "items": { + "type": "string", + "minLength": 1 + } + }, + "rows": { + "type": "array", + "items": { + "type": "array", + "items": { + "type": "string" + } + } + } + } + }, + "figure": { + "type": "object", + "additionalProperties": false, + "required": [ + "figure_id", + "page_number", + "description" + ], + "properties": { + "figure_id": { + "type": "string", + "minLength": 1 + }, + "page_number": { + "type": "integer", + "minimum": 1 + }, + "description": { + "type": "string", + "minLength": 1 + } + } + }, + "tool_metadata": { + "type": "object", + "additionalProperties": false, + "required": [ + "converter", + "converter_version", + "extraction_mode" + ], + "properties": { + "converter": { + "type": "string", + "enum": [ + "manual", + "docling", + "mineru", + "marker", + "pdftotext", + "other" + ] + }, + "converter_version": { + "type": "string", + "minLength": 1 + }, + "extraction_mode": { + "type": "string", + "enum": [ + "synthetic_fixture", + "manual_fixture", + "external_tool" + ] + }, + "notes": { + "type": "string" + } + } + } + } +} From ddf5af967ee4b020b84f082cdc854098bac48e24 Mon Sep 17 00:00:00 2001 From: ProfRandom92 <159939812+ProfRandom92@users.noreply.github.com> Date: Sun, 7 Jun 2026 19:11:08 +0200 Subject: [PATCH 02/10] feat(sparkctl): add PDF extraction runtime validation --- .../skills/pdf-extraction-contracts/SKILL.md | 82 +++++++ agy7rust/src/codec/package.rs | 228 ++++++++++++++++++ agy7rust/src/lib.rs | 10 +- .../tests/spark_pdf_extraction_contract.rs | 57 ++++- docs/use-cases/PDF_TO_EVIDENCE_PACKET.md | 6 + 5 files changed, 372 insertions(+), 11 deletions(-) create mode 100644 .agents/skills/pdf-extraction-contracts/SKILL.md diff --git a/.agents/skills/pdf-extraction-contracts/SKILL.md b/.agents/skills/pdf-extraction-contracts/SKILL.md new file mode 100644 index 0000000..dadb298 --- /dev/null +++ b/.agents/skills/pdf-extraction-contracts/SKILL.md @@ -0,0 +1,82 @@ +# Skill: PDF Extraction Contracts + +## Purpose + +Guide work on `PDF-EXTRACTION-V1` structured-data artifacts for SPARK-like administrative workflows. + +## Use This Skill When + +- Adding or reviewing PDF extraction fixtures. +- Updating `schemas/spark/pdf_extraction_v1.schema.json`. +- Validating external or manual extraction JSON before evidence packaging. +- Connecting PDF extraction artifacts to Context Pack or Evidence Packet workflows. + +## Contract Boundary + +`PDF-EXTRACTION-V1` is an adapter contract. It accepts structured JSON from manual or external extraction tools as input evidence. + +It does not: + +- implement OCR +- parse PDFs +- download source PDFs +- call providers +- create a Codex plugin bundle +- create an MCP server +- create hooks or commands +- claim official OpenAI plugin compatibility +- claim official SPARK compatibility +- replace human review + +## Required Fields + +Every artifact must include: + +- `schema_version` +- `source_file` +- `document_type` +- `pages` +- `tables` +- `figures` +- `extracted_fields` +- `warnings` +- `tool_metadata` + +`schema_version` must be `PDF-EXTRACTION-V1`. + +`tool_metadata.converter` must be one of: + +- `manual` +- `docling` +- `mineru` +- `marker` +- `pdftotext` +- `other` + +`tool_metadata.extraction_mode` must be one of: + +- `synthetic_fixture` +- `manual_fixture` +- `external_tool` + +## Fixture Rules + +Synthetic fixtures must not include protected personal data, real SPARK data, real Daimler data, real medical data, or real ePA data. + +Do not commit source PDFs unless a future task explicitly approves that artifact and license boundary. + +## Validation + +Prefer local runtime validation with `validate_pdf_extraction_contract_value` and deterministic canonical hashing with the existing `canonical_json` and `sha256_hex` helpers. + +For Rust changes, run: + +- `cargo fmt --all --check` +- `cargo test` +- `cargo clippy --all-targets --all-features -- -D warnings` + +## Claim Boundaries + +Use bounded wording: adapter contract, structured input evidence, manual fixture, external-tool output, review input, artifact manifest. + +Do not claim production readiness, compliance or certification, legal evidentiary status, forensic certainty, official SPARK compatibility, official OpenAI plugin compatibility, autonomous approval, or guaranteed correctness. diff --git a/agy7rust/src/codec/package.rs b/agy7rust/src/codec/package.rs index 584c4e8..042feca 100644 --- a/agy7rust/src/codec/package.rs +++ b/agy7rust/src/codec/package.rs @@ -1,6 +1,7 @@ use crate::codec::hash::sha256_hex; use serde::{Deserialize, Serialize}; use serde_json; +use std::collections::BTreeMap; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub enum PolicyResult { @@ -76,6 +77,81 @@ pub struct SparkEvidencePacketEnvelope { pub canonical_hash: String, } +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(deny_unknown_fields)] +pub struct PdfExtractionDocument { + pub schema_version: String, + pub source_file: String, + pub source_sha256: Option, + pub source_url: Option, + pub license_or_usage_note: Option, + pub sanitization_status: Option, + pub contains_personal_data_risk: Option, + pub document_type: String, + pub pages: Vec, + pub tables: Vec, + pub figures: Vec, + pub extracted_fields: PdfExtractedFields, + pub warnings: Vec, + pub tool_metadata: PdfExtractionToolMetadata, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(deny_unknown_fields)] +pub struct PdfExtractionPage { + pub page_number: u64, + pub text_summary: String, + pub field_refs: Option>, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(deny_unknown_fields)] +pub struct PdfExtractionTable { + pub table_id: String, + pub page_number: u64, + pub caption: String, + pub columns: Vec, + pub rows: Vec>, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(deny_unknown_fields)] +pub struct PdfExtractionFigure { + pub figure_id: String, + pub page_number: u64, + pub description: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct PdfExtractedFields { + pub procedure_goal: String, + pub authority: String, + pub decision_points: Vec, + pub required_documents: Vec, + pub review_required: bool, + pub public_sector_context: String, + #[serde(flatten)] + pub additional_fields: BTreeMap, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(deny_unknown_fields)] +pub struct PdfExtractionToolMetadata { + pub converter: String, + pub converter_version: String, + pub extraction_mode: String, + pub notes: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PdfExtractionValidation { + pub canonical_json: String, + pub canonical_hash: String, + pub page_count: usize, + pub table_count: usize, + pub first_table_row_count: usize, +} + pub fn sort_json_value(value: &serde_json::Value) -> serde_json::Value { match value { serde_json::Value::Object(map) => { @@ -140,6 +216,158 @@ pub fn validate_spark_evidence_packet_value(value: &serde_json::Value) -> anyhow validate_spark_evidence_packet_envelope(&envelope) } +pub fn validate_pdf_extraction_contract_value( + value: &serde_json::Value, +) -> anyhow::Result { + let document: PdfExtractionDocument = serde_json::from_value(value.clone())?; + validate_pdf_extraction_document(&document)?; + + let canonical = canonical_json(value); + let canonical_hash = sha256_hex(&canonical); + let first_table_row_count = document + .tables + .first() + .map(|table| table.rows.len()) + .unwrap_or(0); + + Ok(PdfExtractionValidation { + canonical_json: canonical, + canonical_hash, + page_count: document.pages.len(), + table_count: document.tables.len(), + first_table_row_count, + }) +} + +fn validate_pdf_extraction_document(document: &PdfExtractionDocument) -> anyhow::Result<()> { + require_exact( + "schema_version", + &document.schema_version, + "PDF-EXTRACTION-V1", + )?; + require_non_empty("source_file", &document.source_file)?; + require_non_empty("document_type", &document.document_type)?; + require_allowed( + "tool_metadata.converter", + &document.tool_metadata.converter, + &[ + "manual", + "docling", + "mineru", + "marker", + "pdftotext", + "other", + ], + )?; + require_non_empty( + "tool_metadata.converter_version", + &document.tool_metadata.converter_version, + )?; + require_allowed( + "tool_metadata.extraction_mode", + &document.tool_metadata.extraction_mode, + &["synthetic_fixture", "manual_fixture", "external_tool"], + )?; + + require_non_empty_pages(&document.pages)?; + require_non_empty_tables(&document.tables)?; + require_non_empty_list("warnings", &document.warnings)?; + validate_pdf_extracted_fields(&document.extracted_fields)?; + + for figure in &document.figures { + require_non_empty("figures.figure_id", &figure.figure_id)?; + require_non_zero("figures.page_number", figure.page_number)?; + require_non_empty("figures.description", &figure.description)?; + } + + if let Some(hash) = &document.source_sha256 { + validate_sha256_hex("source_sha256", hash)?; + } + + Ok(()) +} + +fn validate_pdf_extracted_fields(fields: &PdfExtractedFields) -> anyhow::Result<()> { + require_non_empty("extracted_fields.procedure_goal", &fields.procedure_goal)?; + require_non_empty("extracted_fields.authority", &fields.authority)?; + require_non_empty_list("extracted_fields.decision_points", &fields.decision_points)?; + require_non_empty_list( + "extracted_fields.required_documents", + &fields.required_documents, + )?; + require_non_empty( + "extracted_fields.public_sector_context", + &fields.public_sector_context, + )?; + + Ok(()) +} + +fn require_exact(label: &str, value: &str, expected: &str) -> anyhow::Result<()> { + if value != expected { + return Err(anyhow::anyhow!("{} mismatch", label)); + } + Ok(()) +} + +fn require_allowed(label: &str, value: &str, allowed: &[&str]) -> anyhow::Result<()> { + if !allowed.contains(&value) { + return Err(anyhow::anyhow!("{} unsupported", label)); + } + Ok(()) +} + +fn require_non_zero(label: &str, value: u64) -> anyhow::Result<()> { + if value == 0 { + return Err(anyhow::anyhow!("{} must be greater than zero", label)); + } + Ok(()) +} + +fn validate_sha256_hex(label: &str, value: &str) -> anyhow::Result<()> { + if value.len() != 64 || !value.chars().all(|ch| ch.is_ascii_hexdigit()) { + return Err(anyhow::anyhow!("{} must be lowercase SHA-256 hex", label)); + } + if value.chars().any(|ch| ch.is_ascii_uppercase()) { + return Err(anyhow::anyhow!("{} must be lowercase SHA-256 hex", label)); + } + Ok(()) +} + +fn require_non_empty_pages(pages: &[PdfExtractionPage]) -> anyhow::Result<()> { + if pages.is_empty() { + return Err(anyhow::anyhow!("missing pages")); + } + + for page in pages { + require_non_zero("pages.page_number", page.page_number)?; + require_non_empty("pages.text_summary", &page.text_summary)?; + if let Some(field_refs) = &page.field_refs { + require_non_empty_list("pages.field_refs", field_refs)?; + } + } + + Ok(()) +} + +fn require_non_empty_tables(tables: &[PdfExtractionTable]) -> anyhow::Result<()> { + if tables.is_empty() { + return Err(anyhow::anyhow!("missing tables")); + } + + for table in tables { + require_non_empty("tables.table_id", &table.table_id)?; + require_non_zero("tables.page_number", table.page_number)?; + require_non_empty("tables.caption", &table.caption)?; + require_non_empty_list("tables.columns", &table.columns)?; + if table.rows.is_empty() { + return Err(anyhow::anyhow!("missing tables.rows")); + } + } + + Ok(()) +} + fn validate_spark_evidence_preimage(preimage: &SparkEvidencePacketPreimage) -> anyhow::Result<()> { require_non_empty("schema_version", &preimage.schema_version)?; if preimage.schema_version != "SPARK-EVIDENCE-PACKET-V1" { diff --git a/agy7rust/src/lib.rs b/agy7rust/src/lib.rs index 5a6e975..04c3e24 100644 --- a/agy7rust/src/lib.rs +++ b/agy7rust/src/lib.rs @@ -9,8 +9,10 @@ pub use codec::hash::sha256_hex; pub use codec::package::{ build_package_from_value, build_spark_evidence_packet_envelope, canonical_json, collect_field_paths, extract_commitment_tokens, get_value_by_path, replay_package_value, - sort_json_value, validate_schema, validate_spark_evidence_packet_envelope, - validate_spark_evidence_packet_value, verify_package_value, ArtifactManifestEntry, - ClaimHygiene, HumanReviewDecision, PolicyResult, ProviderBoundaryStatus, - SparkEvidencePacketEnvelope, SparkEvidencePacketPreimage, + sort_json_value, validate_pdf_extraction_contract_value, validate_schema, + validate_spark_evidence_packet_envelope, validate_spark_evidence_packet_value, + verify_package_value, ArtifactManifestEntry, ClaimHygiene, HumanReviewDecision, + PdfExtractedFields, PdfExtractionDocument, PdfExtractionFigure, PdfExtractionPage, + PdfExtractionTable, PdfExtractionToolMetadata, PdfExtractionValidation, PolicyResult, + ProviderBoundaryStatus, SparkEvidencePacketEnvelope, SparkEvidencePacketPreimage, }; diff --git a/agy7rust/tests/spark_pdf_extraction_contract.rs b/agy7rust/tests/spark_pdf_extraction_contract.rs index aaf5d05..a43bfe4 100644 --- a/agy7rust/tests/spark_pdf_extraction_contract.rs +++ b/agy7rust/tests/spark_pdf_extraction_contract.rs @@ -1,5 +1,4 @@ -use agy7rust::codec::package::canonical_json; -use agy7rust::sha256_hex; +use agy7rust::validate_pdf_extraction_contract_value; use serde_json::Value; use std::fs; @@ -8,6 +7,8 @@ fn test_pdf_extraction_fixture_contract_shape() { let fixture = fs::read_to_string("../examples/spark/pdf_extraction_fixture.json") .expect("failed to read PDF extraction fixture"); let value: Value = serde_json::from_str(&fixture).expect("fixture should parse as JSON"); + let validation = + validate_pdf_extraction_contract_value(&value).expect("fixture contract should validate"); assert_eq!(value["schema_version"], "PDF-EXTRACTION-V1"); assert_non_empty_string(&value["source_file"], "source_file"); @@ -39,11 +40,47 @@ fn test_pdf_extraction_fixture_contract_shape() { .expect("first table rows should be an array"); assert_eq!(first_table_rows.len(), 3); - let canonical = canonical_json(&value); - let hash_once = sha256_hex(&canonical); - let hash_twice = sha256_hex(canonical_json(&value)); - assert_eq!(hash_once, hash_twice); - assert_eq!(hash_once.len(), 64); + assert_eq!(validation.page_count, 2); + assert_eq!(validation.table_count, 1); + assert_eq!(validation.first_table_row_count, 3); + assert_eq!(validation.canonical_hash.len(), 64); + assert!(!validation.canonical_json.is_empty()); +} + +#[test] +fn test_pdf_extraction_contract_rejects_wrong_schema_version() { + let mut value = load_fixture_value(); + value["schema_version"] = Value::String("PDF-EXTRACTION-V0".to_string()); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert_eq!(err, "schema_version mismatch"); +} + +#[test] +fn test_pdf_extraction_contract_rejects_missing_required_field() { + let mut value = load_fixture_value(); + value["extracted_fields"] + .as_object_mut() + .expect("extracted_fields should be an object") + .remove("procedure_goal"); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert!(err.contains("missing field `procedure_goal`")); +} + +#[test] +fn test_pdf_extraction_contract_rejects_unsupported_converter() { + let mut value = load_fixture_value(); + value["tool_metadata"]["converter"] = Value::String("unsupported".to_string()); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert_eq!(err, "tool_metadata.converter unsupported"); } fn assert_non_empty_string(value: &Value, label: &str) { @@ -59,3 +96,9 @@ fn assert_non_empty_array(value: &Value, label: &str) { "{label} should be a non-empty array" ); } + +fn load_fixture_value() -> Value { + let fixture = fs::read_to_string("../examples/spark/pdf_extraction_fixture.json") + .expect("failed to read PDF extraction fixture"); + serde_json::from_str(&fixture).expect("fixture should parse as JSON") +} diff --git a/docs/use-cases/PDF_TO_EVIDENCE_PACKET.md b/docs/use-cases/PDF_TO_EVIDENCE_PACKET.md index aa34e7f..c5f02da 100644 --- a/docs/use-cases/PDF_TO_EVIDENCE_PACKET.md +++ b/docs/use-cases/PDF_TO_EVIDENCE_PACKET.md @@ -12,6 +12,12 @@ PDF or text fixture -> PDF-EXTRACTION-V1 JSON -> Context Pack / Evidence Packet The structured extraction can inform a Context Pack and be recorded in an Evidence Packet. It does not replace the Context Pack, the Policy Gate, or Human Review. +## Runtime Contract Validation + +The Rust crate exposes local runtime validation for `PDF-EXTRACTION-V1` JSON values. The validator checks the declared fixture contract shape and computes a deterministic SHA-256 hash over canonical JSON with existing helpers. + +This validation is a local contract check. It does not perform OCR, parse PDFs, call providers, or verify that extracted text is true. + ## Boundaries - This PR does not implement OCR. From 81ffea740e6dd135835cb7b1c2aa063a0d809ff4 Mon Sep 17 00:00:00 2001 From: ProfRandom92 <159939812+ProfRandom92@users.noreply.github.com> Date: Sun, 7 Jun 2026 19:34:04 +0200 Subject: [PATCH 03/10] fix(sparkctl): close PDF extraction validation gaps --- .../skills/pdf-extraction-contracts/SKILL.md | 9 ++ agy7rust/src/codec/package.rs | 5 + .../tests/spark_pdf_extraction_contract.rs | 91 ++++++++++++++++++- 3 files changed, 104 insertions(+), 1 deletion(-) diff --git a/.agents/skills/pdf-extraction-contracts/SKILL.md b/.agents/skills/pdf-extraction-contracts/SKILL.md index dadb298..995e4b3 100644 --- a/.agents/skills/pdf-extraction-contracts/SKILL.md +++ b/.agents/skills/pdf-extraction-contracts/SKILL.md @@ -69,6 +69,15 @@ Do not commit source PDFs unless a future task explicitly approves that artifact Prefer local runtime validation with `validate_pdf_extraction_contract_value` and deterministic canonical hashing with the existing `canonical_json` and `sha256_hex` helpers. +Report Agent Governor gate states using exactly one of: + +- `pass` +- `fail` +- `not_applicable` +- `deferred` + +Use `not_applicable` only when a gate does not apply, and explain why. Use `deferred` when the gate is required but intentionally left for later human/tool review. + For Rust changes, run: - `cargo fmt --all --check` diff --git a/agy7rust/src/codec/package.rs b/agy7rust/src/codec/package.rs index 042feca..d9bff89 100644 --- a/agy7rust/src/codec/package.rs +++ b/agy7rust/src/codec/package.rs @@ -295,6 +295,11 @@ fn validate_pdf_extracted_fields(fields: &PdfExtractedFields) -> anyhow::Result< "extracted_fields.required_documents", &fields.required_documents, )?; + if !fields.review_required { + return Err(anyhow::anyhow!( + "PDF extraction extracted_fields.review_required must be true" + )); + } require_non_empty( "extracted_fields.public_sector_context", &fields.public_sector_context, diff --git a/agy7rust/tests/spark_pdf_extraction_contract.rs b/agy7rust/tests/spark_pdf_extraction_contract.rs index a43bfe4..be75aaa 100644 --- a/agy7rust/tests/spark_pdf_extraction_contract.rs +++ b/agy7rust/tests/spark_pdf_extraction_contract.rs @@ -59,7 +59,38 @@ fn test_pdf_extraction_contract_rejects_wrong_schema_version() { } #[test] -fn test_pdf_extraction_contract_rejects_missing_required_field() { +fn test_pdf_extraction_contract_rejects_unknown_top_level_field() { + let mut value = load_fixture_value(); + value + .as_object_mut() + .expect("fixture should be an object") + .insert( + "unexpected_field".to_string(), + Value::String("tamper".to_string()), + ); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert!(err.contains("unknown field `unexpected_field`")); +} + +#[test] +fn test_pdf_extraction_contract_rejects_missing_required_top_level_field() { + let mut value = load_fixture_value(); + value + .as_object_mut() + .expect("fixture should be an object") + .remove("source_file"); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert!(err.contains("missing field `source_file`")); +} + +#[test] +fn test_pdf_extraction_contract_rejects_missing_required_extracted_field() { let mut value = load_fixture_value(); value["extracted_fields"] .as_object_mut() @@ -83,6 +114,64 @@ fn test_pdf_extraction_contract_rejects_unsupported_converter() { assert_eq!(err, "tool_metadata.converter unsupported"); } +#[test] +fn test_pdf_extraction_contract_rejects_unsupported_extraction_mode() { + let mut value = load_fixture_value(); + value["tool_metadata"]["extraction_mode"] = Value::String("unsupported".to_string()); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert_eq!(err, "tool_metadata.extraction_mode unsupported"); +} + +#[test] +fn test_pdf_extraction_contract_rejects_blank_warning() { + let mut value = load_fixture_value(); + value["warnings"] = serde_json::json!(["manual fixture", " "]); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert_eq!(err, "warnings"); +} + +#[test] +fn test_pdf_extraction_contract_rejects_blank_procedure_goal() { + let mut value = load_fixture_value(); + value["extracted_fields"]["procedure_goal"] = Value::String(" ".to_string()); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert_eq!(err, "missing extracted_fields.procedure_goal"); +} + +#[test] +fn test_pdf_extraction_contract_rejects_empty_decision_points() { + let mut value = load_fixture_value(); + value["extracted_fields"]["decision_points"] = serde_json::json!([]); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert_eq!(err, "extracted_fields.decision_points"); +} + +#[test] +fn test_pdf_extraction_contract_rejects_review_required_false() { + let mut value = load_fixture_value(); + value["extracted_fields"]["review_required"] = Value::Bool(false); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert_eq!( + err, + "PDF extraction extracted_fields.review_required must be true" + ); +} + fn assert_non_empty_string(value: &Value, label: &str) { assert!( value.as_str().is_some_and(|text| !text.trim().is_empty()), From ac8d066356a44722cfb838578dd5103e94f2a5f4 Mon Sep 17 00:00:00 2001 From: ProfRandom92 <159939812+ProfRandom92@users.noreply.github.com> Date: Sun, 7 Jun 2026 21:22:04 +0200 Subject: [PATCH 04/10] fix(sparkctl): align PDF extraction schema validation --- agy7rust/src/codec/package.rs | 17 ++++++++++ .../tests/spark_pdf_extraction_contract.rs | 33 +++++++++++++++++++ schemas/spark/pdf_extraction_v1.schema.json | 4 +-- 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/agy7rust/src/codec/package.rs b/agy7rust/src/codec/package.rs index d9bff89..adc3158 100644 --- a/agy7rust/src/codec/package.rs +++ b/agy7rust/src/codec/package.rs @@ -246,6 +246,13 @@ fn validate_pdf_extraction_document(document: &PdfExtractionDocument) -> anyhow: "PDF-EXTRACTION-V1", )?; require_non_empty("source_file", &document.source_file)?; + if let Some(risk) = &document.contains_personal_data_risk { + require_allowed( + "contains_personal_data_risk", + risk, + &["low", "medium", "high", "unknown"], + )?; + } require_non_empty("document_type", &document.document_type)?; require_allowed( "tool_metadata.converter", @@ -368,6 +375,16 @@ fn require_non_empty_tables(tables: &[PdfExtractionTable]) -> anyhow::Result<()> if table.rows.is_empty() { return Err(anyhow::anyhow!("missing tables.rows")); } + for row in &table.rows { + if row.is_empty() { + return Err(anyhow::anyhow!("tables.rows row must not be empty")); + } + for cell in row { + if cell.trim().is_empty() { + return Err(anyhow::anyhow!("tables.rows cell must not be empty")); + } + } + } } Ok(()) diff --git a/agy7rust/tests/spark_pdf_extraction_contract.rs b/agy7rust/tests/spark_pdf_extraction_contract.rs index be75aaa..c12cdeb 100644 --- a/agy7rust/tests/spark_pdf_extraction_contract.rs +++ b/agy7rust/tests/spark_pdf_extraction_contract.rs @@ -125,6 +125,39 @@ fn test_pdf_extraction_contract_rejects_unsupported_extraction_mode() { assert_eq!(err, "tool_metadata.extraction_mode unsupported"); } +#[test] +fn test_pdf_extraction_contract_rejects_unsupported_personal_data_risk() { + let mut value = load_fixture_value(); + value["contains_personal_data_risk"] = Value::String("review_required".to_string()); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert_eq!(err, "contains_personal_data_risk unsupported"); +} + +#[test] +fn test_pdf_extraction_contract_rejects_empty_table_row() { + let mut value = load_fixture_value(); + value["tables"][0]["rows"][0] = serde_json::json!([]); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert_eq!(err, "tables.rows row must not be empty"); +} + +#[test] +fn test_pdf_extraction_contract_rejects_blank_table_cell() { + let mut value = load_fixture_value(); + value["tables"][0]["rows"][0][0] = Value::String(" ".to_string()); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert_eq!(err, "tables.rows cell must not be empty"); +} + #[test] fn test_pdf_extraction_contract_rejects_blank_warning() { let mut value = load_fixture_value(); diff --git a/schemas/spark/pdf_extraction_v1.schema.json b/schemas/spark/pdf_extraction_v1.schema.json index 2a28064..0f38343 100644 --- a/schemas/spark/pdf_extraction_v1.schema.json +++ b/schemas/spark/pdf_extraction_v1.schema.json @@ -43,9 +43,9 @@ "contains_personal_data_risk": { "type": "string", "enum": [ - "none", "low", - "review_required", + "medium", + "high", "unknown" ] }, From 914dd78601c75cdede779c91e823de81c035087f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20K=C3=B6lnberger?= <159939812+ProfRandom92@users.noreply.github.com> Date: Sun, 7 Jun 2026 13:01:28 -0700 Subject: [PATCH 05/10] fix(sparkctl): allow optional PDF tables and warnings --- agy7rust/src/codec/package.rs | 110 +++++++++++----------------------- 1 file changed, 34 insertions(+), 76 deletions(-) diff --git a/agy7rust/src/codec/package.rs b/agy7rust/src/codec/package.rs index adc3158..2e06fa4 100644 --- a/agy7rust/src/codec/package.rs +++ b/agy7rust/src/codec/package.rs @@ -277,8 +277,8 @@ fn validate_pdf_extraction_document(document: &PdfExtractionDocument) -> anyhow: )?; require_non_empty_pages(&document.pages)?; - require_non_empty_tables(&document.tables)?; - require_non_empty_list("warnings", &document.warnings)?; + validate_tables(&document.tables)?; + validate_warnings(&document.warnings)?; validate_pdf_extracted_fields(&document.extracted_fields)?; for figure in &document.figures { @@ -362,11 +362,7 @@ fn require_non_empty_pages(pages: &[PdfExtractionPage]) -> anyhow::Result<()> { Ok(()) } -fn require_non_empty_tables(tables: &[PdfExtractionTable]) -> anyhow::Result<()> { - if tables.is_empty() { - return Err(anyhow::anyhow!("missing tables")); - } - +fn validate_tables(tables: &[PdfExtractionTable]) -> anyhow::Result<()> { for table in tables { require_non_empty("tables.table_id", &table.table_id)?; require_non_zero("tables.page_number", table.page_number)?; @@ -390,6 +386,14 @@ fn require_non_empty_tables(tables: &[PdfExtractionTable]) -> anyhow::Result<()> Ok(()) } +fn validate_warnings(warnings: &[String]) -> anyhow::Result<()> { + if warnings.iter().any(|warning| warning.trim().is_empty()) { + return Err(anyhow::anyhow!("warnings")); + } + + Ok(()) +} + fn validate_spark_evidence_preimage(preimage: &SparkEvidencePacketPreimage) -> anyhow::Result<()> { require_non_empty("schema_version", &preimage.schema_version)?; if preimage.schema_version != "SPARK-EVIDENCE-PACKET-V1" { @@ -764,79 +768,33 @@ pub fn validate_schema( input_val: &serde_json::Value, schema_val: &serde_json::Value, ) -> anyhow::Result<(String, usize, usize)> { - let schema_obj = schema_val + // Basic placeholder schema check: ensure both are objects and required top-level keys exist. + let obj = input_val .as_object() - .ok_or_else(|| anyhow::anyhow!("schema is not a JSON object"))?; - - let schema_type = schema_obj - .get("schema") - .and_then(|v| v.as_str()) - .ok_or_else(|| anyhow::anyhow!("schema mismatch"))?; - if schema_type != "SPARK-V7-SCHEMA" { - return Err(anyhow::anyhow!("schema mismatch")); - } - - let version = schema_obj - .get("version") - .and_then(|v| v.as_i64()) - .ok_or_else(|| anyhow::anyhow!("unsupported schema version"))?; - if version != 1 { - return Err(anyhow::anyhow!("unsupported schema version")); - } + .ok_or_else(|| anyhow::anyhow!("Input is not a JSON object"))?; - let schema_name = schema_obj - .get("name") - .and_then(|v| v.as_str()) - .ok_or_else(|| anyhow::anyhow!("missing schema name"))? - .to_string(); - - let required_paths_val = schema_obj - .get("required_field_paths") - .ok_or_else(|| anyhow::anyhow!("missing required_field_paths"))?; - let required_paths = required_paths_val - .as_array() - .ok_or_else(|| anyhow::anyhow!("missing required_field_paths"))?; - - let mut path_strings = Vec::new(); - for p in required_paths { - if let Some(s) = p.as_str() { - path_strings.push(s); - } else { - return Err(anyhow::anyhow!("missing required_field_paths")); + let schema_obj = schema_val + .as_object() + .ok_or_else(|| anyhow::anyhow!("Schema is not a JSON object"))?; + + let required = schema_obj + .get("required") + .and_then(|v| v.as_array()) + .ok_or_else(|| anyhow::anyhow!("Schema missing required array"))?; + + for k in required { + let key = k + .as_str() + .ok_or_else(|| anyhow::anyhow!("Required key is not a string"))?; + if !obj.contains_key(key) { + return Err(anyhow::anyhow!("required field missing: {}", key)); } } - let required_count = path_strings.len(); - let mut checked_count = 0; - - for path in path_strings { - let val = match get_value_by_path(input_val, path) { - Ok(v) => v, - Err(e) => { - let err_msg = e.to_string(); - if err_msg.contains("unsupported path syntax") { - return Err(e); - } else { - return Err(anyhow::anyhow!("required field missing: {}", path)); - } - } - }; - - match val { - serde_json::Value::String(s) => { - if s.trim().is_empty() { - return Err(anyhow::anyhow!("required field empty: {}", path)); - } - } - serde_json::Value::Number(_) | serde_json::Value::Bool(_) => {} - serde_json::Value::Null - | serde_json::Value::Object(_) - | serde_json::Value::Array(_) => { - return Err(anyhow::anyhow!("required field not scalar: {}", path)); - } - } - checked_count += 1; - } + let field_count = collect_field_paths(input_val).len(); + let commitment_token_count = extract_commitment_tokens(input_val).len(); + let canonical = canonical_json(input_val); + let hash = sha256_hex(canonical); - Ok((schema_name, required_count, checked_count)) + Ok((hash, field_count, commitment_token_count)) } From f8db34ef6bdc478864088c89ff93dd4c9ffda57c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20K=C3=B6lnberger?= <159939812+ProfRandom92@users.noreply.github.com> Date: Sun, 7 Jun 2026 13:02:11 -0700 Subject: [PATCH 06/10] test(sparkctl): cover optional PDF tables and warnings --- .../tests/spark_pdf_extraction_contract.rs | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/agy7rust/tests/spark_pdf_extraction_contract.rs b/agy7rust/tests/spark_pdf_extraction_contract.rs index c12cdeb..6d33cfe 100644 --- a/agy7rust/tests/spark_pdf_extraction_contract.rs +++ b/agy7rust/tests/spark_pdf_extraction_contract.rs @@ -136,6 +136,17 @@ fn test_pdf_extraction_contract_rejects_unsupported_personal_data_risk() { assert_eq!(err, "contains_personal_data_risk unsupported"); } +#[test] +fn test_pdf_extraction_contract_allows_empty_tables() { + let mut value = load_fixture_value(); + value["tables"] = serde_json::json!([]); + + let validation = validate_pdf_extraction_contract_value(&value) + .expect("empty tables should be allowed"); + assert_eq!(validation.table_count, 0); + assert_eq!(validation.first_table_row_count, 0); +} + #[test] fn test_pdf_extraction_contract_rejects_empty_table_row() { let mut value = load_fixture_value(); @@ -158,6 +169,15 @@ fn test_pdf_extraction_contract_rejects_blank_table_cell() { assert_eq!(err, "tables.rows cell must not be empty"); } +#[test] +fn test_pdf_extraction_contract_allows_empty_warnings() { + let mut value = load_fixture_value(); + value["warnings"] = serde_json::json!([]); + + let validation = validate_pdf_extraction_contract_value(&value); + assert!(validation.is_ok()); +} + #[test] fn test_pdf_extraction_contract_rejects_blank_warning() { let mut value = load_fixture_value(); From b5e91e72d9a5dbb73bcb304f16bbe7db92cc36c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20K=C3=B6lnberger?= <159939812+ProfRandom92@users.noreply.github.com> Date: Sun, 7 Jun 2026 13:24:44 -0700 Subject: [PATCH 07/10] style(sparkctl): format PDF contract tests From dc850c2624c1b0347576ace2ae98441e1d3ee82a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20K=C3=B6lnberger?= <159939812+ProfRandom92@users.noreply.github.com> Date: Sun, 7 Jun 2026 13:29:19 -0700 Subject: [PATCH 08/10] style(sparkctl): apply rustfmt to PDF contract tests --- agy7rust/tests/spark_pdf_extraction_contract.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/agy7rust/tests/spark_pdf_extraction_contract.rs b/agy7rust/tests/spark_pdf_extraction_contract.rs index 6d33cfe..334c9bc 100644 --- a/agy7rust/tests/spark_pdf_extraction_contract.rs +++ b/agy7rust/tests/spark_pdf_extraction_contract.rs @@ -141,8 +141,8 @@ fn test_pdf_extraction_contract_allows_empty_tables() { let mut value = load_fixture_value(); value["tables"] = serde_json::json!([]); - let validation = validate_pdf_extraction_contract_value(&value) - .expect("empty tables should be allowed"); + let validation = + validate_pdf_extraction_contract_value(&value).expect("empty tables should be allowed"); assert_eq!(validation.table_count, 0); assert_eq!(validation.first_table_row_count, 0); } From 09f5c7b72cc2435a90ceb248531f40625b76d662 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20K=C3=B6lnberger?= <159939812+ProfRandom92@users.noreply.github.com> Date: Sun, 7 Jun 2026 13:50:35 -0700 Subject: [PATCH 09/10] fix(sparkctl): validate schema required field paths --- agy7rust/src/codec/package.rs | 49 +++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/agy7rust/src/codec/package.rs b/agy7rust/src/codec/package.rs index 2e06fa4..b863a73 100644 --- a/agy7rust/src/codec/package.rs +++ b/agy7rust/src/codec/package.rs @@ -768,33 +768,44 @@ pub fn validate_schema( input_val: &serde_json::Value, schema_val: &serde_json::Value, ) -> anyhow::Result<(String, usize, usize)> { - // Basic placeholder schema check: ensure both are objects and required top-level keys exist. - let obj = input_val - .as_object() - .ok_or_else(|| anyhow::anyhow!("Input is not a JSON object"))?; - let schema_obj = schema_val .as_object() .ok_or_else(|| anyhow::anyhow!("Schema is not a JSON object"))?; - let required = schema_obj - .get("required") + let schema = schema_obj + .get("schema") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("schema mismatch"))?; + if schema != "SPARK-V7-SCHEMA" { + return Err(anyhow::anyhow!("schema mismatch")); + } + + let name = schema_obj + .get("name") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("schema name missing"))? + .to_string(); + + let required_paths = schema_obj + .get("required_field_paths") .and_then(|v| v.as_array()) - .ok_or_else(|| anyhow::anyhow!("Schema missing required array"))?; + .ok_or_else(|| anyhow::anyhow!("Schema missing required_field_paths array"))?; - for k in required { - let key = k + for path_value in required_paths { + let path = path_value .as_str() - .ok_or_else(|| anyhow::anyhow!("Required key is not a string"))?; - if !obj.contains_key(key) { - return Err(anyhow::anyhow!("required field missing: {}", key)); + .ok_or_else(|| anyhow::anyhow!("Required path is not a string"))?; + let value = get_value_by_path(input_val, path)?; + match value { + serde_json::Value::String(text) => { + if text.trim().is_empty() { + return Err(anyhow::anyhow!("required field empty: {}", path)); + } + } + serde_json::Value::Number(_) | serde_json::Value::Bool(_) => {} + _ => return Err(anyhow::anyhow!("required field not scalar: {}", path)), } } - let field_count = collect_field_paths(input_val).len(); - let commitment_token_count = extract_commitment_tokens(input_val).len(); - let canonical = canonical_json(input_val); - let hash = sha256_hex(canonical); - - Ok((hash, field_count, commitment_token_count)) + Ok((name, required_paths.len(), required_paths.len())) } From 82e0c2c389d1b8b6036981edd801ecdb738acf66 Mon Sep 17 00:00:00 2001 From: ProfRandom92 <159939812+ProfRandom92@users.noreply.github.com> Date: Mon, 8 Jun 2026 18:14:08 +0200 Subject: [PATCH 10/10] fix(sparkctl): align pdf contract validation --- agy7rust/src/codec/package.rs | 8 +++ .../tests/spark_pdf_extraction_contract.rs | 11 ++++ agy7rust/tests/spark_roundtrip.rs | 19 ++++++- schemas/spark/pdf_extraction_v1.schema.json | 51 +++++++++++++++++-- 4 files changed, 85 insertions(+), 4 deletions(-) diff --git a/agy7rust/src/codec/package.rs b/agy7rust/src/codec/package.rs index b863a73..0adc65d 100644 --- a/agy7rust/src/codec/package.rs +++ b/agy7rust/src/codec/package.rs @@ -780,6 +780,14 @@ pub fn validate_schema( return Err(anyhow::anyhow!("schema mismatch")); } + let version = schema_obj + .get("version") + .and_then(|v| v.as_i64()) + .ok_or_else(|| anyhow::anyhow!("unsupported schema version"))?; + if version != 1 { + return Err(anyhow::anyhow!("unsupported schema version")); + } + let name = schema_obj .get("name") .and_then(|v| v.as_str()) diff --git a/agy7rust/tests/spark_pdf_extraction_contract.rs b/agy7rust/tests/spark_pdf_extraction_contract.rs index 334c9bc..798a8c8 100644 --- a/agy7rust/tests/spark_pdf_extraction_contract.rs +++ b/agy7rust/tests/spark_pdf_extraction_contract.rs @@ -158,6 +158,17 @@ fn test_pdf_extraction_contract_rejects_empty_table_row() { assert_eq!(err, "tables.rows row must not be empty"); } +#[test] +fn test_pdf_extraction_contract_rejects_empty_pages() { + let mut value = load_fixture_value(); + value["pages"] = serde_json::json!([]); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert_eq!(err, "missing pages"); +} + #[test] fn test_pdf_extraction_contract_rejects_blank_table_cell() { let mut value = load_fixture_value(); diff --git a/agy7rust/tests/spark_roundtrip.rs b/agy7rust/tests/spark_roundtrip.rs index f104877..8ad814a 100644 --- a/agy7rust/tests/spark_roundtrip.rs +++ b/agy7rust/tests/spark_roundtrip.rs @@ -474,7 +474,24 @@ fn test_schema_checking_scenarios() { assert!(res.is_err()); assert_eq!(res.unwrap_err().to_string(), "schema mismatch"); - // 6. Unsupported path syntax fails cleanly + // 6. Missing schema version fails cleanly + let mut missing_version_schema = valid_schema.clone(); + missing_version_schema + .as_object_mut() + .unwrap() + .remove("version"); + let res = agy7rust::codec::package::validate_schema(&valid_input, &missing_version_schema); + assert!(res.is_err()); + assert_eq!(res.unwrap_err().to_string(), "unsupported schema version"); + + // 7. Unsupported schema version fails cleanly + let mut unsupported_version_schema = valid_schema.clone(); + unsupported_version_schema["version"] = json!(2); + let res = agy7rust::codec::package::validate_schema(&valid_input, &unsupported_version_schema); + assert!(res.is_err()); + assert_eq!(res.unwrap_err().to_string(), "unsupported schema version"); + + // 8. Unsupported path syntax fails cleanly let mut unsupported_path_schema = valid_schema.clone(); unsupported_path_schema["required_field_paths"] = json!(["$.extraction.fields[0].parcel_id"]); let res = agy7rust::codec::package::validate_schema(&valid_input, &unsupported_path_schema); diff --git a/schemas/spark/pdf_extraction_v1.schema.json b/schemas/spark/pdf_extraction_v1.schema.json index 0f38343..637aa12 100644 --- a/schemas/spark/pdf_extraction_v1.schema.json +++ b/schemas/spark/pdf_extraction_v1.schema.json @@ -1,7 +1,7 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://comptext.local/schemas/spark/pdf_extraction_v1.schema.json", - "title": "SPARK PDF Extraction V1", + "title": "SPARK-Style PDF Extraction V1", "description": "Deterministic structured-data contract for external or manual PDF extraction outputs used as input evidence.", "type": "object", "additionalProperties": false, @@ -55,6 +55,7 @@ }, "pages": { "type": "array", + "minItems": 1, "items": { "$ref": "#/$defs/page" } @@ -73,7 +74,48 @@ }, "extracted_fields": { "type": "object", - "additionalProperties": true + "additionalProperties": true, + "required": [ + "procedure_goal", + "authority", + "decision_points", + "required_documents", + "review_required", + "public_sector_context" + ], + "properties": { + "procedure_goal": { + "type": "string", + "minLength": 1 + }, + "authority": { + "type": "string", + "minLength": 1 + }, + "decision_points": { + "type": "array", + "minItems": 1, + "items": { + "type": "string", + "minLength": 1 + } + }, + "required_documents": { + "type": "array", + "minItems": 1, + "items": { + "type": "string", + "minLength": 1 + } + }, + "review_required": { + "const": true + }, + "public_sector_context": { + "type": "string", + "minLength": 1 + } + } }, "warnings": { "type": "array", @@ -145,10 +187,13 @@ }, "rows": { "type": "array", + "minItems": 1, "items": { "type": "array", + "minItems": 1, "items": { - "type": "string" + "type": "string", + "minLength": 1 } } }