From 8eaef645876371e50105f7d7ee6a122a6a37ddb8 Mon Sep 17 00:00:00 2001
From: ProfRandom92 <159939812+ProfRandom92@users.noreply.github.com>
Date: Sun, 7 Jun 2026 17:38:09 +0200
Subject: [PATCH 01/10] feat(sparkctl): add PDF extraction contract fixture

---
 .../tests/spark_pdf_extraction_contract.rs    |  61 +++++
 docs/use-cases/PDF_TO_EVIDENCE_PACKET.md      |  53 +++++
 examples/spark/pdf_extraction_fixture.json    |  92 ++++++++
 schemas/spark/pdf_extraction_v1.schema.json   | 218 ++++++++++++++++++
 4 files changed, 424 insertions(+)
 create mode 100644 agy7rust/tests/spark_pdf_extraction_contract.rs
 create mode 100644 docs/use-cases/PDF_TO_EVIDENCE_PACKET.md
 create mode 100644 examples/spark/pdf_extraction_fixture.json
 create mode 100644 schemas/spark/pdf_extraction_v1.schema.json

diff --git a/agy7rust/tests/spark_pdf_extraction_contract.rs b/agy7rust/tests/spark_pdf_extraction_contract.rs
new file mode 100644
index 0000000..aaf5d05
--- /dev/null
+++ b/agy7rust/tests/spark_pdf_extraction_contract.rs
@@ -0,0 +1,61 @@
+use agy7rust::codec::package::canonical_json;
+use agy7rust::sha256_hex;
+use serde_json::Value;
+use std::fs;
+
+#[test]
+fn test_pdf_extraction_fixture_contract_shape() {
+    let fixture = fs::read_to_string("../examples/spark/pdf_extraction_fixture.json")
+        .expect("failed to read PDF extraction fixture");
+    let value: Value = serde_json::from_str(&fixture).expect("fixture should parse as JSON");
+
+    assert_eq!(value["schema_version"], "PDF-EXTRACTION-V1");
+    assert_non_empty_string(&value["source_file"], "source_file");
+    assert_eq!(value["tool_metadata"]["converter"], "manual");
+    assert_eq!(value["tool_metadata"]["extraction_mode"], "manual_fixture");
+
+    let extracted_fields = &value["extracted_fields"];
+    assert_non_empty_string(
+        &extracted_fields["procedure_goal"],
+        "extracted_fields.procedure_goal",
+    );
+    assert_non_empty_string(&extracted_fields["authority"], "extracted_fields.authority");
+    assert_non_empty_array(
+        &extracted_fields["decision_points"],
+        "extracted_fields.decision_points",
+    );
+    assert_non_empty_array(
+        &extracted_fields["required_documents"],
+        "extracted_fields.required_documents",
+    );
+    assert_eq!(extracted_fields["review_required"], true);
+
+    let tables = value["tables"]
+        .as_array()
+        .expect("tables should be an array");
+    let first_table = tables.first().expect("fixture should include a table");
+    let first_table_rows = first_table["rows"]
+        .as_array()
+        .expect("first table rows should be an array");
+    assert_eq!(first_table_rows.len(), 3);
+
+    let canonical = canonical_json(&value);
+    let hash_once = sha256_hex(&canonical);
+    let hash_twice = sha256_hex(canonical_json(&value));
+    assert_eq!(hash_once, hash_twice);
+    assert_eq!(hash_once.len(), 64);
+}
+
+fn assert_non_empty_string(value: &Value, label: &str) {
+    assert!(
+        value.as_str().is_some_and(|text| !text.trim().is_empty()),
+        "{label} should be a non-empty string"
+    );
+}
+
+fn assert_non_empty_array(value: &Value, label: &str) {
+    assert!(
+        value.as_array().is_some_and(|items| !items.is_empty()),
+        "{label} should be a non-empty array"
+    );
+}
diff --git a/docs/use-cases/PDF_TO_EVIDENCE_PACKET.md b/docs/use-cases/PDF_TO_EVIDENCE_PACKET.md
new file mode 100644
index 0000000..aa34e7f
--- /dev/null
+++ b/docs/use-cases/PDF_TO_EVIDENCE_PACKET.md
@@ -0,0 +1,53 @@
+# PDF To Evidence Packet Adapter Contract
+
+## Purpose
+
+External or manual PDF extraction can produce a structured JSON artifact for `sparkctl` evidence workflows. `sparkctl` treats that JSON as input evidence, not as truth by itself.
+
+The `PDF-EXTRACTION-V1` contract gives future converters and manual fixtures a deterministic shape for extracted administrative fields, page summaries, tables, figures, warnings, and converter metadata.
+
+## Pipeline
+
+PDF or text fixture -> PDF-EXTRACTION-V1 JSON -> Context Pack / Evidence Packet -> Policy Gate -> Human Review -> Artifact Manifest
+
+The structured extraction can inform a Context Pack and be recorded in an Evidence Packet. It does not replace the Context Pack, the Policy Gate, or Human Review.
+
+## Boundaries
+
+- This PR does not implement OCR.
+- This PR does not parse PDFs.
+- This PR does not include protected SPARK data.
+- This PR does not include real Daimler data.
+- This PR does not include real medical or ePA data.
+- This PR makes no production claim.
+- This PR makes no compliance, legal, or forensic claim.
+- Provider output remains untrusted until reviewed.
+- Human review remains required.
+- No real source PDF is committed.
+
+## Converter Strategy
+
+Docling, MinerU, Marker, pdftotext, and manual processes can be future producers of the same schema. This PR defines only the adapter contract.
+
+The contract records converter name, converter version, and extraction mode so reviewers can distinguish synthetic fixtures, manual fixtures, and future external-tool outputs.
+
+## Future Codex-Style Plugin Bundle Readiness
+
+Future Codex-style plugin bundles could expose skills, commands, hooks, and declared artifacts around this contract.
+
+This repository does not claim official OpenAI plugin compatibility or plugin-directory availability. A future bundle would still need human review, explicit connector boundaries, declared artifacts, and repository-specific approval before any write-capable action.
+
+## Method Precedent
+
+This follows prior CompText privacy-preserving synthetic fixture and replay-contract patterns. The fixture is synthetic and bounded so reviewers can inspect the contract without relying on protected source material.
+
+## Review Use
+
+Reviewers should check:
+
+- the fixture uses `schema_version: PDF-EXTRACTION-V1`
+- the source file path is descriptive but no source PDF is committed
+- extracted fields are administrative and synthetic
+- warnings state the fixture limits
+- converter metadata identifies manual fixture preparation
+- downstream evidence packets preserve the source and warning context
diff --git a/examples/spark/pdf_extraction_fixture.json b/examples/spark/pdf_extraction_fixture.json
new file mode 100644
index 0000000..b355775
--- /dev/null
+++ b/examples/spark/pdf_extraction_fixture.json
@@ -0,0 +1,92 @@
+{
+  "schema_version": "PDF-EXTRACTION-V1",
+  "source_file": "examples/spark/synthetic_bauantrag_fixture.pdf",
+  "document_type": "synthetic_public_admin_planning_fixture",
+  "license_or_usage_note": "Synthetic fixture for local contract testing only.",
+  "sanitization_status": "synthetic_no_real_personal_data",
+  "contains_personal_data_risk": "low",
+  "pages": [
+    {
+      "page_number": 1,
+      "text_summary": "Synthetic building and planning application overview with authority, procedure goal, zoning placeholder, and deadline note.",
+      "field_refs": [
+        "procedure_goal",
+        "authority",
+        "public_sector_context"
+      ]
+    },
+    {
+      "page_number": 2,
+      "text_summary": "Synthetic review appendix with environmental note, protected-area placeholder, flood-zone placeholder, required attachments, and decision points.",
+      "field_refs": [
+        "decision_points",
+        "required_documents",
+        "review_required"
+      ]
+    }
+  ],
+  "tables": [
+    {
+      "table_id": "table_required_documents_v1",
+      "page_number": 2,
+      "caption": "Synthetic required attachments for planning review",
+      "columns": [
+        "document",
+        "status",
+        "review_note"
+      ],
+      "rows": [
+        [
+          "site_plan",
+          "required",
+          "Check zoning placeholder reference ZONE-A"
+        ],
+        [
+          "environmental_note",
+          "required",
+          "Review protected-area placeholder reference ENV-P"
+        ],
+        [
+          "flood_zone_statement",
+          "conditional",
+          "Review flood-zone placeholder reference FLOOD-X"
+        ]
+      ]
+    }
+  ],
+  "figures": [
+    {
+      "figure_id": "figure_site_context_v1",
+      "page_number": 1,
+      "description": "Synthetic site-context sketch descriptor; no image or source PDF is committed."
+    }
+  ],
+  "extracted_fields": {
+    "procedure_goal": "Assess a synthetic building and planning application for review routing and evidence packaging.",
+    "authority": "Synthetic Municipal Planning Office",
+    "decision_points": [
+      "Confirm zoning placeholder reference ZONE-A is addressed.",
+      "Check environmental note for protected-area placeholder reference ENV-P.",
+      "Decide whether flood-zone placeholder reference FLOOD-X requires additional review.",
+      "Record deadline note before moving to human review."
+    ],
+    "required_documents": [
+      "site_plan",
+      "environmental_note",
+      "flood_zone_statement",
+      "deadline_note"
+    ],
+    "review_required": true,
+    "public_sector_context": "Synthetic administrative planning workflow for local evidence-packet contract testing."
+  },
+  "warnings": [
+    "Manual fixture; not extracted from a real PDF.",
+    "Synthetic administrative content only; no protected personal data is included."
+  ],
+  "tool_metadata": {
+    "converter": "manual",
+    "converter_version": "fixture-v1",
+    "extraction_mode": "manual_fixture",
+    "notes": "Prepared by hand to exercise the PDF-EXTRACTION-V1 adapter contract."
+  }
+}
diff --git a/schemas/spark/pdf_extraction_v1.schema.json b/schemas/spark/pdf_extraction_v1.schema.json
new file mode 100644
index 0000000..2a28064
--- /dev/null
+++ b/schemas/spark/pdf_extraction_v1.schema.json
@@ -0,0 +1,218 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://comptext.local/schemas/spark/pdf_extraction_v1.schema.json",
+  "title": "SPARK PDF Extraction V1",
+  "description": "Deterministic structured-data contract for external or manual PDF extraction outputs used as input evidence.",
+  "type": "object",
+  "additionalProperties": false,
+  "required": [
+    "schema_version",
+    "source_file",
+    "document_type",
+    "pages",
+    "tables",
+    "figures",
+    "extracted_fields",
+    "warnings",
+    "tool_metadata"
+  ],
+  "properties": {
+    "schema_version": {
+      "const": "PDF-EXTRACTION-V1"
+    },
+    "source_file": {
+      "type": "string",
+      "minLength": 1
+    },
+    "source_sha256": {
+      "type": "string",
+      "pattern": "^[a-f0-9]{64}$"
+    },
+    "source_url": {
+      "type": "string",
+      "minLength": 1
+    },
+    "license_or_usage_note": {
+      "type": "string",
+      "minLength": 1
+    },
+    "sanitization_status": {
+      "type": "string",
+      "minLength": 1
+    },
+    "contains_personal_data_risk": {
+      "type": "string",
+      "enum": [
+        "none",
+        "low",
+        "review_required",
+        "unknown"
+      ]
+    },
+    "document_type": {
+      "type": "string",
+      "minLength": 1
+    },
+    "pages": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/page"
+      }
+    },
+    "tables": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/table"
+      }
+    },
+    "figures": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/figure"
+      }
+    },
+    "extracted_fields": {
+      "type": "object",
+      "additionalProperties": true
+    },
+    "warnings": {
+      "type": "array",
+      "items": {
+        "type": "string",
+        "minLength": 1
+      }
+    },
+    "tool_metadata": {
+      "$ref": "#/$defs/tool_metadata"
+    }
+  },
+  "$defs": {
+    "page": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "page_number",
+        "text_summary"
+      ],
+      "properties": {
+        "page_number": {
+          "type": "integer",
+          "minimum": 1
+        },
+        "text_summary": {
+          "type": "string",
+          "minLength": 1
+        },
+        "field_refs": {
+          "type": "array",
+          "items": {
+            "type": "string",
+            "minLength": 1
+          }
+        }
+      }
+    },
+    "table": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "table_id",
+        "page_number",
+        "caption",
+        "columns",
+        "rows"
+      ],
+      "properties": {
+        "table_id": {
+          "type": "string",
+          "minLength": 1
+        },
+        "page_number": {
+          "type": "integer",
+          "minimum": 1
+        },
+        "caption": {
+          "type": "string",
+          "minLength": 1
+        },
+        "columns": {
+          "type": "array",
+          "minItems": 1,
+          "items": {
+            "type": "string",
+            "minLength": 1
+          }
+        },
+        "rows": {
+          "type": "array",
+          "items": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            }
+          }
+        }
+      }
+    },
+    "figure": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "figure_id",
+        "page_number",
+        "description"
+      ],
+      "properties": {
+        "figure_id": {
+          "type": "string",
+          "minLength": 1
+        },
+        "page_number": {
+          "type": "integer",
+          "minimum": 1
+        },
+        "description": {
+          "type": "string",
+          "minLength": 1
+        }
+      }
+    },
+    "tool_metadata": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "converter",
+        "converter_version",
+        "extraction_mode"
+      ],
+      "properties": {
+        "converter": {
+          "type": "string",
+          "enum": [
+            "manual",
+            "docling",
+            "mineru",
+            "marker",
+            "pdftotext",
+            "other"
+          ]
+        },
+        "converter_version": {
+          "type": "string",
+          "minLength": 1
+        },
+        "extraction_mode": {
+          "type": "string",
+          "enum": [
+            "synthetic_fixture",
+            "manual_fixture",
+            "external_tool"
+          ]
+        },
+        "notes": {
+          "type": "string"
+        }
+      }
+    }
+  }
+}

From ddf5af967ee4b020b84f082cdc854098bac48e24 Mon Sep 17 00:00:00 2001
From: ProfRandom92 <159939812+ProfRandom92@users.noreply.github.com>
Date: Sun, 7 Jun 2026 19:11:08 +0200
Subject: [PATCH 02/10] feat(sparkctl): add PDF extraction runtime validation

---
 .../skills/pdf-extraction-contracts/SKILL.md  |  82 +++++++
 agy7rust/src/codec/package.rs                 | 228 ++++++++++++++++++
 agy7rust/src/lib.rs                           |  10 +-
 .../tests/spark_pdf_extraction_contract.rs    |  57 ++++-
 docs/use-cases/PDF_TO_EVIDENCE_PACKET.md      |   6 +
 5 files changed, 372 insertions(+), 11 deletions(-)
 create mode 100644 .agents/skills/pdf-extraction-contracts/SKILL.md

diff --git a/.agents/skills/pdf-extraction-contracts/SKILL.md b/.agents/skills/pdf-extraction-contracts/SKILL.md
new file mode 100644
index 0000000..dadb298
--- /dev/null
+++ b/.agents/skills/pdf-extraction-contracts/SKILL.md
@@ -0,0 +1,82 @@
+# Skill: PDF Extraction Contracts
+
+## Purpose
+
+Guide work on `PDF-EXTRACTION-V1` structured-data artifacts for SPARK-like administrative workflows.
+
+## Use This Skill When
+
+- Adding or reviewing PDF extraction fixtures.
+- Updating `schemas/spark/pdf_extraction_v1.schema.json`.
+- Validating external or manual extraction JSON before evidence packaging.
+- Connecting PDF extraction artifacts to Context Pack or Evidence Packet workflows.
+
+## Contract Boundary
+
+`PDF-EXTRACTION-V1` is an adapter contract. It accepts structured JSON from manual or external extraction tools as input evidence.
+
+It does not:
+
+- implement OCR
+- parse PDFs
+- download source PDFs
+- call providers
+- create a Codex plugin bundle
+- create an MCP server
+- create hooks or commands
+- claim official OpenAI plugin compatibility
+- claim official SPARK compatibility
+- replace human review
+
+## Required Fields
+
+Every artifact must include:
+
+- `schema_version`
+- `source_file`
+- `document_type`
+- `pages`
+- `tables`
+- `figures`
+- `extracted_fields`
+- `warnings`
+- `tool_metadata`
+
+`schema_version` must be `PDF-EXTRACTION-V1`.
+
+`tool_metadata.converter` must be one of:
+
+- `manual`
+- `docling`
+- `mineru`
+- `marker`
+- `pdftotext`
+- `other`
+
+`tool_metadata.extraction_mode` must be one of:
+
+- `synthetic_fixture`
+- `manual_fixture`
+- `external_tool`
+
+## Fixture Rules
+
+Synthetic fixtures must not include protected personal data, real SPARK data, real Daimler data, real medical data, or real ePA data.
+
+Do not commit source PDFs unless a future task explicitly approves that artifact and license boundary.
+
+## Validation
+
+Prefer local runtime validation with `validate_pdf_extraction_contract_value` and deterministic canonical hashing with the existing `canonical_json` and `sha256_hex` helpers.
+
+For Rust changes, run:
+
+- `cargo fmt --all --check`
+- `cargo test`
+- `cargo clippy --all-targets --all-features -- -D warnings`
+
+## Claim Boundaries
+
+Use bounded wording: adapter contract, structured input evidence, manual fixture, external-tool output, review input, artifact manifest.
+
+Do not claim production readiness, compliance or certification, legal evidentiary status, forensic certainty, official SPARK compatibility, official OpenAI plugin compatibility, autonomous approval, or guaranteed correctness.
diff --git a/agy7rust/src/codec/package.rs b/agy7rust/src/codec/package.rs
index 584c4e8..042feca 100644
--- a/agy7rust/src/codec/package.rs
+++ b/agy7rust/src/codec/package.rs
@@ -1,6 +1,7 @@
 use crate::codec::hash::sha256_hex;
 use serde::{Deserialize, Serialize};
 use serde_json;
+use std::collections::BTreeMap;
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub enum PolicyResult {
@@ -76,6 +77,81 @@ pub struct SparkEvidencePacketEnvelope {
     pub canonical_hash: String,
 }
 
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(deny_unknown_fields)]
+pub struct PdfExtractionDocument {
+    pub schema_version: String,
+    pub source_file: String,
+    pub source_sha256: Option<String>,
+    pub source_url: Option<String>,
+    pub license_or_usage_note: Option<String>,
+    pub sanitization_status: Option<String>,
+    pub contains_personal_data_risk: Option<String>,
+    pub document_type: String,
+    pub pages: Vec<PdfExtractionPage>,
+    pub tables: Vec<PdfExtractionTable>,
+    pub figures: Vec<PdfExtractionFigure>,
+    pub extracted_fields: PdfExtractedFields,
+    pub warnings: Vec<String>,
+    pub tool_metadata: PdfExtractionToolMetadata,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(deny_unknown_fields)]
+pub struct PdfExtractionPage {
+    pub page_number: u64,
+    pub text_summary: String,
+    pub field_refs: Option<Vec<String>>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(deny_unknown_fields)]
+pub struct PdfExtractionTable {
+    pub table_id: String,
+    pub page_number: u64,
+    pub caption: String,
+    pub columns: Vec<String>,
+    pub rows: Vec<Vec<String>>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(deny_unknown_fields)]
+pub struct PdfExtractionFigure {
+    pub figure_id: String,
+    pub page_number: u64,
+    pub description: String,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct PdfExtractedFields {
+    pub procedure_goal: String,
+    pub authority: String,
+    pub decision_points: Vec<String>,
+    pub required_documents: Vec<String>,
+    pub review_required: bool,
+    pub public_sector_context: String,
+    #[serde(flatten)]
+    pub additional_fields: BTreeMap<String, serde_json::Value>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(deny_unknown_fields)]
+pub struct PdfExtractionToolMetadata {
+    pub converter: String,
+    pub converter_version: String,
+    pub extraction_mode: String,
+    pub notes: Option<String>,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct PdfExtractionValidation {
+    pub canonical_json: String,
+    pub canonical_hash: String,
+    pub page_count: usize,
+    pub table_count: usize,
+    pub first_table_row_count: usize,
+}
+
 pub fn sort_json_value(value: &serde_json::Value) -> serde_json::Value {
     match value {
         serde_json::Value::Object(map) => {
@@ -140,6 +216,158 @@ pub fn validate_spark_evidence_packet_value(value: &serde_json::Value) -> anyhow
     validate_spark_evidence_packet_envelope(&envelope)
 }
 
+pub fn validate_pdf_extraction_contract_value(
+    value: &serde_json::Value,
+) -> anyhow::Result<PdfExtractionValidation> {
+    let document: PdfExtractionDocument = serde_json::from_value(value.clone())?;
+    validate_pdf_extraction_document(&document)?;
+
+    let canonical = canonical_json(value);
+    let canonical_hash = sha256_hex(&canonical);
+    let first_table_row_count = document
+        .tables
+        .first()
+        .map(|table| table.rows.len())
+        .unwrap_or(0);
+
+    Ok(PdfExtractionValidation {
+        canonical_json: canonical,
+        canonical_hash,
+        page_count: document.pages.len(),
+        table_count: document.tables.len(),
+        first_table_row_count,
+    })
+}
+
+fn validate_pdf_extraction_document(document: &PdfExtractionDocument) -> anyhow::Result<()> {
+    require_exact(
+        "schema_version",
+        &document.schema_version,
+        "PDF-EXTRACTION-V1",
+    )?;
+    require_non_empty("source_file", &document.source_file)?;
+    require_non_empty("document_type", &document.document_type)?;
+    require_allowed(
+        "tool_metadata.converter",
+        &document.tool_metadata.converter,
+        &[
+            "manual",
+            "docling",
+            "mineru",
+            "marker",
+            "pdftotext",
+            "other",
+        ],
+    )?;
+    require_non_empty(
+        "tool_metadata.converter_version",
+        &document.tool_metadata.converter_version,
+    )?;
+    require_allowed(
+        "tool_metadata.extraction_mode",
+        &document.tool_metadata.extraction_mode,
+        &["synthetic_fixture", "manual_fixture", "external_tool"],
+    )?;
+
+    require_non_empty_pages(&document.pages)?;
+    require_non_empty_tables(&document.tables)?;
+    require_non_empty_list("warnings", &document.warnings)?;
+    validate_pdf_extracted_fields(&document.extracted_fields)?;
+
+    for figure in &document.figures {
+        require_non_empty("figures.figure_id", &figure.figure_id)?;
+        require_non_zero("figures.page_number", figure.page_number)?;
+        require_non_empty("figures.description", &figure.description)?;
+    }
+
+    if let Some(hash) = &document.source_sha256 {
+        validate_sha256_hex("source_sha256", hash)?;
+    }
+
+    Ok(())
+}
+
+fn validate_pdf_extracted_fields(fields: &PdfExtractedFields) -> anyhow::Result<()> {
+    require_non_empty("extracted_fields.procedure_goal", &fields.procedure_goal)?;
+    require_non_empty("extracted_fields.authority", &fields.authority)?;
+    require_non_empty_list("extracted_fields.decision_points", &fields.decision_points)?;
+    require_non_empty_list(
+        "extracted_fields.required_documents",
+        &fields.required_documents,
+    )?;
+    require_non_empty(
+        "extracted_fields.public_sector_context",
+        &fields.public_sector_context,
+    )?;
+
+    Ok(())
+}
+
+fn require_exact(label: &str, value: &str, expected: &str) -> anyhow::Result<()> {
+    if value != expected {
+        return Err(anyhow::anyhow!("{} mismatch", label));
+    }
+    Ok(())
+}
+
+fn require_allowed(label: &str, value: &str, allowed: &[&str]) -> anyhow::Result<()> {
+    if !allowed.contains(&value) {
+        return Err(anyhow::anyhow!("{} unsupported", label));
+    }
+    Ok(())
+}
+
+fn require_non_zero(label: &str, value: u64) -> anyhow::Result<()> {
+    if value == 0 {
+        return Err(anyhow::anyhow!("{} must be greater than zero", label));
+    }
+    Ok(())
+}
+
+fn validate_sha256_hex(label: &str, value: &str) -> anyhow::Result<()> {
+    if value.len() != 64 || !value.chars().all(|ch| ch.is_ascii_hexdigit()) {
+        return Err(anyhow::anyhow!("{} must be lowercase SHA-256 hex", label));
+    }
+    if value.chars().any(|ch| ch.is_ascii_uppercase()) {
+        return Err(anyhow::anyhow!("{} must be lowercase SHA-256 hex", label));
+    }
+    Ok(())
+}
+
+fn require_non_empty_pages(pages: &[PdfExtractionPage]) -> anyhow::Result<()> {
+    if pages.is_empty() {
+        return Err(anyhow::anyhow!("missing pages"));
+    }
+
+    for page in pages {
+        require_non_zero("pages.page_number", page.page_number)?;
+        require_non_empty("pages.text_summary", &page.text_summary)?;
+        if let Some(field_refs) = &page.field_refs {
+            require_non_empty_list("pages.field_refs", field_refs)?;
+        }
+    }
+
+    Ok(())
+}
+
+fn require_non_empty_tables(tables: &[PdfExtractionTable]) -> anyhow::Result<()> {
+    if tables.is_empty() {
+        return Err(anyhow::anyhow!("missing tables"));
+    }
+
+    for table in tables {
+        require_non_empty("tables.table_id", &table.table_id)?;
+        require_non_zero("tables.page_number", table.page_number)?;
+        require_non_empty("tables.caption", &table.caption)?;
+        require_non_empty_list("tables.columns", &table.columns)?;
+        if table.rows.is_empty() {
+            return Err(anyhow::anyhow!("missing tables.rows"));
+        }
+    }
+
+    Ok(())
+}
+
 fn validate_spark_evidence_preimage(preimage: &SparkEvidencePacketPreimage) -> anyhow::Result<()> {
     require_non_empty("schema_version", &preimage.schema_version)?;
     if preimage.schema_version != "SPARK-EVIDENCE-PACKET-V1" {
diff --git a/agy7rust/src/lib.rs b/agy7rust/src/lib.rs
index 5a6e975..04c3e24 100644
--- a/agy7rust/src/lib.rs
+++ b/agy7rust/src/lib.rs
@@ -9,8 +9,10 @@ pub use codec::hash::sha256_hex;
 pub use codec::package::{
     build_package_from_value, build_spark_evidence_packet_envelope, canonical_json,
     collect_field_paths, extract_commitment_tokens, get_value_by_path, replay_package_value,
-    sort_json_value, validate_schema, validate_spark_evidence_packet_envelope,
-    validate_spark_evidence_packet_value, verify_package_value, ArtifactManifestEntry,
-    ClaimHygiene, HumanReviewDecision, PolicyResult, ProviderBoundaryStatus,
-    SparkEvidencePacketEnvelope, SparkEvidencePacketPreimage,
+    sort_json_value, validate_pdf_extraction_contract_value, validate_schema,
+    validate_spark_evidence_packet_envelope, validate_spark_evidence_packet_value,
+    verify_package_value, ArtifactManifestEntry, ClaimHygiene, HumanReviewDecision,
+    PdfExtractedFields, PdfExtractionDocument, PdfExtractionFigure, PdfExtractionPage,
+    PdfExtractionTable, PdfExtractionToolMetadata, PdfExtractionValidation, PolicyResult,
+    ProviderBoundaryStatus, SparkEvidencePacketEnvelope, SparkEvidencePacketPreimage,
 };
diff --git a/agy7rust/tests/spark_pdf_extraction_contract.rs b/agy7rust/tests/spark_pdf_extraction_contract.rs
index aaf5d05..a43bfe4 100644
--- a/agy7rust/tests/spark_pdf_extraction_contract.rs
+++ b/agy7rust/tests/spark_pdf_extraction_contract.rs
@@ -1,5 +1,4 @@
-use agy7rust::codec::package::canonical_json;
-use agy7rust::sha256_hex;
+use agy7rust::validate_pdf_extraction_contract_value;
 use serde_json::Value;
 use std::fs;
 
@@ -8,6 +7,8 @@ fn test_pdf_extraction_fixture_contract_shape() {
     let fixture = fs::read_to_string("../examples/spark/pdf_extraction_fixture.json")
         .expect("failed to read PDF extraction fixture");
     let value: Value = serde_json::from_str(&fixture).expect("fixture should parse as JSON");
+    let validation =
+        validate_pdf_extraction_contract_value(&value).expect("fixture contract should validate");
 
     assert_eq!(value["schema_version"], "PDF-EXTRACTION-V1");
     assert_non_empty_string(&value["source_file"], "source_file");
@@ -39,11 +40,47 @@ fn test_pdf_extraction_fixture_contract_shape() {
         .expect("first table rows should be an array");
     assert_eq!(first_table_rows.len(), 3);
 
-    let canonical = canonical_json(&value);
-    let hash_once = sha256_hex(&canonical);
-    let hash_twice = sha256_hex(canonical_json(&value));
-    assert_eq!(hash_once, hash_twice);
-    assert_eq!(hash_once.len(), 64);
+    assert_eq!(validation.page_count, 2);
+    assert_eq!(validation.table_count, 1);
+    assert_eq!(validation.first_table_row_count, 3);
+    assert_eq!(validation.canonical_hash.len(), 64);
+    assert!(!validation.canonical_json.is_empty());
+}
+
+#[test]
+fn test_pdf_extraction_contract_rejects_wrong_schema_version() {
+    let mut value = load_fixture_value();
+    value["schema_version"] = Value::String("PDF-EXTRACTION-V0".to_string());
+
+    let err = validate_pdf_extraction_contract_value(&value)
+        .unwrap_err()
+        .to_string();
+    assert_eq!(err, "schema_version mismatch");
+}
+
+#[test]
+fn test_pdf_extraction_contract_rejects_missing_required_field() {
+    let mut value = load_fixture_value();
+    value["extracted_fields"]
+        .as_object_mut()
+        .expect("extracted_fields should be an object")
+        .remove("procedure_goal");
+
+    let err = validate_pdf_extraction_contract_value(&value)
+        .unwrap_err()
+        .to_string();
+    assert!(err.contains("missing field `procedure_goal`"));
+}
+
+#[test]
+fn test_pdf_extraction_contract_rejects_unsupported_converter() {
+    let mut value = load_fixture_value();
+    value["tool_metadata"]["converter"] = Value::String("unsupported".to_string());
+
+    let err = validate_pdf_extraction_contract_value(&value)
+        .unwrap_err()
+        .to_string();
+    assert_eq!(err, "tool_metadata.converter unsupported");
 }
 
 fn assert_non_empty_string(value: &Value, label: &str) {
@@ -59,3 +96,9 @@ fn assert_non_empty_array(value: &Value, label: &str) {
         "{label} should be a non-empty array"
     );
 }
+
+fn load_fixture_value() -> Value {
+    let fixture = fs::read_to_string("../examples/spark/pdf_extraction_fixture.json")
+        .expect("failed to read PDF extraction fixture");
+    serde_json::from_str(&fixture).expect("fixture should parse as JSON")
+}
diff --git a/docs/use-cases/PDF_TO_EVIDENCE_PACKET.md b/docs/use-cases/PDF_TO_EVIDENCE_PACKET.md
index aa34e7f..c5f02da 100644
--- a/docs/use-cases/PDF_TO_EVIDENCE_PACKET.md
+++ b/docs/use-cases/PDF_TO_EVIDENCE_PACKET.md
@@ -12,6 +12,12 @@ PDF or text fixture -> PDF-EXTRACTION-V1 JSON -> Context Pack / Evidence Packet
 
 The structured extraction can inform a Context Pack and be recorded in an Evidence Packet. It does not replace the Context Pack, the Policy Gate, or Human Review.
 
+## Runtime Contract Validation
+
+The Rust crate exposes local runtime validation for `PDF-EXTRACTION-V1` JSON values. The validator checks the declared fixture contract shape and computes a deterministic SHA-256 hash over canonical JSON with existing helpers.
+
+This validation is a local contract check. It does not perform OCR, parse PDFs, call providers, or verify that extracted text is true.
+
 ## Boundaries
 
 - This PR does not implement OCR.

From 81ffea740e6dd135835cb7b1c2aa063a0d809ff4 Mon Sep 17 00:00:00 2001
From: ProfRandom92 <159939812+ProfRandom92@users.noreply.github.com>
Date: Sun, 7 Jun 2026 19:34:04 +0200
Subject: [PATCH 03/10] fix(sparkctl): close PDF extraction validation gaps

---
 .../skills/pdf-extraction-contracts/SKILL.md  |  9 ++
 agy7rust/src/codec/package.rs                 |  5 +
 .../tests/spark_pdf_extraction_contract.rs    | 91 ++++++++++++++++++-
 3 files changed, 104 insertions(+), 1 deletion(-)

diff --git a/.agents/skills/pdf-extraction-contracts/SKILL.md b/.agents/skills/pdf-extraction-contracts/SKILL.md
index dadb298..995e4b3 100644
--- a/.agents/skills/pdf-extraction-contracts/SKILL.md
+++ b/.agents/skills/pdf-extraction-contracts/SKILL.md
@@ -69,6 +69,15 @@ Do not commit source PDFs unless a future task explicitly approves that artifact
 
 Prefer local runtime validation with `validate_pdf_extraction_contract_value` and deterministic canonical hashing with the existing `canonical_json` and `sha256_hex` helpers.
 
+Report Agent Governor gate states using exactly one of:
+
+- `pass`
+- `fail`
+- `not_applicable`
+- `deferred`
+
+Use `not_applicable` only when a gate does not apply, and explain why. Use `deferred` when the gate is required but intentionally left for later human/tool review.
+
 For Rust changes, run:
 
 - `cargo fmt --all --check`
diff --git a/agy7rust/src/codec/package.rs b/agy7rust/src/codec/package.rs
index 042feca..d9bff89 100644
--- a/agy7rust/src/codec/package.rs
+++ b/agy7rust/src/codec/package.rs
@@ -295,6 +295,11 @@ fn validate_pdf_extracted_fields(fields: &PdfExtractedFields) -> anyhow::Result<
         "extracted_fields.required_documents",
         &fields.required_documents,
     )?;
+    if !fields.review_required {
+        return Err(anyhow::anyhow!(
+            "PDF extraction extracted_fields.review_required must be true"
+        ));
+    }
     require_non_empty(
         "extracted_fields.public_sector_context",
         &fields.public_sector_context,
diff --git a/agy7rust/tests/spark_pdf_extraction_contract.rs b/agy7rust/tests/spark_pdf_extraction_contract.rs
index a43bfe4..be75aaa 100644
--- a/agy7rust/tests/spark_pdf_extraction_contract.rs
+++ b/agy7rust/tests/spark_pdf_extraction_contract.rs
@@ -59,7 +59,38 @@ fn test_pdf_extraction_contract_rejects_wrong_schema_version() {
 }
 
 #[test]
-fn test_pdf_extraction_contract_rejects_missing_required_field() {
+fn test_pdf_extraction_contract_rejects_unknown_top_level_field() {
+    let mut value = load_fixture_value();
+    value
+        .as_object_mut()
+        .expect("fixture should be an object")
+        .insert(
+            "unexpected_field".to_string(),
+            Value::String("tamper".to_string()),
+        );
+
+    let err = validate_pdf_extraction_contract_value(&value)
+        .unwrap_err()
+        .to_string();
+    assert!(err.contains("unknown field `unexpected_field`"));
+}
+
+#[test]
+fn test_pdf_extraction_contract_rejects_missing_required_top_level_field() {
+    let mut value = load_fixture_value();
+    value
+        .as_object_mut()
+        .expect("fixture should be an object")
+        .remove("source_file");
+
+    let err = validate_pdf_extraction_contract_value(&value)
+        .unwrap_err()
+        .to_string();
+    assert!(err.contains("missing field `source_file`"));
+}
+
+#[test]
+fn test_pdf_extraction_contract_rejects_missing_required_extracted_field() {
     let mut value = load_fixture_value();
     value["extracted_fields"]
         .as_object_mut()
@@ -83,6 +114,64 @@ fn test_pdf_extraction_contract_rejects_unsupported_converter() {
     assert_eq!(err, "tool_metadata.converter unsupported");
 }
 
+#[test]
+fn test_pdf_extraction_contract_rejects_unsupported_extraction_mode() {
+    let mut value = load_fixture_value();
+    value["tool_metadata"]["extraction_mode"] = Value::String("unsupported".to_string());
+
+    let err = validate_pdf_extraction_contract_value(&value)
+        .unwrap_err()
+        .to_string();
+    assert_eq!(err, "tool_metadata.extraction_mode unsupported");
+}
+
+#[test]
+fn test_pdf_extraction_contract_rejects_blank_warning() {
+    let mut value = load_fixture_value();
+    value["warnings"] = serde_json::json!(["manual fixture", "   "]);
+
+    let err = validate_pdf_extraction_contract_value(&value)
+        .unwrap_err()
+        .to_string();
+    assert_eq!(err, "warnings");
+}
+
+#[test]
+fn test_pdf_extraction_contract_rejects_blank_procedure_goal() {
+    let mut value = load_fixture_value();
+    value["extracted_fields"]["procedure_goal"] = Value::String("  ".to_string());
+
+    let err = validate_pdf_extraction_contract_value(&value)
+        .unwrap_err()
+        .to_string();
+    assert_eq!(err, "missing extracted_fields.procedure_goal");
+}
+
+#[test]
+fn test_pdf_extraction_contract_rejects_empty_decision_points() {
+    let mut value = load_fixture_value();
+    value["extracted_fields"]["decision_points"] = serde_json::json!([]);
+
+    let err = validate_pdf_extraction_contract_value(&value)
+        .unwrap_err()
+        .to_string();
+    assert_eq!(err, "extracted_fields.decision_points");
+}
+
+#[test]
+fn test_pdf_extraction_contract_rejects_review_required_false() {
+    let mut value = load_fixture_value();
+    value["extracted_fields"]["review_required"] = Value::Bool(false);
+
+    let err = validate_pdf_extraction_contract_value(&value)
+        .unwrap_err()
+        .to_string();
+    assert_eq!(
+        err,
+        "PDF extraction extracted_fields.review_required must be true"
+    );
+}
+
 fn assert_non_empty_string(value: &Value, label: &str) {
     assert!(
         value.as_str().is_some_and(|text| !text.trim().is_empty()),

From ac8d066356a44722cfb838578dd5103e94f2a5f4 Mon Sep 17 00:00:00 2001
From: ProfRandom92 <159939812+ProfRandom92@users.noreply.github.com>
Date: Sun, 7 Jun 2026 21:22:04 +0200
Subject: [PATCH 04/10] fix(sparkctl): align PDF extraction schema validation

---
 agy7rust/src/codec/package.rs                 | 17 ++++++++++
 .../tests/spark_pdf_extraction_contract.rs    | 33 +++++++++++++++++++
 schemas/spark/pdf_extraction_v1.schema.json   |  4 +--
 3 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/agy7rust/src/codec/package.rs b/agy7rust/src/codec/package.rs
index d9bff89..adc3158 100644
--- a/agy7rust/src/codec/package.rs
+++ b/agy7rust/src/codec/package.rs
@@ -246,6 +246,13 @@ fn validate_pdf_extraction_document(document: &PdfExtractionDocument) -> anyhow:
         "PDF-EXTRACTION-V1",
     )?;
     require_non_empty("source_file", &document.source_file)?;
+    if let Some(risk) = &document.contains_personal_data_risk {
+        require_allowed(
+            "contains_personal_data_risk",
+            risk,
+            &["low", "medium", "high", "unknown"],
+        )?;
+    }
     require_non_empty("document_type", &document.document_type)?;
     require_allowed(
         "tool_metadata.converter",
@@ -368,6 +375,16 @@ fn require_non_empty_tables(tables: &[PdfExtractionTable]) -> anyhow::Result<()>
         if table.rows.is_empty() {
             return Err(anyhow::anyhow!("missing tables.rows"));
         }
+        for row in &table.rows {
+            if row.is_empty() {
+                return Err(anyhow::anyhow!("tables.rows row must not be empty"));
+            }
+            for cell in row {
+                if cell.trim().is_empty() {
+                    return Err(anyhow::anyhow!("tables.rows cell must not be empty"));
+                }
+            }
+        }
     }
 
     Ok(())
diff --git a/agy7rust/tests/spark_pdf_extraction_contract.rs b/agy7rust/tests/spark_pdf_extraction_contract.rs
index be75aaa..c12cdeb 100644
--- a/agy7rust/tests/spark_pdf_extraction_contract.rs
+++ b/agy7rust/tests/spark_pdf_extraction_contract.rs
@@ -125,6 +125,39 @@ fn test_pdf_extraction_contract_rejects_unsupported_extraction_mode() {
     assert_eq!(err, "tool_metadata.extraction_mode unsupported");
 }
 
+#[test]
+fn test_pdf_extraction_contract_rejects_unsupported_personal_data_risk() {
+    let mut value = load_fixture_value();
+    value["contains_personal_data_risk"] = Value::String("review_required".to_string());
+
+    let err = validate_pdf_extraction_contract_value(&value)
+        .unwrap_err()
+        .to_string();
+    assert_eq!(err, "contains_personal_data_risk unsupported");
+}
+
+#[test]
+fn test_pdf_extraction_contract_rejects_empty_table_row() {
+    let mut value = load_fixture_value();
+    value["tables"][0]["rows"][0] = serde_json::json!([]);
+
+    let err = validate_pdf_extraction_contract_value(&value)
+        .unwrap_err()
+        .to_string();
+    assert_eq!(err, "tables.rows row must not be empty");
+}
+
+#[test]
+fn test_pdf_extraction_contract_rejects_blank_table_cell() {
+    let mut value = load_fixture_value();
+    value["tables"][0]["rows"][0][0] = Value::String("   ".to_string());
+
+    let err = validate_pdf_extraction_contract_value(&value)
+        .unwrap_err()
+        .to_string();
+    assert_eq!(err, "tables.rows cell must not be empty");
+}
+
 #[test]
 fn test_pdf_extraction_contract_rejects_blank_warning() {
     let mut value = load_fixture_value();
diff --git a/schemas/spark/pdf_extraction_v1.schema.json b/schemas/spark/pdf_extraction_v1.schema.json
index 2a28064..0f38343 100644
--- a/schemas/spark/pdf_extraction_v1.schema.json
+++ b/schemas/spark/pdf_extraction_v1.schema.json
@@ -43,9 +43,9 @@
     "contains_personal_data_risk": {
       "type": "string",
       "enum": [
-        "none",
         "low",
-        "review_required",
+        "medium",
+        "high",
         "unknown"
       ]
     },

From 914dd78601c75cdede779c91e823de81c035087f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20K=C3=B6lnberger?=
 <159939812+ProfRandom92@users.noreply.github.com>
Date: Sun, 7 Jun 2026 13:01:28 -0700
Subject: [PATCH 05/10] fix(sparkctl): allow optional PDF tables and warnings

---
 agy7rust/src/codec/package.rs | 110 +++++++++++-----------------------
 1 file changed, 34 insertions(+), 76 deletions(-)

diff --git a/agy7rust/src/codec/package.rs b/agy7rust/src/codec/package.rs
index adc3158..2e06fa4 100644
--- a/agy7rust/src/codec/package.rs
+++ b/agy7rust/src/codec/package.rs
@@ -277,8 +277,8 @@ fn validate_pdf_extraction_document(document: &PdfExtractionDocument) -> anyhow:
     )?;
 
     require_non_empty_pages(&document.pages)?;
-    require_non_empty_tables(&document.tables)?;
-    require_non_empty_list("warnings", &document.warnings)?;
+    validate_tables(&document.tables)?;
+    validate_warnings(&document.warnings)?;
     validate_pdf_extracted_fields(&document.extracted_fields)?;
 
     for figure in &document.figures {
@@ -362,11 +362,7 @@ fn require_non_empty_pages(pages: &[PdfExtractionPage]) -> anyhow::Result<()> {
     Ok(())
 }
 
-fn require_non_empty_tables(tables: &[PdfExtractionTable]) -> anyhow::Result<()> {
-    if tables.is_empty() {
-        return Err(anyhow::anyhow!("missing tables"));
-    }
-
+fn validate_tables(tables: &[PdfExtractionTable]) -> anyhow::Result<()> {
     for table in tables {
         require_non_empty("tables.table_id", &table.table_id)?;
         require_non_zero("tables.page_number", table.page_number)?;
@@ -390,6 +386,14 @@ fn require_non_empty_tables(tables: &[PdfExtractionTable]) -> anyhow::Result<()>
     Ok(())
 }
 
+fn validate_warnings(warnings: &[String]) -> anyhow::Result<()> {
+    if warnings.iter().any(|warning| warning.trim().is_empty()) {
+        return Err(anyhow::anyhow!("warnings"));
+    }
+
+    Ok(())
+}
+
 fn validate_spark_evidence_preimage(preimage: &SparkEvidencePacketPreimage) -> anyhow::Result<()> {
     require_non_empty("schema_version", &preimage.schema_version)?;
     if preimage.schema_version != "SPARK-EVIDENCE-PACKET-V1" {
@@ -764,79 +768,33 @@ pub fn validate_schema(
     input_val: &serde_json::Value,
     schema_val: &serde_json::Value,
 ) -> anyhow::Result<(String, usize, usize)> {
-    let schema_obj = schema_val
+    // Basic placeholder schema check: ensure both are objects and required top-level keys exist.
+    let obj = input_val
         .as_object()
-        .ok_or_else(|| anyhow::anyhow!("schema is not a JSON object"))?;
-
-    let schema_type = schema_obj
-        .get("schema")
-        .and_then(|v| v.as_str())
-        .ok_or_else(|| anyhow::anyhow!("schema mismatch"))?;
-    if schema_type != "SPARK-V7-SCHEMA" {
-        return Err(anyhow::anyhow!("schema mismatch"));
-    }
-
-    let version = schema_obj
-        .get("version")
-        .and_then(|v| v.as_i64())
-        .ok_or_else(|| anyhow::anyhow!("unsupported schema version"))?;
-    if version != 1 {
-        return Err(anyhow::anyhow!("unsupported schema version"));
-    }
+        .ok_or_else(|| anyhow::anyhow!("Input is not a JSON object"))?;
 
-    let schema_name = schema_obj
-        .get("name")
-        .and_then(|v| v.as_str())
-        .ok_or_else(|| anyhow::anyhow!("missing schema name"))?
-        .to_string();
-
-    let required_paths_val = schema_obj
-        .get("required_field_paths")
-        .ok_or_else(|| anyhow::anyhow!("missing required_field_paths"))?;
-    let required_paths = required_paths_val
-        .as_array()
-        .ok_or_else(|| anyhow::anyhow!("missing required_field_paths"))?;
-
-    let mut path_strings = Vec::new();
-    for p in required_paths {
-        if let Some(s) = p.as_str() {
-            path_strings.push(s);
-        } else {
-            return Err(anyhow::anyhow!("missing required_field_paths"));
+    let schema_obj = schema_val
+        .as_object()
+        .ok_or_else(|| anyhow::anyhow!("Schema is not a JSON object"))?;
+
+    let required = schema_obj
+        .get("required")
+        .and_then(|v| v.as_array())
+        .ok_or_else(|| anyhow::anyhow!("Schema missing required array"))?;
+
+    for k in required {
+        let key = k
+            .as_str()
+            .ok_or_else(|| anyhow::anyhow!("Required key is not a string"))?;
+        if !obj.contains_key(key) {
+            return Err(anyhow::anyhow!("required field missing: {}", key));
         }
     }
 
-    let required_count = path_strings.len();
-    let mut checked_count = 0;
-
-    for path in path_strings {
-        let val = match get_value_by_path(input_val, path) {
-            Ok(v) => v,
-            Err(e) => {
-                let err_msg = e.to_string();
-                if err_msg.contains("unsupported path syntax") {
-                    return Err(e);
-                } else {
-                    return Err(anyhow::anyhow!("required field missing: {}", path));
-                }
-            }
-        };
-
-        match val {
-            serde_json::Value::String(s) => {
-                if s.trim().is_empty() {
-                    return Err(anyhow::anyhow!("required field empty: {}", path));
-                }
-            }
-            serde_json::Value::Number(_) | serde_json::Value::Bool(_) => {}
-            serde_json::Value::Null
-            | serde_json::Value::Object(_)
-            | serde_json::Value::Array(_) => {
-                return Err(anyhow::anyhow!("required field not scalar: {}", path));
-            }
-        }
-        checked_count += 1;
-    }
+    let field_count = collect_field_paths(input_val).len();
+    let commitment_token_count = extract_commitment_tokens(input_val).len();
+    let canonical = canonical_json(input_val);
+    let hash = sha256_hex(canonical);
 
-    Ok((schema_name, required_count, checked_count))
+    Ok((hash, field_count, commitment_token_count))
 }

From f8db34ef6bdc478864088c89ff93dd4c9ffda57c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20K=C3=B6lnberger?=
 <159939812+ProfRandom92@users.noreply.github.com>
Date: Sun, 7 Jun 2026 13:02:11 -0700
Subject: [PATCH 06/10] test(sparkctl): cover optional PDF tables and warnings

---
 .../tests/spark_pdf_extraction_contract.rs    | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/agy7rust/tests/spark_pdf_extraction_contract.rs b/agy7rust/tests/spark_pdf_extraction_contract.rs
index c12cdeb..6d33cfe 100644
--- a/agy7rust/tests/spark_pdf_extraction_contract.rs
+++ b/agy7rust/tests/spark_pdf_extraction_contract.rs
@@ -136,6 +136,17 @@ fn test_pdf_extraction_contract_rejects_unsupported_personal_data_risk() {
     assert_eq!(err, "contains_personal_data_risk unsupported");
 }
 
+#[test]
+fn test_pdf_extraction_contract_allows_empty_tables() {
+    let mut value = load_fixture_value();
+    value["tables"] = serde_json::json!([]);
+
+    let validation = validate_pdf_extraction_contract_value(&value)
+        .expect("empty tables should be allowed");
+    assert_eq!(validation.table_count, 0);
+    assert_eq!(validation.first_table_row_count, 0);
+}
+
 #[test]
 fn test_pdf_extraction_contract_rejects_empty_table_row() {
     let mut value = load_fixture_value();
@@ -158,6 +169,15 @@ fn test_pdf_extraction_contract_rejects_blank_table_cell() {
     assert_eq!(err, "tables.rows cell must not be empty");
 }
 
+#[test]
+fn test_pdf_extraction_contract_allows_empty_warnings() {
+    let mut value = load_fixture_value();
+    value["warnings"] = serde_json::json!([]);
+
+    let validation = validate_pdf_extraction_contract_value(&value);
+    assert!(validation.is_ok());
+}
+
 #[test]
 fn test_pdf_extraction_contract_rejects_blank_warning() {
     let mut value = load_fixture_value();

From b5e91e72d9a5dbb73bcb304f16bbe7db92cc36c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20K=C3=B6lnberger?=
 <159939812+ProfRandom92@users.noreply.github.com>
Date: Sun, 7 Jun 2026 13:24:44 -0700
Subject: [PATCH 07/10] style(sparkctl): format PDF contract tests


From dc850c2624c1b0347576ace2ae98441e1d3ee82a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20K=C3=B6lnberger?=
 <159939812+ProfRandom92@users.noreply.github.com>
Date: Sun, 7 Jun 2026 13:29:19 -0700
Subject: [PATCH 08/10] style(sparkctl): apply rustfmt to PDF contract tests

---
 agy7rust/tests/spark_pdf_extraction_contract.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/agy7rust/tests/spark_pdf_extraction_contract.rs b/agy7rust/tests/spark_pdf_extraction_contract.rs
index 6d33cfe..334c9bc 100644
--- a/agy7rust/tests/spark_pdf_extraction_contract.rs
+++ b/agy7rust/tests/spark_pdf_extraction_contract.rs
@@ -141,8 +141,8 @@ fn test_pdf_extraction_contract_allows_empty_tables() {
     let mut value = load_fixture_value();
     value["tables"] = serde_json::json!([]);
 
-    let validation = validate_pdf_extraction_contract_value(&value)
-        .expect("empty tables should be allowed");
+    let validation =
+        validate_pdf_extraction_contract_value(&value).expect("empty tables should be allowed");
     assert_eq!(validation.table_count, 0);
     assert_eq!(validation.first_table_row_count, 0);
 }

From 09f5c7b72cc2435a90ceb248531f40625b76d662 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20K=C3=B6lnberger?=
 <159939812+ProfRandom92@users.noreply.github.com>
Date: Sun, 7 Jun 2026 13:50:35 -0700
Subject: [PATCH 09/10] fix(sparkctl): validate schema required field paths

---
 agy7rust/src/codec/package.rs | 49 +++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/agy7rust/src/codec/package.rs b/agy7rust/src/codec/package.rs
index 2e06fa4..b863a73 100644
--- a/agy7rust/src/codec/package.rs
+++ b/agy7rust/src/codec/package.rs
@@ -768,33 +768,44 @@ pub fn validate_schema(
     input_val: &serde_json::Value,
     schema_val: &serde_json::Value,
 ) -> anyhow::Result<(String, usize, usize)> {
-    // Basic placeholder schema check: ensure both are objects and required top-level keys exist.
-    let obj = input_val
-        .as_object()
-        .ok_or_else(|| anyhow::anyhow!("Input is not a JSON object"))?;
-
     let schema_obj = schema_val
         .as_object()
         .ok_or_else(|| anyhow::anyhow!("Schema is not a JSON object"))?;
 
-    let required = schema_obj
-        .get("required")
+    let schema = schema_obj
+        .get("schema")
+        .and_then(|v| v.as_str())
+        .ok_or_else(|| anyhow::anyhow!("schema mismatch"))?;
+    if schema != "SPARK-V7-SCHEMA" {
+        return Err(anyhow::anyhow!("schema mismatch"));
+    }
+
+    let name = schema_obj
+        .get("name")
+        .and_then(|v| v.as_str())
+        .ok_or_else(|| anyhow::anyhow!("schema name missing"))?
+        .to_string();
+
+    let required_paths = schema_obj
+        .get("required_field_paths")
         .and_then(|v| v.as_array())
-        .ok_or_else(|| anyhow::anyhow!("Schema missing required array"))?;
+        .ok_or_else(|| anyhow::anyhow!("Schema missing required_field_paths array"))?;
 
-    for k in required {
-        let key = k
+    for path_value in required_paths {
+        let path = path_value
             .as_str()
-            .ok_or_else(|| anyhow::anyhow!("Required key is not a string"))?;
-        if !obj.contains_key(key) {
-            return Err(anyhow::anyhow!("required field missing: {}", key));
+            .ok_or_else(|| anyhow::anyhow!("Required path is not a string"))?;
+        let value = get_value_by_path(input_val, path)?;
+        match value {
+            serde_json::Value::String(text) => {
+                if text.trim().is_empty() {
+                    return Err(anyhow::anyhow!("required field empty: {}", path));
+                }
+            }
+            serde_json::Value::Number(_) | serde_json::Value::Bool(_) => {}
+            _ => return Err(anyhow::anyhow!("required field not scalar: {}", path)),
         }
     }
 
-    let field_count = collect_field_paths(input_val).len();
-    let commitment_token_count = extract_commitment_tokens(input_val).len();
-    let canonical = canonical_json(input_val);
-    let hash = sha256_hex(canonical);
-
-    Ok((hash, field_count, commitment_token_count))
+    Ok((name, required_paths.len(), required_paths.len()))
 }

From 82e0c2c389d1b8b6036981edd801ecdb738acf66 Mon Sep 17 00:00:00 2001
From: ProfRandom92 <159939812+ProfRandom92@users.noreply.github.com>
Date: Mon, 8 Jun 2026 18:14:08 +0200
Subject: [PATCH 10/10] fix(sparkctl): align pdf contract validation

---
 agy7rust/src/codec/package.rs                 |  8 +++
 .../tests/spark_pdf_extraction_contract.rs    | 11 ++++
 agy7rust/tests/spark_roundtrip.rs             | 19 ++++++-
 schemas/spark/pdf_extraction_v1.schema.json   | 51 +++++++++++++++++--
 4 files changed, 85 insertions(+), 4 deletions(-)

diff --git a/agy7rust/src/codec/package.rs b/agy7rust/src/codec/package.rs
index b863a73..0adc65d 100644
--- a/agy7rust/src/codec/package.rs
+++ b/agy7rust/src/codec/package.rs
@@ -780,6 +780,14 @@ pub fn validate_schema(
         return Err(anyhow::anyhow!("schema mismatch"));
     }
 
+    let version = schema_obj
+        .get("version")
+        .and_then(|v| v.as_i64())
+        .ok_or_else(|| anyhow::anyhow!("unsupported schema version"))?;
+    if version != 1 {
+        return Err(anyhow::anyhow!("unsupported schema version"));
+    }
+
     let name = schema_obj
         .get("name")
         .and_then(|v| v.as_str())
diff --git a/agy7rust/tests/spark_pdf_extraction_contract.rs b/agy7rust/tests/spark_pdf_extraction_contract.rs
index 334c9bc..798a8c8 100644
--- a/agy7rust/tests/spark_pdf_extraction_contract.rs
+++ b/agy7rust/tests/spark_pdf_extraction_contract.rs
@@ -158,6 +158,17 @@ fn test_pdf_extraction_contract_rejects_empty_table_row() {
     assert_eq!(err, "tables.rows row must not be empty");
 }
 
+#[test]
+fn test_pdf_extraction_contract_rejects_empty_pages() {
+    let mut value = load_fixture_value();
+    value["pages"] = serde_json::json!([]);
+
+    let err = validate_pdf_extraction_contract_value(&value)
+        .unwrap_err()
+        .to_string();
+    assert_eq!(err, "missing pages");
+}
+
 #[test]
 fn test_pdf_extraction_contract_rejects_blank_table_cell() {
     let mut value = load_fixture_value();
diff --git a/agy7rust/tests/spark_roundtrip.rs b/agy7rust/tests/spark_roundtrip.rs
index f104877..8ad814a 100644
--- a/agy7rust/tests/spark_roundtrip.rs
+++ b/agy7rust/tests/spark_roundtrip.rs
@@ -474,7 +474,24 @@ fn test_schema_checking_scenarios() {
     assert!(res.is_err());
     assert_eq!(res.unwrap_err().to_string(), "schema mismatch");
 
-    // 6. Unsupported path syntax fails cleanly
+    // 6. Missing schema version fails cleanly
+    let mut missing_version_schema = valid_schema.clone();
+    missing_version_schema
+        .as_object_mut()
+        .unwrap()
+        .remove("version");
+    let res = agy7rust::codec::package::validate_schema(&valid_input, &missing_version_schema);
+    assert!(res.is_err());
+    assert_eq!(res.unwrap_err().to_string(), "unsupported schema version");
+
+    // 7. Unsupported schema version fails cleanly
+    let mut unsupported_version_schema = valid_schema.clone();
+    unsupported_version_schema["version"] = json!(2);
+    let res = agy7rust::codec::package::validate_schema(&valid_input, &unsupported_version_schema);
+    assert!(res.is_err());
+    assert_eq!(res.unwrap_err().to_string(), "unsupported schema version");
+
+    // 8. Unsupported path syntax fails cleanly
     let mut unsupported_path_schema = valid_schema.clone();
     unsupported_path_schema["required_field_paths"] = json!(["$.extraction.fields[0].parcel_id"]);
     let res = agy7rust::codec::package::validate_schema(&valid_input, &unsupported_path_schema);
diff --git a/schemas/spark/pdf_extraction_v1.schema.json b/schemas/spark/pdf_extraction_v1.schema.json
index 0f38343..637aa12 100644
--- a/schemas/spark/pdf_extraction_v1.schema.json
+++ b/schemas/spark/pdf_extraction_v1.schema.json
@@ -1,7 +1,7 @@
 {
   "$schema": "https://json-schema.org/draft/2020-12/schema",
   "$id": "https://comptext.local/schemas/spark/pdf_extraction_v1.schema.json",
-  "title": "SPARK PDF Extraction V1",
+  "title": "SPARK-Style PDF Extraction V1",
   "description": "Deterministic structured-data contract for external or manual PDF extraction outputs used as input evidence.",
   "type": "object",
   "additionalProperties": false,
@@ -55,6 +55,7 @@
     },
     "pages": {
       "type": "array",
+      "minItems": 1,
       "items": {
         "$ref": "#/$defs/page"
       }
@@ -73,7 +74,48 @@
     },
     "extracted_fields": {
       "type": "object",
-      "additionalProperties": true
+      "additionalProperties": true,
+      "required": [
+        "procedure_goal",
+        "authority",
+        "decision_points",
+        "required_documents",
+        "review_required",
+        "public_sector_context"
+      ],
+      "properties": {
+        "procedure_goal": {
+          "type": "string",
+          "minLength": 1
+        },
+        "authority": {
+          "type": "string",
+          "minLength": 1
+        },
+        "decision_points": {
+          "type": "array",
+          "minItems": 1,
+          "items": {
+            "type": "string",
+            "minLength": 1
+          }
+        },
+        "required_documents": {
+          "type": "array",
+          "minItems": 1,
+          "items": {
+            "type": "string",
+            "minLength": 1
+          }
+        },
+        "review_required": {
+          "const": true
+        },
+        "public_sector_context": {
+          "type": "string",
+          "minLength": 1
+        }
+      }
     },
     "warnings": {
       "type": "array",
@@ -145,10 +187,13 @@
         },
         "rows": {
           "type": "array",
+          "minItems": 1,
           "items": {
             "type": "array",
+            "minItems": 1,
             "items": {
-              "type": "string"
+              "type": "string",
+              "minLength": 1
             }
           }
         }