diff --git a/.agents/skills/pdf-extraction-contracts/SKILL.md b/.agents/skills/pdf-extraction-contracts/SKILL.md new file mode 100644 index 0000000..995e4b3 --- /dev/null +++ b/.agents/skills/pdf-extraction-contracts/SKILL.md @@ -0,0 +1,91 @@ +# Skill: PDF Extraction Contracts + +## Purpose + +Guide work on `PDF-EXTRACTION-V1` structured-data artifacts for SPARK-like administrative workflows. + +## Use This Skill When + +- Adding or reviewing PDF extraction fixtures. +- Updating `schemas/spark/pdf_extraction_v1.schema.json`. +- Validating external or manual extraction JSON before evidence packaging. +- Connecting PDF extraction artifacts to Context Pack or Evidence Packet workflows. + +## Contract Boundary + +`PDF-EXTRACTION-V1` is an adapter contract. It accepts structured JSON from manual or external extraction tools as input evidence. + +It does not: + +- implement OCR +- parse PDFs +- download source PDFs +- call providers +- create a Codex plugin bundle +- create an MCP server +- create hooks or commands +- claim official OpenAI plugin compatibility +- claim official SPARK compatibility +- replace human review + +## Required Fields + +Every artifact must include: + +- `schema_version` +- `source_file` +- `document_type` +- `pages` +- `tables` +- `figures` +- `extracted_fields` +- `warnings` +- `tool_metadata` + +`schema_version` must be `PDF-EXTRACTION-V1`. + +`tool_metadata.converter` must be one of: + +- `manual` +- `docling` +- `mineru` +- `marker` +- `pdftotext` +- `other` + +`tool_metadata.extraction_mode` must be one of: + +- `synthetic_fixture` +- `manual_fixture` +- `external_tool` + +## Fixture Rules + +Synthetic fixtures must not include protected personal data, real SPARK data, real Daimler data, real medical data, or real ePA data. + +Do not commit source PDFs unless a future task explicitly approves that artifact and license boundary. + +## Validation + +Prefer local runtime validation with `validate_pdf_extraction_contract_value` and deterministic canonical hashing with the existing `canonical_json` and `sha256_hex` helpers. + +Report Agent Governor gate states using exactly one of: + +- `pass` +- `fail` +- `not_applicable` +- `deferred` + +Use `not_applicable` only when a gate does not apply, and explain why. Use `deferred` when the gate is required but intentionally left for later human/tool review. + +For Rust changes, run: + +- `cargo fmt --all --check` +- `cargo test` +- `cargo clippy --all-targets --all-features -- -D warnings` + +## Claim Boundaries + +Use bounded wording: adapter contract, structured input evidence, manual fixture, external-tool output, review input, artifact manifest. + +Do not claim production readiness, compliance or certification, legal evidentiary status, forensic certainty, official SPARK compatibility, official OpenAI plugin compatibility, autonomous approval, or guaranteed correctness. diff --git a/agy7rust/src/codec/package.rs b/agy7rust/src/codec/package.rs index 584c4e8..0adc65d 100644 --- a/agy7rust/src/codec/package.rs +++ b/agy7rust/src/codec/package.rs @@ -1,6 +1,7 @@ use crate::codec::hash::sha256_hex; use serde::{Deserialize, Serialize}; use serde_json; +use std::collections::BTreeMap; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub enum PolicyResult { @@ -76,6 +77,81 @@ pub struct SparkEvidencePacketEnvelope { pub canonical_hash: String, } +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(deny_unknown_fields)] +pub struct PdfExtractionDocument { + pub schema_version: String, + pub source_file: String, + pub source_sha256: Option, + pub source_url: Option, + pub license_or_usage_note: Option, + pub sanitization_status: Option, + pub contains_personal_data_risk: Option, + pub document_type: String, + pub pages: Vec, + pub tables: Vec, + pub figures: Vec, + pub extracted_fields: PdfExtractedFields, + pub warnings: Vec, + pub tool_metadata: PdfExtractionToolMetadata, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(deny_unknown_fields)] +pub struct PdfExtractionPage { + pub page_number: u64, + pub text_summary: String, + pub field_refs: Option>, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(deny_unknown_fields)] +pub struct PdfExtractionTable { + pub table_id: String, + pub page_number: u64, + pub caption: String, + pub columns: Vec, + pub rows: Vec>, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(deny_unknown_fields)] +pub struct PdfExtractionFigure { + pub figure_id: String, + pub page_number: u64, + pub description: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct PdfExtractedFields { + pub procedure_goal: String, + pub authority: String, + pub decision_points: Vec, + pub required_documents: Vec, + pub review_required: bool, + pub public_sector_context: String, + #[serde(flatten)] + pub additional_fields: BTreeMap, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(deny_unknown_fields)] +pub struct PdfExtractionToolMetadata { + pub converter: String, + pub converter_version: String, + pub extraction_mode: String, + pub notes: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PdfExtractionValidation { + pub canonical_json: String, + pub canonical_hash: String, + pub page_count: usize, + pub table_count: usize, + pub first_table_row_count: usize, +} + pub fn sort_json_value(value: &serde_json::Value) -> serde_json::Value { match value { serde_json::Value::Object(map) => { @@ -140,6 +216,184 @@ pub fn validate_spark_evidence_packet_value(value: &serde_json::Value) -> anyhow validate_spark_evidence_packet_envelope(&envelope) } +pub fn validate_pdf_extraction_contract_value( + value: &serde_json::Value, +) -> anyhow::Result { + let document: PdfExtractionDocument = serde_json::from_value(value.clone())?; + validate_pdf_extraction_document(&document)?; + + let canonical = canonical_json(value); + let canonical_hash = sha256_hex(&canonical); + let first_table_row_count = document + .tables + .first() + .map(|table| table.rows.len()) + .unwrap_or(0); + + Ok(PdfExtractionValidation { + canonical_json: canonical, + canonical_hash, + page_count: document.pages.len(), + table_count: document.tables.len(), + first_table_row_count, + }) +} + +fn validate_pdf_extraction_document(document: &PdfExtractionDocument) -> anyhow::Result<()> { + require_exact( + "schema_version", + &document.schema_version, + "PDF-EXTRACTION-V1", + )?; + require_non_empty("source_file", &document.source_file)?; + if let Some(risk) = &document.contains_personal_data_risk { + require_allowed( + "contains_personal_data_risk", + risk, + &["low", "medium", "high", "unknown"], + )?; + } + require_non_empty("document_type", &document.document_type)?; + require_allowed( + "tool_metadata.converter", + &document.tool_metadata.converter, + &[ + "manual", + "docling", + "mineru", + "marker", + "pdftotext", + "other", + ], + )?; + require_non_empty( + "tool_metadata.converter_version", + &document.tool_metadata.converter_version, + )?; + require_allowed( + "tool_metadata.extraction_mode", + &document.tool_metadata.extraction_mode, + &["synthetic_fixture", "manual_fixture", "external_tool"], + )?; + + require_non_empty_pages(&document.pages)?; + validate_tables(&document.tables)?; + validate_warnings(&document.warnings)?; + validate_pdf_extracted_fields(&document.extracted_fields)?; + + for figure in &document.figures { + require_non_empty("figures.figure_id", &figure.figure_id)?; + require_non_zero("figures.page_number", figure.page_number)?; + require_non_empty("figures.description", &figure.description)?; + } + + if let Some(hash) = &document.source_sha256 { + validate_sha256_hex("source_sha256", hash)?; + } + + Ok(()) +} + +fn validate_pdf_extracted_fields(fields: &PdfExtractedFields) -> anyhow::Result<()> { + require_non_empty("extracted_fields.procedure_goal", &fields.procedure_goal)?; + require_non_empty("extracted_fields.authority", &fields.authority)?; + require_non_empty_list("extracted_fields.decision_points", &fields.decision_points)?; + require_non_empty_list( + "extracted_fields.required_documents", + &fields.required_documents, + )?; + if !fields.review_required { + return Err(anyhow::anyhow!( + "PDF extraction extracted_fields.review_required must be true" + )); + } + require_non_empty( + "extracted_fields.public_sector_context", + &fields.public_sector_context, + )?; + + Ok(()) +} + +fn require_exact(label: &str, value: &str, expected: &str) -> anyhow::Result<()> { + if value != expected { + return Err(anyhow::anyhow!("{} mismatch", label)); + } + Ok(()) +} + +fn require_allowed(label: &str, value: &str, allowed: &[&str]) -> anyhow::Result<()> { + if !allowed.contains(&value) { + return Err(anyhow::anyhow!("{} unsupported", label)); + } + Ok(()) +} + +fn require_non_zero(label: &str, value: u64) -> anyhow::Result<()> { + if value == 0 { + return Err(anyhow::anyhow!("{} must be greater than zero", label)); + } + Ok(()) +} + +fn validate_sha256_hex(label: &str, value: &str) -> anyhow::Result<()> { + if value.len() != 64 || !value.chars().all(|ch| ch.is_ascii_hexdigit()) { + return Err(anyhow::anyhow!("{} must be lowercase SHA-256 hex", label)); + } + if value.chars().any(|ch| ch.is_ascii_uppercase()) { + return Err(anyhow::anyhow!("{} must be lowercase SHA-256 hex", label)); + } + Ok(()) +} + +fn require_non_empty_pages(pages: &[PdfExtractionPage]) -> anyhow::Result<()> { + if pages.is_empty() { + return Err(anyhow::anyhow!("missing pages")); + } + + for page in pages { + require_non_zero("pages.page_number", page.page_number)?; + require_non_empty("pages.text_summary", &page.text_summary)?; + if let Some(field_refs) = &page.field_refs { + require_non_empty_list("pages.field_refs", field_refs)?; + } + } + + Ok(()) +} + +fn validate_tables(tables: &[PdfExtractionTable]) -> anyhow::Result<()> { + for table in tables { + require_non_empty("tables.table_id", &table.table_id)?; + require_non_zero("tables.page_number", table.page_number)?; + require_non_empty("tables.caption", &table.caption)?; + require_non_empty_list("tables.columns", &table.columns)?; + if table.rows.is_empty() { + return Err(anyhow::anyhow!("missing tables.rows")); + } + for row in &table.rows { + if row.is_empty() { + return Err(anyhow::anyhow!("tables.rows row must not be empty")); + } + for cell in row { + if cell.trim().is_empty() { + return Err(anyhow::anyhow!("tables.rows cell must not be empty")); + } + } + } + } + + Ok(()) +} + +fn validate_warnings(warnings: &[String]) -> anyhow::Result<()> { + if warnings.iter().any(|warning| warning.trim().is_empty()) { + return Err(anyhow::anyhow!("warnings")); + } + + Ok(()) +} + fn validate_spark_evidence_preimage(preimage: &SparkEvidencePacketPreimage) -> anyhow::Result<()> { require_non_empty("schema_version", &preimage.schema_version)?; if preimage.schema_version != "SPARK-EVIDENCE-PACKET-V1" { @@ -516,13 +770,13 @@ pub fn validate_schema( ) -> anyhow::Result<(String, usize, usize)> { let schema_obj = schema_val .as_object() - .ok_or_else(|| anyhow::anyhow!("schema is not a JSON object"))?; + .ok_or_else(|| anyhow::anyhow!("Schema is not a JSON object"))?; - let schema_type = schema_obj + let schema = schema_obj .get("schema") .and_then(|v| v.as_str()) .ok_or_else(|| anyhow::anyhow!("schema mismatch"))?; - if schema_type != "SPARK-V7-SCHEMA" { + if schema != "SPARK-V7-SCHEMA" { return Err(anyhow::anyhow!("schema mismatch")); } @@ -534,59 +788,32 @@ pub fn validate_schema( return Err(anyhow::anyhow!("unsupported schema version")); } - let schema_name = schema_obj + let name = schema_obj .get("name") .and_then(|v| v.as_str()) - .ok_or_else(|| anyhow::anyhow!("missing schema name"))? + .ok_or_else(|| anyhow::anyhow!("schema name missing"))? .to_string(); - let required_paths_val = schema_obj + let required_paths = schema_obj .get("required_field_paths") - .ok_or_else(|| anyhow::anyhow!("missing required_field_paths"))?; - let required_paths = required_paths_val - .as_array() - .ok_or_else(|| anyhow::anyhow!("missing required_field_paths"))?; - - let mut path_strings = Vec::new(); - for p in required_paths { - if let Some(s) = p.as_str() { - path_strings.push(s); - } else { - return Err(anyhow::anyhow!("missing required_field_paths")); - } - } - - let required_count = path_strings.len(); - let mut checked_count = 0; - - for path in path_strings { - let val = match get_value_by_path(input_val, path) { - Ok(v) => v, - Err(e) => { - let err_msg = e.to_string(); - if err_msg.contains("unsupported path syntax") { - return Err(e); - } else { - return Err(anyhow::anyhow!("required field missing: {}", path)); - } - } - }; - - match val { - serde_json::Value::String(s) => { - if s.trim().is_empty() { + .and_then(|v| v.as_array()) + .ok_or_else(|| anyhow::anyhow!("Schema missing required_field_paths array"))?; + + for path_value in required_paths { + let path = path_value + .as_str() + .ok_or_else(|| anyhow::anyhow!("Required path is not a string"))?; + let value = get_value_by_path(input_val, path)?; + match value { + serde_json::Value::String(text) => { + if text.trim().is_empty() { return Err(anyhow::anyhow!("required field empty: {}", path)); } } serde_json::Value::Number(_) | serde_json::Value::Bool(_) => {} - serde_json::Value::Null - | serde_json::Value::Object(_) - | serde_json::Value::Array(_) => { - return Err(anyhow::anyhow!("required field not scalar: {}", path)); - } + _ => return Err(anyhow::anyhow!("required field not scalar: {}", path)), } - checked_count += 1; } - Ok((schema_name, required_count, checked_count)) + Ok((name, required_paths.len(), required_paths.len())) } diff --git a/agy7rust/src/lib.rs b/agy7rust/src/lib.rs index 5a6e975..04c3e24 100644 --- a/agy7rust/src/lib.rs +++ b/agy7rust/src/lib.rs @@ -9,8 +9,10 @@ pub use codec::hash::sha256_hex; pub use codec::package::{ build_package_from_value, build_spark_evidence_packet_envelope, canonical_json, collect_field_paths, extract_commitment_tokens, get_value_by_path, replay_package_value, - sort_json_value, validate_schema, validate_spark_evidence_packet_envelope, - validate_spark_evidence_packet_value, verify_package_value, ArtifactManifestEntry, - ClaimHygiene, HumanReviewDecision, PolicyResult, ProviderBoundaryStatus, - SparkEvidencePacketEnvelope, SparkEvidencePacketPreimage, + sort_json_value, validate_pdf_extraction_contract_value, validate_schema, + validate_spark_evidence_packet_envelope, validate_spark_evidence_packet_value, + verify_package_value, ArtifactManifestEntry, ClaimHygiene, HumanReviewDecision, + PdfExtractedFields, PdfExtractionDocument, PdfExtractionFigure, PdfExtractionPage, + PdfExtractionTable, PdfExtractionToolMetadata, PdfExtractionValidation, PolicyResult, + ProviderBoundaryStatus, SparkEvidencePacketEnvelope, SparkEvidencePacketPreimage, }; diff --git a/agy7rust/tests/spark_pdf_extraction_contract.rs b/agy7rust/tests/spark_pdf_extraction_contract.rs new file mode 100644 index 0000000..798a8c8 --- /dev/null +++ b/agy7rust/tests/spark_pdf_extraction_contract.rs @@ -0,0 +1,257 @@ +use agy7rust::validate_pdf_extraction_contract_value; +use serde_json::Value; +use std::fs; + +#[test] +fn test_pdf_extraction_fixture_contract_shape() { + let fixture = fs::read_to_string("../examples/spark/pdf_extraction_fixture.json") + .expect("failed to read PDF extraction fixture"); + let value: Value = serde_json::from_str(&fixture).expect("fixture should parse as JSON"); + let validation = + validate_pdf_extraction_contract_value(&value).expect("fixture contract should validate"); + + assert_eq!(value["schema_version"], "PDF-EXTRACTION-V1"); + assert_non_empty_string(&value["source_file"], "source_file"); + assert_eq!(value["tool_metadata"]["converter"], "manual"); + assert_eq!(value["tool_metadata"]["extraction_mode"], "manual_fixture"); + + let extracted_fields = &value["extracted_fields"]; + assert_non_empty_string( + &extracted_fields["procedure_goal"], + "extracted_fields.procedure_goal", + ); + assert_non_empty_string(&extracted_fields["authority"], "extracted_fields.authority"); + assert_non_empty_array( + &extracted_fields["decision_points"], + "extracted_fields.decision_points", + ); + assert_non_empty_array( + &extracted_fields["required_documents"], + "extracted_fields.required_documents", + ); + assert_eq!(extracted_fields["review_required"], true); + + let tables = value["tables"] + .as_array() + .expect("tables should be an array"); + let first_table = tables.first().expect("fixture should include a table"); + let first_table_rows = first_table["rows"] + .as_array() + .expect("first table rows should be an array"); + assert_eq!(first_table_rows.len(), 3); + + assert_eq!(validation.page_count, 2); + assert_eq!(validation.table_count, 1); + assert_eq!(validation.first_table_row_count, 3); + assert_eq!(validation.canonical_hash.len(), 64); + assert!(!validation.canonical_json.is_empty()); +} + +#[test] +fn test_pdf_extraction_contract_rejects_wrong_schema_version() { + let mut value = load_fixture_value(); + value["schema_version"] = Value::String("PDF-EXTRACTION-V0".to_string()); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert_eq!(err, "schema_version mismatch"); +} + +#[test] +fn test_pdf_extraction_contract_rejects_unknown_top_level_field() { + let mut value = load_fixture_value(); + value + .as_object_mut() + .expect("fixture should be an object") + .insert( + "unexpected_field".to_string(), + Value::String("tamper".to_string()), + ); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert!(err.contains("unknown field `unexpected_field`")); +} + +#[test] +fn test_pdf_extraction_contract_rejects_missing_required_top_level_field() { + let mut value = load_fixture_value(); + value + .as_object_mut() + .expect("fixture should be an object") + .remove("source_file"); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert!(err.contains("missing field `source_file`")); +} + +#[test] +fn test_pdf_extraction_contract_rejects_missing_required_extracted_field() { + let mut value = load_fixture_value(); + value["extracted_fields"] + .as_object_mut() + .expect("extracted_fields should be an object") + .remove("procedure_goal"); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert!(err.contains("missing field `procedure_goal`")); +} + +#[test] +fn test_pdf_extraction_contract_rejects_unsupported_converter() { + let mut value = load_fixture_value(); + value["tool_metadata"]["converter"] = Value::String("unsupported".to_string()); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert_eq!(err, "tool_metadata.converter unsupported"); +} + +#[test] +fn test_pdf_extraction_contract_rejects_unsupported_extraction_mode() { + let mut value = load_fixture_value(); + value["tool_metadata"]["extraction_mode"] = Value::String("unsupported".to_string()); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert_eq!(err, "tool_metadata.extraction_mode unsupported"); +} + +#[test] +fn test_pdf_extraction_contract_rejects_unsupported_personal_data_risk() { + let mut value = load_fixture_value(); + value["contains_personal_data_risk"] = Value::String("review_required".to_string()); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert_eq!(err, "contains_personal_data_risk unsupported"); +} + +#[test] +fn test_pdf_extraction_contract_allows_empty_tables() { + let mut value = load_fixture_value(); + value["tables"] = serde_json::json!([]); + + let validation = + validate_pdf_extraction_contract_value(&value).expect("empty tables should be allowed"); + assert_eq!(validation.table_count, 0); + assert_eq!(validation.first_table_row_count, 0); +} + +#[test] +fn test_pdf_extraction_contract_rejects_empty_table_row() { + let mut value = load_fixture_value(); + value["tables"][0]["rows"][0] = serde_json::json!([]); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert_eq!(err, "tables.rows row must not be empty"); +} + +#[test] +fn test_pdf_extraction_contract_rejects_empty_pages() { + let mut value = load_fixture_value(); + value["pages"] = serde_json::json!([]); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert_eq!(err, "missing pages"); +} + +#[test] +fn test_pdf_extraction_contract_rejects_blank_table_cell() { + let mut value = load_fixture_value(); + value["tables"][0]["rows"][0][0] = Value::String(" ".to_string()); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert_eq!(err, "tables.rows cell must not be empty"); +} + +#[test] +fn test_pdf_extraction_contract_allows_empty_warnings() { + let mut value = load_fixture_value(); + value["warnings"] = serde_json::json!([]); + + let validation = validate_pdf_extraction_contract_value(&value); + assert!(validation.is_ok()); +} + +#[test] +fn test_pdf_extraction_contract_rejects_blank_warning() { + let mut value = load_fixture_value(); + value["warnings"] = serde_json::json!(["manual fixture", " "]); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert_eq!(err, "warnings"); +} + +#[test] +fn test_pdf_extraction_contract_rejects_blank_procedure_goal() { + let mut value = load_fixture_value(); + value["extracted_fields"]["procedure_goal"] = Value::String(" ".to_string()); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert_eq!(err, "missing extracted_fields.procedure_goal"); +} + +#[test] +fn test_pdf_extraction_contract_rejects_empty_decision_points() { + let mut value = load_fixture_value(); + value["extracted_fields"]["decision_points"] = serde_json::json!([]); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert_eq!(err, "extracted_fields.decision_points"); +} + +#[test] +fn test_pdf_extraction_contract_rejects_review_required_false() { + let mut value = load_fixture_value(); + value["extracted_fields"]["review_required"] = Value::Bool(false); + + let err = validate_pdf_extraction_contract_value(&value) + .unwrap_err() + .to_string(); + assert_eq!( + err, + "PDF extraction extracted_fields.review_required must be true" + ); +} + +fn assert_non_empty_string(value: &Value, label: &str) { + assert!( + value.as_str().is_some_and(|text| !text.trim().is_empty()), + "{label} should be a non-empty string" + ); +} + +fn assert_non_empty_array(value: &Value, label: &str) { + assert!( + value.as_array().is_some_and(|items| !items.is_empty()), + "{label} should be a non-empty array" + ); +} + +fn load_fixture_value() -> Value { + let fixture = fs::read_to_string("../examples/spark/pdf_extraction_fixture.json") + .expect("failed to read PDF extraction fixture"); + serde_json::from_str(&fixture).expect("fixture should parse as JSON") +} diff --git a/agy7rust/tests/spark_roundtrip.rs b/agy7rust/tests/spark_roundtrip.rs index f104877..8ad814a 100644 --- a/agy7rust/tests/spark_roundtrip.rs +++ b/agy7rust/tests/spark_roundtrip.rs @@ -474,7 +474,24 @@ fn test_schema_checking_scenarios() { assert!(res.is_err()); assert_eq!(res.unwrap_err().to_string(), "schema mismatch"); - // 6. Unsupported path syntax fails cleanly + // 6. Missing schema version fails cleanly + let mut missing_version_schema = valid_schema.clone(); + missing_version_schema + .as_object_mut() + .unwrap() + .remove("version"); + let res = agy7rust::codec::package::validate_schema(&valid_input, &missing_version_schema); + assert!(res.is_err()); + assert_eq!(res.unwrap_err().to_string(), "unsupported schema version"); + + // 7. Unsupported schema version fails cleanly + let mut unsupported_version_schema = valid_schema.clone(); + unsupported_version_schema["version"] = json!(2); + let res = agy7rust::codec::package::validate_schema(&valid_input, &unsupported_version_schema); + assert!(res.is_err()); + assert_eq!(res.unwrap_err().to_string(), "unsupported schema version"); + + // 8. Unsupported path syntax fails cleanly let mut unsupported_path_schema = valid_schema.clone(); unsupported_path_schema["required_field_paths"] = json!(["$.extraction.fields[0].parcel_id"]); let res = agy7rust::codec::package::validate_schema(&valid_input, &unsupported_path_schema); diff --git a/docs/use-cases/PDF_TO_EVIDENCE_PACKET.md b/docs/use-cases/PDF_TO_EVIDENCE_PACKET.md new file mode 100644 index 0000000..c5f02da --- /dev/null +++ b/docs/use-cases/PDF_TO_EVIDENCE_PACKET.md @@ -0,0 +1,59 @@ +# PDF To Evidence Packet Adapter Contract + +## Purpose + +External or manual PDF extraction can produce a structured JSON artifact for `sparkctl` evidence workflows. `sparkctl` treats that JSON as input evidence, not as truth by itself. + +The `PDF-EXTRACTION-V1` contract gives future converters and manual fixtures a deterministic shape for extracted administrative fields, page summaries, tables, figures, warnings, and converter metadata. + +## Pipeline + +PDF or text fixture -> PDF-EXTRACTION-V1 JSON -> Context Pack / Evidence Packet -> Policy Gate -> Human Review -> Artifact Manifest + +The structured extraction can inform a Context Pack and be recorded in an Evidence Packet. It does not replace the Context Pack, the Policy Gate, or Human Review. + +## Runtime Contract Validation + +The Rust crate exposes local runtime validation for `PDF-EXTRACTION-V1` JSON values. The validator checks the declared fixture contract shape and computes a deterministic SHA-256 hash over canonical JSON with existing helpers. + +This validation is a local contract check. It does not perform OCR, parse PDFs, call providers, or verify that extracted text is true. + +## Boundaries + +- This PR does not implement OCR. +- This PR does not parse PDFs. +- This PR does not include protected SPARK data. +- This PR does not include real Daimler data. +- This PR does not include real medical or ePA data. +- This PR makes no production claim. +- This PR makes no compliance, legal, or forensic claim. +- Provider output remains untrusted until reviewed. +- Human review remains required. +- No real source PDF is committed. + +## Converter Strategy + +Docling, MinerU, Marker, pdftotext, and manual processes can be future producers of the same schema. This PR defines only the adapter contract. + +The contract records converter name, converter version, and extraction mode so reviewers can distinguish synthetic fixtures, manual fixtures, and future external-tool outputs. + +## Future Codex-Style Plugin Bundle Readiness + +Future Codex-style plugin bundles could expose skills, commands, hooks, and declared artifacts around this contract. + +This repository does not claim official OpenAI plugin compatibility or plugin-directory availability. A future bundle would still need human review, explicit connector boundaries, declared artifacts, and repository-specific approval before any write-capable action. + +## Method Precedent + +This follows prior CompText privacy-preserving synthetic fixture and replay-contract patterns. The fixture is synthetic and bounded so reviewers can inspect the contract without relying on protected source material. + +## Review Use + +Reviewers should check: + +- the fixture uses `schema_version: PDF-EXTRACTION-V1` +- the source file path is descriptive but no source PDF is committed +- extracted fields are administrative and synthetic +- warnings state the fixture limits +- converter metadata identifies manual fixture preparation +- downstream evidence packets preserve the source and warning context diff --git a/examples/spark/pdf_extraction_fixture.json b/examples/spark/pdf_extraction_fixture.json new file mode 100644 index 0000000..b355775 --- /dev/null +++ b/examples/spark/pdf_extraction_fixture.json @@ -0,0 +1,92 @@ +{ + "schema_version": "PDF-EXTRACTION-V1", + "source_file": "examples/spark/synthetic_bauantrag_fixture.pdf", + "document_type": "synthetic_public_admin_planning_fixture", + "license_or_usage_note": "Synthetic fixture for local contract testing only.", + "sanitization_status": "synthetic_no_real_personal_data", + "contains_personal_data_risk": "low", + "pages": [ + { + "page_number": 1, + "text_summary": "Synthetic building and planning application overview with authority, procedure goal, zoning placeholder, and deadline note.", + "field_refs": [ + "procedure_goal", + "authority", + "public_sector_context" + ] + }, + { + "page_number": 2, + "text_summary": "Synthetic review appendix with environmental note, protected-area placeholder, flood-zone placeholder, required attachments, and decision points.", + "field_refs": [ + "decision_points", + "required_documents", + "review_required" + ] + } + ], + "tables": [ + { + "table_id": "table_required_documents_v1", + "page_number": 2, + "caption": "Synthetic required attachments for planning review", + "columns": [ + "document", + "status", + "review_note" + ], + "rows": [ + [ + "site_plan", + "required", + "Check zoning placeholder reference ZONE-A" + ], + [ + "environmental_note", + "required", + "Review protected-area placeholder reference ENV-P" + ], + [ + "flood_zone_statement", + "conditional", + "Review flood-zone placeholder reference FLOOD-X" + ] + ] + } + ], + "figures": [ + { + "figure_id": "figure_site_context_v1", + "page_number": 1, + "description": "Synthetic site-context sketch descriptor; no image or source PDF is committed." + } + ], + "extracted_fields": { + "procedure_goal": "Assess a synthetic building and planning application for review routing and evidence packaging.", + "authority": "Synthetic Municipal Planning Office", + "decision_points": [ + "Confirm zoning placeholder reference ZONE-A is addressed.", + "Check environmental note for protected-area placeholder reference ENV-P.", + "Decide whether flood-zone placeholder reference FLOOD-X requires additional review.", + "Record deadline note before moving to human review." + ], + "required_documents": [ + "site_plan", + "environmental_note", + "flood_zone_statement", + "deadline_note" + ], + "review_required": true, + "public_sector_context": "Synthetic administrative planning workflow for local evidence-packet contract testing." + }, + "warnings": [ + "Manual fixture; not extracted from a real PDF.", + "Synthetic administrative content only; no protected personal data is included." + ], + "tool_metadata": { + "converter": "manual", + "converter_version": "fixture-v1", + "extraction_mode": "manual_fixture", + "notes": "Prepared by hand to exercise the PDF-EXTRACTION-V1 adapter contract." + } +} diff --git a/schemas/spark/pdf_extraction_v1.schema.json b/schemas/spark/pdf_extraction_v1.schema.json new file mode 100644 index 0000000..637aa12 --- /dev/null +++ b/schemas/spark/pdf_extraction_v1.schema.json @@ -0,0 +1,263 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://comptext.local/schemas/spark/pdf_extraction_v1.schema.json", + "title": "SPARK-Style PDF Extraction V1", + "description": "Deterministic structured-data contract for external or manual PDF extraction outputs used as input evidence.", + "type": "object", + "additionalProperties": false, + "required": [ + "schema_version", + "source_file", + "document_type", + "pages", + "tables", + "figures", + "extracted_fields", + "warnings", + "tool_metadata" + ], + "properties": { + "schema_version": { + "const": "PDF-EXTRACTION-V1" + }, + "source_file": { + "type": "string", + "minLength": 1 + }, + "source_sha256": { + "type": "string", + "pattern": "^[a-f0-9]{64}$" + }, + "source_url": { + "type": "string", + "minLength": 1 + }, + "license_or_usage_note": { + "type": "string", + "minLength": 1 + }, + "sanitization_status": { + "type": "string", + "minLength": 1 + }, + "contains_personal_data_risk": { + "type": "string", + "enum": [ + "low", + "medium", + "high", + "unknown" + ] + }, + "document_type": { + "type": "string", + "minLength": 1 + }, + "pages": { + "type": "array", + "minItems": 1, + "items": { + "$ref": "#/$defs/page" + } + }, + "tables": { + "type": "array", + "items": { + "$ref": "#/$defs/table" + } + }, + "figures": { + "type": "array", + "items": { + "$ref": "#/$defs/figure" + } + }, + "extracted_fields": { + "type": "object", + "additionalProperties": true, + "required": [ + "procedure_goal", + "authority", + "decision_points", + "required_documents", + "review_required", + "public_sector_context" + ], + "properties": { + "procedure_goal": { + "type": "string", + "minLength": 1 + }, + "authority": { + "type": "string", + "minLength": 1 + }, + "decision_points": { + "type": "array", + "minItems": 1, + "items": { + "type": "string", + "minLength": 1 + } + }, + "required_documents": { + "type": "array", + "minItems": 1, + "items": { + "type": "string", + "minLength": 1 + } + }, + "review_required": { + "const": true + }, + "public_sector_context": { + "type": "string", + "minLength": 1 + } + } + }, + "warnings": { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + } + }, + "tool_metadata": { + "$ref": "#/$defs/tool_metadata" + } + }, + "$defs": { + "page": { + "type": "object", + "additionalProperties": false, + "required": [ + "page_number", + "text_summary" + ], + "properties": { + "page_number": { + "type": "integer", + "minimum": 1 + }, + "text_summary": { + "type": "string", + "minLength": 1 + }, + "field_refs": { + "type": "array", + "items": { + "type": "string", + "minLength": 1 + } + } + } + }, + "table": { + "type": "object", + "additionalProperties": false, + "required": [ + "table_id", + "page_number", + "caption", + "columns", + "rows" + ], + "properties": { + "table_id": { + "type": "string", + "minLength": 1 + }, + "page_number": { + "type": "integer", + "minimum": 1 + }, + "caption": { + "type": "string", + "minLength": 1 + }, + "columns": { + "type": "array", + "minItems": 1, + "items": { + "type": "string", + "minLength": 1 + } + }, + "rows": { + "type": "array", + "minItems": 1, + "items": { + "type": "array", + "minItems": 1, + "items": { + "type": "string", + "minLength": 1 + } + } + } + } + }, + "figure": { + "type": "object", + "additionalProperties": false, + "required": [ + "figure_id", + "page_number", + "description" + ], + "properties": { + "figure_id": { + "type": "string", + "minLength": 1 + }, + "page_number": { + "type": "integer", + "minimum": 1 + }, + "description": { + "type": "string", + "minLength": 1 + } + } + }, + "tool_metadata": { + "type": "object", + "additionalProperties": false, + "required": [ + "converter", + "converter_version", + "extraction_mode" + ], + "properties": { + "converter": { + "type": "string", + "enum": [ + "manual", + "docling", + "mineru", + "marker", + "pdftotext", + "other" + ] + }, + "converter_version": { + "type": "string", + "minLength": 1 + }, + "extraction_mode": { + "type": "string", + "enum": [ + "synthetic_fixture", + "manual_fixture", + "external_tool" + ] + }, + "notes": { + "type": "string" + } + } + } + } +}