diff --git a/AGENTS.md b/AGENTS.md index b5e655b..c20001e 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -14,7 +14,7 @@ pathrex/ ├── Cargo.toml # Crate manifest (edition 2024) ├── build.rs # Links LAGraph + LAGraphX; optionally regenerates FFI bindings ├── src/ -│ ├── lib.rs # Public modules: graph, formats, lagraph_sys; utils is pub(crate) +│ ├── lib.rs # Public modules: formats, graph, sparql, lagraph_sys, utils │ ├── main.rs # Binary entry point (placeholder) │ ├── lagraph_sys.rs # FFI module — includes generated bindings │ ├── lagraph_sys_generated.rs# Bindgen output (checked in, regenerated in CI) @@ -24,6 +24,8 @@ pathrex/ │ │ ├── mod.rs # Core traits (GraphBuilder, GraphDecomposition, GraphSource, │ │ │ # Backend, Graph), error types, RAII wrappers, GrB init │ │ └── inmemory.rs # InMemory marker, InMemoryBuilder, InMemoryGraph +│ ├── sparql/ +│ │ └── mod.rs # SPARQL parsing (spargebra), PathTriple extraction, parse_rpq │ └── formats/ │ ├── mod.rs # FormatError enum, re-exports │ ├── csv.rs # Csv — CSV → Edge iterator (CsvConfig, ColumnSpec) @@ -216,7 +218,7 @@ Name-based lookup requires `has_header: true`. #### MatrixMarket directory format -[`MatrixMarket`](src/formats/mm.rs:160) loads an edge-labeled graph from a directory with: +[`MatrixMarket`](src/formats/mm.rs:159) loads an edge-labeled graph from a directory with: - `vertices.txt` — one line per node: ` <1-based-index>` on disk; [`get_node_id`](src/graph/mod.rs:199) returns the matching **0-based** matrix index - `edges.txt` — one line per label: ` <1-based-index>` (selects `n.txt`) @@ -228,11 +230,47 @@ converted to 0-based and installed via [`InMemoryBuilder::set_node_map()`](src/g Helper functions: -- [`load_mm_file(path)`](src/formats/mm.rs:64) — reads a single MatrixMarket file into a +- [`load_mm_file(path)`](src/formats/mm.rs:39) — reads a single MatrixMarket file into a `GrB_Matrix`. -- [`parse_index_map(path)`](src/formats/mm.rs) — parses ` ` lines; indices must be **>= 1** and **unique** within the file. +- [`parse_index_map(path)`](src/formats/mm.rs:81) — parses ` ` lines; indices must be **>= 1** and **unique** within the file. -`MatrixMarket` implements `GraphSource` in [`src/graph/inmemory.rs`](src/graph/inmemory.rs): `vertices.txt` maps are converted from 1-based file indices to 0-based matrix ids before [`set_node_map`](src/graph/inmemory.rs:67); `edges.txt` indices are unchanged for `n.txt` lookup. +`MatrixMarket` implements `GraphSource` in [`src/graph/inmemory.rs`](src/graph/inmemory.rs) (see the `impl` at line 215): `vertices.txt` maps are converted from 1-based file indices to 0-based matrix ids before [`set_node_map`](src/graph/inmemory.rs:67); `edges.txt` indices are unchanged for `n.txt` lookup. + +### SPARQL parsing (`src/sparql/mod.rs`) + +The [`sparql`](src/sparql/mod.rs) module uses the [`spargebra`](https://crates.io/crates/spargebra) +crate to parse SPARQL 1.1 query strings and extract the single property-path +triple pattern that pathrex's RPQ evaluators operate on. + +**Supported query form:** `SELECT` queries with exactly one triple or property +path pattern in the `WHERE` clause, e.g.: + +```sparql +SELECT ?x ?y WHERE { ?x /* ?y . } +``` + +Key public items: + +- [`parse_query(sparql)`](src/sparql/mod.rs:45) — parses a SPARQL string into a + [`spargebra::Query`]. +- [`extract_path(query)`](src/sparql/mod.rs:67) — validates a parsed `Query` is a + `SELECT` with a single path pattern and returns a [`PathTriple`](src/sparql/mod.rs:56). +- [`parse_rpq(sparql)`](src/sparql/mod.rs:190) — convenience function combining + `parse_query` + `extract_path` in one call. +- [`PathTriple`](src/sparql/mod.rs:56) — holds the extracted `subject` + ([`TermPattern`]), `path` ([`PropertyPathExpression`]), and `object` + ([`TermPattern`]). +- [`ExtractError`](src/sparql/mod.rs:25) — error enum for extraction failures + (`NotSelect`, `NotSinglePath`, `UnsupportedSubject`, `UnsupportedObject`, + `VariablePredicate`). +- [`RpqParseError`](src/sparql/mod.rs:198) — combined error for [`parse_rpq`] + wrapping both `spargebra::SparqlSyntaxError` and [`ExtractError`]. +- [`DEFAULT_BASE_IRI`](src/sparql/mod.rs:38) — `"http://example.org/"`, the + default base IRI constant. + +The module also handles spargebra's desugaring of sequence paths +(`?x // ?y`) from a chain of BGP triples back into a single +[`PropertyPathExpression::Sequence`]. ### FFI layer @@ -286,6 +324,8 @@ native libraries. Tests in `src/formats/csv.rs` are pure Rust and need no native dependencies. +Tests in `src/sparql/mod.rs` are pure Rust and need no native dependencies. + Tests in `src/graph/inmemory.rs` and [`tests/inmemory_tests.rs`](tests/inmemory_tests.rs) call real GraphBLAS/LAGraph and require the native libraries to be present. diff --git a/Cargo.toml b/Cargo.toml index 4b2d184..2420d28 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ csv = "1.4.0" libc = "0.2" oxrdf = "0.3.3" oxttl = "0.2.3" +spargebra = "0.4.6" thiserror = "1.0" [features] diff --git a/src/lib.rs b/src/lib.rs index ef319e0..0d11b1f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,6 @@ pub mod formats; pub mod graph; +pub mod sparql; #[allow(unused_unsafe, dead_code)] pub(crate) mod utils; diff --git a/src/sparql/mod.rs b/src/sparql/mod.rs new file mode 100644 index 0000000..5b5c5fa --- /dev/null +++ b/src/sparql/mod.rs @@ -0,0 +1,348 @@ +//! SPARQL parsing and validation utilities. +//! +//! This module provides helpers for parsing SPARQL query strings using the +//! [`spargebra`] crate and extracting the property path triple pattern that +//! pathrex's RPQ evaluators operate on. +//! +//! # Supported query form +//! +//! SELECT queries with a single triple pattern in the +//! WHERE clause are supported: +//! +//! ```sparql +//! SELECT ?x ?y WHERE { ?x ?y . } +//! SELECT ?x ?y WHERE { ?x /* ?y . } +//! SELECT ?x WHERE { + ?x . } +//! ``` + +use spargebra::algebra::{GraphPattern, PropertyPathExpression}; +use spargebra::term::{NamedNodePattern, TermPattern, TriplePattern}; +use spargebra::{Query, SparqlParser, SparqlSyntaxError}; +use thiserror::Error; + +/// Error returned when extracting a property path triple from a parsed query. +#[derive(Debug, Error)] +pub enum ExtractError { + #[error("expected SELECT query, got a different query form")] + NotSelect, + #[error("WHERE clause must contain exactly one triple or property path pattern")] + NotSinglePath, + #[error("unsupported subject term: {0}")] + UnsupportedSubject(String), + #[error("unsupported object term: {0}")] + UnsupportedObject(String), + #[error("predicate in plain triple must be a named node, not a variable")] + VariablePredicate, +} + +pub const DEFAULT_BASE_IRI: &str = "http://example.org/"; + +/// Parse a SPARQL query string into a [`spargebra::Query`]. +/// +/// # Errors +/// +/// Returns [`SparqlSyntaxError`] if the input is not valid SPARQL 1.1. +pub fn parse_query(sparql: &str) -> Result { + SparqlParser::new() + // .with_base_iri(DEFAULT_BASE_IRI) + // .expect("DEFAULT_BASE_IRI is a valid IRI") + .parse_query(sparql) +} + +/// Extracted triple components from a parsed SPARQL query. +/// +/// Holds owned data so callers do not need to keep the [`Query`] alive. +#[derive(Debug, Clone)] +pub struct PathTriple { + pub subject: TermPattern, + pub path: PropertyPathExpression, + pub object: TermPattern, +} + +/// Extract the property path triple from a parsed SPARQL [`Query`]. +/// +/// Validates that the query is a `SELECT` with a single triple or property +/// path pattern in the WHERE clause and returns a [`PathTriple`] with the +/// three components. +pub fn extract_path(query: &Query) -> Result { + let pattern = match query { + Query::Select { pattern, .. } => pattern, + _ => return Err(ExtractError::NotSelect), + }; + + let triple = extract_path_from_pattern(pattern)?; + + validate_term(&triple.subject, true)?; + validate_term(&triple.object, false)?; + + Ok(triple) +} + +/// Recursively unwrap `GraphPattern` wrappers (Project, Distinct, etc.) to +/// find the single triple or path pattern inside. +fn extract_path_from_pattern(pattern: &GraphPattern) -> Result { + match pattern { + GraphPattern::Path { + subject, + path, + object, + } => Ok(PathTriple { + subject: subject.clone(), + path: path.clone(), + object: object.clone(), + }), + + GraphPattern::Bgp { patterns } => extract_from_bgp(patterns), + + GraphPattern::Project { inner, .. } => extract_path_from_pattern(inner), + + GraphPattern::Distinct { inner } => extract_path_from_pattern(inner), + GraphPattern::Reduced { inner } => extract_path_from_pattern(inner), + GraphPattern::Slice { inner, .. } => extract_path_from_pattern(inner), + + _ => Err(ExtractError::NotSinglePath), + } +} + +/// Extract a [`PathTriple`] from a BGP's triple patterns. +/// +/// Handles two cases: +/// 1. **Single triple** — `?x ?y` → wraps predicate as +/// [`PropertyPathExpression::NamedNode`]. +/// 2. **Desugared sequence path** — spargebra rewrites `?x // ?y` +/// into a chain of triples linked by blank-node intermediates: +/// `?x _:b0 . _:b0 _:b1 . _:b1 ?y`. +/// We detect this pattern and reconstruct a +/// [`PropertyPathExpression::Sequence`]. +fn extract_from_bgp(patterns: &[TriplePattern]) -> Result { + if patterns.is_empty() { + return Err(ExtractError::NotSinglePath); + } + if patterns.len() == 1 { + return bgp_triple_to_path_triple(&patterns[0]); + } + + let mut steps: Vec = Vec::with_capacity(patterns.len()); + for triple in patterns { + match &triple.predicate { + NamedNodePattern::NamedNode(nn) => { + steps.push(PropertyPathExpression::NamedNode(nn.clone())); + } + NamedNodePattern::Variable(_) => return Err(ExtractError::NotSinglePath), + } + } + + for i in 0..patterns.len() - 1 { + let obj_bn = match &patterns[i].object { + TermPattern::BlankNode(bn) => bn, + _ => return Err(ExtractError::NotSinglePath), + }; + let subj_bn = match &patterns[i + 1].subject { + TermPattern::BlankNode(bn) => bn, + _ => return Err(ExtractError::NotSinglePath), + }; + if obj_bn != subj_bn { + return Err(ExtractError::NotSinglePath); + } + } + + let path = steps + .into_iter() + .reduce(|acc, step| PropertyPathExpression::Sequence(Box::new(acc), Box::new(step))) + .unwrap(); + + Ok(PathTriple { + subject: patterns[0].subject.clone(), + path, + object: patterns.last().unwrap().object.clone(), + }) +} + +/// Convert a plain BGP [`TriplePattern`] into a [`PathTriple`] by wrapping +/// the predicate as a [`PropertyPathExpression::NamedNode`]. +fn bgp_triple_to_path_triple(triple: &TriplePattern) -> Result { + let path = match &triple.predicate { + NamedNodePattern::NamedNode(nn) => PropertyPathExpression::NamedNode(nn.clone()), + NamedNodePattern::Variable(_) => return Err(ExtractError::VariablePredicate), + }; + Ok(PathTriple { + subject: triple.subject.clone(), + path, + object: triple.object.clone(), + }) +} + +/// Validate that a [`TermPattern`] is a supported vertex form. +fn validate_term(term: &TermPattern, is_subject: bool) -> Result<(), ExtractError> { + match term { + TermPattern::Variable(_) | TermPattern::NamedNode(_) => Ok(()), + other => { + let msg = format!("{other}"); + if is_subject { + Err(ExtractError::UnsupportedSubject(msg)) + } else { + Err(ExtractError::UnsupportedObject(msg)) + } + } + } +} + +pub fn parse_rpq(sparql: &str) -> Result { + let query = parse_query(sparql)?; + let triple = extract_path(&query)?; + Ok(triple) +} + +/// Combined error for [`parse_rpq`]. +#[derive(Debug, Error)] +pub enum RpqParseError { + #[error("SPARQL syntax error: {0}")] + Syntax(#[from] SparqlSyntaxError), + #[error("query extraction error: {0}")] + Extract(#[from] ExtractError), +} + +#[cfg(test)] +mod tests { + use super::*; + use spargebra::algebra::PropertyPathExpression; + use spargebra::term::TermPattern; + + pub const DEFAULT_BASE_IRI: &str = "BASE "; + + fn parse_and_extract(sparql: &str) -> PathTriple { + let q = format!("{DEFAULT_BASE_IRI} {sparql}"); + parse_rpq(&q).expect("parse_rpq failed") + } + + #[test] + fn test_plain_triple_bgp() { + let triple = parse_and_extract("SELECT ?x ?y WHERE { ?x ?y . }"); + assert!(matches!(triple.subject, TermPattern::Variable(_))); + assert!(matches!(triple.object, TermPattern::Variable(_))); + assert!(matches!(triple.path, PropertyPathExpression::NamedNode(_))); + } + + #[test] + fn test_variable_variable_zero_or_more() { + let triple = parse_and_extract("SELECT ?x ?y WHERE { ?x * ?y . }"); + assert!(matches!(triple.subject, TermPattern::Variable(_))); + assert!(matches!(triple.object, TermPattern::Variable(_))); + assert!(matches!(triple.path, PropertyPathExpression::ZeroOrMore(_))); + } + + #[test] + fn test_variable_variable_sequence() { + let triple = parse_and_extract("SELECT ?x ?y WHERE { ?x / ?y . }"); + assert!(matches!(triple.subject, TermPattern::Variable(_))); + assert!(matches!(triple.object, TermPattern::Variable(_))); + assert!(matches!( + triple.path, + PropertyPathExpression::Sequence(_, _) + )); + } + + #[test] + fn test_named_variable_sequence() { + let triple = parse_and_extract("SELECT ?y WHERE { / ?y . }"); + assert!(matches!(triple.subject, TermPattern::NamedNode(_))); + assert!(matches!(triple.object, TermPattern::Variable(_))); + assert!(matches!( + triple.path, + PropertyPathExpression::Sequence(_, _) + )); + } + + #[test] + fn test_three_step_sequence() { + let triple = parse_and_extract("SELECT ?x ?y WHERE { ?x // ?y . }"); + assert!(matches!(triple.subject, TermPattern::Variable(_))); + assert!(matches!(triple.object, TermPattern::Variable(_))); + match &triple.path { + PropertyPathExpression::Sequence(lhs, _rhs) => { + assert!(matches!( + lhs.as_ref(), + PropertyPathExpression::Sequence(_, _) + )); + } + other => panic!("expected Sequence, got {other:?}"), + } + } + + #[test] + fn test_variable_named_star() { + let triple = parse_and_extract("SELECT ?x WHERE { ?x * . }"); + assert!(matches!(triple.subject, TermPattern::Variable(_))); + assert!(matches!(triple.object, TermPattern::NamedNode(_))); + assert!(matches!(triple.path, PropertyPathExpression::ZeroOrMore(_))); + } + + #[test] + fn test_alternative_path() { + let triple = parse_and_extract("SELECT ?x ?y WHERE { ?x | ?y . }"); + assert!(matches!( + triple.path, + PropertyPathExpression::Alternative(_, _) + )); + } + + #[test] + fn test_one_or_more() { + let triple = parse_and_extract("SELECT ?x ?y WHERE { ?x + ?y . }"); + assert!(matches!(triple.path, PropertyPathExpression::OneOrMore(_))); + } + + #[test] + fn test_zero_or_one() { + let triple = parse_and_extract("SELECT ?x ?y WHERE { ?x ? ?y . }"); + assert!(matches!(triple.path, PropertyPathExpression::ZeroOrOne(_))); + } + + #[test] + fn test_complex_path() { + let triple = parse_and_extract("SELECT ?x ?y WHERE { ?x (/)* ?y . }"); + assert!(matches!(triple.path, PropertyPathExpression::ZeroOrMore(_))); + } + + #[test] + fn test_not_select_returns_error() { + let sparql = format!("{DEFAULT_BASE_IRI} ASK {{ ?x ?y }}"); + let query = parse_query(&sparql).expect("parse failed"); + let result = extract_path(&query); + assert!(matches!(result, Err(ExtractError::NotSelect))); + } + + #[test] + fn test_multiple_triples_returns_error() { + let sparql = format!("{DEFAULT_BASE_IRI} SELECT ?x ?y WHERE {{ ?x ?z . ?z ?y . }}"); + let result = parse_rpq(&sparql); + assert!(matches!( + result, + Err(RpqParseError::Extract(ExtractError::NotSinglePath)) + )); + } + + #[test] + fn test_default_base_iri_resolves_relative_iris() { + let triple = parse_and_extract("SELECT ?x ?y WHERE { ?x ?y . }"); + if let PropertyPathExpression::NamedNode(nn) = &triple.path { + assert_eq!(nn.as_str(), "http://example.org/knows"); + } else { + panic!("expected NamedNode path"); + } + } + + #[test] + fn test_with_prefix_resolves_prefixed_iris() { + let query = SparqlParser::new() + .with_prefix("ex", "http://example.org/") + .unwrap() + .parse_query("SELECT ?x ?y WHERE { ?x ex:knows/ex:likes ?y . }") + .expect("parse with prefix failed"); + let triple = extract_path(&query).expect("extract failed"); + assert!(matches!( + triple.path, + PropertyPathExpression::Sequence(_, _) + )); + } +}