diff --git a/AGENTS.md b/AGENTS.md index b5e655b..6d94eff 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -27,7 +27,8 @@ pathrex/ │ └── formats/ │ ├── mod.rs # FormatError enum, re-exports │ ├── csv.rs # Csv — CSV → Edge iterator (CsvConfig, ColumnSpec) -│ └── mm.rs # MatrixMarket directory loader (vertices.txt, edges.txt, *.txt) +│ ├── mm.rs # MatrixMarket directory loader (vertices.txt, edges.txt, *.txt) +│ └── nt.rs # NTriples — N-Triples → Edge iterator (full predicate IRI labels) ├── tests/ │ ├── inmemory_tests.rs # Integration tests for InMemoryBuilder / InMemoryGraph │ └── mm_tests.rs # Integration tests for MatrixMarket format @@ -119,7 +120,7 @@ regenerates it with `--features regenerate-bindings`. **Do not hand-edit this fi ### Edge -[`Edge`](src/graph/mod.rs:154) is the universal currency between format parsers and graph +[`Edge`](src/graph/mod.rs:158) is the universal currency between format parsers and graph builders: `{ source: String, target: String, label: String }`. ### GraphSource trait @@ -130,8 +131,9 @@ feed itself into a specific [`GraphBuilder`]: - [`apply_to(self, builder: B) -> Result`](src/graph/mod.rs:165) — consumes the source and returns the populated builder. -[`Csv`](src/formats/csv.rs:52) implements `GraphSource` directly, so it -can be passed to [`GraphBuilder::load`]. +[`Csv`](src/formats/csv.rs), [`MatrixMarket`](src/formats/mm.rs), and [`NTriples`](src/formats/nt.rs) +implement `GraphSource` (see [`src/graph/inmemory.rs`](src/graph/inmemory.rs)), so they +can be passed to [`GraphBuilder::load`] and [`Graph::try_from`]. ### GraphBuilder trait @@ -194,12 +196,13 @@ which is used by the MatrixMarket loader. ### Format parsers -Two built-in parsers are available: +Three built-in parsers are available, each yielding +`Iterator>` and pluggable into +`GraphBuilder::load()` via `GraphSource` (see [`src/graph/inmemory.rs`](src/graph/inmemory.rs)). -#### CSV format +#### `Csv` -[`Csv`](src/formats/csv.rs:52) yields `Iterator>` and is -directly pluggable into `GraphBuilder::load()` via its `GraphSource` impl. +[`Csv`](src/formats/csv.rs) parses delimiter-separated edge files. Configuration is via [`CsvConfig`](src/formats/csv.rs:17): @@ -216,7 +219,7 @@ Name-based lookup requires `has_header: true`. #### MatrixMarket directory format -[`MatrixMarket`](src/formats/mm.rs:160) loads an edge-labeled graph from a directory with: +[`MatrixMarket`](src/formats/mm.rs:159) loads an edge-labeled graph from a directory with: - `vertices.txt` — one line per node: ` <1-based-index>` on disk; [`get_node_id`](src/graph/mod.rs:199) returns the matching **0-based** matrix index - `edges.txt` — one line per label: ` <1-based-index>` (selects `n.txt`) @@ -228,12 +231,27 @@ converted to 0-based and installed via [`InMemoryBuilder::set_node_map()`](src/g Helper functions: -- [`load_mm_file(path)`](src/formats/mm.rs:64) — reads a single MatrixMarket file into a +- [`load_mm_file(path)`](src/formats/mm.rs:39) — reads a single MatrixMarket file into a `GrB_Matrix`. -- [`parse_index_map(path)`](src/formats/mm.rs) — parses ` ` lines; indices must be **>= 1** and **unique** within the file. +- [`parse_index_map(path)`](src/formats/mm.rs:81) — parses ` ` lines; indices must be **>= 1** and **unique** within the file. `MatrixMarket` implements `GraphSource` in [`src/graph/inmemory.rs`](src/graph/inmemory.rs): `vertices.txt` maps are converted from 1-based file indices to 0-based matrix ids before [`set_node_map`](src/graph/inmemory.rs:67); `edges.txt` indices are unchanged for `n.txt` lookup. +#### `NTriples` + +[`NTriples`](src/formats/nt.rs:51) parses [W3C N-Triples](https://www.w3.org/TR/n-triples/) +RDF files using `oxttl` and `oxrdf`. Each triple `(subject, predicate, object)` becomes an +[`Edge`](src/graph/mod.rs:158) where: + +- `source` — subject IRI or blank-node ID (`_:label`). +- `target` — object IRI or blank-node ID; triples whose object is an RDF + literal yield `Err(FormatError::LiteralAsNode)` (callers may filter these out). +- `label` — full predicate IRI string (including fragment `#…` when present). + +Constructor: + +- [`NTriples::new(reader)`](src/formats/nt.rs:56) — parses the stream; each predicate IRI is copied verbatim to the edge label. + ### FFI layer [`lagraph_sys`](src/lagraph_sys.rs) exposes raw C bindings for GraphBLAS and @@ -284,7 +302,7 @@ Tests in `src/graph/mod.rs` use `CountingBuilder` / `CountOutput` / `VecSource` [`src/utils.rs`](src/utils.rs) — these do **not** call into GraphBLAS and run without native libraries. -Tests in `src/formats/csv.rs` are pure Rust and need no native dependencies. +Tests in `src/formats/csv.rs` and `src/formats/nt.rs` are pure Rust and need no native dependencies. Tests in `src/graph/inmemory.rs` and [`tests/inmemory_tests.rs`](tests/inmemory_tests.rs) call real GraphBLAS/LAGraph and require the native libraries to be present. diff --git a/src/formats/csv.rs b/src/formats/csv.rs index e106951..49788b2 100644 --- a/src/formats/csv.rs +++ b/src/formats/csv.rs @@ -230,6 +230,25 @@ mod tests { assert!(edges.is_empty()); } + #[test] + fn test_non_ascii() { + let csv = "source,target,label\n\ +人甲,人乙,认识\n\ +Алиса,Боб,знает\n"; + let edges: Vec<_> = make_csv(csv).collect(); + assert_eq!(edges.len(), 2); + + let e0 = edges[0].as_ref().unwrap(); + assert_eq!(e0.source, "人甲"); + assert_eq!(e0.target, "人乙"); + assert_eq!(e0.label, "认识"); + + let e1 = edges[1].as_ref().unwrap(); + assert_eq!(e1.source, "Алиса"); + assert_eq!(e1.target, "Боб"); + assert_eq!(e1.label, "знает"); + } + #[test] fn test_graph_source_impl() { use crate::graph::{GraphBuilder, GraphDecomposition, InMemoryBuilder}; diff --git a/src/formats/mod.rs b/src/formats/mod.rs index 5ca0cea..480d720 100644 --- a/src/formats/mod.rs +++ b/src/formats/mod.rs @@ -4,20 +4,27 @@ //! //! ```no_run //! use pathrex::graph::{Graph, InMemory, GraphDecomposition}; -//! use pathrex::formats::Csv; +//! use pathrex::formats::{Csv, NTriples}; //! use std::fs::File; //! //! // Build from CSV in one line //! let g = Graph::::try_from( //! Csv::from_reader(File::open("edges.csv").unwrap()).unwrap() //! ).unwrap(); +//! +//! // Build from N-Triples in one line +//! let g2 = Graph::::try_from( +//! NTriples::new(File::open("data.nt").unwrap()) +//! ).unwrap(); //! ``` pub mod csv; pub mod mm; +pub mod nt; pub use csv::Csv; pub use mm::MatrixMarket; +pub use nt::NTriples; use thiserror::Error; @@ -49,4 +56,13 @@ pub enum FormatError { line: usize, reason: String, }, + + /// An error produced by the N-Triples parser. + #[error("N-Triples parse error: {0}")] + NTriples(String), + + /// An RDF literal appeared as a subject or object where a node IRI or + /// blank node was expected. + #[error("RDF literal cannot be used as a graph node")] + LiteralAsNode, } diff --git a/src/formats/nt.rs b/src/formats/nt.rs new file mode 100644 index 0000000..aa08880 --- /dev/null +++ b/src/formats/nt.rs @@ -0,0 +1,210 @@ +//! N-Triples edge iterator for the formats layer. +//! +//! ```no_run +//! use pathrex::formats::NTriples; +//! use pathrex::formats::FormatError; +//! +//! # let reader = std::io::empty(); +//! let iter = NTriples::new(reader) +//! .filter_map(|r| match r { +//! Err(FormatError::LiteralAsNode) => None, // skip +//! other => Some(other), +//! }); +//! ``` +//! +//! To load into a graph: +//! +//! ```no_run +//! use pathrex::graph::{Graph, InMemory, GraphDecomposition}; +//! use pathrex::formats::NTriples; +//! use std::fs::File; +//! +//! let graph = Graph::::try_from( +//! NTriples::new(File::open("data.nt").unwrap()) +//! ).unwrap(); +//! ``` + +use std::io::Read; + +use oxrdf::{NamedOrBlankNode, Term}; +use oxttl::NTriplesParser; +use oxttl::ntriples::ReaderNTriplesParser; + +use crate::formats::FormatError; +use crate::graph::Edge; + +/// An iterator that reads N-Triples and yields `Result`. +/// +/// # Example +/// +/// ```no_run +/// use pathrex::formats::nt::NTriples; +/// use std::fs::File; +/// +/// let file = File::open("data.nt").unwrap(); +/// let iter = NTriples::new(file); +/// for result in iter { +/// let edge = result.unwrap(); +/// println!("{} --{}--> {}", edge.source, edge.label, edge.target); +/// } +/// ``` +pub struct NTriples { + inner: ReaderNTriplesParser, +} + +impl NTriples { + pub fn new(reader: R) -> Self { + Self { + inner: NTriplesParser::new().for_reader(reader), + } + } + + fn subject_to_node_id(subject: NamedOrBlankNode) -> String { + match subject { + NamedOrBlankNode::NamedNode(n) => n.into_string(), + NamedOrBlankNode::BlankNode(b) => format!("_:{}", b.as_str()), + } + } + + fn object_to_node_id(object: Term) -> Result { + match object { + Term::NamedNode(n) => Ok(n.into_string()), + Term::BlankNode(b) => Ok(format!("_:{}", b.as_str())), + Term::Literal(_) => Err(FormatError::LiteralAsNode), + } + } +} + +impl Iterator for NTriples { + type Item = Result; + + fn next(&mut self) -> Option { + let triple = match self.inner.next()? { + Ok(t) => t, + Err(e) => return Some(Err(FormatError::NTriples(e.to_string()))), + }; + + let source = Self::subject_to_node_id(triple.subject.into()); + let label = triple.predicate.as_str().to_owned(); + let target = match Self::object_to_node_id(triple.object) { + Ok(t) => t, + Err(e) => return Some(Err(e)), + }; + + Some(Ok(Edge { + source, + target, + label, + })) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn parse(nt: &str) -> Vec> { + NTriples::new(nt.as_bytes()).collect() + } + + #[test] + fn test_basic_ntriples() { + let nt = " .\n\ + .\n"; + let edges = parse(nt); + assert_eq!(edges.len(), 2); + + let e0 = edges[0].as_ref().unwrap(); + assert_eq!(e0.source, "http://example.org/Alice"); + assert_eq!(e0.target, "http://example.org/Bob"); + assert_eq!(e0.label, "http://example.org/knows"); + + let e1 = edges[1].as_ref().unwrap(); + assert_eq!(e1.source, "http://example.org/Bob"); + assert_eq!(e1.target, "http://example.org/Charlie"); + assert_eq!(e1.label, "http://example.org/likes"); + } + + #[test] + fn test_blank_node_subject_and_object() { + let nt = "_:b1 _:b2 .\n"; + let edges = parse(nt); + assert_eq!(edges.len(), 1); + + let e = edges[0].as_ref().unwrap(); + assert_eq!(e.source, "_:b1"); + assert_eq!(e.target, "_:b2"); + } + + #[test] + fn test_literal_object_yields_error() { + let nt = " \"Alice\" .\n"; + let edges = parse(nt); + assert_eq!(edges.len(), 1); + assert!( + matches!(edges[0], Err(FormatError::LiteralAsNode)), + "literal object should yield LiteralAsNode error" + ); + } + + #[test] + fn test_caller_can_skip_literal_triples() { + let nt = " .\n\ + \"Alice\" .\n\ + .\n"; + let edges: Vec<_> = NTriples::new(nt.as_bytes()) + .filter_map(|r| match r { + Err(FormatError::LiteralAsNode) => None, + other => Some(other), + }) + .collect(); + + assert_eq!(edges.len(), 2, "literal triple should be skipped"); + assert!(edges.iter().all(|r| r.is_ok())); + } + + #[test] + fn test_predicate_with_fragment_is_full_iri_string() { + let nt = + " .\n"; + let edges = parse(nt); + assert_eq!( + edges[0].as_ref().unwrap().label, + "http://example.org/ns#knows" + ); + } + + #[test] + fn test_non_ascii_in_iris() { + let nt = " .\n\ + .\n"; + let edges = parse(nt); + assert_eq!(edges.len(), 2); + + let e0 = edges[0].as_ref().unwrap(); + assert_eq!(e0.source, "http://example.org/人甲"); + assert_eq!(e0.target, "http://example.org/人乙"); + assert_eq!(e0.label, "http://example.org/关系/认识"); + + let e1 = edges[1].as_ref().unwrap(); + assert_eq!(e1.source, "http://example.org/Алиса"); + assert_eq!(e1.target, "http://example.org/Боб"); + assert_eq!(e1.label, "http://example.org/знает"); + } + + #[test] + fn test_ntriples_graph_source() { + use crate::graph::{GraphBuilder, GraphDecomposition, InMemoryBuilder}; + + let nt = " .\n\ + .\n"; + let iter = NTriples::new(nt.as_bytes()); + + let graph = InMemoryBuilder::default() + .load(iter) + .expect("load should succeed") + .build() + .expect("build should succeed"); + assert_eq!(graph.num_nodes(), 3); + } +} diff --git a/src/graph/inmemory.rs b/src/graph/inmemory.rs index 103e5fe..602978a 100644 --- a/src/graph/inmemory.rs +++ b/src/graph/inmemory.rs @@ -2,7 +2,7 @@ use std::sync::Arc; use std::{collections::HashMap, io::Read}; use crate::formats::mm::{load_mm_file, parse_index_map}; -use crate::formats::{Csv, MatrixMarket}; +use crate::formats::{Csv, MatrixMarket, NTriples}; use crate::{ graph::GraphSource, lagraph_sys::{GrB_Index, GrB_Matrix, GrB_Matrix_free, LAGraph_Kind}, @@ -216,10 +216,8 @@ impl GraphSource for MatrixMarket { fn apply_to(self, mut builder: InMemoryBuilder) -> Result { let vertices_path = self.dir.join("vertices.txt"); let (vert_by_idx, vert_by_name) = parse_index_map(&vertices_path)?; - let vert_by_idx = - vert_by_idx.into_iter().map(|(i, n)| (i - 1, n)).collect(); - let vert_by_name = - vert_by_name.into_iter().map(|(n, i)| (n, i - 1)).collect(); + let vert_by_idx = vert_by_idx.into_iter().map(|(i, n)| (i - 1, n)).collect(); + let vert_by_name = vert_by_name.into_iter().map(|(n, i)| (n, i - 1)).collect(); let (edge_by_idx, _) = parse_index_map(&self.dir.join("edges.txt"))?; @@ -235,6 +233,15 @@ impl GraphSource for MatrixMarket { } } +impl GraphSource for NTriples { + fn apply_to(self, mut builder: InMemoryBuilder) -> Result { + for item in self { + builder.push_edge(item?)?; + } + Ok(builder) + } +} + #[cfg(test)] mod tests { use super::*; @@ -322,4 +329,23 @@ mod tests { assert!(graph.get_graph("knows").is_ok()); assert!(graph.get_graph("likes").is_ok()); } + + #[test] + fn test_with_stream_from_ntriples() { + use crate::formats::nt::NTriples; + + let nt = " .\n\ + .\n\ + .\n"; + + let graph = InMemoryBuilder::new() + .load(NTriples::new(nt.as_bytes())) + .expect("load should succeed") + .build() + .expect("build should succeed"); + + assert_eq!(graph.num_nodes(), 3); + assert!(graph.get_graph("http://example.org/knows").is_ok()); + assert!(graph.get_graph("http://example.org/likes").is_ok()); + } }