From c823d19b8869a7ee2c775a21fe7a854fbd4585c6 Mon Sep 17 00:00:00 2001 From: jdsika Date: Thu, 2 Apr 2026 17:22:31 +0200 Subject: [PATCH 01/12] fix(owlgen): warn on covering axiom edge cases for abstract classes Emit warnings for abstract class covering axiom edge cases: - Zero children: warn that no covering axiom will be generated - One child: warn that the covering axiom degenerates to an equivalence (Parent = Child), recommending --skip-abstract-class-as-unionof-subclasses Both axioms are still emitted when applicable (semantically correct per OWL 2), but warnings alert users who extend the ontology downstream. Tests verify warnings are logged, flag suppression works, the single-child covering axiom triple is correctly asserted, plus negative tests for multi-child and concrete class cases, and the mixin-only children edge case. Refs: linkml/linkml#3309, linkml/linkml#3219 Signed-off-by: jdsika --- .../linkml/src/linkml/generators/owlgen.py | 30 +++- tests/linkml/test_generators/test_owlgen.py | 170 ++++++++++++++++++ 2 files changed, 198 insertions(+), 2 deletions(-) diff --git a/packages/linkml/src/linkml/generators/owlgen.py b/packages/linkml/src/linkml/generators/owlgen.py index 88cd3fa10f..6c818a2323 100644 --- a/packages/linkml/src/linkml/generators/owlgen.py +++ b/packages/linkml/src/linkml/generators/owlgen.py @@ -209,7 +209,11 @@ class OwlSchemaGenerator(Generator): one direct ``is_a`` child, the generator adds ``AbstractClass rdfs:subClassOf (Child1 or Child2 or …)``, expressing the open-world covering constraint that every instance of the abstract class must also be an instance of one of its - direct subclasses.""" + direct subclasses. + + .. note:: An info message is emitted when an abstract class has no children (no axiom generated). + A warning is emitted when there is only one child (covering axiom degenerates to equivalence + Parent ≡ Child). Use this flag to suppress covering axioms entirely if equivalence is undesired.""" @staticmethod def _present(values: Iterable[_T | None]) -> list[_T]: @@ -505,6 +509,26 @@ def condition_to_bnode(expr: AnonymousClassExpression) -> OWL_EXPRESSION | None: # must be an instance of at least one of its direct subclasses. if cls.abstract and not self.skip_abstract_class_as_unionof_subclasses: children = sorted(sv.class_children(cls.name, imports=self.mergeimports, mixins=False, is_a=True)) + if not children: + logger.info( + "Abstract class '%s' has no children. No covering axiom will be generated.", + cls.name, + ) + elif len(children) == 1: + # Warn: with one child C, the covering axiom degenerates to + # Parent ⊑ C which, combined with C ⊑ Parent (from is_a), + # creates Parent ≡ C (equivalence). This is semantically + # correct per OWL 2 but may be surprising for extensible + # ontologies where more children are added later. + logger.warning( + "Abstract class '%s' has only 1 direct child ('%s'). " + "The covering axiom makes them equivalent (%s ≡ %s). " + "Use --skip-abstract-class-as-unionof-subclasses to suppress.", + cls.name, + children[0], + cls.name, + children[0], + ) if children: child_uris = [self._class_uri(child) for child in children] union_node = self._union_of(child_uris) @@ -1654,7 +1678,9 @@ def slot_owl_type(self, slot: SlotDefinition) -> URIRef: show_default=True, help=( "If true, suppress rdfs:subClassOf owl:unionOf(subclasses) covering axioms for abstract classes. " - "By default such axioms are emitted for every abstract class that has direct is_a children." + "By default such axioms are emitted for every abstract class that has direct is_a children. " + "Note: an info message is logged for abstract classes with zero children (no axiom); " + "a warning is emitted for one child (equivalence)." ), ) @click.option( diff --git a/tests/linkml/test_generators/test_owlgen.py b/tests/linkml/test_generators/test_owlgen.py index ead3359ee2..062d4c31ac 100644 --- a/tests/linkml/test_generators/test_owlgen.py +++ b/tests/linkml/test_generators/test_owlgen.py @@ -1,3 +1,4 @@ +import logging from enum import Enum import pytest @@ -526,6 +527,175 @@ def test_abstract_class_without_subclasses_gets_no_union_of_axiom(): assert _union_members(g, EX.Orphan) is None +def test_abstract_class_with_no_children_emits_info(caplog): + """An abstract class with no children emits an info message about missing coverage. + + When an abstract class has zero subclasses, no covering axiom can be + generated. An info message alerts users that the class hierarchy is + incomplete — this is not a warning because abstract leaf classes are + a normal pattern in base schemas designed for downstream extension. + + See: mgskjaeveland's review on linkml/linkml#3309. + See: matentzn's review on linkml/linkml#3309. + """ + sb = SchemaBuilder() + sb.add_class("Orphan", abstract=True) + sb.add_defaults() + + with caplog.at_level(logging.INFO, logger="linkml.generators.owlgen"): + g = _owl_graph(sb) + + # No covering axiom emitted + assert _union_members(g, EX.Orphan) is None + + # An info message must be logged (not a warning) + assert any("has no children" in msg for msg in caplog.messages), ( + "Expected an info message about abstract class with no children" + ) + assert any("No covering axiom" in msg for msg in caplog.messages), ( + "Info message should mention that no covering axiom will be generated" + ) + + +def test_no_children_info_suppressed_by_skip_flag(caplog): + """When --skip-abstract-class-as-unionof-subclasses is set, no info for zero children.""" + sb = SchemaBuilder() + sb.add_class("Orphan", abstract=True) + sb.add_defaults() + + with caplog.at_level(logging.INFO, logger="linkml.generators.owlgen"): + _owl_graph(sb, skip_abstract_class_as_unionof_subclasses=True) + + assert not any("has no children" in msg for msg in caplog.messages) + + +def test_abstract_class_with_single_child_emits_warning(caplog): + """An abstract class with one child still gets a covering axiom but emits a warning. + + Per OWL 2 semantics, the covering axiom with a single child creates an + equivalence (Parent ≡ Child). This is logically correct but may surprise + users who plan to extend the ontology later. The generator should warn + and recommend ``--skip-abstract-class-as-unionof-subclasses``. + + See: W3C OWL 2 Primer §4.2 — bidirectional rdfs:subClassOf = equivalence. + See: mgskjaeveland's review on linkml/linkml#3309. + """ + sb = SchemaBuilder() + sb.add_class("GrandParent") + sb.add_class("Parent", is_a="GrandParent", abstract=True) + sb.add_class("Child", is_a="Parent") + sb.add_defaults() + + with caplog.at_level(logging.WARNING, logger="linkml.generators.owlgen"): + g = _owl_graph(sb) + + # Covering axiom IS still emitted (single child → equivalence is OWL-correct). + # With one child, _union_of returns the child URI directly (no owl:unionOf wrapper), + # so the covering axiom materialises as Parent rdfs:subClassOf Child. + # Combined with Child rdfs:subClassOf Parent (from is_a), this is the equivalence. + assert (EX.Parent, RDFS.subClassOf, EX.Child) in g, ( + "Covering axiom should produce Parent rdfs:subClassOf Child for single-child case" + ) + assert (EX.Child, RDFS.subClassOf, EX.Parent) in g + assert (EX.Parent, RDFS.subClassOf, EX.GrandParent) in g + + # But a warning must be logged + assert any("only 1 direct child" in msg for msg in caplog.messages), ( + "Expected a warning about single-child covering axiom creating equivalence" + ) + assert any("--skip-abstract-class-as-unionof-subclasses" in msg for msg in caplog.messages), ( + "Warning should recommend the skip flag" + ) + + +def test_single_child_warning_suppressed_by_skip_flag(caplog): + """When --skip-abstract-class-as-unionof-subclasses is set, no warning is emitted. + + The skip flag suppresses covering axioms entirely, so the single-child + equivalence case never arises. + """ + sb = SchemaBuilder() + sb.add_class("Parent", abstract=True) + sb.add_class("Child", is_a="Parent") + sb.add_defaults() + + with caplog.at_level(logging.WARNING, logger="linkml.generators.owlgen"): + g = _owl_graph(sb, skip_abstract_class_as_unionof_subclasses=True) + + # No covering axiom emitted + assert (EX.Parent, RDFS.subClassOf, EX.Child) not in g + # No warning either + assert not any("only 1 direct child" in msg for msg in caplog.messages) + + +def test_multiple_children_no_warning(caplog): + """An abstract class with 2+ children must NOT emit a warning. + + The covering axiom is a proper union (not a degenerate equivalence), + so no warning is needed. + """ + sb = SchemaBuilder() + sb.add_class("Animal", abstract=True) + sb.add_class("Dog", is_a="Animal") + sb.add_class("Cat", is_a="Animal") + sb.add_defaults() + + with caplog.at_level(logging.WARNING, logger="linkml.generators.owlgen"): + g = _owl_graph(sb) + + # Covering axiom emitted (proper union) + members = _union_members(g, EX.Animal) + assert members == {EX.Dog, EX.Cat} + + # No warning about children count + assert not any("has no children" in msg for msg in caplog.messages) + assert not any("only 1 direct child" in msg for msg in caplog.messages) + + +def test_non_abstract_class_no_warning(caplog): + """A non-abstract class must NOT emit covering axiom warnings. + + Covering axioms only apply to abstract classes. Concrete classes + should be silently skipped regardless of child count. + """ + sb = SchemaBuilder() + sb.add_class("Parent") # not abstract + sb.add_class("Child", is_a="Parent") + sb.add_defaults() + + with caplog.at_level(logging.WARNING, logger="linkml.generators.owlgen"): + g = _owl_graph(sb) + + # No covering axiom for non-abstract class + assert _union_members(g, EX.Parent) is None + assert (EX.Parent, RDFS.subClassOf, EX.Child) not in g + + # No warning either + assert not any("has no children" in msg for msg in caplog.messages) + assert not any("only 1 direct child" in msg for msg in caplog.messages) + + +def test_abstract_class_with_only_mixin_children_emits_info(caplog): + """An abstract class whose only children are via mixins (not is_a) gets the no-children info. + + The covering axiom only considers direct is_a children (not mixins). + If an abstract class has mixin children but no is_a children, it should + log an info message about having no children for covering axiom purposes. + """ + sb = SchemaBuilder() + sb.add_class("Base", abstract=True) + sb.add_class("MixinChild", mixins=["Base"]) + sb.add_defaults() + + with caplog.at_level(logging.INFO, logger="linkml.generators.owlgen"): + g = _owl_graph(sb) + + assert _union_members(g, EX.Base) is None + assert any("has no children" in msg for msg in caplog.messages), ( + "Abstract class with only mixin children should log info about no is_a children" + ) + + @pytest.mark.parametrize("skip", [False, True]) def test_union_of_axiom_only_covers_direct_children(skip: bool): """Union-of axiom lists only direct is_a children, not grandchildren. From 146c707748bd025738b223e7cf411b821ec30b0e Mon Sep 17 00:00:00 2001 From: jdsika Date: Thu, 2 Apr 2026 17:21:36 +0200 Subject: [PATCH 02/12] feat(generators): add --normalize-prefixes flag for well-known prefix names Add an opt-in --normalize-prefixes flag to OWL, SHACL, and JSON-LD Context generators that normalises non-standard prefix aliases to well-known names from a static prefix map (derived from rdflib 7.x defaults, cross-checked against prefix.cc consensus). Key design decisions: - Static frozen map (MappingProxyType) instead of runtime Graph().namespaces() lookup eliminates rdflib version dependency - Both http://schema.org/ and https://schema.org/ map to 'schema' - Shared normalize_graph_prefixes() helper used by OWL and SHACL - Two-phase graph normalisation: Phase 1 normalises schema-declared prefixes, Phase 2 cleans up runtime-injected bindings - Collision detection: skip with warning when standard prefix name is already user-declared for a different namespace - Phase 2 guard prevents overwriting HTTPS bindings with HTTP variants The flag defaults to off, preserving existing behaviour. Tests cover OWL, SHACL, and context generators with sdo->schema, dce->dc, http/https edge case, custom prefix preservation, flag-off backward compatibility, cross-generator consistency, prefix collision detection, schema1 regression prevention, Phase 2 HTTPS guard, empty schema edge case, and static map integrity. Signed-off-by: jdsika Signed-off-by: Carlo van Driesten --- packages/linkml/pyproject.toml | 9 +- .../src/linkml/generators/jsonldcontextgen.py | 82 ++- .../linkml/src/linkml/generators/jsonldgen.py | 2 + .../linkml/src/linkml/generators/owlgen.py | 6 +- .../linkml/src/linkml/generators/shaclgen.py | 6 +- packages/linkml/src/linkml/utils/generator.py | 170 +++++- .../test_generators/test_jsonldcontextgen.py | 115 ++++ .../test_normalize_prefixes.py | 545 ++++++++++++++++++ uv.lock | 10 +- 9 files changed, 932 insertions(+), 13 deletions(-) create mode 100644 tests/linkml/test_generators/test_normalize_prefixes.py diff --git a/packages/linkml/pyproject.toml b/packages/linkml/pyproject.toml index abce3b4510..5f2689848d 100644 --- a/packages/linkml/pyproject.toml +++ b/packages/linkml/pyproject.toml @@ -50,7 +50,10 @@ dependencies = [ # Specifier syntax: https://peps.python.org/pep-0631/ "openpyxl", "parse", "prefixcommons >= 0.1.7", - "prefixmaps >= 0.2.2", + # TODO(prefixmaps-0.2.8): Replace git pin with "prefixmaps >= 0.2.8" once released, + # then remove [tool.hatch.metadata] allow-direct-references and regenerate uv.lock. + # Tracked in: https://github.com/linkml/prefixmaps/issues/82 + "prefixmaps @ git+https://github.com/linkml/prefixmaps@75435150a1b31760b9780af2b64a265943a9b263", "pydantic >= 2.0.0, < 3.0.0", "pyjsg >= 0.12.3", "pyshex >= 0.9.0", @@ -202,6 +205,10 @@ vcs = "git" style = "pep440" fallback-version = "0.0.0" +[tool.hatch.metadata] +# TODO(prefixmaps-0.2.8): Remove this section once the git pin is replaced with >= 0.2.8 +allow-direct-references = true + [tool.hatch.version] source = "uv-dynamic-versioning" diff --git a/packages/linkml/src/linkml/generators/jsonldcontextgen.py b/packages/linkml/src/linkml/generators/jsonldcontextgen.py index c30afc72a5..38dd938860 100644 --- a/packages/linkml/src/linkml/generators/jsonldcontextgen.py +++ b/packages/linkml/src/linkml/generators/jsonldcontextgen.py @@ -15,7 +15,7 @@ from linkml._version import __version__ from linkml.utils.deprecation import deprecated_fields -from linkml.utils.generator import Generator, shared_arguments +from linkml.utils.generator import Generator, shared_arguments, well_known_prefix_map from linkml_runtime.linkml_model.meta import ClassDefinition, EnumDefinition, SlotDefinition from linkml_runtime.linkml_model.types import SHEX from linkml_runtime.utils.formatutils import camelcase, underscore @@ -90,6 +90,9 @@ class ContextGenerator(Generator): frame_root: str | None = None def __post_init__(self) -> None: + # Must be set before super().__post_init__() because the parent triggers + # the visitor pattern (visit_schema), which accesses _prefix_remap. + self._prefix_remap: dict[str, str] = {} super().__post_init__() if self.namespaces is None: raise TypeError("Schema text must be supplied to context generator. Preparsed schema will not work") @@ -127,8 +130,14 @@ def _collect_external_elements(sv: SchemaView) -> tuple[set[str], set[str]]: external_slots.update(schema_def.slots.keys()) return external_classes, external_slots + def add_prefix(self, ncname: str) -> None: + """Add a prefix, applying well-known prefix normalisation when enabled.""" + super().add_prefix(self._prefix_remap.get(ncname, ncname)) + def visit_schema(self, base: str | Namespace | None = None, output: str | None = None, **_): - # Add any explicitly declared prefixes + # Add any explicitly declared prefixes. + # Direct .add() is safe here: the normalisation block below explicitly + # rewrites emit_prefixes entries for any renamed prefixes (Cases 1-3). for prefix in self.schema.prefixes.values(): self.emit_prefixes.add(prefix.prefix_prefix) @@ -136,6 +145,68 @@ def visit_schema(self, base: str | Namespace | None = None, output: str | None = for pfx in self.schema.emit_prefixes: self.add_prefix(pfx) + # Normalise well-known prefix names when --normalize-prefixes is set. + # If the schema declares a non-standard alias for a namespace that has + # a well-known standard name (e.g. ``sdo`` for + # ``https://schema.org/``), replace the alias with the standard name + # so that generated JSON-LD contexts use the conventional prefix. + # + # Three cases are handled: + # 1. Standard prefix is not yet bound → just rebind from old to new. + # 2. Standard prefix is bound to a *different* URI: + # a. User-declared (in schema.prefixes) → collision, skip with warning. + # b. Runtime default (e.g. linkml-runtime's ``schema: http://…``) + # → remove stale binding, then rebind. + # 3. Standard prefix is already bound to the *same* URI (duplicate) + # → just drop the non-standard alias. + # + # A remap dict is stored for ``_build_element_id`` because + # ``prefix_suffix()`` splits CURIEs on ``:`` without looking up the + # namespace dict. + self._prefix_remap.clear() + if self.normalize_prefixes: + wk = well_known_prefix_map() + for old_pfx in list(self.namespaces): + url = str(self.namespaces[old_pfx]) + std_pfx = wk.get(url) + if not std_pfx or std_pfx == old_pfx: + continue + if std_pfx in self.namespaces: + if str(self.namespaces[std_pfx]) != url: + # Case 2: std_pfx is bound to a different URI. + # If the user explicitly declared std_pfx in the schema, + # it is intentional — skip to avoid data loss. + if std_pfx in self.schema.prefixes: + self.logger.warning( + "Prefix collision: cannot rename '%s' to '%s' because '%s' is " + "already declared for <%s>; skipping normalisation for <%s>", + old_pfx, + std_pfx, + std_pfx, + str(self.namespaces[std_pfx]), + url, + ) + continue + # Not user-declared (e.g. linkml-runtime default) — safe to remove + self.emit_prefixes.discard(std_pfx) + del self.namespaces[std_pfx] + else: + # Case 3: standard prefix already bound to same URI + # — just drop the non-standard alias + del self.namespaces[old_pfx] + if old_pfx in self.emit_prefixes: + self.emit_prefixes.discard(old_pfx) + self.emit_prefixes.add(std_pfx) + self._prefix_remap[old_pfx] = std_pfx + continue + # Case 1 (or Case 2 after stale removal): bind standard name + self.namespaces[std_pfx] = self.namespaces[old_pfx] + del self.namespaces[old_pfx] + if old_pfx in self.emit_prefixes: + self.emit_prefixes.discard(old_pfx) + self.emit_prefixes.add(std_pfx) + self._prefix_remap[old_pfx] = std_pfx + # Add the default prefix if self.schema.default_prefix: dflt = self.namespaces.prefix_for(self.schema.default_prefix) @@ -143,6 +214,8 @@ def visit_schema(self, base: str | Namespace | None = None, output: str | None = self.default_ns = dflt if self.default_ns: default_uri = self.namespaces[self.default_ns] + # Direct .add() is safe: default_ns is already resolved from + # the (possibly normalised) namespace bindings above. self.emit_prefixes.add(self.default_ns) else: default_uri = self.schema.default_prefix @@ -486,6 +559,11 @@ def _build_element_id(self, definition: Any, uri: str) -> None: @return: None """ uri_prefix, uri_suffix = self.namespaces.prefix_suffix(uri) + # Apply well-known prefix normalisation (e.g. sdo → schema). + # prefix_suffix() splits CURIEs on ':' without checking the + # namespace dict, so it may return a stale alias. + if uri_prefix and uri_prefix in self._prefix_remap: + uri_prefix = self._prefix_remap[uri_prefix] is_default_namespace = uri_prefix == self.context_body["@vocab"] or uri_prefix == self.namespaces.prefix_for( self.context_body["@vocab"] ) diff --git a/packages/linkml/src/linkml/generators/jsonldgen.py b/packages/linkml/src/linkml/generators/jsonldgen.py index 75d2068e16..ee2fd0cf4e 100644 --- a/packages/linkml/src/linkml/generators/jsonldgen.py +++ b/packages/linkml/src/linkml/generators/jsonldgen.py @@ -179,6 +179,8 @@ def end_schema(self, context: str | Sequence[str] | None = None, context_kwargs: # TODO: The _visit function above alters the schema in situ # force some context_kwargs context_kwargs["metadata"] = False + # Forward prefix normalisation into the inline @context. + context_kwargs.setdefault("normalize_prefixes", self.normalize_prefixes) add_prefixes = ContextGenerator(self.original_schema, **context_kwargs).serialize() add_prefixes_json = loads(add_prefixes) metamodel_ctx = self.metamodel_context or METAMODEL_CONTEXT_URI diff --git a/packages/linkml/src/linkml/generators/owlgen.py b/packages/linkml/src/linkml/generators/owlgen.py index 6c818a2323..da70ea6957 100644 --- a/packages/linkml/src/linkml/generators/owlgen.py +++ b/packages/linkml/src/linkml/generators/owlgen.py @@ -21,7 +21,7 @@ from linkml._version import __version__ from linkml.generators.common.subproperty import is_xsd_anyuri_range from linkml.utils.deprecation import deprecation_warning -from linkml.utils.generator import Generator, shared_arguments +from linkml.utils.generator import Generator, normalize_graph_prefixes, shared_arguments from linkml_runtime import SchemaView from linkml_runtime.linkml_model.meta import ( AnonymousClassExpression, @@ -269,6 +269,10 @@ def as_graph(self) -> Graph: self.graph.bind(prefix, self.metamodel.namespaces[prefix]) for pfx in schema.prefixes.values(): self.graph.namespace_manager.bind(pfx.prefix_prefix, URIRef(pfx.prefix_reference)) + if self.normalize_prefixes: + normalize_graph_prefixes( + graph, {str(v.prefix_prefix): str(v.prefix_reference) for v in schema.prefixes.values()} + ) graph.add((base, RDF.type, OWL.Ontology)) # Add main schema elements diff --git a/packages/linkml/src/linkml/generators/shaclgen.py b/packages/linkml/src/linkml/generators/shaclgen.py index 874e47b3a6..f747aae572 100644 --- a/packages/linkml/src/linkml/generators/shaclgen.py +++ b/packages/linkml/src/linkml/generators/shaclgen.py @@ -13,7 +13,7 @@ from linkml.generators.common.subproperty import get_subproperty_values, is_uri_range from linkml.generators.shacl.shacl_data_type import ShaclDataType from linkml.generators.shacl.shacl_ifabsent_processor import ShaclIfAbsentProcessor -from linkml.utils.generator import Generator, shared_arguments +from linkml.utils.generator import Generator, normalize_graph_prefixes, shared_arguments from linkml_runtime.linkml_model.meta import ClassDefinition, ElementName from linkml_runtime.utils.formatutils import underscore from linkml_runtime.utils.rdf_canonicalize import canonicalize_rdf_graph @@ -106,6 +106,10 @@ def as_graph(self) -> Graph: for pfx in self.schema.prefixes.values(): g.bind(str(pfx.prefix_prefix), pfx.prefix_reference) + if self.normalize_prefixes: + normalize_graph_prefixes( + g, {str(v.prefix_prefix): str(v.prefix_reference) for v in self.schema.prefixes.values()} + ) for c in sv.all_classes(imports=not self.exclude_imports).values(): diff --git a/packages/linkml/src/linkml/utils/generator.py b/packages/linkml/src/linkml/utils/generator.py index 88fc485851..72b977eaa7 100644 --- a/packages/linkml/src/linkml/utils/generator.py +++ b/packages/linkml/src/linkml/utils/generator.py @@ -20,11 +20,12 @@ import os import re import sys +import types from collections.abc import Callable, Mapping from dataclasses import dataclass, field from functools import lru_cache from pathlib import Path -from typing import ClassVar, TextIO, Union, cast +from typing import TYPE_CHECKING, ClassVar, TextIO, Union, cast import click from click import Argument, Command, Option @@ -58,6 +59,9 @@ from linkml_runtime.utils.formatutils import camelcase, underscore from linkml_runtime.utils.namespaces import Namespaces +if TYPE_CHECKING: + from rdflib import Graph + logger = logging.getLogger(__name__) @@ -78,6 +82,154 @@ def _resolved_metamodel(mergeimports): return metamodel +def well_known_prefix_map() -> dict[str, str]: + """Return a mapping from namespace URI to standard prefix name. + + Primary source: the ``linked_data`` context from `prefixmaps + `_ — the canonical curated + registry maintained by the LinkML team. This context provides + correct, community-consensus prefix names (e.g. ``sh`` not ``shacl``, + ``schema`` not ``sdo``). + + Secondary source: the ``merged`` context from prefixmaps, which + combines prefix.cc, bioregistry, and other sources for broad coverage. + + A small ``_PREFIX_OVERRIDES`` map corrects the few cases where the + merged context disagrees with rdflib/W3C canonical names. + + Both ``http`` and ``https`` variants of schema.org and wgs84 are + included because the linkml-runtime historically binds the HTTP form + while rdflib (and the W3C) prefer HTTPS. + + .. note:: + Requires ``prefixmaps >= 0.2.7``. For entries added in + linkml/prefixmaps#81 (W3C/OGC standard prefixes), pin to + ``prefixmaps @ git+https://github.com/linkml/prefixmaps@75435150`` + until v0.2.8 is released. + """ + return dict(_cached_well_known_prefix_map()) + + +@lru_cache(maxsize=1) +def _cached_well_known_prefix_map() -> dict[str, str]: + """Internal cached builder for well_known_prefix_map().""" + from prefixmaps import load_context + + # Layer 1: merged context (broad coverage, first-seen-wins for duplicates). + merged = load_context("merged") + ns_to_prefix: dict[str, str] = {} + for rec in merged.prefix_expansions: + if rec.namespace not in ns_to_prefix: + ns_to_prefix[rec.namespace] = rec.prefix + + # Layer 2: linked_data context (curated, correct names) overrides merged. + ld = load_context("linked_data") + for rec in ld.prefix_expansions: + ns_to_prefix[rec.namespace] = rec.prefix + + # Layer 3: overrides for the few cases where merged/linked_data disagrees + # with the rdflib/W3C canonical forms used by the RDF community. + for ns, pfx in _PREFIX_OVERRIDES.items(): + ns_to_prefix[ns] = pfx + + # Ensure both HTTP/HTTPS schema.org variants resolve to 'schema'. + ns_to_prefix.setdefault("https://schema.org/", "schema") + ns_to_prefix["http://schema.org/"] = "schema" + + # Ensure both HTTP/HTTPS wgs84 variants resolve to 'wgs'. + ns_to_prefix.setdefault("https://www.w3.org/2003/01/geo/wgs84_pos#", "wgs") + + return ns_to_prefix + + +# Overrides: corrections where prefixmaps merged context uses non-standard names +# that differ from rdflib 7.x / W3C canonical forms. +_PREFIX_OVERRIDES: types.MappingProxyType[str, str] = types.MappingProxyType( + { + # merged gives 'geosparql', rdflib/W3C uses 'geo' + "http://www.opengis.net/ont/geosparql#": "geo", + # merged gives 'sc', rdflib/W3C uses 'schema' + "https://schema.org/": "schema", + # merged gives 'WGS84', rdflib uses 'wgs' + "https://www.w3.org/2003/01/geo/wgs84_pos#": "wgs", + "http://www.w3.org/2003/01/geo/wgs84_pos#": "wgs", + } +) + + +def normalize_graph_prefixes(graph: "Graph", schema_prefixes: dict[str, str]) -> None: + """Normalise non-standard prefix aliases in an rdflib Graph. + + For each prefix bound in *schema_prefixes* (mapping prefix name → + namespace URI), check whether ``well_known_prefix_map()`` knows a + standard name for that URI. If the standard name differs from the + schema-declared name, rebind the namespace to the standard name. + + This is the **shared implementation** used by OWL, SHACL, and (via a + different code-path) JSON-LD context generators so that all serialisation + formats agree on prefix names when ``--normalize-prefixes`` is active. + + :param graph: rdflib Graph whose namespace bindings should be adjusted. + :param schema_prefixes: mapping of prefix name → namespace URI string, + typically from ``schema.prefixes``. + """ + from rdflib import Namespace + + wk = well_known_prefix_map() + + # Phase 1: normalise schema-declared prefixes. + for old_pfx, ns_uri in schema_prefixes.items(): + ns_str = str(ns_uri) + std_pfx = wk.get(ns_str) + if not std_pfx or std_pfx == old_pfx: + continue + # Collision: the user explicitly declared std_pfx for a different + # namespace — do not clobber their binding. + if std_pfx in schema_prefixes and schema_prefixes[std_pfx] != ns_str: + logger.warning( + "Prefix collision: cannot rename '%s' to '%s' because '%s' is already " + "declared for <%s>; skipping normalisation for <%s>", + old_pfx, + std_pfx, + std_pfx, + schema_prefixes[std_pfx], + ns_str, + ) + continue + # Rebind: remove old prefix, add standard prefix. + # ``replace=True`` forces the new prefix even if the prefix name + # is already bound to a different namespace. + graph.bind(std_pfx, Namespace(ns_str), override=True, replace=True) + + # Phase 2: normalise runtime-injected bindings (e.g. metamodel defaults). + # The linkml-runtime / rdflib may inject well-known namespaces under + # non-standard prefix names. After Phase 1 rebinds schema-declared + # prefixes, orphaned runtime bindings can appear as ``schema1``, ``dc0``, + # etc. Scan the graph's current bindings and fix any that map to a + # well-known namespace under a non-standard name, provided the standard + # name isn't already claimed by the user for a different namespace. + # + # Guard: if Phase 1 already bound std_pfx to a different URI (e.g. + # ``schema`` → ``https://schema.org/``), do not clobber it with the + # HTTP variant (``http://schema.org/``). Build a snapshot of the + # current bindings after Phase 1 to detect this. + current_bindings = {str(p): str(n) for p, n in graph.namespaces()} + for pfx, ns in list(graph.namespaces()): + pfx_str, ns_str = str(pfx), str(ns) + std_pfx = wk.get(ns_str) + if not std_pfx or std_pfx == pfx_str: + continue + # Same collision check as Phase 1: respect user-declared prefixes. + if std_pfx in schema_prefixes and schema_prefixes[std_pfx] != ns_str: + continue + # Guard: if std_pfx is already bound to a different (correct) URI + # by Phase 1, do not overwrite it. This prevents the HTTP variant + # of schema.org from clobbering the HTTPS binding. + if std_pfx in current_bindings and current_bindings[std_pfx] != ns_str: + continue + graph.bind(std_pfx, Namespace(ns_str), override=True, replace=True) + + @dataclass class Generator(metaclass=abc.ABCMeta): """ @@ -180,6 +332,12 @@ class Generator(metaclass=abc.ABCMeta): stacktrace: bool = False """True means print stack trace, false just error message""" + normalize_prefixes: bool = False + """True means normalise non-standard prefix aliases to well-known names + from the ``prefixmaps`` package (linked_data + merged contexts, with + overrides for rdflib/W3C canonical forms). E.g. ``sdo`` → ``schema`` + for ``https://schema.org/``.""" + include: str | Path | SchemaDefinition | None = None """If set, include extra schema outside of the imports mechanism""" @@ -986,6 +1144,16 @@ def decorator(f: Command) -> Command: callback=stacktrace_callback, ) ) + f.params.append( + Option( + ("--normalize-prefixes/--no-normalize-prefixes",), + default=False, + show_default=True, + help="Normalise non-standard prefix aliases to rdflib's curated default names " + "(e.g. sdo → schema for https://schema.org/). " + "Supported by OWL, SHACL, and JSON-LD Context generators.", + ) + ) return f diff --git a/tests/linkml/test_generators/test_jsonldcontextgen.py b/tests/linkml/test_generators/test_jsonldcontextgen.py index 6e3170d5ac..3a1081ceeb 100644 --- a/tests/linkml/test_generators/test_jsonldcontextgen.py +++ b/tests/linkml/test_generators/test_jsonldcontextgen.py @@ -1637,3 +1637,118 @@ def test_kitchen_sink_employment_event_type_falls_back(kitchen_sink_path): slot_def = ctx["employed_at"] if isinstance(slot_def, dict) and "@context" in slot_def: assert "@vocab" not in slot_def.get("@context", {}) + + +def test_normalize_prefixes_renames_nonstandard_alias(tmp_path): + """When --normalize-prefixes is set, non-standard aliases are replaced by rdflib defaults. + + rdflib binds ``dc`` to ``http://purl.org/dc/elements/1.1/`` by default. + A schema that declares ``dce`` for the same URI should have it normalised + to ``dc`` when the flag is enabled. + + See: rdflib default namespace bindings. + """ + schema = tmp_path / "schema.yaml" + schema.write_text( + """\ +id: https://example.org/test +name: test_normalize +default_prefix: ex +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + dce: http://purl.org/dc/elements/1.1/ +imports: + - linkml:types +classes: + Record: + class_uri: ex:Record + attributes: + title: + range: string + slot_uri: dce:title +""", + encoding="utf-8", + ) + + # Flag OFF (default): non-standard alias preserved + ctx_off = json.loads(ContextGenerator(str(schema), normalize_prefixes=False).serialize())["@context"] + assert "dce" in ctx_off, "With flag off, original prefix 'dce' must be preserved" + + # Flag ON: rdflib default name used + ctx_on = json.loads(ContextGenerator(str(schema), normalize_prefixes=True).serialize())["@context"] + assert "dc" in ctx_on, "With flag on, 'dce' should be normalised to 'dc'" + assert "dce" not in ctx_on, "With flag on, original alias 'dce' should be removed" + assert ctx_on["dc"] == "http://purl.org/dc/elements/1.1/" + + +def test_normalize_prefixes_default_is_off(tmp_path): + """The --normalize-prefixes flag defaults to False — no prefix renaming. + + Ensures backward compatibility: existing schemas produce identical output. + """ + schema = tmp_path / "schema.yaml" + schema.write_text( + """\ +id: https://example.org/test +name: test_default +default_prefix: ex +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + sdo: https://schema.org/ +imports: + - linkml:types +classes: + Thing: + class_uri: sdo:Thing + attributes: + name: + range: string + slot_uri: sdo:name +""", + encoding="utf-8", + ) + + ctx = json.loads(ContextGenerator(str(schema)).serialize())["@context"] + # Without the flag, the schema's own prefix name must be preserved + assert "sdo" in ctx, "Default behavior must preserve schema-declared prefix 'sdo'" + + +def test_normalize_prefixes_curie_remapping(tmp_path): + """CURIEs in element @id values use the normalised prefix name. + + When ``sdo`` is normalised to ``schema``, slot URIs like ``sdo:name`` + must appear as ``schema:name`` in the generated context. + """ + schema = tmp_path / "schema.yaml" + schema.write_text( + """\ +id: https://example.org/test +name: test_curie +default_prefix: ex +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + sdo: https://schema.org/ +imports: + - linkml:types +classes: + Person: + class_uri: sdo:Person + attributes: + full_name: + range: string + slot_uri: sdo:name +""", + encoding="utf-8", + ) + + ctx = json.loads(ContextGenerator(str(schema), normalize_prefixes=True).serialize())["@context"] + # The prefix declaration must use the standard name + assert "schema" in ctx, "Normalised prefix 'schema' must appear" + # Element @id must use the normalised prefix + person = ctx.get("Person", {}) + assert person.get("@id", "").startswith("schema:"), ( + f"Person @id should use normalised prefix 'schema:', got {person}" + ) diff --git a/tests/linkml/test_generators/test_normalize_prefixes.py b/tests/linkml/test_generators/test_normalize_prefixes.py new file mode 100644 index 0000000000..0a832a5791 --- /dev/null +++ b/tests/linkml/test_generators/test_normalize_prefixes.py @@ -0,0 +1,545 @@ +"""Tests for the --normalize-prefixes flag across all generators. + +Verifies that non-standard prefix aliases (e.g. ``sdo`` for ``https://schema.org/``) +are normalised to well-known names (e.g. ``schema``) consistently in OWL, SHACL, +and JSON-LD context output. + +References: +- prefix.cc — community consensus RDF prefix registry +- rdflib 7.x curated default namespace bindings +- W3C Turtle §2.4 — prefix declarations are syntactic sugar +""" + +import json +import logging +import re +import textwrap + +import pytest + +# ── Shared test schema ────────────────────────────────────────────── + +SCHEMA_SDO = textwrap.dedent("""\ + id: https://example.org/test + name: test_normalize + default_prefix: ex + prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + sdo: https://schema.org/ + imports: + - linkml:types + classes: + Person: + class_uri: sdo:Person + attributes: + full_name: + range: string + slot_uri: sdo:name +""") + +SCHEMA_DCE = textwrap.dedent("""\ + id: https://example.org/test + name: test_normalize_dce + default_prefix: ex + prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + dce: http://purl.org/dc/elements/1.1/ + imports: + - linkml:types + classes: + Record: + class_uri: ex:Record + attributes: + title: + range: string + slot_uri: dce:title +""") + +# HTTP variant — linkml-runtime historically binds schema: http://schema.org/ +# while rdflib (and the W3C) prefer https://schema.org/. The normalize flag +# must handle both. +SCHEMA_HTTP_SDO = textwrap.dedent("""\ + id: https://example.org/test + name: test_http_schema + default_prefix: ex + prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + sdo: http://schema.org/ + imports: + - linkml:types + classes: + Place: + class_uri: sdo:Place + attributes: + geo: + range: string + slot_uri: sdo:geo +""") + +# Collision scenario: user declares 'foaf' for a custom namespace AND 'myfoaf' +# for http://xmlns.com/foaf/0.1/. Normalisation must NOT clobber the user's 'foaf'. +# Uses 'foaf' instead of 'schema' because 'schema' is declared in linkml:types, +# which causes a SchemaLoader merge conflict before normalisation even runs. +SCHEMA_COLLISION = textwrap.dedent("""\ + id: https://example.org/test + name: test_collision + default_prefix: ex + prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + foaf: https://something-else.org/ + myfoaf: http://xmlns.com/foaf/0.1/ + imports: + - linkml:types + classes: + Agent: + class_uri: myfoaf:Agent + attributes: + label: + range: string + slot_uri: myfoaf:name +""") + + +def _write_schema(tmp_path, content: str, name: str = "schema.yaml") -> str: + """Write schema content to a temporary file and return its path as string.""" + p = tmp_path / name + p.write_text(content, encoding="utf-8") + return str(p) + + +def _turtle_prefixes(ttl: str) -> dict[str, str]: + """Extract @prefix declarations from Turtle output → {prefix: namespace}.""" + result = {} + for m in re.finditer(r"@prefix\s+(\w+):\s+<([^>]+)>", ttl): + result[m.group(1)] = m.group(2) + return result + + +# ── OWL Generator Tests ───────────────────────────────────────────── + + +def test_owl_sdo_normalised_to_schema(tmp_path): + """sdo → schema when --normalize-prefixes is active.""" + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "schema" in pfx, f"Expected 'schema' prefix in OWL output, got: {sorted(pfx)}" + assert pfx["schema"] == "https://schema.org/" + assert "sdo" not in pfx, "Non-standard 'sdo' prefix should be removed" + + +def test_owl_flag_off_preserves_original(tmp_path): + """Without the flag, schema-declared prefix names are preserved.""" + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=False).serialize() + pfx = _turtle_prefixes(ttl) + assert "sdo" in pfx, "With flag off, original prefix 'sdo' must be preserved" + + +def test_owl_dce_normalised_to_dc(tmp_path): + """dce → dc for http://purl.org/dc/elements/1.1/ in graph bindings. + + Note: rdflib's Turtle serializer only emits @prefix declarations for + namespaces actually used in triples. Since the OWL generator may not + produce triples using dc:elements URIs for simple attribute schemas, + we verify the graph's namespace bindings directly. + """ + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_DCE) + gen = OwlSchemaGenerator(schema_path, normalize_prefixes=True) + graph = gen.as_graph() + bound = {str(p): str(n) for p, n in graph.namespaces()} + assert "dc" in bound, f"Expected 'dc' in graph bindings, got: {sorted(bound)}" + assert bound["dc"] == "http://purl.org/dc/elements/1.1/" + + +def test_owl_custom_prefix_not_affected(tmp_path): + """Domain-specific prefixes (e.g. 'ex') are not touched by normalisation.""" + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "ex" in pfx, "Custom prefix 'ex' must survive normalisation" + assert pfx["ex"] == "https://example.org/" + + +def test_owl_http_schema_org_normalised(tmp_path): + """http://schema.org/ (HTTP variant) also normalises to 'schema'. + + The linkml-runtime historically binds ``schema: http://schema.org/`` + while the W3C and rdflib prefer ``https://schema.org/``. Both + variants must be recognised by the static well-known prefix map. + """ + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_HTTP_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "schema" in pfx, f"Expected 'schema' prefix for http://schema.org/, got: {sorted(pfx)}" + assert "sdo" not in pfx + + +def test_owl_no_schema1_from_runtime_http_binding(tmp_path): + """Runtime-injected ``schema: http://schema.org/`` must not create ``schema1``. + + The linkml metamodel (types.yaml) declares ``schema: http://schema.org/`` + (HTTP). When a user schema declares ``sdo: https://schema.org/`` (HTTPS), + normalisation must clean up *both* variants so the output never contains + auto-generated suffixed prefixes like ``schema1``. + """ + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + suffixed = [p for p in pfx if re.match(r"schema\d+", p)] + assert not suffixed, ( + f"Auto-generated suffixed prefix(es) {suffixed} found — runtime http://schema.org/ binding was not cleaned up" + ) + + +# ── SHACL Generator Tests ─────────────────────────────────────────── + + +def test_shacl_sdo_normalised_to_schema(tmp_path): + """sdo → schema when --normalize-prefixes is active.""" + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "schema" in pfx, f"Expected 'schema' prefix in SHACL output, got: {sorted(pfx)}" + assert pfx["schema"] == "https://schema.org/" + assert "sdo" not in pfx, "Non-standard 'sdo' prefix should be removed" + + +def test_shacl_flag_off_preserves_original(tmp_path): + """Without the flag, schema-declared prefix names are preserved.""" + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = ShaclGenerator(schema_path, normalize_prefixes=False).serialize() + pfx = _turtle_prefixes(ttl) + assert "sdo" in pfx, "With flag off, original prefix 'sdo' must be preserved" + + +def test_shacl_dce_normalised_to_dc(tmp_path): + """dce → dc for http://purl.org/dc/elements/1.1/.""" + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_DCE) + ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "dc" in pfx, f"Expected 'dc' prefix in SHACL output, got: {sorted(pfx)}" + assert pfx["dc"] == "http://purl.org/dc/elements/1.1/" + assert "dce" not in pfx, "Non-standard 'dce' prefix should be removed" + + +def test_shacl_custom_prefix_not_affected(tmp_path): + """Domain-specific prefixes (e.g. 'ex') are not touched by normalisation. + + Note: rdflib only emits @prefix for namespaces used in triples. + We verify graph bindings directly. + """ + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + gen = ShaclGenerator(schema_path, normalize_prefixes=True) + graph = gen.as_graph() + bound = {str(p): str(n) for p, n in graph.namespaces()} + assert "ex" in bound, f"Custom prefix 'ex' must survive in graph bindings, got: {sorted(bound)}" + assert bound["ex"] == "https://example.org/" + + +def test_shacl_http_schema_org_normalised(tmp_path): + """http://schema.org/ (HTTP variant) also normalises to 'schema'.""" + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_HTTP_SDO) + ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "schema" in pfx, f"Expected 'schema' prefix for http://schema.org/, got: {sorted(pfx)}" + assert "sdo" not in pfx + + +def test_shacl_no_schema1_from_runtime_http_binding(tmp_path): + """Runtime-injected ``schema: http://schema.org/`` must not create ``schema1``. + + Same scenario as the OWL test: linkml:types imports bring in + ``schema: http://schema.org/`` while the user schema has + ``sdo: https://schema.org/``. Phase 2 of normalisation must + clean up the orphaned HTTP binding. + """ + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + suffixed = [p for p in pfx if re.match(r"schema\d+", p)] + assert not suffixed, ( + f"Auto-generated suffixed prefix(es) {suffixed} found — runtime http://schema.org/ binding was not cleaned up" + ) + + +# ── JSON-LD Context Generator Tests ───────────────────────────────── + + +def test_context_http_schema_org_normalised(tmp_path): + """http://schema.org/ (HTTP variant) normalises to 'schema' in JSON-LD context. + + This covers the edge case where linkml-runtime's ``schema: http://schema.org/`` + conflicts with rdflib's ``schema: https://schema.org/``. The stale binding + must be removed and replaced with the correct one. + """ + from linkml.generators.jsonldcontextgen import ContextGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_HTTP_SDO) + ctx = json.loads(ContextGenerator(schema_path, normalize_prefixes=True).serialize())["@context"] + assert "schema" in ctx, "HTTP schema.org should normalise to 'schema'" + assert "sdo" not in ctx, "Non-standard 'sdo' should be removed" + # The namespace URI must match the schema-declared one (http, not https) + schema_val = ctx["schema"] + if isinstance(schema_val, dict): + schema_val = schema_val.get("@id", "") + assert schema_val == "http://schema.org/", f"Namespace URI must be preserved: got {schema_val}" + + +# ── Static Prefix Map Tests ───────────────────────────────────────── + + +def test_well_known_prefix_map_returns_dict(): + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + assert isinstance(wk, dict) + assert len(wk) >= 29, f"Expected ≥29 entries, got {len(wk)}" + + +def test_well_known_prefix_map_schema_https(): + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + assert wk["https://schema.org/"] == "schema" + + +def test_well_known_prefix_map_schema_http_variant(): + """Both http and https schema.org must map to 'schema'.""" + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + assert wk["http://schema.org/"] == "schema" + + +def test_well_known_prefix_map_dc_elements(): + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + assert wk["http://purl.org/dc/elements/1.1/"] == "dc" + + +def test_well_known_prefix_map_returns_copy(): + """Callers should not be able to mutate the internal map.""" + from linkml.utils.generator import well_known_prefix_map + + wk1 = well_known_prefix_map() + wk1["http://never-in-any-real-prefix-map.test/"] = "test" + wk2 = well_known_prefix_map() + assert "http://never-in-any-real-prefix-map.test/" not in wk2 + + +def test_well_known_prefix_map_fully_resolved_from_prefixmaps(): + """All rdflib defaults must be resolved from prefixmaps (no residual map). + + This is the proof that pinning prefixmaps to the commit containing + linkml/prefixmaps#81 resolves all well-known prefixes without any + hardcoded fallback. If this test fails after a prefixmaps update, + add the missing prefix to the upstream linked_data.curated.yaml. + """ + from rdflib import Graph as RdfGraph + + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + rdflib_map = {str(ns): str(pfx) for pfx, ns in RdfGraph().namespaces() if str(pfx)} + missing = {ns: pfx for ns, pfx in rdflib_map.items() if ns not in wk} + assert not missing, f"Prefix map missing rdflib defaults (add to prefixmaps upstream): {missing}" + + +# ── Cross-Generator Consistency Tests ──────────────────────────────── + + +def test_all_generators_normalise_sdo_to_schema(tmp_path): + """OWL, SHACL, and JSON-LD context must all use 'schema' for schema.org.""" + from linkml.generators.jsonldcontextgen import ContextGenerator + from linkml.generators.owlgen import OwlSchemaGenerator + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + + owl_ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + shacl_ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + ctx = json.loads(ContextGenerator(schema_path, normalize_prefixes=True).serialize())["@context"] + + owl_pfx = _turtle_prefixes(owl_ttl) + shacl_pfx = _turtle_prefixes(shacl_ttl) + + assert "schema" in owl_pfx, "OWL must use 'schema'" + assert "schema" in shacl_pfx, "SHACL must use 'schema'" + assert "schema" in ctx, "JSON-LD context must use 'schema'" + + assert "sdo" not in owl_pfx, "OWL must not have 'sdo'" + assert "sdo" not in shacl_pfx, "SHACL must not have 'sdo'" + assert "sdo" not in ctx, "JSON-LD context must not have 'sdo'" + + +# ── Prefix Collision Tests ──────────────────────────────────────────── + + +@pytest.mark.parametrize( + "generator_cls,generator_module", + [ + ("OwlSchemaGenerator", "linkml.generators.owlgen"), + ("ShaclGenerator", "linkml.generators.shaclgen"), + ], + ids=["owl", "shacl"], +) +def test_graph_generator_collision_skips_rename(tmp_path, caplog, generator_cls, generator_module): + """Graph generators: myfoaf must NOT be renamed to 'foaf' when user claims that name.""" + import importlib + + mod = importlib.import_module(generator_module) + cls = getattr(mod, generator_cls) + + schema_path = _write_schema(tmp_path, SCHEMA_COLLISION) + with caplog.at_level(logging.WARNING): + gen = cls(schema_path, normalize_prefixes=True) + graph = gen.as_graph() + bound = {str(p): str(n) for p, n in graph.namespaces()} + assert "myfoaf" in bound, "Non-standard 'myfoaf' must remain when collision prevents renaming" + assert bound["myfoaf"] == "http://xmlns.com/foaf/0.1/" + assert "collision" in caplog.text.lower(), f"Expected collision warning, got: {caplog.text}" + + +def test_context_collision_preserves_user_prefix(tmp_path, caplog): + """JSON-LD: user's 'foaf: https://something-else.org/' must survive.""" + from linkml.generators.jsonldcontextgen import ContextGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_COLLISION) + with caplog.at_level(logging.WARNING): + ctx = json.loads(ContextGenerator(schema_path, normalize_prefixes=True).serialize())["@context"] + # User's 'foaf' binding preserved + foaf_val = ctx.get("foaf") + if isinstance(foaf_val, dict): + foaf_val = foaf_val.get("@id", "") + assert foaf_val == "https://something-else.org/", f"User's 'foaf' binding must be preserved, got: {foaf_val}" + # myfoaf must remain (not renamed to foaf) + assert "myfoaf" in ctx, "Non-standard 'myfoaf' must remain when collision prevents renaming" + # Warning emitted + assert "collision" in caplog.text.lower(), f"Expected collision warning, got: {caplog.text}" + + +# ── JSONLDGenerator Flag Forwarding Tests ───────────────────────────── + + +def test_jsonld_generator_forwards_normalize_prefixes(tmp_path): + """JSONLDGenerator must pass normalize_prefixes to embedded ContextGenerator. + + Without forwarding, the inline @context in JSON-LD output would keep + non-standard prefix aliases even when --normalize-prefixes is set. + """ + from linkml.generators.jsonldgen import JSONLDGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + out = JSONLDGenerator(schema_path, normalize_prefixes=True).serialize() + parsed = json.loads(out) + # The @context may be a list; find the dict entry + ctx = parsed.get("@context", {}) + if isinstance(ctx, list): + for item in ctx: + if isinstance(item, dict): + ctx = item + break + assert "sdo" not in ctx, "normalize_prefixes not forwarded: 'sdo' still in embedded @context" + + +# ── Phase 2 HTTP/HTTPS Overwrite Bug Tests ──────────────────────────── + + +def test_phase2_does_not_overwrite_https_with_http(tmp_path): + """When Phase 1 binds schema → https://schema.org/, Phase 2 must not + overwrite it with http://schema.org/ from the runtime metamodel. + + Reproduction: linkml:types imports bring schema: http://schema.org/ + (HTTP) while the user schema has sdo: https://schema.org/ (HTTPS). + Phase 1 normalises sdo → schema (HTTPS). Phase 2 must not then + rebind schema → http://schema.org/ when it encounters the runtime + HTTP binding. + """ + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + gen = OwlSchemaGenerator(schema_path, normalize_prefixes=True) + graph = gen.as_graph() + bound = {str(p): str(n) for p, n in graph.namespaces()} + assert "schema" in bound, f"Expected 'schema' in bindings, got: {sorted(bound)}" + # MUST be HTTPS (from the user's schema), not HTTP (from runtime) + assert bound["schema"] == "https://schema.org/", ( + f"Phase 2 overwrote HTTPS with HTTP: schema bound to {bound['schema']}" + ) + + +def test_normalize_graph_prefixes_phase2_guard(): + """Direct unit test for the Phase 2 guard in normalize_graph_prefixes. + + Simulates the exact scenario: Phase 1 binds schema → https://schema.org/, + then Phase 2 encounters schema1 → http://schema.org/ and must NOT rebind. + """ + from rdflib import Graph, Namespace, URIRef + + from linkml.utils.generator import normalize_graph_prefixes + + g = Graph(bind_namespaces="none") + # Simulate Phase 1 result + g.bind("schema", Namespace("https://schema.org/")) + # Simulate runtime-injected HTTP variant (would appear as schema1) + g.bind("schema1", Namespace("http://schema.org/")) + # Add a triple so the graph isn't empty + g.add((URIRef("https://example.org/s"), URIRef("https://schema.org/name"), URIRef("https://example.org/o"))) + + normalize_graph_prefixes(g, {"sdo": "https://schema.org/"}) + + bound = {str(p): str(n) for p, n in g.namespaces()} + assert bound.get("schema") == "https://schema.org/", f"Phase 2 guard failed: schema bound to {bound.get('schema')}" + + +def test_empty_schema_no_crash(tmp_path): + """A schema with no custom prefixes must not crash normalize_graph_prefixes.""" + from linkml.generators.owlgen import OwlSchemaGenerator + + (tmp_path / "empty.yaml").write_text( + textwrap.dedent("""\ + id: https://example.org/empty + name: empty + default_prefix: ex + prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/ + imports: + - linkml:types + """), + encoding="utf-8", + ) + # Should not raise + gen = OwlSchemaGenerator(str(tmp_path / "empty.yaml"), normalize_prefixes=True) + ttl = gen.serialize() + assert len(ttl) > 0 diff --git a/uv.lock b/uv.lock index c23e3dffd7..aeec80f737 100644 --- a/uv.lock +++ b/uv.lock @@ -2342,7 +2342,7 @@ requires-dist = [ { name = "openpyxl" }, { name = "parse" }, { name = "prefixcommons", specifier = ">=0.1.7" }, - { name = "prefixmaps", specifier = ">=0.2.2" }, + { name = "prefixmaps", git = "https://github.com/linkml/prefixmaps?rev=75435150a1b31760b9780af2b64a265943a9b263" }, { name = "pydantic", specifier = ">=2.0.0,<3.0.0" }, { name = "pyjsg", specifier = ">=0.12.3" }, { name = "pyshex", specifier = ">=0.9.0" }, @@ -3548,16 +3548,12 @@ wheels = [ [[package]] name = "prefixmaps" -version = "0.2.6" -source = { registry = "https://pypi.org/simple" } +version = "0.2.7.post2.dev0+7543515" +source = { git = "https://github.com/linkml/prefixmaps?rev=75435150a1b31760b9780af2b64a265943a9b263#75435150a1b31760b9780af2b64a265943a9b263" } dependencies = [ { name = "curies" }, { name = "pyyaml" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/4d/cf/f588bcdfd2c841839b9d59ce219a46695da56aa2805faff937bbafb9ee2b/prefixmaps-0.2.6.tar.gz", hash = "sha256:7421e1244eea610217fa1ba96c9aebd64e8162a930dc0626207cd8bf62ecf4b9", size = 709899, upload-time = "2024-10-17T16:30:57.738Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/89/b2/2b2153173f2819e3d7d1949918612981bc6bd895b75ffa392d63d115f327/prefixmaps-0.2.6-py3-none-any.whl", hash = "sha256:f6cef28a7320fc6337cf411be212948ce570333a0ce958940ef684c7fb192a62", size = 754732, upload-time = "2024-10-17T16:30:55.731Z" }, -] [[package]] name = "prettytable" From 7a4227055c1aa150cc09f20bcef74c0ada8ae49d Mon Sep 17 00:00:00 2001 From: Carlo van Driesten Date: Tue, 12 May 2026 11:38:58 +0200 Subject: [PATCH 03/12] fix(shaclgen): emit sh:minCount/maxCount 0 for zero cardinality values Python truthiness check `if s.maximum_cardinality:` evaluates to False when the value is 0 (an integer), silently skipping sh:maxCount 0 emission. The same bug affected minimum_cardinality and exact_cardinality. Replace all three truthiness checks with explicit `is not None` guards: - `if s.minimum_cardinality is not None:` - `if s.maximum_cardinality is not None:` - `elif s.exact_cardinality is not None:` (two occurrences) Add regression tests: - test_zero_maximum_cardinality_emits_maxcount - test_zero_exact_cardinality_emits_both_counts This is the primary mechanism for suppressing inherited slots on subclasses via slot_usage (OWL maxCardinality 0 pattern). Signed-off-by: Carlo van Driesten --- .../linkml/src/linkml/generators/shaclgen.py | 8 +- .../input/shaclgen/cardinality.yaml | 25 +++++++ tests/linkml/test_generators/test_shaclgen.py | 75 +++++++++++++++++++ 3 files changed, 104 insertions(+), 4 deletions(-) diff --git a/packages/linkml/src/linkml/generators/shaclgen.py b/packages/linkml/src/linkml/generators/shaclgen.py index f747aae572..20eb5535ea 100644 --- a/packages/linkml/src/linkml/generators/shaclgen.py +++ b/packages/linkml/src/linkml/generators/shaclgen.py @@ -174,9 +174,9 @@ def prop_pv_literal(p, v): prop_pv_literal(SH.name, s.title) prop_pv_literal(SH.description, s.description) # minCount - if s.minimum_cardinality: + if s.minimum_cardinality is not None: prop_pv_literal(SH.minCount, s.minimum_cardinality) - elif s.exact_cardinality: + elif s.exact_cardinality is not None: prop_pv_literal(SH.minCount, s.exact_cardinality) # Identifiers map to the node's IRI rather than a property triple, # so there's no arc to constrain with sh:minCount 1 — emitting it @@ -184,9 +184,9 @@ def prop_pv_literal(p, v): elif s.required and not s.identifier: prop_pv_literal(SH.minCount, 1) # maxCount - if s.maximum_cardinality: + if s.maximum_cardinality is not None: prop_pv_literal(SH.maxCount, s.maximum_cardinality) - elif s.exact_cardinality: + elif s.exact_cardinality is not None: prop_pv_literal(SH.maxCount, s.exact_cardinality) elif not s.multivalued: prop_pv_literal(SH.maxCount, 1) diff --git a/tests/linkml/test_generators/input/shaclgen/cardinality.yaml b/tests/linkml/test_generators/input/shaclgen/cardinality.yaml index 6bacffa680..86f88c4f60 100644 --- a/tests/linkml/test_generators/input/shaclgen/cardinality.yaml +++ b/tests/linkml/test_generators/input/shaclgen/cardinality.yaml @@ -17,6 +17,23 @@ classes: slots: - list_exact_size + ParentClass: + slots: + - inherited_slot + - restricted_slot + + ChildWithZeroMaxCard: + is_a: ParentClass + slot_usage: + restricted_slot: + maximum_cardinality: 0 + + ChildWithZeroExactCard: + is_a: ParentClass + slot_usage: + restricted_slot: + exact_cardinality: 0 + slots: list_min_max_size: range: integer @@ -28,3 +45,11 @@ slots: range: integer multivalued: true exact_cardinality: 3 + + inherited_slot: + range: string + multivalued: true + + restricted_slot: + range: string + multivalued: true diff --git a/tests/linkml/test_generators/test_shaclgen.py b/tests/linkml/test_generators/test_shaclgen.py index 84bac6b4ec..441b569667 100644 --- a/tests/linkml/test_generators/test_shaclgen.py +++ b/tests/linkml/test_generators/test_shaclgen.py @@ -577,6 +577,81 @@ def test_multivalued_slot_exact_cardinality(input_path): ) in g +def test_zero_maximum_cardinality_emits_maxcount(input_path): + """Test that maximum_cardinality: 0 correctly emits sh:maxCount 0. + + Regression test for the bug where Python truthiness check + `if s.maximum_cardinality:` would skip the value 0 (falsy), + failing to emit sh:maxCount 0 in the generated SHACL shape. + The fix uses `if s.maximum_cardinality is not None:` instead. + + This is the primary mechanism for suppressing inherited slots on + subclasses via slot_usage (e.g., OWL maxCardinality 0 pattern). + """ + shacl = ShaclGenerator(input_path("shaclgen/cardinality.yaml"), mergeimports=True).serialize() + + g = rdflib.Graph() + g.parse(data=shacl) + + # Find the ChildWithZeroMaxCard shape + child_uri = URIRef("https://w3id.org/linkml/examples/cardinality/ChildWithZeroMaxCard") + restricted_slot_uri = URIRef("https://w3id.org/linkml/examples/cardinality/restricted_slot") + + # Get all property shapes for the child class + prop_nodes = list(g.objects(child_uri, SH.property)) + assert prop_nodes, "ChildWithZeroMaxCard should have property shapes" + + # Find the property shape for restricted_slot + restricted_prop_node = None + for pn in prop_nodes: + if (pn, SH.path, restricted_slot_uri) in g: + restricted_prop_node = pn + break + assert restricted_prop_node is not None, "Should have a property shape for restricted_slot" + + # The critical assertion: sh:maxCount 0 must be emitted + max_count_values = list(g.objects(restricted_prop_node, SH.maxCount)) + assert len(max_count_values) == 1, f"Expected exactly one sh:maxCount, got {max_count_values}" + assert max_count_values[0] == rdflib.term.Literal( + 0, datatype=rdflib.term.URIRef("http://www.w3.org/2001/XMLSchema#integer") + ), f"sh:maxCount should be 0, got {max_count_values[0]}" + + +def test_zero_exact_cardinality_emits_both_counts(input_path): + """Test that exact_cardinality: 0 emits both sh:minCount 0 and sh:maxCount 0. + + Same truthiness bug as maximum_cardinality: `if s.exact_cardinality:` + skips value 0 (falsy). The fix uses `is not None` instead. + """ + shacl = ShaclGenerator(input_path("shaclgen/cardinality.yaml"), mergeimports=True).serialize() + + g = rdflib.Graph() + g.parse(data=shacl) + + child_uri = URIRef("https://w3id.org/linkml/examples/cardinality/ChildWithZeroExactCard") + restricted_slot_uri = URIRef("https://w3id.org/linkml/examples/cardinality/restricted_slot") + + prop_nodes = list(g.objects(child_uri, SH.property)) + assert prop_nodes, "ChildWithZeroExactCard should have property shapes" + + restricted_prop_node = None + for pn in prop_nodes: + if (pn, SH.path, restricted_slot_uri) in g: + restricted_prop_node = pn + break + assert restricted_prop_node is not None, "Should have a property shape for restricted_slot" + + XSD_INT = rdflib.term.URIRef("http://www.w3.org/2001/XMLSchema#integer") + + min_count_values = list(g.objects(restricted_prop_node, SH.minCount)) + assert len(min_count_values) == 1, f"Expected exactly one sh:minCount, got {min_count_values}" + assert min_count_values[0] == rdflib.term.Literal(0, datatype=XSD_INT) + + max_count_values = list(g.objects(restricted_prop_node, SH.maxCount)) + assert len(max_count_values) == 1, f"Expected exactly one sh:maxCount, got {max_count_values}" + assert max_count_values[0] == rdflib.term.Literal(0, datatype=XSD_INT) + + def test_exclude_imports(input_path): shacl = ShaclGenerator( input_path("shaclgen/exclude_imports.yaml"), mergeimports=True, exclude_imports=True From e665564ae6961ca91ea2fb48fa78662c25d7b5df Mon Sep 17 00:00:00 2001 From: Carlo van Driesten Date: Thu, 7 May 2026 13:58:58 +0200 Subject: [PATCH 04/12] fix(shaclgen): emit sh:pattern for pattern constraints inside any_of MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SHACL generator translated any_of branches by dispatching solely on `any.range` (class, type, enum, or simple datatype). If a branch specified `pattern:` — either alone or combined with a range — the constraint was silently dropped, producing an empty blank node `[ ]` (trivially satisfied) instead of the intended `[ sh:pattern "..." ]`. This is a problem for schemas that use pattern alternatives in `any_of`, such as the SPDX license field where valid values are either members of a fixed enum (SPDX identifiers), IRIs, or custom identifiers matching the LicenseRef- pattern defined in SPDX Specification v2.3 Annex D (ABNF: license-ref = ["DocumentRef-"(idstring)":"]"LicenseRef-"(idstring)). The fix adds a single check after the range dispatch: if any.pattern: g.add((range_list[-1], SH.pattern, Literal(any.pattern))) This correctly handles: - Pattern-only branches (no range): node gets only sh:pattern - Range + pattern branches: node gets both sh:datatype and sh:pattern - Range-only branches (no pattern): unchanged behaviour The test suite now includes a dedicated schema exercising all three cases, with assertions on both the generated RDF triples and pyshacl validation of conforming/non-conforming data. Signed-off-by: Carlo van Driesten --- .../linkml/src/linkml/generators/shaclgen.py | 5 ++ .../input/shaclgen/any_of_pattern.yaml | 59 +++++++++++++++++ tests/linkml/test_generators/test_shaclgen.py | 65 +++++++++++++++++++ 3 files changed, 129 insertions(+) create mode 100644 tests/linkml/test_generators/input/shaclgen/any_of_pattern.yaml diff --git a/packages/linkml/src/linkml/generators/shaclgen.py b/packages/linkml/src/linkml/generators/shaclgen.py index 20eb5535ea..30031dae7c 100644 --- a/packages/linkml/src/linkml/generators/shaclgen.py +++ b/packages/linkml/src/linkml/generators/shaclgen.py @@ -242,6 +242,11 @@ def st_node_pv(p, v): add_simple_data_type(st_node_pv, r) range_list.append(st_node) + # Propagate pattern constraint to the branch node. + # A branch may combine range + pattern (e.g. range: string + # with pattern: "^...") or specify pattern alone (no range). + if any.pattern: + g.add((range_list[-1], SH.pattern, Literal(any.pattern))) Collection(g, or_node, range_list) else: prop_pv_literal(SH.hasValue, s.equals_number) diff --git a/tests/linkml/test_generators/input/shaclgen/any_of_pattern.yaml b/tests/linkml/test_generators/input/shaclgen/any_of_pattern.yaml new file mode 100644 index 0000000000..5b247bb2a1 --- /dev/null +++ b/tests/linkml/test_generators/input/shaclgen/any_of_pattern.yaml @@ -0,0 +1,59 @@ +id: https://w3id.org/linkml/examples/any_of_pattern +name: test_any_of_pattern +description: >- + Test schema for pattern constraints inside any_of branches. + Exercises three cases: (1) pattern-only branch (no range), + (2) range + pattern on the same branch, (3) mixed branches + where some have pattern and some do not. +prefixes: + linkml: https://w3id.org/linkml/ + ex: https://w3id.org/linkml/examples/any_of_pattern/ +imports: + - linkml:types +default_range: string +default_prefix: ex + +enums: + LicenseEnum: + permissible_values: + MIT: + Apache-2.0: + GPL-3.0-only: + +classes: + PatternOnlyBranch: + description: >- + A class where one any_of branch specifies only a pattern + (no range). The generated SHACL sh:or should contain a + node with sh:pattern but no sh:datatype or sh:class. + attributes: + license: + any_of: + - range: LicenseEnum + - range: uri + - pattern: "^LicenseRef-[a-zA-Z0-9\\-\\.]+$" + + RangeWithPattern: + description: >- + A class where an any_of branch combines range + pattern. + The generated SHACL sh:or node should have both sh:datatype + and sh:pattern. + attributes: + identifier: + any_of: + - range: string + pattern: "^[A-Z]{2}-[0-9]{4}$" + - range: integer + + MixedBranches: + description: >- + A class with three any_of branches: one with range only, + one with pattern only, one with range + pattern. Ensures + pattern is emitted only on branches that declare it. + attributes: + code: + any_of: + - range: integer + - pattern: "^CUSTOM-.*$" + - range: string + pattern: "^STD-[0-9]+$" diff --git a/tests/linkml/test_generators/test_shaclgen.py b/tests/linkml/test_generators/test_shaclgen.py index 441b569667..41f3f723ca 100644 --- a/tests/linkml/test_generators/test_shaclgen.py +++ b/tests/linkml/test_generators/test_shaclgen.py @@ -1244,3 +1244,68 @@ def test_nodeidentifier_range_produces_blank_node_or_iri(): uri_ref = props["https://example.org/uriRef"] uri_kinds = list(g.objects(uri_ref, SH.nodeKind)) assert SH.IRI in uri_kinds, f"Expected sh:IRI for uri, got {uri_kinds}" + + +def test_any_of_with_pattern(input_path): + """Test that pattern constraints inside any_of branches emit sh:pattern. + + Exercises three cases: + 1. PatternOnlyBranch: any_of with a pattern-only branch (no range) + 2. RangeWithPattern: any_of with range + pattern on the same branch + 3. MixedBranches: combination of range-only, pattern-only, and range+pattern + """ + shacl = ShaclGenerator(input_path("shaclgen/any_of_pattern.yaml"), mergeimports=True).serialize() + g = rdflib.Graph() + g.parse(data=shacl) + + def get_or_branch_nodes(class_uri: str, slot_local: str) -> list[rdflib.BNode]: + """Return the list of BNodes inside sh:or for a given class property.""" + class_ref = URIRef(class_uri) + for prop_node in g.objects(class_ref, SH.property): + paths = list(g.objects(prop_node, SH.path)) + if any(slot_local in str(p) for p in paths): + for or_head in g.objects(prop_node, SH["or"]): + return list(Collection(g, or_head)) + return [] + + prefix = "https://w3id.org/linkml/examples/any_of_pattern/" + + # Case 1: PatternOnlyBranch — license slot has 3 branches: + # [enum sh:in], [sh:nodeKind sh:IRI], [sh:pattern "^LicenseRef-..."] + branches = get_or_branch_nodes(f"{prefix}PatternOnlyBranch", "license") + assert len(branches) == 3, f"Expected 3 branches, got {len(branches)}" + # Find the branch with sh:pattern + pattern_branches = [b for b in branches if list(g.objects(b, SH.pattern))] + assert len(pattern_branches) == 1, f"Expected 1 pattern branch, got {len(pattern_branches)}" + pattern_val = str(list(g.objects(pattern_branches[0], SH.pattern))[0]) + assert pattern_val == "^LicenseRef-[a-zA-Z0-9\\-\\.]+$" + # The pattern-only branch should NOT have sh:datatype or sh:class + assert list(g.objects(pattern_branches[0], SH.datatype)) == [] + assert list(g.objects(pattern_branches[0], SH["class"])) == [] + + # Case 2: RangeWithPattern — identifier slot has 2 branches: + # [sh:datatype xsd:string + sh:pattern "^[A-Z]{2}-[0-9]{4}$"], [sh:datatype xsd:integer] + branches = get_or_branch_nodes(f"{prefix}RangeWithPattern", "identifier") + assert len(branches) == 2, f"Expected 2 branches, got {len(branches)}" + # Find branch with both datatype and pattern + combo_branches = [b for b in branches if list(g.objects(b, SH.datatype)) and list(g.objects(b, SH.pattern))] + assert len(combo_branches) == 1, f"Expected 1 combo branch, got {len(combo_branches)}" + assert str(list(g.objects(combo_branches[0], SH.pattern))[0]) == "^[A-Z]{2}-[0-9]{4}$" + # The other branch (integer) should NOT have sh:pattern + int_branches = [b for b in branches if b not in combo_branches] + assert list(g.objects(int_branches[0], SH.pattern)) == [] + + # Case 3: MixedBranches — code slot has 3 branches: + # [sh:datatype xsd:integer], [sh:pattern "^CUSTOM-.*$"], [sh:datatype xsd:string + sh:pattern "^STD-[0-9]+$"] + branches = get_or_branch_nodes(f"{prefix}MixedBranches", "code") + assert len(branches) == 3, f"Expected 3 branches, got {len(branches)}" + # Exactly 2 branches should have sh:pattern + pattern_branches = [b for b in branches if list(g.objects(b, SH.pattern))] + assert len(pattern_branches) == 2, f"Expected 2 pattern branches, got {len(pattern_branches)}" + # Collect the patterns + patterns = sorted(str(list(g.objects(b, SH.pattern))[0]) for b in pattern_branches) + assert patterns == ["^CUSTOM-.*$", "^STD-[0-9]+$"] + # The integer-only branch should have no pattern + no_pattern = [b for b in branches if not list(g.objects(b, SH.pattern))] + assert len(no_pattern) == 1 + assert list(g.objects(no_pattern[0], SH.datatype)) == [URIRef("http://www.w3.org/2001/XMLSchema#integer")] From f302682963684d66e716403562dc1f693a78263a Mon Sep 17 00:00:00 2001 From: Carlo van Driesten Date: Sat, 25 Apr 2026 18:12:28 +0200 Subject: [PATCH 05/12] feat(generators): add --default-language flag for language-tagged literals Add a `--default-language` CLI option to both gen-owl and gen-shacl that emits BCP 47 language-tagged string literals for human-readable annotations. gen-owl changes: - New `default_language` field on OwlSchemaGenerator - `_LANGUAGE_TAGGABLE_RANGES` frozenset (string, ncname) guards tagging - `_resolve_language()` checks element-level in_language first, then default - `_literal()` helper creates properly tagged Literal objects - `add_metadata()` tags string-range and fallback-range literals - `add_enum()` PV labels respect language tags - New `--default-language` Click option gen-shacl changes: - New `default_language` field on ShaclGenerator - NodeShape rdfs:label / rdfs:comment get language tags - PropertyShape sh:name / sh:description get language tags via prop_pv_text() - Numeric literals (sh:order, sh:minCount, etc.) are never tagged - New `--default-language` Click option Tests: - 3 new OWL tests: tagged labels, backward-compat plain literals, URI ranges - 4 new SHACL tests: NodeShape, PropertyShape, plain literals, numeric guard Signed-off-by: Carlo van Driesten --- .../linkml/src/linkml/generators/owlgen.py | 96 ++++++- .../linkml/src/linkml/generators/shaclgen.py | 83 +++++- tests/linkml/test_generators/test_owlgen.py | 268 ++++++++++++++++++ tests/linkml/test_generators/test_shaclgen.py | 220 ++++++++++++++ 4 files changed, 658 insertions(+), 9 deletions(-) diff --git a/packages/linkml/src/linkml/generators/owlgen.py b/packages/linkml/src/linkml/generators/owlgen.py index da70ea6957..51c2c941a4 100644 --- a/packages/linkml/src/linkml/generators/owlgen.py +++ b/packages/linkml/src/linkml/generators/owlgen.py @@ -2,12 +2,13 @@ import logging import os +import re from collections import defaultdict from collections.abc import Iterable, Sequence from copy import copy from dataclasses import dataclass, field from enum import Enum, unique -from typing import Any, TypeAlias, TypeVar +from typing import Any, ClassVar, TypeAlias, TypeVar import click import rdflib @@ -239,6 +240,73 @@ def _present(values: Iterable[_T | None]) -> list[_T]: - have no ``rdfs:range`` restriction (any IRI is valid) """ + default_language: str | None = None + """Default BCP 47 language tag for human-readable string literals. + + When set, ``rdfs:label``, ``rdfs:comment``, ``skos:definition``, + ``dcterms:title``, and other annotation literals are emitted with the + specified language tag (e.g. ``"Person"@en``). An element-level + ``in_language`` value overrides this default for that element. + + Technical literals (URIs, numeric constraints, XSD facets) are never + language-tagged. Conforms to :rfc:`5646` (BCP 47). + """ + + # Metaslot ranges that represent human-readable text (eligible for language tags). + # Everything else (uri, uriorcurie, datetime, boolean, integer, classes, …) is technical. + _LANGUAGE_TAGGABLE_RANGES: ClassVar[frozenset[str]] = frozenset({"string", "ncname"}) + + # Syntactic validator for BCP 47 language tags (RFC 5646 §2.1 ABNF). + # Each group maps 1:1 to an ABNF production: language, script, region, + # variant, extension, privateuse, and grandfathered (irregular + regular). + _BCP47_RE: ClassVar[re.Pattern[str]] = re.compile( + r"^(?:" + r"(?:(?:[A-Za-z]{2,3}(?:-[A-Za-z]{3}){0,3})|[A-Za-z]{4}|[A-Za-z]{5,8})" + r"(?:-[A-Za-z]{4})?" + r"(?:-(?:[A-Za-z]{2}|\d{3}))?" + r"(?:-(?:[A-Za-z\d]{5,8}|\d[A-Za-z\d]{3}))*" + r"(?:-[0-9A-WY-Za-wy-z](?:-[A-Za-z\d]{2,8})+)*" + r"(?:-x(?:-[A-Za-z\d]{1,8})+)?" + r"|x(?:-[A-Za-z\d]{1,8})+" + r"|en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon" + r"|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu" + r"|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE" + r"|art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu" + r"|zh-hakka|zh-min|zh-min-nan|zh-xiang" + r")$", + re.ASCII, + ) + + def _resolve_language(self, element: "Definition | PermissibleValue | None" = None) -> str | None: + """Return the BCP 47 language tag for *element*, or ``None``. + + Resolution order: + 1. ``element.in_language`` (element-level override) + 2. ``self.default_language`` (generator-level default) + + Empty or whitespace-only strings are normalised to ``None``. + Tags that do not conform to RFC 5646 §2.1 syntax produce a warning. + """ + if element is not None: + element_lang = getattr(element, "in_language", None) + if element_lang and element_lang.strip(): + tag = element_lang.strip() + if not self._BCP47_RE.match(tag): + logger.warning("in_language value %r is not a well-formed BCP 47 tag (RFC 5646 §2.1)", tag) + return tag + tag = (self.default_language or "").strip() or None + if tag is not None and not self._BCP47_RE.match(tag): + logger.warning("--default-language value %r is not a well-formed BCP 47 tag (RFC 5646 §2.1)", tag) + return tag + + def _literal(self, value: str, element: "Definition | PermissibleValue | None" = None) -> Literal: + """Create a language-tagged ``Literal`` for a human-readable string. + + If no language tag is resolved, falls back to a plain literal. + """ + lang = self._resolve_language(element) + return Literal(value, lang=lang) if lang else Literal(value) + def as_graph(self) -> Graph: """ Generate an rdflib Graph from the LinkML schema. @@ -315,6 +383,8 @@ def add_metadata(self, e: Definition | PermissibleValue, uri: URIRef) -> None: Add annotation properties. Set the profile attribute to the appropriate OWL profile. + Human-readable string literals are language-tagged when + ``default_language`` is set or the element has ``in_language``. :param e: schema element :param uri: URI representation of schema element @@ -324,6 +394,7 @@ def add_metadata(self, e: Definition | PermissibleValue, uri: URIRef) -> None: msv = self.metamodel_schemaview this_sv = self.schemaview sn_mappings = msv.slot_name_mappings() + lang = self._resolve_language(e) # iterate through all the assigned metamodel slots for metaslot_name, metaslot_value in vars(e).items(): @@ -348,6 +419,8 @@ def add_metadata(self, e: Definition | PermissibleValue, uri: URIRef) -> None: obj = URIRef(v) elif metaslot_range == "uriorcurie": obj = URIRef(this_sv.expand_curie(v)) + elif metaslot_range in self._LANGUAGE_TAGGABLE_RANGES and lang: + obj = Literal(v, lang=lang) else: obj = Literal(v) elif metaslot_range in msv.all_subsets(): @@ -359,7 +432,7 @@ def add_metadata(self, e: Definition | PermissibleValue, uri: URIRef) -> None: # else: # logger.debug(f"Skipping {uri} {metaslot_uri} => {v}") else: - obj = Literal(v) + obj = Literal(v, lang=lang) if lang else Literal(v) self.graph.add((uri, metaslot_uri, obj)) for k, v in e.annotations.items(): @@ -376,7 +449,11 @@ def add_metadata(self, e: Definition | PermissibleValue, uri: URIRef) -> None: if k_uri == k: k_uri = None if k_uri: - self.graph.add((uri, URIRef(k_uri), Literal(v.value))) + if isinstance(v.value, str): + obj = self._literal(v.value, e) + else: + obj = Literal(v.value) + self.graph.add((uri, URIRef(k_uri), obj)) def add_class(self, cls: ClassDefinition) -> None: """ @@ -1107,7 +1184,7 @@ def add_enum(self, e: EnumDefinition) -> None: if not isinstance(pv_node, Literal): self.add_metadata(pv, pv_node) g.add((pv_node, RDF.type, pv_owl_type)) - g.add((pv_node, RDFS.label, Literal(pv.text))) + g.add((pv_node, RDFS.label, self._literal(pv.text, pv))) # TODO: make this configurable # self._add_element_properties(pv_uri, pv) if self.metaclasses: @@ -1698,6 +1775,17 @@ def slot_owl_type(self, slot: SlotDefinition) -> URIRef: "the JSON-LD context generator (--xsd-anyuri-as-iri → @type: @id)." ), ) +@click.option( + "--default-language", + default=None, + show_default=True, + help=( + "Default BCP 47 language tag for human-readable string literals " + "(e.g. en, de, zh-Hans). When set, rdfs:label, rdfs:comment, " + "skos:definition and other text annotations are emitted with the " + "specified language tag. Element-level in_language overrides this." + ), +) @click.version_option(__version__, "-V", "--version") def cli(yamlfile: str, metadata_profile: str, **kwargs: Any) -> None: """Generate an OWL representation of a LinkML model diff --git a/packages/linkml/src/linkml/generators/shaclgen.py b/packages/linkml/src/linkml/generators/shaclgen.py index 30031dae7c..d5dbe1dd61 100644 --- a/packages/linkml/src/linkml/generators/shaclgen.py +++ b/packages/linkml/src/linkml/generators/shaclgen.py @@ -1,5 +1,6 @@ import logging import os +import re from collections.abc import Callable from dataclasses import dataclass @@ -75,6 +76,15 @@ class ShaclGenerator(Generator): """ expand_subproperty_of: bool = True """If True, expand subproperty_of to sh:in constraints with slot descendants""" + + default_language: str | None = None + """Default BCP 47 language tag for human-readable string literals. + + When set, ``sh:name``, ``sh:description``, ``rdfs:label``, and + ``rdfs:comment`` literals are emitted with the specified language tag. + Conforms to :rfc:`5646` (BCP 47). + """ + generatorname = os.path.basename(__file__) generatorversion = "0.0.1" valid_formats = ["ttl"] @@ -82,6 +92,49 @@ class ShaclGenerator(Generator): visit_all_class_slots = False uses_schemaloader = False + # Syntactic validator for BCP 47 language tags (RFC 5646 §2.1 ABNF). + # Each group maps 1:1 to an ABNF production: language, script, region, + # variant, extension, privateuse, and grandfathered (irregular + regular). + _BCP47_RE: re.Pattern[str] = re.compile( + r"^(?:" + r"(?:(?:[A-Za-z]{2,3}(?:-[A-Za-z]{3}){0,3})|[A-Za-z]{4}|[A-Za-z]{5,8})" + r"(?:-[A-Za-z]{4})?" + r"(?:-(?:[A-Za-z]{2}|\d{3}))?" + r"(?:-(?:[A-Za-z\d]{5,8}|\d[A-Za-z\d]{3}))*" + r"(?:-[0-9A-WY-Za-wy-z](?:-[A-Za-z\d]{2,8})+)*" + r"(?:-x(?:-[A-Za-z\d]{1,8})+)?" + r"|x(?:-[A-Za-z\d]{1,8})+" + r"|en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon" + r"|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu" + r"|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE" + r"|art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu" + r"|zh-hakka|zh-min|zh-min-nan|zh-xiang" + r")$", + re.ASCII, + ) + + def _resolve_language(self, element=None) -> str | None: + """Return the BCP 47 language tag for *element*, or ``None``. + + Resolution order: + 1. ``element.in_language`` (element-level override) + 2. ``self.default_language`` (generator-level default) + + Empty or whitespace-only strings are normalised to ``None``. + Tags that do not conform to RFC 5646 §2.1 syntax produce a warning. + """ + if element is not None: + element_lang = getattr(element, "in_language", None) + if element_lang and element_lang.strip(): + tag = element_lang.strip() + if not self._BCP47_RE.match(tag): + logger.warning("in_language value %r is not a well-formed BCP 47 tag (RFC 5646 §2.1)", tag) + return tag + tag = (self.default_language or "").strip() or None + if tag is not None and not self._BCP47_RE.match(tag): + logger.warning("--default-language value %r is not a well-formed BCP 47 tag (RFC 5646 §2.1)", tag) + return tag + def __post_init__(self) -> None: super().__post_init__() self.generate_header() @@ -137,13 +190,13 @@ def shape_pv(p, v): if c.title is not None: # Use rdfs:label for NodeShape titles per SHACL spec. # sh:name has rdfs:domain of sh:PropertyShape. See issue #3059. - shape_pv(RDFS.label, Literal(c.title)) + shape_pv(RDFS.label, Literal(c.title, lang=self._resolve_language(c))) if c.description is not None: # Use rdfs:comment for NodeShape descriptions per SHACL spec. # sh:description has rdfs:domain of sh:PropertyShape, so using it # on NodeShapes causes RDFS-aware validators to incorrectly infer # the NodeShape is also a PropertyShape. See issue #3059. - shape_pv(RDFS.comment, Literal(c.description)) + shape_pv(RDFS.comment, Literal(c.description, lang=self._resolve_language(c))) shape_pv(SH.ignoredProperties, self._build_ignored_properties(g, c)) @@ -168,11 +221,15 @@ def prop_pv_literal(p, v): if v is not None: g.add((pnode, p, Literal(v))) + def prop_pv_text(p, v): + if v is not None: + g.add((pnode, p, Literal(v, lang=self._resolve_language(s)))) + prop_pv(SH.path, slot_uri) prop_pv_literal(SH.order, order) order += 1 - prop_pv_literal(SH.name, s.title) - prop_pv_literal(SH.description, s.description) + prop_pv_text(SH.name, s.title) + prop_pv_text(SH.description, s.description) # minCount if s.minimum_cardinality is not None: prop_pv_literal(SH.minCount, s.minimum_cardinality) @@ -439,9 +496,14 @@ def _add_annotations(self, func: Callable, item) -> None: else: N_predicate = Literal(a["tag"], datatype=XSD.string) # If the value is a string and ':' is in the value, treat it as a CURIE, - # otherwise treat as Literal with derived XSD datatype + # otherwise treat as Literal with derived XSD datatype. + # String annotations are language-tagged when default_language is set; + # non-string types (bool, int, float) keep their XSD datatype. + lang = self._resolve_language(item) if type(a["value"]) is extended_str and ":" in a["value"]: N_object = URIRef(sv.expand_curie(a["value"])) + elif isinstance(a["value"], str) and lang: + N_object = Literal(a["value"], lang=lang) else: N_object = Literal(a["value"], datatype=self._getXSDtype(a["value"])) @@ -536,6 +598,17 @@ def add_simple_data_type(func: Callable, r: ElementName) -> None: help="If --expand-subproperty-of (default), slots with subproperty_of will generate sh:in constraints " "containing all slot descendants. Use --no-expand-subproperty-of to disable this behavior.", ) +@click.option( + "--default-language", + default=None, + show_default=True, + help=( + "Default BCP 47 language tag for human-readable string literals " + "(e.g. en, de, zh-Hans). When set, sh:name, sh:description, " + "rdfs:label and rdfs:comment are emitted with the specified " + "language tag." + ), +) @click.version_option(__version__, "-V", "--version") def cli(yamlfile, **args): """Generate SHACL turtle from a LinkML model""" diff --git a/tests/linkml/test_generators/test_owlgen.py b/tests/linkml/test_generators/test_owlgen.py index 062d4c31ac..9d4c714f53 100644 --- a/tests/linkml/test_generators/test_owlgen.py +++ b/tests/linkml/test_generators/test_owlgen.py @@ -994,3 +994,271 @@ def test_children_are_mutually_disjoint( members_node = list(g.objects(disjoint_nodes[0], OWL.members))[0] members = set(Collection(g, members_node)) assert members == {EX[name] for name in child_names} + + +# --------------------------------------------------------------------------- +# --default-language tests +# --------------------------------------------------------------------------- + + +def _build_lang_test_schema(): + """Build a small schema with classes, slots, and an enum for language-tag testing.""" + sb = SchemaBuilder() + sb.add_slot( + SlotDefinition( + "vehicle_name", + range="string", + description="The vehicle name.", + title="Name", + ) + ) + sb.add_slot( + SlotDefinition( + "color", + range="ColorEnum", + description="Paint color.", + ) + ) + sb.add_class( + "Vehicle", + slots=["vehicle_name", "color"], + description="A road vehicle.", + title="Vehicle", + ) + sb.add_enum( + "ColorEnum", + permissible_values=[ + PermissibleValue(text="Red", description="A warm color."), + PermissibleValue(text="Blue", description="A cool color."), + ], + ) + sb.add_defaults() + return sb.schema + + +def test_default_language_tags_owl_labels(): + """With --default-language en, rdfs:label and skos:definition get @en.""" + schema = _build_lang_test_schema() + owl = OwlSchemaGenerator( + schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + default_language="en", + ).serialize() + g = Graph() + g.parse(data=owl, format="turtle") + + # Class label + labels = list(g.objects(EX.Vehicle, RDFS.label)) + assert Literal("Vehicle", lang="en") in labels + + # Class description + defs = list(g.objects(EX.Vehicle, SKOS.definition)) + assert Literal("A road vehicle.", lang="en") in defs + + # Enum PV label — PVs are emitted as <{enum_uri}#{pv_text}> + pv_red = URIRef(str(EX.ColorEnum) + "#Red") + pv_labels = list(g.objects(pv_red, RDFS.label)) + assert Literal("Red", lang="en") in pv_labels + + # No plain (untagged) literals should be present for these predicates + for lit in labels + defs + pv_labels: + assert lit.language == "en", f"Expected @en, got lang={lit.language!r} on {lit!r}" + + +def test_no_default_language_produces_plain_literals(): + """Without --default-language, literals have no language tag (backward-compat).""" + schema = _build_lang_test_schema() + owl = OwlSchemaGenerator( + schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + ).serialize() + g = Graph() + g.parse(data=owl, format="turtle") + + labels = list(g.objects(EX.Vehicle, RDFS.label)) + assert Literal("Vehicle") in labels + for lit in labels: + assert lit.language is None, f"Expected no language tag, got {lit.language!r}" + + +def test_default_language_does_not_tag_uri_range_metaslots(): + """Metaslots with range 'uri' or 'uriorcurie' must produce URIRef, never tagged literals.""" + schema = _build_lang_test_schema() + # id_prefixes has range uriorcurie — set it to verify no language tag + schema.id_prefixes = ["http://example.org/"] + owl = OwlSchemaGenerator( + schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + default_language="de", + ).serialize() + g = Graph() + g.parse(data=owl, format="turtle") + + # Verify labels do get the tag + labels = list(g.objects(EX.Vehicle, RDFS.label)) + assert Literal("Vehicle", lang="de") in labels + + # Verify integer/boolean metaslots (if any) don't get tags + # The schema title should be tagged (string range) + assert any(isinstance(o, Literal) and o.language == "de" for o in g.objects(None, RDFS.label)), ( + "At least one label should be @de" + ) + + +def test_default_language_in_language_override(): + """Element-level in_language overrides the generator default_language.""" + schema = _build_lang_test_schema() + schema.classes["Vehicle"].in_language = "de" + owl = OwlSchemaGenerator( + schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + default_language="en", + ).serialize() + g = Graph() + g.parse(data=owl, format="turtle") + + # Vehicle class should use element-level "de", not default "en" + labels = list(g.objects(EX.Vehicle, RDFS.label)) + assert Literal("Vehicle", lang="de") in labels + assert Literal("Vehicle", lang="en") not in labels + + # ColorEnum should still use the default "en" (no override) + enum_labels = list(g.objects(EX.ColorEnum, RDFS.label)) + assert Literal("ColorEnum", lang="en") in enum_labels + + +def test_default_language_annotations_tagged(): + """OWL annotations with string values are language-tagged.""" + from linkml_runtime.linkml_model.meta import Annotation, Prefix + + sb = SchemaBuilder() + sb.add_class("Widget", description="A widget.") + sb.add_defaults() + sb.schema.prefixes["skos"] = Prefix( + prefix_prefix="skos", + prefix_reference="http://www.w3.org/2004/02/skos/core#", + ) + sb.schema.classes["Widget"].annotations["skos:altLabel"] = Annotation(tag="skos:altLabel", value="Gadget") + + owl = OwlSchemaGenerator( + sb.schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + default_language="en", + ).serialize() + g = Graph() + g.parse(data=owl, format="turtle") + + alt_labels = list(g.objects(EX.Widget, SKOS.altLabel)) + assert Literal("Gadget", lang="en") in alt_labels + + +def test_default_language_empty_string_treated_as_none(): + """An empty string default_language is normalised to None (no tags).""" + schema = _build_lang_test_schema() + owl = OwlSchemaGenerator( + schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + default_language="", + ).serialize() + g = Graph() + g.parse(data=owl, format="turtle") + + labels = list(g.objects(EX.Vehicle, RDFS.label)) + assert Literal("Vehicle") in labels + for lit in labels: + assert lit.language is None, f"Expected no lang tag, got {lit.language!r}" + + +def test_default_language_whitespace_only_treated_as_none(): + """A whitespace-only default_language is normalised to None (no tags).""" + schema = _build_lang_test_schema() + owl = OwlSchemaGenerator( + schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + default_language=" ", + ).serialize() + g = Graph() + g.parse(data=owl, format="turtle") + + labels = list(g.objects(EX.Vehicle, RDFS.label)) + assert Literal("Vehicle") in labels + for lit in labels: + assert lit.language is None, f"Expected no lang tag, got {lit.language!r}" + + +def test_default_language_bcp47_warning(caplog): + """A malformed BCP 47 tag logs a warning but still produces output.""" + import logging + + schema = _build_lang_test_schema() + # "toolongtag" passes rdflib's lax regex but fails strict BCP 47 (max 8 chars for subtag). + with caplog.at_level(logging.WARNING): + owl = OwlSchemaGenerator( + schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + default_language="toolongtag", + ).serialize() + g = Graph() + g.parse(data=owl, format="turtle") + + # Tag is still applied (warning, not error) + labels = list(g.objects(EX.Vehicle, RDFS.label)) + assert any(lit.language == "toolongtag" for lit in labels) + # Warning was emitted + assert any("not a well-formed BCP 47 tag" in rec.message for rec in caplog.records) + + +def test_default_language_bcp47_valid_no_warning(caplog): + """A well-formed BCP 47 tag does not log any warning.""" + import logging + + schema = _build_lang_test_schema() + with caplog.at_level(logging.WARNING): + OwlSchemaGenerator( + schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + default_language="en", + ).serialize() + assert not any("BCP 47" in rec.message for rec in caplog.records) + + +def test_default_language_in_language_override_bcp47_warning(caplog): + """A malformed in_language value logs a warning.""" + import logging + + schema = _build_lang_test_schema() + # "toolongtag" passes rdflib but fails strict BCP 47. + schema.classes["Vehicle"].in_language = "toolongtag" + with caplog.at_level(logging.WARNING): + owl = OwlSchemaGenerator( + schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + default_language="en", + ).serialize() + g = Graph() + g.parse(data=owl, format="turtle") + + # Vehicle uses the (malformed) in_language, not the default + labels = list(g.objects(EX.Vehicle, RDFS.label)) + assert any(lit.language == "toolongtag" for lit in labels) + assert any("in_language" in rec.message and "toolongtag" in rec.message for rec in caplog.records) diff --git a/tests/linkml/test_generators/test_shaclgen.py b/tests/linkml/test_generators/test_shaclgen.py index 41f3f723ca..84b2fe98f0 100644 --- a/tests/linkml/test_generators/test_shaclgen.py +++ b/tests/linkml/test_generators/test_shaclgen.py @@ -7,6 +7,8 @@ from linkml.generators.shacl.shacl_data_type import ShaclDataType from linkml.generators.shaclgen import ShaclGenerator +from linkml_runtime.linkml_model import SlotDefinition +from linkml_runtime.utils.schema_builder import SchemaBuilder EXPECTED = [ ( @@ -1309,3 +1311,221 @@ def get_or_branch_nodes(class_uri: str, slot_local: str) -> list[rdflib.BNode]: no_pattern = [b for b in branches if not list(g.objects(b, SH.pattern))] assert len(no_pattern) == 1 assert list(g.objects(no_pattern[0], SH.datatype)) == [URIRef("http://www.w3.org/2001/XMLSchema#integer")] + + +# --------------------------------------------------------------------------- +# --default-language tests +# --------------------------------------------------------------------------- + +EX = rdflib.Namespace("http://example.org/test-schema/") + + +def _build_shacl_lang_schema(): + """Build a schema with title/description for language-tag testing.""" + sb = SchemaBuilder() + sb.add_slot( + SlotDefinition( + "vehicle_name", + range="string", + description="The vehicle name.", + title="Name", + ) + ) + sb.add_class( + "Vehicle", + slots=["vehicle_name"], + description="A road vehicle.", + title="Vehicle", + ) + sb.add_defaults() + return sb.schema + + +def _parse_shacl(schema, **kwargs): + shacl = ShaclGenerator(schema, mergeimports=False, **kwargs).serialize() + g = rdflib.Graph() + g.parse(data=shacl) + return g + + +def _get_prop_objects(g, shape_uri, prop_path_uri, predicate): + """Get predicate values for the property shape with the given sh:path.""" + for prop_node in g.objects(shape_uri, SH.property): + paths = list(g.objects(prop_node, SH.path)) + if paths and paths[0] == prop_path_uri: + return list(g.objects(prop_node, predicate)) + return [] + + +def test_shacl_default_language_node_shape(): + """NodeShape rdfs:label and rdfs:comment get @en with --default-language.""" + schema = _build_shacl_lang_schema() + g = _parse_shacl(schema, default_language="en") + + vehicle_shape = EX.Vehicle + + labels = list(g.objects(vehicle_shape, RDFS.label)) + assert Literal("Vehicle", lang="en") in labels + + comments = list(g.objects(vehicle_shape, RDFS.comment)) + assert Literal("A road vehicle.", lang="en") in comments + + +def test_shacl_default_language_property_shape(): + """PropertyShape sh:name and sh:description get @en with --default-language.""" + schema = _build_shacl_lang_schema() + g = _parse_shacl(schema, default_language="en") + + vehicle_shape = EX.Vehicle + slot_uri = EX.vehicle_name + + sh_names = _get_prop_objects(g, vehicle_shape, slot_uri, SH["name"]) + assert Literal("Name", lang="en") in sh_names + + sh_descs = _get_prop_objects(g, vehicle_shape, slot_uri, SH.description) + assert Literal("The vehicle name.", lang="en") in sh_descs + + +def test_shacl_no_default_language_plain_literals(): + """Without --default-language, literals have no language tag (backward-compat).""" + schema = _build_shacl_lang_schema() + g = _parse_shacl(schema) + + vehicle_shape = EX.Vehicle + + labels = list(g.objects(vehicle_shape, RDFS.label)) + assert Literal("Vehicle") in labels + for lit in labels: + assert lit.language is None, f"Expected no lang tag, got {lit.language!r}" + + slot_uri = EX.vehicle_name + sh_names = _get_prop_objects(g, vehicle_shape, slot_uri, SH["name"]) + assert Literal("Name") in sh_names + for lit in sh_names: + assert lit.language is None, f"Expected no lang tag, got {lit.language!r}" + + +def test_shacl_default_language_numeric_literals_untagged(): + """Numeric literals (sh:order, sh:minCount, etc.) must never get language tags.""" + schema = _build_shacl_lang_schema() + schema.slots["vehicle_name"].required = True + g = _parse_shacl(schema, default_language="fr") + + vehicle_shape = EX.Vehicle + slot_uri = EX.vehicle_name + + orders = _get_prop_objects(g, vehicle_shape, slot_uri, SH.order) + for lit in orders: + assert lit.language is None, f"sh:order must not be language-tagged: {lit!r}" + + min_counts = _get_prop_objects(g, vehicle_shape, slot_uri, SH.minCount) + for lit in min_counts: + assert lit.language is None, f"sh:minCount must not be language-tagged: {lit!r}" + + +def test_shacl_default_language_annotations_tagged(): + """SHACL string annotations are language-tagged with --default-language.""" + from linkml_runtime.linkml_model.meta import Annotation, Prefix + + schema = _build_shacl_lang_schema() + schema.prefixes["skos"] = Prefix( + prefix_prefix="skos", + prefix_reference="http://www.w3.org/2004/02/skos/core#", + ) + schema.classes["Vehicle"].annotations["skos:altLabel"] = Annotation(tag="skos:altLabel", value="Car") + g = _parse_shacl(schema, default_language="en", include_annotations=True) + + vehicle_shape = EX.Vehicle + SKOS = rdflib.Namespace("http://www.w3.org/2004/02/skos/core#") + alt_labels = list(g.objects(vehicle_shape, SKOS.altLabel)) + assert Literal("Car", lang="en") in alt_labels + + +def test_shacl_default_language_empty_string_treated_as_none(): + """An empty string default_language is normalised to None (no tags).""" + schema = _build_shacl_lang_schema() + g = _parse_shacl(schema, default_language="") + + vehicle_shape = EX.Vehicle + + labels = list(g.objects(vehicle_shape, RDFS.label)) + assert Literal("Vehicle") in labels + for lit in labels: + assert lit.language is None, f"Expected no lang tag, got {lit.language!r}" + + +def test_shacl_default_language_whitespace_only_treated_as_none(): + """A whitespace-only default_language is normalised to None (no tags).""" + schema = _build_shacl_lang_schema() + g = _parse_shacl(schema, default_language=" ") + + vehicle_shape = EX.Vehicle + + labels = list(g.objects(vehicle_shape, RDFS.label)) + assert Literal("Vehicle") in labels + for lit in labels: + assert lit.language is None, f"Expected no lang tag, got {lit.language!r}" + + +def test_shacl_default_language_in_language_override(): + """Element-level in_language overrides the generator default_language in SHACL.""" + schema = _build_shacl_lang_schema() + schema.classes["Vehicle"].in_language = "de" + g = _parse_shacl(schema, default_language="en") + + vehicle_shape = EX.Vehicle + + # Vehicle class should use element-level "de", not default "en" + labels = list(g.objects(vehicle_shape, RDFS.label)) + assert Literal("Vehicle", lang="de") in labels + assert Literal("Vehicle", lang="en") not in labels + + comments = list(g.objects(vehicle_shape, RDFS.comment)) + assert Literal("A road vehicle.", lang="de") in comments + assert Literal("A road vehicle.", lang="en") not in comments + + +def test_shacl_default_language_bcp47_warning(caplog): + """A malformed BCP 47 tag logs a warning but still produces output.""" + import logging + + schema = _build_shacl_lang_schema() + # "toolongtag" passes rdflib's lax regex but fails strict BCP 47. + with caplog.at_level(logging.WARNING): + shacl = ShaclGenerator(schema, mergeimports=False, default_language="toolongtag").serialize() + g = rdflib.Graph() + g.parse(data=shacl) + + # Tag is still applied (warning, not error) + labels = list(g.objects(EX.Vehicle, RDFS.label)) + assert any(lit.language == "toolongtag" for lit in labels) + # Warning was emitted + assert any("not a well-formed BCP 47 tag" in rec.message for rec in caplog.records) + + +def test_shacl_default_language_bcp47_valid_no_warning(caplog): + """A well-formed BCP 47 tag does not log any warning.""" + import logging + + schema = _build_shacl_lang_schema() + with caplog.at_level(logging.WARNING): + ShaclGenerator(schema, mergeimports=False, default_language="en").serialize() + assert not any("BCP 47" in rec.message for rec in caplog.records) + + +def test_shacl_default_language_in_language_bcp47_warning(caplog): + """A malformed in_language value logs a warning in SHACL generator.""" + import logging + + schema = _build_shacl_lang_schema() + # "toolongtag" passes rdflib but fails strict BCP 47. + schema.classes["Vehicle"].in_language = "toolongtag" + with caplog.at_level(logging.WARNING): + shacl = ShaclGenerator(schema, mergeimports=False, default_language="en").serialize() + g = rdflib.Graph() + g.parse(data=shacl) + + # Vehicle uses the (malformed) in_language, not the default + labels = list(g.objects(EX.Vehicle, RDFS.label)) + assert any(lit.language == "toolongtag" for lit in labels) + assert any("in_language" in rec.message and "toolongtag" in rec.message for rec in caplog.records) From 295cd79c94c1049e16bc609a73b2875ee4185821 Mon Sep 17 00:00:00 2001 From: Carlo van Driesten Date: Sat, 16 May 2026 17:40:51 +0200 Subject: [PATCH 06/12] fix(generators): address PR #3449 review comments Resolve all five line-level review comments from amc-corey-cox on the --default-language feature. Code fixes - Drop the unconditional language-tag emission from the catch-all branch of OwlSchemaGenerator.add_metadata. This branch fires for ranges that are neither types, subsets, nor classes -- in practice enum-ranged metaslots such as pv_formula (range pv_formula_options) on a PermissibleValue / EnumDefinition, obligation_level on a SlotDefinition, or alias_predicate on a StructuredAlias. Tagging these permissible-value identifiers shifts the datatype from xsd:string to rdf:langString and breaks downstream sh:in / owl:oneOf matching. (The original review comment cited status="testing" as the example; status has range uriorcurie in the metamodel and takes the URIRef branch -- the illustrative concern was correct, the metaslot named was not.) - Extract the duplicated BCP 47 regex and resolution policy from owlgen.py and shaclgen.py into a new shared module linkml.utils.language_tags. The module exposes BCP47_RE (RFC 5646 section 2.1 ABNF), is_well_formed_bcp47() (well-formedness per section 2.2.9), and a LanguageTagResolver class. - LanguageTagResolver validates the default tag once at construction and remembers per-element in_language tags it has already warned about, collapsing "hundreds of warnings per run" to one per distinct malformed tag. The missing-ClassVar observation on shaclgen is moot with the inline regex removed entirely. - Assign self._language_resolver before super().__post_init__() in both generators so any parent-class hook can safely call _resolve_language during initialisation. Test changes - Rewrite test_default_language_does_not_tag_uri_range_metaslots with a strong negative assertion: walk every triple in the generated graph and require that any language-tagged literal sits under a predicate in a fixed allowlist (rdfs:label, rdfs:comment, skos:definition, skos:prefLabel, skos:altLabel, skos:editorialNote, skos:note, skos:example, dcterms:title, dcterms:description). Also assert bibo:status (uriorcurie range) emits a URIRef. - Add test_default_language_does_not_tag_enum_ranged_metaslot_in_catchall_branch: monkey-patches pv_formula's slot URI to a non-linkml: value so the catch-all else branch actually fires, then asserts the emitted permissible-value identifier carries no language tag. - Add test_default_language_bcp47_warning_is_deduplicated to both test_owlgen.py and test_shaclgen.py: stamp the same malformed tag on multiple elements and assert exactly one warning per distinct tag. Standards references - RFC 5646 section 2.1 (Syntax / ABNF) and section 2.2.9 (Classes of Conformance): https://www.rfc-editor.org/rfc/rfc5646 - RDF 1.1 Concepts section 3.3 (Literals -- language-tagged strings): https://www.w3.org/TR/rdf11-concepts/ - SHACL section 2.3.2.1 (sh:name / sh:description) -- the predicates this feature stamps with language tags. Signed-off-by: Carlo van Driesten --- .../linkml/src/linkml/generators/owlgen.py | 65 +++----- .../linkml/src/linkml/generators/shaclgen.py | 50 ++---- .../linkml/src/linkml/utils/language_tags.py | 116 ++++++++++++++ tests/linkml/test_generators/test_owlgen.py | 150 +++++++++++++++++- tests/linkml/test_generators/test_shaclgen.py | 33 ++++ 5 files changed, 327 insertions(+), 87 deletions(-) create mode 100644 packages/linkml/src/linkml/utils/language_tags.py diff --git a/packages/linkml/src/linkml/generators/owlgen.py b/packages/linkml/src/linkml/generators/owlgen.py index 51c2c941a4..ae1a60db5e 100644 --- a/packages/linkml/src/linkml/generators/owlgen.py +++ b/packages/linkml/src/linkml/generators/owlgen.py @@ -2,7 +2,6 @@ import logging import os -import re from collections import defaultdict from collections.abc import Iterable, Sequence from copy import copy @@ -23,6 +22,7 @@ from linkml.generators.common.subproperty import is_xsd_anyuri_range from linkml.utils.deprecation import deprecation_warning from linkml.utils.generator import Generator, normalize_graph_prefixes, shared_arguments +from linkml.utils.language_tags import LanguageTagResolver from linkml_runtime import SchemaView from linkml_runtime.linkml_model.meta import ( AnonymousClassExpression, @@ -253,51 +253,27 @@ def _present(values: Iterable[_T | None]) -> list[_T]: """ # Metaslot ranges that represent human-readable text (eligible for language tags). - # Everything else (uri, uriorcurie, datetime, boolean, integer, classes, …) is technical. + # Everything else (uri, uriorcurie, datetime, boolean, integer, classes, enums, …) + # is technical and must never be language-tagged. _LANGUAGE_TAGGABLE_RANGES: ClassVar[frozenset[str]] = frozenset({"string", "ncname"}) - # Syntactic validator for BCP 47 language tags (RFC 5646 §2.1 ABNF). - # Each group maps 1:1 to an ABNF production: language, script, region, - # variant, extension, privateuse, and grandfathered (irregular + regular). - _BCP47_RE: ClassVar[re.Pattern[str]] = re.compile( - r"^(?:" - r"(?:(?:[A-Za-z]{2,3}(?:-[A-Za-z]{3}){0,3})|[A-Za-z]{4}|[A-Za-z]{5,8})" - r"(?:-[A-Za-z]{4})?" - r"(?:-(?:[A-Za-z]{2}|\d{3}))?" - r"(?:-(?:[A-Za-z\d]{5,8}|\d[A-Za-z\d]{3}))*" - r"(?:-[0-9A-WY-Za-wy-z](?:-[A-Za-z\d]{2,8})+)*" - r"(?:-x(?:-[A-Za-z\d]{1,8})+)?" - r"|x(?:-[A-Za-z\d]{1,8})+" - r"|en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon" - r"|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu" - r"|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE" - r"|art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu" - r"|zh-hakka|zh-min|zh-min-nan|zh-xiang" - r")$", - re.ASCII, - ) + def __post_init__(self) -> None: + # Resolver must be assigned before ``super().__post_init__()`` so that + # any hook the parent invokes during initialisation can safely call + # ``_resolve_language``. The resolver also validates the default tag + # once here; per-element tags are validated lazily, with at most one + # warning per distinct malformed tag. + self._language_resolver = LanguageTagResolver(self.default_language) + super().__post_init__() def _resolve_language(self, element: "Definition | PermissibleValue | None" = None) -> str | None: """Return the BCP 47 language tag for *element*, or ``None``. - Resolution order: - 1. ``element.in_language`` (element-level override) - 2. ``self.default_language`` (generator-level default) - - Empty or whitespace-only strings are normalised to ``None``. - Tags that do not conform to RFC 5646 §2.1 syntax produce a warning. + Delegates to :class:`linkml.utils.language_tags.LanguageTagResolver`. + Resolution order is element-level ``in_language`` first, then the + generator-level default. """ - if element is not None: - element_lang = getattr(element, "in_language", None) - if element_lang and element_lang.strip(): - tag = element_lang.strip() - if not self._BCP47_RE.match(tag): - logger.warning("in_language value %r is not a well-formed BCP 47 tag (RFC 5646 §2.1)", tag) - return tag - tag = (self.default_language or "").strip() or None - if tag is not None and not self._BCP47_RE.match(tag): - logger.warning("--default-language value %r is not a well-formed BCP 47 tag (RFC 5646 §2.1)", tag) - return tag + return self._language_resolver.resolve(element) def _literal(self, value: str, element: "Definition | PermissibleValue | None" = None) -> Literal: """Create a language-tagged ``Literal`` for a human-readable string. @@ -432,7 +408,16 @@ def add_metadata(self, e: Definition | PermissibleValue, uri: URIRef) -> None: # else: # logger.debug(f"Skipping {uri} {metaslot_uri} => {v}") else: - obj = Literal(v, lang=lang) if lang else Literal(v) + # Catch-all for ranges that are not types, subsets, or + # classes -- in practice these are enum-ranged metaslots + # such as ``pv_formula`` (range ``pv_formula_options``) on + # a PermissibleValue or ``obligation_level`` (range + # ``obligation_level_enum``) on a SlotDefinition. Their + # values are permissible-value identifiers, i.e. constraint + # data, not labels: tagging them would shift the datatype + # from ``xsd:string`` to ``rdf:langString`` and break + # downstream string equality / SHACL ``sh:in`` matching. + obj = Literal(v) self.graph.add((uri, metaslot_uri, obj)) for k, v in e.annotations.items(): diff --git a/packages/linkml/src/linkml/generators/shaclgen.py b/packages/linkml/src/linkml/generators/shaclgen.py index d5dbe1dd61..cb36ecedc7 100644 --- a/packages/linkml/src/linkml/generators/shaclgen.py +++ b/packages/linkml/src/linkml/generators/shaclgen.py @@ -1,6 +1,5 @@ import logging import os -import re from collections.abc import Callable from dataclasses import dataclass @@ -15,6 +14,7 @@ from linkml.generators.shacl.shacl_data_type import ShaclDataType from linkml.generators.shacl.shacl_ifabsent_processor import ShaclIfAbsentProcessor from linkml.utils.generator import Generator, normalize_graph_prefixes, shared_arguments +from linkml.utils.language_tags import LanguageTagResolver from linkml_runtime.linkml_model.meta import ClassDefinition, ElementName from linkml_runtime.utils.formatutils import underscore from linkml_runtime.utils.rdf_canonicalize import canonicalize_rdf_graph @@ -92,50 +92,22 @@ class ShaclGenerator(Generator): visit_all_class_slots = False uses_schemaloader = False - # Syntactic validator for BCP 47 language tags (RFC 5646 §2.1 ABNF). - # Each group maps 1:1 to an ABNF production: language, script, region, - # variant, extension, privateuse, and grandfathered (irregular + regular). - _BCP47_RE: re.Pattern[str] = re.compile( - r"^(?:" - r"(?:(?:[A-Za-z]{2,3}(?:-[A-Za-z]{3}){0,3})|[A-Za-z]{4}|[A-Za-z]{5,8})" - r"(?:-[A-Za-z]{4})?" - r"(?:-(?:[A-Za-z]{2}|\d{3}))?" - r"(?:-(?:[A-Za-z\d]{5,8}|\d[A-Za-z\d]{3}))*" - r"(?:-[0-9A-WY-Za-wy-z](?:-[A-Za-z\d]{2,8})+)*" - r"(?:-x(?:-[A-Za-z\d]{1,8})+)?" - r"|x(?:-[A-Za-z\d]{1,8})+" - r"|en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon" - r"|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu" - r"|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE" - r"|art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu" - r"|zh-hakka|zh-min|zh-min-nan|zh-xiang" - r")$", - re.ASCII, - ) - def _resolve_language(self, element=None) -> str | None: """Return the BCP 47 language tag for *element*, or ``None``. - Resolution order: - 1. ``element.in_language`` (element-level override) - 2. ``self.default_language`` (generator-level default) - - Empty or whitespace-only strings are normalised to ``None``. - Tags that do not conform to RFC 5646 §2.1 syntax produce a warning. + Delegates to :class:`linkml.utils.language_tags.LanguageTagResolver`. + Resolution order is element-level ``in_language`` first, then the + generator-level default. """ - if element is not None: - element_lang = getattr(element, "in_language", None) - if element_lang and element_lang.strip(): - tag = element_lang.strip() - if not self._BCP47_RE.match(tag): - logger.warning("in_language value %r is not a well-formed BCP 47 tag (RFC 5646 §2.1)", tag) - return tag - tag = (self.default_language or "").strip() or None - if tag is not None and not self._BCP47_RE.match(tag): - logger.warning("--default-language value %r is not a well-formed BCP 47 tag (RFC 5646 §2.1)", tag) - return tag + return self._language_resolver.resolve(element) def __post_init__(self) -> None: + # Resolver must be assigned before ``super().__post_init__()`` so that + # any hook the parent invokes during initialisation can safely call + # ``_resolve_language``. The resolver also validates the default tag + # once here; per-element tags are validated lazily, with at most one + # warning per distinct malformed tag. + self._language_resolver = LanguageTagResolver(self.default_language) super().__post_init__() self.generate_header() diff --git a/packages/linkml/src/linkml/utils/language_tags.py b/packages/linkml/src/linkml/utils/language_tags.py new file mode 100644 index 0000000000..442a6eadc3 --- /dev/null +++ b/packages/linkml/src/linkml/utils/language_tags.py @@ -0,0 +1,116 @@ +"""BCP 47 language tag validation shared across generators. + +Centralises the syntactic validator and resolution policy used by +:mod:`linkml.generators.owlgen` and :mod:`linkml.generators.shaclgen` +for the ``--default-language`` feature. + +The validator implements *well-formedness* in the sense of +RFC 5646 §2.2.9 (Classes of Conformance): conformance to the ABNF +grammar in §2.1. It does **not** check IANA registry validity -- +that would require external data and is out of scope for a code +generator. RDF 1.1 §3.3 also requires only well-formedness for +``rdf:langString`` literals. + +References +---------- +- RFC 5646 -- Tags for Identifying Languages (BCP 47): https://www.rfc-editor.org/rfc/rfc5646 +- RFC 5646 §2.1 (Syntax / ABNF): https://www.rfc-editor.org/rfc/rfc5646#section-2.1 +- RFC 5646 §2.2.9 (Classes of Conformance): https://www.rfc-editor.org/rfc/rfc5646#section-2.2.9 +- RDF 1.1 Concepts §3.3 (Literals): https://www.w3.org/TR/rdf11-concepts/#section-Graph-Literal +""" + +from __future__ import annotations + +import logging +import re +from typing import Any + +logger = logging.getLogger(__name__) + +# RFC 5646 §2.1 ABNF -- full grammar (langtag | privateuse | grandfathered). +# Each top-level alternative maps 1:1 to an ABNF production: +# langtag = language ["-" script] ["-" region] *("-" variant) +# *("-" extension) ["-" privateuse] +# privateuse = "x" 1*("-" (1*8alphanum)) +# grandfathered = irregular | regular (closed list from §2.2.8) +BCP47_RE: re.Pattern[str] = re.compile( + r"^(?:" + # langtag + r"(?:(?:[A-Za-z]{2,3}(?:-[A-Za-z]{3}){0,3})|[A-Za-z]{4}|[A-Za-z]{5,8})" + r"(?:-[A-Za-z]{4})?" + r"(?:-(?:[A-Za-z]{2}|\d{3}))?" + r"(?:-(?:[A-Za-z\d]{5,8}|\d[A-Za-z\d]{3}))*" + r"(?:-[0-9A-WY-Za-wy-z](?:-[A-Za-z\d]{2,8})+)*" + r"(?:-x(?:-[A-Za-z\d]{1,8})+)?" + # privateuse + r"|x(?:-[A-Za-z\d]{1,8})+" + # grandfathered (irregular) + r"|en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon" + r"|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu" + r"|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE" + # grandfathered (regular) + r"|art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu" + r"|zh-hakka|zh-min|zh-min-nan|zh-xiang" + r")$", + re.ASCII, +) + + +def is_well_formed_bcp47(tag: str) -> bool: + """Return ``True`` if *tag* is well-formed per RFC 5646 §2.2.9. + + Well-formedness is conformance to the ABNF grammar in RFC 5646 §2.1; + it does not imply IANA registry validity (RFC 5646 §2.2.9). + """ + return bool(BCP47_RE.match(tag)) + + +class LanguageTagResolver: + """Resolve and validate BCP 47 language tags for code generators. + + The resolver implements the two-level policy used by both ``gen-owl`` + and ``gen-shacl``: + + 1. ``element.in_language`` (per-element override) takes precedence + 2. fall back to the generator-level default + + Validation happens at most once per distinct malformed tag: + + - the generator-level default is validated **once** at construction; + - per-element ``in_language`` values are validated the first time + each distinct tag is observed and remembered in :attr:`_warned`. + + This avoids the original implementation's "hundreds of warnings per + run" failure mode while still surfacing every distinct problem tag. + """ + + __slots__ = ("default", "_warned") + + def __init__(self, default: str | None) -> None: + self.default: str | None = (default or "").strip() or None + if self.default is not None and not is_well_formed_bcp47(self.default): + logger.warning( + "default language tag %r is not a well-formed BCP 47 tag (RFC 5646 §2.2.9)", + self.default, + ) + self._warned: set[str] = set() + + def resolve(self, element: Any = None) -> str | None: + """Return the resolved BCP 47 tag for *element*, or ``None``. + + Resolution order is per-element first, generator default second. + Empty or whitespace-only ``in_language`` values are ignored + (the default is consulted instead). + """ + if element is not None: + element_lang = getattr(element, "in_language", None) + if element_lang and element_lang.strip(): + tag = element_lang.strip() + if not is_well_formed_bcp47(tag) and tag not in self._warned: + logger.warning( + "in_language value %r is not a well-formed BCP 47 tag (RFC 5646 §2.2.9)", + tag, + ) + self._warned.add(tag) + return tag + return self.default diff --git a/tests/linkml/test_generators/test_owlgen.py b/tests/linkml/test_generators/test_owlgen.py index 9d4c714f53..af5b904af2 100644 --- a/tests/linkml/test_generators/test_owlgen.py +++ b/tests/linkml/test_generators/test_owlgen.py @@ -1086,10 +1086,24 @@ def test_no_default_language_produces_plain_literals(): def test_default_language_does_not_tag_uri_range_metaslots(): - """Metaslots with range 'uri' or 'uriorcurie' must produce URIRef, never tagged literals.""" + """Metaslots with non-string ranges must never produce language-tagged literals. + + This is the *negative* counterpart to the positive tagging tests. It asserts + that language tags appear only on human-readable predicates (per RDF 1.1 + Concepts §3.3 and the ``_LANGUAGE_TAGGABLE_RANGES`` allowlist), and never on: + + - IRI-valued predicates (``owl:imports``, ``rdf:type``, ``rdfs:isDefinedBy``) + - The ``status`` metaslot (range ``uriorcurie``) + - Enum-ranged metaslots that land in ``add_metadata``'s catch-all branch + (e.g. ``pv_formula`` on permissible values). + + A regression where any of these become ``rdf:langString`` would silently + break SHACL ``sh:in`` / OWL ``owl:oneOf`` matching downstream. + """ schema = _build_lang_test_schema() - # id_prefixes has range uriorcurie — set it to verify no language tag + # uriorcurie-ranged metaslots that should remain IRIs: schema.id_prefixes = ["http://example.org/"] + schema.status = "release" owl = OwlSchemaGenerator( schema, mergeimports=False, @@ -1100,15 +1114,43 @@ def test_default_language_does_not_tag_uri_range_metaslots(): g = Graph() g.parse(data=owl, format="turtle") - # Verify labels do get the tag + # Positive sanity check: string-ranged labels still get the @de tag. labels = list(g.objects(EX.Vehicle, RDFS.label)) assert Literal("Vehicle", lang="de") in labels - # Verify integer/boolean metaslots (if any) don't get tags - # The schema title should be tagged (string range) - assert any(isinstance(o, Literal) and o.language == "de" for o in g.objects(None, RDFS.label)), ( - "At least one label should be @de" - ) + # Strong negative: language tags appear ONLY on known human-readable + # predicates. Whitelist everything we expect to be tag-bearing; anything + # else carrying a language tag is a regression. + from rdflib.namespace import DCTERMS + + # All known human-readable annotation predicates that owlgen may emit + # from string-ranged metaslots. Adding a new string metaslot to the + # linkml metamodel requires extending this allowlist (or proving the + # value is constraint data, not a label). + LANG_TAG_ALLOWED_PREDICATES = { + RDFS.label, + RDFS.comment, + SKOS.definition, + SKOS.prefLabel, + SKOS.altLabel, + SKOS.editorialNote, # linkml ``notes`` metaslot + SKOS.note, + SKOS.example, + DCTERMS.title, + DCTERMS.description, + } + for s, p, o in g: + if isinstance(o, Literal) and o.language is not None: + assert p in LANG_TAG_ALLOWED_PREDICATES, ( + f"Predicate {p!r} produced a language-tagged literal {o!r}; " + "only label/description-style predicates may carry @lang." + ) + + # And specifically: every emitted ``status`` (uriorcurie range) reaches the + # graph as a URIRef, not a Literal of any kind. + BIBO_STATUS = URIRef("http://purl.org/ontology/bibo/status") + for obj in g.objects(None, BIBO_STATUS): + assert isinstance(obj, URIRef), f"uriorcurie-ranged ``status`` metaslot must emit URIRef, got {obj!r}" def test_default_language_in_language_override(): @@ -1262,3 +1304,95 @@ def test_default_language_in_language_override_bcp47_warning(caplog): labels = list(g.objects(EX.Vehicle, RDFS.label)) assert any(lit.language == "toolongtag" for lit in labels) assert any("in_language" in rec.message and "toolongtag" in rec.message for rec in caplog.records) + + +def test_default_language_does_not_tag_enum_ranged_metaslot_in_catchall_branch(monkeypatch): + """Direct regression test for PR #3449 review comment #2. + + The ``else`` branch in :meth:`OwlSchemaGenerator.add_metadata` is reached + when a metaslot's range is neither a type, subset, nor class -- in + practice, an enum-ranged metaslot. The fix removes the unconditional + ``Literal(v, lang=lang)`` emission from that branch. + + No metaslot in the *current* LinkML metamodel reaches this branch with + a non-``linkml:`` slot URI (``pv_formula``, ``obligation_level``, + ``alias_predicate`` are all either filtered by the ``linkml:`` guard + or nested inside class-ranged containers). To exercise the branch + directly, this test temporarily promotes ``pv_formula``'s slot URI to + a non-``linkml:`` value via ``monkeypatch``, then verifies the emitted + permissible-value identifier remains a plain ``xsd:string`` literal -- + never ``rdf:langString`` -- even with ``--default-language en`` set. + + Tagging this value would shift the datatype and silently break + downstream SHACL ``sh:in`` / OWL ``owl:oneOf`` matching (RDF 1.1 + Concepts §3.3). + """ + sb = SchemaBuilder() + sb.add_enum("ColorEnum", permissible_values=[PermissibleValue(text="Red")]) + sb.schema.enums["ColorEnum"].pv_formula = "CODE" + sb.add_defaults() + + gen = OwlSchemaGenerator( + sb.schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + default_language="en", + ) + # Promote pv_formula's slot URI so it passes the ``linkml:`` guard in + # add_metadata and actually reaches the catch-all else branch. + pv_formula_slot = gen.metamodel_schemaview.get_slot("pv_formula") + monkeypatch.setattr(pv_formula_slot, "slot_uri", "https://example.org/pv_formula") + + owl = gen.serialize() + g = Graph() + g.parse(data=owl, format="turtle") + + pv_formula_objects = list(g.objects(None, URIRef("https://example.org/pv_formula"))) + assert pv_formula_objects, ( + "Test setup failure: pv_formula triple was not emitted -- the monkey-patch may have stopped working." + ) + for obj in pv_formula_objects: + assert isinstance(obj, Literal), f"expected Literal, got {obj!r}" + assert obj.language is None, f"catch-all else branch language-tagged an enum-ranged metaslot value: {obj!r}" + assert str(obj) == "CODE" + + +def test_default_language_bcp47_warning_is_deduplicated(caplog): + """Each distinct malformed tag warns at most once across the whole run. + + Regression test for the original implementation, which re-validated on + every call to ``_resolve_language`` and emitted one warning per element + -- potentially hundreds per run. The shared :class:`LanguageTagResolver` + caches the default check (one warning at construction) and remembers + already-warned per-element ``in_language`` tags. + """ + import logging + + schema = _build_lang_test_schema() + # Stamp the same malformed in_language on multiple elements. + schema.classes["Vehicle"].in_language = "toolongtag" + schema.enums["ColorEnum"].in_language = "toolongtag" + schema.slots["vehicle_name"].in_language = "toolongtag" + schema.slots["color"].in_language = "toolongtag" + + with caplog.at_level(logging.WARNING, logger="linkml.utils.language_tags"): + OwlSchemaGenerator( + schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + # Also stamp a malformed default to exercise the default branch. + default_language="anothertoolongone", + ).serialize() + + in_language_warnings = [ + rec for rec in caplog.records if "in_language" in rec.message and "toolongtag" in rec.message + ] + default_warnings = [ + rec for rec in caplog.records if "default language" in rec.message and "anothertoolongone" in rec.message + ] + assert len(in_language_warnings) == 1, ( + f"expected exactly 1 in_language warning for 'toolongtag', got {len(in_language_warnings)}" + ) + assert len(default_warnings) == 1, f"expected exactly 1 default-language warning, got {len(default_warnings)}" diff --git a/tests/linkml/test_generators/test_shaclgen.py b/tests/linkml/test_generators/test_shaclgen.py index 84b2fe98f0..0b456a28a1 100644 --- a/tests/linkml/test_generators/test_shaclgen.py +++ b/tests/linkml/test_generators/test_shaclgen.py @@ -1529,3 +1529,36 @@ def test_shacl_default_language_in_language_bcp47_warning(caplog): labels = list(g.objects(EX.Vehicle, RDFS.label)) assert any(lit.language == "toolongtag" for lit in labels) assert any("in_language" in rec.message and "toolongtag" in rec.message for rec in caplog.records) + + +def test_shacl_default_language_bcp47_warning_is_deduplicated(caplog): + """Each distinct malformed tag warns at most once across the whole SHACL run. + + Mirrors the owlgen regression test (see PR #3449 review comment): the + original implementation emitted one warning per element. The shared + :class:`linkml.utils.language_tags.LanguageTagResolver` collapses these + to one warning per distinct malformed tag. + """ + import logging + + schema = _build_shacl_lang_schema() + schema.classes["Vehicle"].in_language = "toolongtag" + schema.slots["vehicle_name"].in_language = "toolongtag" + + with caplog.at_level(logging.WARNING, logger="linkml.utils.language_tags"): + ShaclGenerator( + schema, + mergeimports=False, + default_language="anothertoolongone", + ).serialize() + + in_language_warnings = [ + rec for rec in caplog.records if "in_language" in rec.message and "toolongtag" in rec.message + ] + default_warnings = [ + rec for rec in caplog.records if "default language" in rec.message and "anothertoolongone" in rec.message + ] + assert len(in_language_warnings) == 1, ( + f"expected exactly 1 in_language warning for 'toolongtag', got {len(in_language_warnings)}" + ) + assert len(default_warnings) == 1, f"expected exactly 1 default-language warning, got {len(default_warnings)}" From 19fb5b26713ca580a5a34b9dd167a0d08010e07e Mon Sep 17 00:00:00 2001 From: jdsika Date: Sat, 25 Apr 2026 18:16:00 +0200 Subject: [PATCH 07/12] feat(gen-shacl): add --message-template for sh:message on property shapes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new --message-template option that attaches sh:message literals to each property shape using a user-defined template string. Supported placeholders: {name} — slot name (underscore-separated) {title} — slot title (human-readable), falls back to name {description} — slot description, falls back to empty string {comments} — slot comments joined with "; ", falls back to empty string {class} — enclosing class name {path} — property IRI (compact or full) The resulting message is stripped of leading/trailing whitespace and omitted entirely when empty (avoids blank sh:message literals). When --default-language is also set, the literal is language-tagged. Example: gen-shacl --message-template "{name} ({class}): {description} [{comments}]" Signed-off-by: Carlo van Driesten --- .../linkml/src/linkml/generators/shaclgen.py | 51 ++++ tests/linkml/test_generators/test_shaclgen.py | 229 ++++++++++++++++++ 2 files changed, 280 insertions(+) diff --git a/packages/linkml/src/linkml/generators/shaclgen.py b/packages/linkml/src/linkml/generators/shaclgen.py index cb36ecedc7..863e719578 100644 --- a/packages/linkml/src/linkml/generators/shaclgen.py +++ b/packages/linkml/src/linkml/generators/shaclgen.py @@ -85,6 +85,25 @@ class ShaclGenerator(Generator): Conforms to :rfc:`5646` (BCP 47). """ + message_template: str | None = None + """Template for ``sh:message`` on property shapes. + + When set, each property shape receives an ``sh:message`` literal built from + this template. The following placeholders are expanded: + + * ``{name}`` — the slot name (underscore-separated LinkML name) + * ``{title}`` — the slot title (human-readable), falls back to *name* + * ``{description}`` — the slot description, falls back to empty string + * ``{comments}`` — the slot comments joined with ``; ``, falls back to empty string + * ``{class}`` — the enclosing class name + * ``{path}`` — the property IRI (compact or full) + + Example: ``"Validation of {name} failed!"`` → + ``sh:message "Validation of has_speed failed!"`` + + If ``default_language`` is also set the literal is language-tagged. + """ + generatorname = os.path.basename(__file__) generatorversion = "0.0.1" valid_formats = ["ttl"] @@ -109,6 +128,7 @@ def __post_init__(self) -> None: # warning per distinct malformed tag. self._language_resolver = LanguageTagResolver(self.default_language) super().__post_init__() + self.message_template = (self.message_template or "").strip() or None self.generate_header() def generate_header(self) -> str: @@ -202,6 +222,25 @@ def prop_pv_text(p, v): order += 1 prop_pv_text(SH.name, s.title) prop_pv_text(SH.description, s.description) + + # sh:message from template + if self.message_template is not None: + try: + msg_text = self.message_template.format( + name=s.name, + title=s.title or s.name, + description=s.description or "", + comments="; ".join(s.comments) if s.comments else "", + **{"class": c.name}, + path=str(slot_uri), + ).strip() + except (KeyError, IndexError, ValueError) as exc: + raise ValueError( + f"Invalid placeholder {exc} in --message-template. " + f"Allowed: {{name}}, {{title}}, {{description}}, {{comments}}, {{class}}, {{path}}" + ) from None + if msg_text: + prop_pv_text(SH.message, msg_text) # minCount if s.minimum_cardinality is not None: prop_pv_literal(SH.minCount, s.minimum_cardinality) @@ -581,6 +620,18 @@ def add_simple_data_type(func: Callable, r: ElementName) -> None: "language tag." ), ) +@click.option( + "--message-template", + default=None, + show_default=True, + help=( + "Template string for sh:message on each property shape. " + "Placeholders: {name} (slot name), {title} (slot title or name), " + "{description} (slot description), {comments} (slot comments joined with '; '), " + "{class} (class name), {path} (property IRI). " + 'Example: "{name} ({class}): {description} [{comments}]"' + ), +) @click.version_option(__version__, "-V", "--version") def cli(yamlfile, **args): """Generate SHACL turtle from a LinkML model""" diff --git a/tests/linkml/test_generators/test_shaclgen.py b/tests/linkml/test_generators/test_shaclgen.py index 0b456a28a1..d91b3b3716 100644 --- a/tests/linkml/test_generators/test_shaclgen.py +++ b/tests/linkml/test_generators/test_shaclgen.py @@ -1341,6 +1341,34 @@ def _build_shacl_lang_schema(): return sb.schema +def _build_message_test_schema(): + """Build a schema for sh:message testing (includes a second slot without title).""" + sb = SchemaBuilder() + sb.add_slot( + SlotDefinition( + "vehicle_name", + range="string", + description="The vehicle name.", + title="Name", + required=True, + ) + ) + sb.add_slot( + SlotDefinition( + "speed", + range="integer", + description="Speed in km/h.", + ) + ) + sb.add_class( + "Vehicle", + slots=["vehicle_name", "speed"], + description="A road vehicle.", + ) + sb.add_defaults() + return sb.schema + + def _parse_shacl(schema, **kwargs): shacl = ShaclGenerator(schema, mergeimports=False, **kwargs).serialize() g = rdflib.Graph() @@ -1562,3 +1590,204 @@ def test_shacl_default_language_bcp47_warning_is_deduplicated(caplog): f"expected exactly 1 in_language warning for 'toolongtag', got {len(in_language_warnings)}" ) assert len(default_warnings) == 1, f"expected exactly 1 default-language warning, got {len(default_warnings)}" + + +# --------------------------------------------------------------------------- +# --message-template tests +# --------------------------------------------------------------------------- + + +def test_message_template_basic(): + """--message-template emits sh:message on every property shape.""" + schema = _build_message_test_schema() + g = _parse_shacl(schema, message_template="Validation of {name} failed!") + + vehicle_shape = EX.Vehicle + + msgs = _get_prop_objects(g, vehicle_shape, EX.vehicle_name, SH.message) + assert Literal("Validation of vehicle_name failed!") in msgs + + msgs = _get_prop_objects(g, vehicle_shape, EX.speed, SH.message) + assert Literal("Validation of speed failed!") in msgs + + +def test_message_template_title_placeholder(): + """{title} expands to slot title, falling back to slot name.""" + schema = _build_message_test_schema() + g = _parse_shacl(schema, message_template="{title} is invalid") + + vehicle_shape = EX.Vehicle + + # vehicle_name has title="Name" + msgs = _get_prop_objects(g, vehicle_shape, EX.vehicle_name, SH.message) + assert Literal("Name is invalid") in msgs + + # speed has no title → falls back to slot name + msgs = _get_prop_objects(g, vehicle_shape, EX.speed, SH.message) + assert Literal("speed is invalid") in msgs + + +def test_message_template_class_placeholder(): + """{class} expands to the enclosing class name.""" + schema = _build_message_test_schema() + g = _parse_shacl(schema, message_template="{class}.{name} constraint violated") + + vehicle_shape = EX.Vehicle + + msgs = _get_prop_objects(g, vehicle_shape, EX.vehicle_name, SH.message) + assert Literal("Vehicle.vehicle_name constraint violated") in msgs + + +def test_message_template_description_placeholder(): + """{description} expands to the slot description, empty string when absent.""" + schema = _build_message_test_schema() + g = _parse_shacl(schema, message_template="{name} ({class}): {description}") + + vehicle_shape = EX.Vehicle + + # vehicle_name has description="The vehicle name." + msgs = _get_prop_objects(g, vehicle_shape, EX.vehicle_name, SH.message) + assert Literal("vehicle_name (Vehicle): The vehicle name.") in msgs + + # speed has description="Speed in km/h." + msgs = _get_prop_objects(g, vehicle_shape, EX.speed, SH.message) + assert Literal("speed (Vehicle): Speed in km/h.") in msgs + + +def test_message_template_description_fallback_empty(): + """{description} falls back to empty string when slot has no description.""" + sb = SchemaBuilder() + sb.add_slot(SlotDefinition("bare_slot", range="string")) + sb.add_class("Thing", slots=["bare_slot"]) + sb.add_defaults() + g = _parse_shacl(sb.schema, message_template="{name}: {description}") + + msgs = _get_prop_objects(g, EX.Thing, EX.bare_slot, SH.message) + assert Literal("bare_slot:") in msgs + + +def test_message_template_comments_placeholder(): + """{comments} expands to slot comments joined with '; '.""" + sb = SchemaBuilder() + sb.add_slot( + SlotDefinition( + "wind_speed", + range="float", + description="Wind speed in metres per second.", + comments=["ISO 34503:2023, Section 10.2.3"], + ) + ) + sb.add_class("Weather", slots=["wind_speed"]) + sb.add_defaults() + g = _parse_shacl(sb.schema, message_template="{name} ({class}): {description} [{comments}]") + + msgs = _get_prop_objects(g, EX.Weather, EX.wind_speed, SH.message) + assert Literal("wind_speed (Weather): Wind speed in metres per second. [ISO 34503:2023, Section 10.2.3]") in msgs + + +def test_message_template_comments_multiple(): + """{comments} joins multiple comments with '; '.""" + sb = SchemaBuilder() + sb.add_slot( + SlotDefinition( + "temperature", + range="float", + comments=["ISO 34503:2023, Section 10.2", "Unit: Celsius"], + ) + ) + sb.add_class("Weather", slots=["temperature"]) + sb.add_defaults() + g = _parse_shacl(sb.schema, message_template="{comments}") + + msgs = _get_prop_objects(g, EX.Weather, EX.temperature, SH.message) + assert Literal("ISO 34503:2023, Section 10.2; Unit: Celsius") in msgs + + +def test_message_template_comments_fallback_empty(): + """{comments} falls back to empty string when slot has no comments.""" + sb = SchemaBuilder() + sb.add_slot(SlotDefinition("bare_slot", range="string")) + sb.add_class("Thing", slots=["bare_slot"]) + sb.add_defaults() + g = _parse_shacl(sb.schema, message_template="{name}: {comments}") + + msgs = _get_prop_objects(g, EX.Thing, EX.bare_slot, SH.message) + assert Literal("bare_slot:") in msgs + + +def test_no_message_template_no_sh_message(): + """Without --message-template, no sh:message is emitted (backward-compat).""" + schema = _build_message_test_schema() + g = _parse_shacl(schema) + + vehicle_shape = EX.Vehicle + + msgs = _get_prop_objects(g, vehicle_shape, EX.vehicle_name, SH.message) + assert msgs == [] + + msgs = _get_prop_objects(g, vehicle_shape, EX.speed, SH.message) + assert msgs == [] + + +def test_message_template_invalid_placeholder_raises(): + """An invalid placeholder in --message-template raises ValueError.""" + import pytest + + schema = _build_message_test_schema() + with pytest.raises(ValueError, match="Invalid placeholder"): + _parse_shacl(schema, message_template="Error: {invalid}") + + +def test_message_template_positional_placeholder_raises(): + """Positional placeholders like {0} raise ValueError.""" + import pytest + + schema = _build_message_test_schema() + with pytest.raises(ValueError, match="Invalid placeholder"): + _parse_shacl(schema, message_template="Error: {0}") + + +def test_message_template_format_spec_raises(): + """Format specs like {name:d} raise ValueError.""" + import pytest + + schema = _build_message_test_schema() + with pytest.raises(ValueError, match="Invalid placeholder"): + _parse_shacl(schema, message_template="Error: {name:d}") + + +def test_message_template_empty_string_treated_as_none(): + """An empty message_template is normalised to None (no sh:message).""" + schema = _build_message_test_schema() + g = _parse_shacl(schema, message_template="") + + vehicle_shape = EX.Vehicle + msgs = _get_prop_objects(g, vehicle_shape, EX.vehicle_name, SH.message) + assert msgs == [] + + +def test_message_template_whitespace_only_treated_as_none(): + """A whitespace-only message_template is normalised to None (no sh:message).""" + schema = _build_message_test_schema() + g = _parse_shacl(schema, message_template=" ") + + vehicle_shape = EX.Vehicle + msgs = _get_prop_objects(g, vehicle_shape, EX.vehicle_name, SH.message) + assert msgs == [] + + +def test_message_template_with_default_language(): + """sh:message is language-tagged when both --message-template and --default-language are set.""" + schema = _build_message_test_schema() + g = _parse_shacl( + schema, + message_template="Validation of {name} failed!", + default_language="en", + ) + + vehicle_shape = EX.Vehicle + msgs = _get_prop_objects(g, vehicle_shape, EX.vehicle_name, SH.message) + assert Literal("Validation of vehicle_name failed!", lang="en") in msgs + + # Verify the message is NOT a plain literal + assert Literal("Validation of vehicle_name failed!") not in msgs From 88ca2e9af9c64931b418a80be4116e9c539cc825 Mon Sep 17 00:00:00 2001 From: jdsika Date: Mon, 27 Apr 2026 21:30:12 +0200 Subject: [PATCH 08/12] feat(gen-shacl): generate sh:sparql constraints from LinkML rules Implement SHACL-SPARQL constraint generation for the boolean-guard pattern commonly used in conditional validation rules. When a LinkML class has rules: blocks with preconditions (value_presence: PRESENT) and postconditions (equals_string: true), the generator now emits sh:SPARQLConstraint nodes on the corresponding sh:NodeShape. Features: - New _add_rules() method translates recognised rule patterns to SPARQL - Boolean-guard pattern: if value present then flag must be true - Rule description mapped to sh:message on the constraint - Deactivated rules are skipped - Warnings emitted for bidirectional/open_world rule flags - New --emit-rules/--no-emit-rules CLI flag (default: enabled) - Full URI references in SPARQL (no PREFIX declarations needed) The generated SPARQL follows W3C SHACL Section 5 and uses the pre-bound \ variable per Section 5.3.1. Constraints are validated by pyshacl with advanced=True. Refs: linkml/linkml#2464 Signed-off-by: Carlo van Driesten --- .../linkml/src/linkml/generators/shaclgen.py | 247 +++++- .../input/shaclgen/boolean_guard_rules.yaml | 70 ++ tests/linkml/test_generators/test_shaclgen.py | 761 ++++++++++++++++++ 3 files changed, 1077 insertions(+), 1 deletion(-) create mode 100644 tests/linkml/test_generators/input/shaclgen/boolean_guard_rules.yaml diff --git a/packages/linkml/src/linkml/generators/shaclgen.py b/packages/linkml/src/linkml/generators/shaclgen.py index 863e719578..422051b640 100644 --- a/packages/linkml/src/linkml/generators/shaclgen.py +++ b/packages/linkml/src/linkml/generators/shaclgen.py @@ -15,7 +15,7 @@ from linkml.generators.shacl.shacl_ifabsent_processor import ShaclIfAbsentProcessor from linkml.utils.generator import Generator, normalize_graph_prefixes, shared_arguments from linkml.utils.language_tags import LanguageTagResolver -from linkml_runtime.linkml_model.meta import ClassDefinition, ElementName +from linkml_runtime.linkml_model.meta import ClassDefinition, ElementName, PresenceEnum from linkml_runtime.utils.formatutils import underscore from linkml_runtime.utils.rdf_canonicalize import canonicalize_rdf_graph from linkml_runtime.utils.yamlutils import TypedNode, extended_float, extended_int, extended_str @@ -104,6 +104,22 @@ class ShaclGenerator(Generator): If ``default_language`` is also set the literal is language-tagged. """ + emit_rules: bool = True + """Emit ``sh:sparql`` constraints from LinkML ``rules:`` blocks. + + When ``True`` (default), recognised rule patterns are translated into + SHACL-SPARQL constraints (``sh:SPARQLConstraint``) on the corresponding + ``sh:NodeShape``. Currently two patterns are recognised: + + * *Boolean guard* — a precondition with ``value_presence: PRESENT`` on a + value slot and a postcondition with ``equals_string: "true"`` on a + boolean flag slot. + * *Exclusive value* — a precondition with ``equals_string`` on a slot and + a postcondition with ``maximum_cardinality`` on the *same* slot. + + See `W3C SHACL §5 `_ + and `linkml/linkml#2464 `_. + """ generatorname = os.path.basename(__file__) generatorversion = "0.0.1" valid_formats = ["ttl"] @@ -361,10 +377,228 @@ def st_node_pv(p, v): if default_value: prop_pv(SH.defaultValue, default_value) + if self.emit_rules: + self._add_rules(g, class_uri_with_suffix, c) + return g LINKML_ANY_URI = "https://w3id.org/linkml/Any" + # ------------------------------------------------------------------- + # Rules → sh:sparql + # ------------------------------------------------------------------- + + def _add_rules(self, g: Graph, shape_uri: URIRef, cls: ClassDefinition) -> None: + """Emit ``sh:sparql`` constraints from LinkML ``rules:`` blocks. + + Each recognised rule is converted into an ``sh:SPARQLConstraint`` + attached to *shape_uri*. Unrecognised patterns are logged at + ``DEBUG`` level and silently skipped. + + Currently recognised patterns: + + * **Boolean guard** — a *precondition* with + ``value_presence: PRESENT`` on a value slot and a *postcondition* + with ``equals_string: "true"`` on a boolean flag slot. + + * **Exclusive value** — a *precondition* with ``equals_string`` on + a slot and a *postcondition* with ``maximum_cardinality`` on the + *same* slot. Enforces that when a specific value is present in a + multivalued slot, the total number of values must not exceed the + given cardinality (typically 1 for mutual exclusion). + + See `W3C SHACL §5 `_. + """ + if not cls.rules: + return + + sv = self.schemaview + for rule in cls.rules: + if getattr(rule, "deactivated", False): + continue + + if getattr(rule, "bidirectional", False): + logger.warning( + "Rule in class %r has bidirectional=true; " + "SHACL-SPARQL generation does not yet support bidirectional rules. " + "Only the forward direction is emitted.", + cls.name, + ) + + if getattr(rule, "open_world", False): + logger.warning( + "Rule in class %r has open_world=true; " + "SHACL operates under closed-world assumption. " + "The constraint is emitted but may not match open-world semantics.", + cls.name, + ) + + sparql_query = self._rule_to_sparql(sv, cls, rule) + if sparql_query is None: + logger.debug( + "Skipping unsupported rule pattern in class %r: %s", + cls.name, + getattr(rule, "description", "(no description)"), + ) + continue + + constraint = BNode() + g.add((shape_uri, SH.sparql, constraint)) + g.add((constraint, RDF.type, SH.SPARQLConstraint)) + + message = getattr(rule, "description", None) + if message: + g.add((constraint, SH.message, Literal(message))) + + g.add((constraint, SH.select, Literal(sparql_query))) + + def _rule_to_sparql(self, sv, cls: ClassDefinition, rule) -> str | None: + """Convert a ``ClassRule`` to a SPARQL SELECT query string. + + Returns ``None`` when the rule does not match any supported pattern. + """ + pre = getattr(rule, "preconditions", None) + post = getattr(rule, "postconditions", None) + if not pre or not post: + return None + + pre_slots = getattr(pre, "slot_conditions", None) or {} + post_slots = getattr(post, "slot_conditions", None) or {} + + # Pattern: boolean guard + # preconditions: exactly one slot with value_presence PRESENT + # postconditions: exactly one slot with equals_string "true" + if len(pre_slots) == 1 and len(post_slots) == 1: + pre_slot_name = next(iter(pre_slots)) + post_slot_name = next(iter(post_slots)) + + pre_cond = pre_slots[pre_slot_name] + post_cond = post_slots[post_slot_name] + + is_value_present = getattr(pre_cond, "value_presence", None) == PresenceEnum(PresenceEnum.PRESENT) + is_flag_true = getattr(post_cond, "equals_string", None) == "true" + + if is_value_present and is_flag_true: + return self._build_boolean_guard_sparql(sv, cls, post_slot_name, pre_slot_name) + + # Pattern: exclusive value + # preconditions: slot X has equals_string (a specific enum value) + # postconditions: same slot X has maximum_cardinality N + # Semantics: "If value V is present in slot X, then X has at most N values." + pre_equals = getattr(pre_cond, "equals_string", None) + post_max_card = getattr(post_cond, "maximum_cardinality", None) + + if pre_equals is not None and post_max_card is not None and pre_slot_name == post_slot_name: + return self._build_exclusive_value_sparql(sv, cls, pre_slot_name, pre_equals, int(post_max_card)) + + return None + + def _build_boolean_guard_sparql(self, sv, cls: ClassDefinition, flag_slot_name: str, value_slot_name: str) -> str: + """Build a SPARQL SELECT query for the boolean-guard pattern. + + The query detects violations where the value property is present + but the boolean flag is absent or not ``true``. + + Conforms to `SHACL §5.3.1 + `_: + ``$this`` is pre-bound to each focus node. + """ + flag_uri = self._slot_uri(sv, flag_slot_name, cls) + value_uri = self._slot_uri(sv, value_slot_name, cls) + + return ( + f"SELECT $this WHERE {{\n" + f" OPTIONAL {{ $this <{flag_uri}> ?flag . }}\n" + f" OPTIONAL {{ $this <{value_uri}> ?value . }}\n" + f" FILTER (\n" + f" ( !BOUND(?flag) || ?flag != true ) &&\n" + f" BOUND(?value)\n" + f" )\n" + f"}}" + ) + + def _build_exclusive_value_sparql( + self, + sv, + cls: ClassDefinition, + slot_name: str, + value_name: str, + max_card: int, + ) -> str | None: + """Build a SPARQL SELECT query for the exclusive-value pattern. + + Detects violations where a specific value is present in a multivalued + slot but the total number of values exceeds *max_card*. + + For the common case ``max_card == 1``, the query checks whether the + exclusive value coexists with any other value (simple existence test). + For ``max_card > 1``, a subquery counts all values and checks against + the limit. + + The exclusive value is resolved to its full IRI via the slot's enum + ``meaning`` field. If the slot is not an enum or the value has no + ``meaning``, the value is compared as a plain literal. + + Conforms to `SHACL §5.3.1 + `_: + ``$this`` is pre-bound to each focus node. + """ + slot_uri = self._slot_uri(sv, slot_name, cls) + value_ref = self._resolve_enum_value_ref(sv, slot_name, value_name) + + if max_card == 1: + return ( + f"SELECT $this WHERE {{\n" + f" $this <{slot_uri}> {value_ref} .\n" + f" $this <{slot_uri}> ?other .\n" + f" FILTER (?other != {value_ref})\n" + f"}}" + ) + + return ( + f"SELECT $this WHERE {{\n" + f" $this <{slot_uri}> {value_ref} .\n" + f" {{\n" + f" SELECT $this (COUNT(?val) AS ?count)\n" + f" WHERE {{ $this <{slot_uri}> ?val . }}\n" + f" GROUP BY $this\n" + f" HAVING (?count > {max_card})\n" + f" }}\n" + f"}}" + ) + + def _resolve_enum_value_ref(self, sv, slot_name: str, value_name: str) -> str: + """Resolve an enum value name to a SPARQL term (IRI or literal). + + Looks up the slot's range as an enum, finds the permissible value + matching *value_name*, and returns its ``meaning`` as a full IRI + wrapped in angle brackets. Falls back to a quoted literal if the + slot is not an enum or the value lacks a ``meaning``. + """ + slot = sv.get_slot(slot_name) + if slot: + range_name = slot.range + if range_name and range_name in sv.all_enums(): + enum = sv.get_enum(range_name) + pv = enum.permissible_values.get(value_name) + if pv and pv.meaning: + iri = sv.expand_curie(pv.meaning) + return f"<{iri}>" + return f'"{value_name}"' + + def _slot_uri(self, sv, slot_name: str, cls: ClassDefinition) -> str: + """Resolve a slot name to a full IRI string for use in SPARQL queries. + + Mirrors the resolution logic used for ``sh:path`` in the main slot loop: + prefer ``sv.get_uri()`` for slots registered in the schema map, fall + back to ``default_prefix:underscored_name``. + """ + slot = sv.get_slot(slot_name) + if slot and slot_name in sv.element_by_schema_map(): + return sv.get_uri(slot, expand=True) + pfx = sv.schema.default_prefix + return sv.expand_curie(f"{pfx}:{underscore(slot_name)}") + def _add_class(self, func: Callable, r: ElementName) -> None: """Add an sh:class constraint for range class *r*. @@ -632,6 +866,17 @@ def add_simple_data_type(func: Callable, r: ElementName) -> None: 'Example: "{name} ({class}): {description} [{comments}]"' ), ) +@click.option( + "--emit-rules/--no-emit-rules", + default=True, + show_default=True, + help=( + "Emit sh:sparql constraints from LinkML rules: blocks. " + "When enabled (default), recognised rule patterns (e.g. boolean-guard) " + "are translated into SHACL-SPARQL constraints on the corresponding " + "sh:NodeShape. Use --no-emit-rules to suppress rule generation." + ), +) @click.version_option(__version__, "-V", "--version") def cli(yamlfile, **args): """Generate SHACL turtle from a LinkML model""" diff --git a/tests/linkml/test_generators/input/shaclgen/boolean_guard_rules.yaml b/tests/linkml/test_generators/input/shaclgen/boolean_guard_rules.yaml new file mode 100644 index 0000000000..f56c2eca6a --- /dev/null +++ b/tests/linkml/test_generators/input/shaclgen/boolean_guard_rules.yaml @@ -0,0 +1,70 @@ +id: https://example.org/boolean-guards +name: boolean_guard_rules +description: >- + Test schema for SHACL generation of sh:sparql constraints from LinkML rules. + Models the boolean-guard pattern where a boolean flag must be true if a + corresponding value property is present. + +prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/boolean-guards/ + +imports: + - linkml:types + +default_prefix: ex +default_range: string + +slots: + WeatherWind: + description: Whether wind conditions are present. + range: boolean + slot_uri: ex:WeatherWind + weatherWindValue: + description: Wind speed value. + range: decimal + slot_uri: ex:weatherWindValue + WeatherRain: + description: Whether rain conditions are present. + range: boolean + slot_uri: ex:WeatherRain + weatherRainValue: + description: Rain intensity value. + range: decimal + slot_uri: ex:weatherRainValue + Temperature: + description: Ambient temperature. + range: decimal + slot_uri: ex:Temperature + +classes: + Environment: + description: Environmental conditions. + class_uri: ex:Environment + slots: + - WeatherWind + - weatherWindValue + - WeatherRain + - weatherRainValue + - Temperature + rules: + - description: >- + If weatherWindValue is provided, WeatherWind must be true. + preconditions: + slot_conditions: + weatherWindValue: + value_presence: PRESENT + postconditions: + slot_conditions: + WeatherWind: + equals_string: "true" + - description: >- + If weatherRainValue is provided, WeatherRain must be true. + preconditions: + slot_conditions: + weatherRainValue: + value_presence: PRESENT + postconditions: + slot_conditions: + WeatherRain: + equals_string: "true" diff --git a/tests/linkml/test_generators/test_shaclgen.py b/tests/linkml/test_generators/test_shaclgen.py index d91b3b3716..11f85e142c 100644 --- a/tests/linkml/test_generators/test_shaclgen.py +++ b/tests/linkml/test_generators/test_shaclgen.py @@ -1369,6 +1369,10 @@ def _build_message_test_schema(): return sb.schema +# --------------------------------------------------------------------------- +# Helper functions +# --------------------------------------------------------------------------- + def _parse_shacl(schema, **kwargs): shacl = ShaclGenerator(schema, mergeimports=False, **kwargs).serialize() g = rdflib.Graph() @@ -1791,3 +1795,760 @@ def test_message_template_with_default_language(): # Verify the message is NOT a plain literal assert Literal("Validation of vehicle_name failed!") not in msgs + + +# --------------------------------------------------------------------------- +# --emit-rules / sh:sparql tests +# --------------------------------------------------------------------------- + +_RULES_SCHEMA_YAML = """ +id: https://example.org/boolean-guards +name: boolean_guard_rules +prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/boolean-guards/ +imports: + - linkml:types +default_prefix: ex +default_range: string +slots: + WeatherWind: + range: boolean + slot_uri: ex:WeatherWind + weatherWindValue: + description: Wind speed value. + range: decimal + slot_uri: ex:weatherWindValue + WeatherRain: + range: boolean + slot_uri: ex:WeatherRain + weatherRainValue: + description: Rain intensity value. + range: decimal + slot_uri: ex:weatherRainValue + Temperature: + range: decimal + slot_uri: ex:Temperature +classes: + Environment: + class_uri: ex:Environment + slots: + - WeatherWind + - weatherWindValue + - WeatherRain + - weatherRainValue + - Temperature + rules: + - description: If weatherWindValue is provided, WeatherWind must be true. + preconditions: + slot_conditions: + weatherWindValue: + value_presence: PRESENT + postconditions: + slot_conditions: + WeatherWind: + equals_string: "true" + - description: If weatherRainValue is provided, WeatherRain must be true. + preconditions: + slot_conditions: + weatherRainValue: + value_presence: PRESENT + postconditions: + slot_conditions: + WeatherRain: + equals_string: "true" +""" + +EX_RULES = rdflib.Namespace("https://example.org/boolean-guards/") + + +def test_rule_boolean_guard_generates_sparql(): + """Boolean-guard rules produce sh:sparql constraints on the NodeShape.""" + g = _parse_shacl(_RULES_SCHEMA_YAML) + + shape = EX_RULES.Environment + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 2, f"Expected 2 sh:sparql constraints, got {len(sparql_nodes)}" + + for node in sparql_nodes: + assert (node, RDF.type, SH.SPARQLConstraint) in g + selects = list(g.objects(node, SH.select)) + assert len(selects) == 1, "Each constraint must have exactly one sh:select" + query = str(selects[0]) + assert "$this" in query, "SPARQL must use $this pre-bound variable" + assert "OPTIONAL" in query, "SPARQL must use OPTIONAL for flag/value" + assert "FILTER" in query, "SPARQL must have a FILTER clause" + assert "BOUND" in query, "SPARQL must use BOUND()" + + +def test_rule_with_description_generates_message(): + """Rule description is emitted as sh:message on the SPARQLConstraint.""" + g = _parse_shacl(_RULES_SCHEMA_YAML) + + shape = EX_RULES.Environment + sparql_nodes = list(g.objects(shape, SH.sparql)) + + messages = set() + for node in sparql_nodes: + for msg in g.objects(node, SH.message): + messages.add(str(msg)) + + assert "If weatherWindValue is provided, WeatherWind must be true." in messages + assert "If weatherRainValue is provided, WeatherRain must be true." in messages + + +def test_rule_sparql_contains_correct_uris(): + """SPARQL queries reference the correct slot URIs.""" + g = _parse_shacl(_RULES_SCHEMA_YAML) + + shape = EX_RULES.Environment + sparql_nodes = list(g.objects(shape, SH.sparql)) + + queries = [str(list(g.objects(n, SH.select))[0]) for n in sparql_nodes] + all_sparql = "\n".join(queries) + + assert str(EX_RULES.WeatherWind) in all_sparql + assert str(EX_RULES.weatherWindValue) in all_sparql + assert str(EX_RULES.WeatherRain) in all_sparql + assert str(EX_RULES.weatherRainValue) in all_sparql + + +_DEACTIVATED_RULE_SCHEMA_YAML = """ +id: https://example.org/deactivated-test +name: deactivated_rule_test +prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/deactivated-test/ +imports: + - linkml:types +default_prefix: ex +default_range: string +slots: + Flag: + range: boolean + slot_uri: ex:Flag + flagValue: + range: decimal + slot_uri: ex:flagValue +classes: + TestClass: + class_uri: ex:TestClass + slots: + - Flag + - flagValue + rules: + - description: This rule is deactivated. + deactivated: true + preconditions: + slot_conditions: + flagValue: + value_presence: PRESENT + postconditions: + slot_conditions: + Flag: + equals_string: "true" +""" + + +def test_rule_deactivated_skipped(): + """Deactivated rules do not produce sh:sparql constraints.""" + g = _parse_shacl(_DEACTIVATED_RULE_SCHEMA_YAML) + + shape = URIRef("https://example.org/deactivated-test/TestClass") + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 0, f"Deactivated rule should not emit sh:sparql, got {len(sparql_nodes)}" + + +_UNSUPPORTED_RULE_SCHEMA_YAML = """ +id: https://example.org/unsupported-test +name: unsupported_rule_test +prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/unsupported-test/ +imports: + - linkml:types +default_prefix: ex +default_range: string +slots: + slotA: + range: string + slot_uri: ex:slotA + slotB: + range: string + slot_uri: ex:slotB +classes: + TestClass: + class_uri: ex:TestClass + slots: + - slotA + - slotB + rules: + - description: Rule with no postconditions. + preconditions: + slot_conditions: + slotA: + value_presence: PRESENT +""" + + +def test_rule_unsupported_pattern_skipped(): + """Unrecognised rule patterns are silently skipped (no sh:sparql emitted).""" + g = _parse_shacl(_UNSUPPORTED_RULE_SCHEMA_YAML) + + shape = URIRef("https://example.org/unsupported-test/TestClass") + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 0 + + +def test_rule_no_emit_rules_flag(): + """--no-emit-rules suppresses sh:sparql constraint generation.""" + g = _parse_shacl(_RULES_SCHEMA_YAML, emit_rules=False) + + shape = EX_RULES.Environment + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 0, f"emit_rules=False should suppress rules, got {len(sparql_nodes)}" + + +_NO_RULES_SCHEMA_YAML = """ +id: https://example.org/no-rules +name: no_rules_test +prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/no-rules/ +imports: + - linkml:types +default_prefix: ex +default_range: string +slots: + name: + range: string + slot_uri: ex:name +classes: + SimpleClass: + class_uri: ex:SimpleClass + slots: + - name +""" + + +def test_rule_no_rules_no_sparql(): + """Classes without rules: blocks produce no sh:sparql constraints.""" + g = _parse_shacl(_NO_RULES_SCHEMA_YAML) + + shape = URIRef("https://example.org/no-rules/SimpleClass") + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 0 + + +def test_rule_multiple_rules_per_class(): + """Multiple boolean-guard rules on one class produce multiple sh:sparql constraints.""" + g = _parse_shacl(_RULES_SCHEMA_YAML) + + shape = EX_RULES.Environment + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 2 + + # Each constraint should reference different slot pairs + queries = [str(list(g.objects(n, SH.select))[0]) for n in sparql_nodes] + wind_query = [q for q in queries if "weatherWindValue" in q] + rain_query = [q for q in queries if "weatherRainValue" in q] + assert len(wind_query) == 1, "Expected exactly one wind query" + assert len(rain_query) == 1, "Expected exactly one rain query" + + +# --------------------------------------------------------------------------- +# Tests for URI resolution without explicit slot_uri +# --------------------------------------------------------------------------- + +_NO_SLOT_URI_SCHEMA_YAML = """ +id: https://example.org/no-slot-uri +name: no_slot_uri_test +prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/no-slot-uri/ +imports: + - linkml:types +default_prefix: ex +default_range: string +slots: + is_active: + range: boolean + measured_value: + range: decimal +classes: + Reading: + class_uri: ex:Reading + slots: + - is_active + - measured_value + rules: + - description: If measured_value is provided, is_active must be true. + preconditions: + slot_conditions: + measured_value: + value_presence: PRESENT + postconditions: + slot_conditions: + is_active: + equals_string: "true" +""" + + +def test_rule_no_explicit_slot_uri(): + """Slots without explicit slot_uri resolve via default_prefix + underscore(name).""" + g = _parse_shacl(_NO_SLOT_URI_SCHEMA_YAML) + + shape = URIRef("https://example.org/no-slot-uri/Reading") + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 1 + + query = str(list(g.objects(sparql_nodes[0], SH.select))[0]) + # URIs should be default_prefix:underscore(name) + assert "https://example.org/no-slot-uri/is_active" in query + assert "https://example.org/no-slot-uri/measured_value" in query + + +# --------------------------------------------------------------------------- +# Tests for elseconditions rejection +# --------------------------------------------------------------------------- + +_ELSE_COND_SCHEMA_YAML = """ +id: https://example.org/else-test +name: else_cond_test +prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/else-test/ +imports: + - linkml:types +default_prefix: ex +default_range: string +slots: + Flag: + range: boolean + slot_uri: ex:Flag + flagValue: + range: decimal + slot_uri: ex:flagValue + fallbackValue: + range: string + slot_uri: ex:fallbackValue +classes: + TestClass: + class_uri: ex:TestClass + slots: + - Flag + - flagValue + - fallbackValue + rules: + - description: Rule with elseconditions should be skipped. + preconditions: + slot_conditions: + flagValue: + value_presence: PRESENT + postconditions: + slot_conditions: + Flag: + equals_string: "true" + elseconditions: + slot_conditions: + fallbackValue: + value_presence: PRESENT +""" + + +def test_rule_with_elseconditions_emitted(): + """Rules with elseconditions now emit the forward (if/then) branch as sh:sparql.""" + g = _parse_shacl(_ELSE_COND_SCHEMA_YAML) + + shape = URIRef("https://example.org/else-test/TestClass") + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) >= 1, "Rule with elseconditions should emit sh:sparql for the forward branch" + + +# --------------------------------------------------------------------------- +# SPARQL syntax validation +# --------------------------------------------------------------------------- + + +def test_rule_sparql_syntax_valid(): + """Generated SPARQL queries must be syntactically valid.""" + from rdflib.plugins.sparql import prepareQuery + + g = _parse_shacl(_RULES_SCHEMA_YAML) + + shape = EX_RULES.Environment + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) >= 1 + + for node in sparql_nodes: + query_text = str(list(g.objects(node, SH.select))[0]) + # prepareQuery validates SPARQL syntax; $this is a valid variable name + prepareQuery(query_text) + + +# =========================================================================== +# Exclusive-value pattern tests (SHACL §5 SPARQL constraints) +# =========================================================================== +# +# The "exclusive value" pattern translates a LinkML rule where: +# - preconditions: slot X has equals_string (a specific enum value name) +# - postconditions: same slot X has maximum_cardinality N +# +# Semantics: "If value V is present in multivalued slot X, then X has at most +# N values total." For N=1 this means V must be the sole value (mutual +# exclusion with other enum members). +# +# Generated SHACL: sh:SPARQLConstraint per W3C SHACL §5.3.1, using $this +# pre-bound to each focus node. +# +# References: +# - W3C SHACL §5 +# - W3C SHACL §5.3.1 +# - ISO 34503:2023, 9.3.6 (motivating use case: EdgeNone exclusivity) +# =========================================================================== + +_EXCLUSIVE_VALUE_SCHEMA_YAML = """ +id: https://example.org/exclusive-value +name: exclusive_value_rules +prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/exclusive-value/ +imports: + - linkml:types +default_prefix: ex +default_range: string + +enums: + EdgeTypeEnum: + permissible_values: + EdgeNone: + meaning: ex:EdgeNone + EdgeBarriers: + meaning: ex:EdgeBarriers + EdgeMarkers: + meaning: ex:EdgeMarkers + + PriorityEnum: + permissible_values: + High: + description: High priority (no meaning IRI). + Medium: + description: Medium priority (no meaning IRI). + Low: + description: Low priority (no meaning IRI). + +slots: + edgeType: + range: EdgeTypeEnum + multivalued: true + slot_uri: ex:edgeType + priority: + range: PriorityEnum + multivalued: true + slot_uri: ex:priority + otherSlot: + range: string + slot_uri: ex:otherSlot + +classes: + Road: + class_uri: ex:Road + slots: + - edgeType + - otherSlot + rules: + - description: >- + EdgeNone is mutually exclusive with other edge types. + preconditions: + slot_conditions: + edgeType: + equals_string: "EdgeNone" + postconditions: + slot_conditions: + edgeType: + maximum_cardinality: 1 + + Intersection: + class_uri: ex:Intersection + slots: + - edgeType + rules: + - description: >- + EdgeNone allows at most 2 total edge values. + preconditions: + slot_conditions: + edgeType: + equals_string: "EdgeNone" + postconditions: + slot_conditions: + edgeType: + maximum_cardinality: 2 + + Task: + class_uri: ex:Task + slots: + - priority + rules: + - description: >- + High priority is exclusive (literal fallback test). + preconditions: + slot_conditions: + priority: + equals_string: "High" + postconditions: + slot_conditions: + priority: + maximum_cardinality: 1 + + MismatchedSlots: + class_uri: ex:MismatchedSlots + slots: + - edgeType + - otherSlot + rules: + - description: >- + Different slots in pre/post — not an exclusive-value pattern. + preconditions: + slot_conditions: + edgeType: + equals_string: "EdgeNone" + postconditions: + slot_conditions: + otherSlot: + maximum_cardinality: 1 +""" + +EX_EXCL = rdflib.Namespace("https://example.org/exclusive-value/") + + +def test_exclusive_value_generates_sparql(): + """Exclusive-value rules produce sh:sparql constraints on the NodeShape.""" + g = _parse_shacl(_EXCLUSIVE_VALUE_SCHEMA_YAML) + + shape = EX_EXCL.Road + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 1, f"Expected 1 sh:sparql constraint, got {len(sparql_nodes)}" + + node = sparql_nodes[0] + assert (node, RDF.type, SH.SPARQLConstraint) in g + selects = list(g.objects(node, SH.select)) + assert len(selects) == 1, "Constraint must have exactly one sh:select" + + +def test_exclusive_value_sparql_uses_enum_iri(): + """SPARQL references the enum value's meaning IRI, not a string literal. + + Per the enum definition, EdgeNone has meaning: ex:EdgeNone which expands + to . The generated SPARQL + must use this full IRI in angle brackets. + """ + g = _parse_shacl(_EXCLUSIVE_VALUE_SCHEMA_YAML) + + shape = EX_EXCL.Road + sparql_nodes = list(g.objects(shape, SH.sparql)) + query = str(list(g.objects(sparql_nodes[0], SH.select))[0]) + + edge_none_iri = str(EX_EXCL.EdgeNone) + assert f"<{edge_none_iri}>" in query, f"SPARQL must reference EdgeNone as full IRI <{edge_none_iri}>, got:\n{query}" + + +def test_exclusive_value_max_card_1_sparql_structure(): + """For maximum_cardinality: 1, SPARQL uses FILTER(?other != ). + + The query pattern for N=1 is: + SELECT $this WHERE { + $this . + $this ?other . + FILTER (?other != ) + } + + This is more efficient than the COUNT-based approach for the common + singleton exclusion case. + """ + g = _parse_shacl(_EXCLUSIVE_VALUE_SCHEMA_YAML) + + shape = EX_EXCL.Road + sparql_nodes = list(g.objects(shape, SH.sparql)) + query = str(list(g.objects(sparql_nodes[0], SH.select))[0]) + + assert "$this" in query, "SPARQL must use $this pre-bound variable (SHACL §5.3.1)" + assert "FILTER" in query, "N=1 pattern must use FILTER for exclusion check" + assert "?other" in query, "N=1 pattern must bind ?other for comparison" + # Must NOT use COUNT for the N=1 case (simpler pattern) + assert "COUNT" not in query, "N=1 pattern should use FILTER, not COUNT" + # The slot URI must appear (property path) + assert str(EX_EXCL.edgeType) in query, "SPARQL must reference the slot URI" + + +def test_exclusive_value_max_card_gt1_sparql_structure(): + """For maximum_cardinality > 1, SPARQL uses COUNT-based subquery. + + The query pattern for N>1 is: + SELECT $this WHERE { + $this . + { + SELECT $this (COUNT(?val) AS ?count) + WHERE { $this ?val . } + GROUP BY $this + HAVING (?count > N) + } + } + """ + g = _parse_shacl(_EXCLUSIVE_VALUE_SCHEMA_YAML) + + shape = EX_EXCL.Intersection + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 1, f"Expected 1 sh:sparql constraint, got {len(sparql_nodes)}" + + query = str(list(g.objects(sparql_nodes[0], SH.select))[0]) + + assert "$this" in query, "SPARQL must use $this pre-bound variable" + assert "COUNT" in query, "N>1 pattern must use COUNT" + assert "GROUP BY" in query, "N>1 pattern must GROUP BY $this" + assert "HAVING" in query, "N>1 pattern must use HAVING for count check" + assert "> 2" in query, "HAVING must check count > maximum_cardinality (2)" + + +def test_exclusive_value_no_meaning_falls_back_to_literal(): + """When enum values lack a meaning IRI, the value is compared as a literal. + + PriorityEnum values have no meaning field, so 'High' is used as a + quoted string in the SPARQL rather than an IRI in angle brackets. + """ + g = _parse_shacl(_EXCLUSIVE_VALUE_SCHEMA_YAML) + + shape = EX_EXCL.Task + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 1, f"Expected 1 sh:sparql constraint, got {len(sparql_nodes)}" + + query = str(list(g.objects(sparql_nodes[0], SH.select))[0]) + + # Should use quoted literal, not angle-bracket IRI + assert '"High"' in query, f"No-meaning enum should use literal '\"High\"', got:\n{query}" + assert "" not in query, "Should not emit as IRI when meaning is absent" + + +def test_exclusive_value_different_slots_not_recognised(): + """Rules where pre/post reference different slots are NOT exclusive-value. + + The pattern requires the SAME slot in both preconditions and + postconditions. When they differ, the rule is unrecognised and + silently skipped (no sh:sparql emitted). + """ + g = _parse_shacl(_EXCLUSIVE_VALUE_SCHEMA_YAML) + + shape = EX_EXCL.MismatchedSlots + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 0, ( + f"Mismatched slots should not trigger exclusive-value pattern, got {len(sparql_nodes)}" + ) + + +def test_exclusive_value_message_from_description(): + """Rule description is emitted as sh:message on the SPARQLConstraint.""" + g = _parse_shacl(_EXCLUSIVE_VALUE_SCHEMA_YAML) + + shape = EX_EXCL.Road + sparql_nodes = list(g.objects(shape, SH.sparql)) + messages = [str(m) for node in sparql_nodes for m in g.objects(node, SH.message)] + + assert any("EdgeNone is mutually exclusive" in m for m in messages), ( + f"Expected message about EdgeNone exclusivity, got: {messages}" + ) + + +def test_exclusive_value_sparql_syntax_valid(): + """Generated SPARQL for exclusive-value rules must be syntactically valid. + + Uses rdflib's prepareQuery() which validates SPARQL syntax. + $this is a valid SPARQL variable name per the grammar. + """ + from rdflib.plugins.sparql import prepareQuery + + g = _parse_shacl(_EXCLUSIVE_VALUE_SCHEMA_YAML) + + for shape in (EX_EXCL.Road, EX_EXCL.Intersection, EX_EXCL.Task): + sparql_nodes = list(g.objects(shape, SH.sparql)) + for node in sparql_nodes: + query_text = str(list(g.objects(node, SH.select))[0]) + # prepareQuery validates SPARQL syntax + prepareQuery(query_text) + + +def test_exclusive_value_coexists_with_boolean_guard(): + """Exclusive-value and boolean-guard rules can coexist on the same class. + + When a class has both pattern types, both produce sh:sparql constraints. + """ + schema = """ +id: https://example.org/mixed-rules +name: mixed_rules +prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/mixed-rules/ +imports: + - linkml:types +default_prefix: ex +default_range: string + +enums: + StatusEnum: + permissible_values: + None: + meaning: ex:None + Active: + meaning: ex:Active + +slots: + status: + range: StatusEnum + multivalued: true + slot_uri: ex:status + Flag: + range: boolean + slot_uri: ex:Flag + flagValue: + range: decimal + slot_uri: ex:flagValue + +classes: + Widget: + class_uri: ex:Widget + slots: + - status + - Flag + - flagValue + rules: + - description: None is exclusive. + preconditions: + slot_conditions: + status: + equals_string: "None" + postconditions: + slot_conditions: + status: + maximum_cardinality: 1 + - description: If flagValue present, Flag must be true. + preconditions: + slot_conditions: + flagValue: + value_presence: PRESENT + postconditions: + slot_conditions: + Flag: + equals_string: "true" +""" + g = _parse_shacl(schema) + + shape = URIRef("https://example.org/mixed-rules/Widget") + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 2, ( + f"Expected 2 sh:sparql constraints (1 exclusive + 1 boolean guard), got {len(sparql_nodes)}" + ) + + queries = [str(list(g.objects(n, SH.select))[0]) for n in sparql_nodes] + # One should have FILTER(?other != ...) pattern, the other BOUND pattern + has_exclusive = any("?other" in q for q in queries) + has_boolean = any("BOUND" in q for q in queries) + assert has_exclusive, "Expected one exclusive-value SPARQL constraint" + assert has_boolean, "Expected one boolean-guard SPARQL constraint" From 42c5b5ea0bc5317fe13606d71a5e348e6bfc126b Mon Sep 17 00:00:00 2001 From: Carlo van Driesten Date: Mon, 4 May 2026 15:24:10 +0200 Subject: [PATCH 09/12] feat(generators): add --deterministic flag with diff-stable WL hashing Add a --deterministic / --no-deterministic CLI flag (default off) to OWL, SHACL, JSON-LD Context, and JSON-LD generators that produces diff-stable output using Weisfeiler-Lehman structural hashing on top of the RDFC-1.0 canonicalization from upstream (#3407). Three-phase hybrid pipeline (when --deterministic is set): 1. RDFC-1.0 canonicalization (upstream) produces sequential _:c14nN IDs 2. Weisfeiler-Lehman structural hashing replaces sequential IDs with content-based _:b hashes that remain stable when unrelated triples are added/removed 3. rdflib re-serialization recovers idiomatic Turtle (inline blank nodes, collection syntax, filtered prefixes, preserved xsd:string) Without --deterministic, upstream's always-on RDFC-1.0 canonicalization is used directly (via canonicalize_rdf_graph). Additional features gated behind --deterministic: - Expression sorting (any_of/all_of/none_of/exactly_one_of) in owlgen - Collection sorting (sh:in, sh:ignoredProperties) in shaclgen - Permissible value sorting in owlgen and shaclgen - JSON-LD deterministic key ordering (deterministic_json) - JSON-LD context structured ordering (jsonldcontextgen) Rebased on top of upstream linkml/linkml#3407 (pyoxigraph RDFC-1.0). Refs: linkml#1847, linkml#3407 Signed-off-by: Carlo van Driesten --- .../src/linkml/generators/jsonldcontextgen.py | 54 ++ .../linkml/src/linkml/generators/jsonldgen.py | 5 + .../linkml/src/linkml/generators/owlgen.py | 73 ++- .../linkml/src/linkml/generators/rdfgen.py | 2 +- .../linkml/src/linkml/generators/shaclgen.py | 19 +- .../linkml/src/linkml/generators/shexgen.py | 2 +- packages/linkml/src/linkml/utils/generator.py | 289 +++++++++++ .../src/linkml/utils/rdf_canonicalize.py | 223 ++++++++ .../test_deterministic_benchmark.py | 356 +++++++++++++ .../test_deterministic_output.py | 481 ++++++++++++++++++ tests/linkml/test_generators/test_shaclgen.py | 1 + 11 files changed, 1482 insertions(+), 23 deletions(-) create mode 100644 packages/linkml/src/linkml/utils/rdf_canonicalize.py create mode 100644 tests/linkml/test_generators/test_deterministic_benchmark.py create mode 100644 tests/linkml/test_generators/test_deterministic_output.py diff --git a/packages/linkml/src/linkml/generators/jsonldcontextgen.py b/packages/linkml/src/linkml/generators/jsonldcontextgen.py index 38dd938860..bc52c11008 100644 --- a/packages/linkml/src/linkml/generators/jsonldcontextgen.py +++ b/packages/linkml/src/linkml/generators/jsonldcontextgen.py @@ -309,8 +309,62 @@ def end_schema( with open(frame_path, "w", encoding="UTF-8") as f: json.dump(frame, f, indent=2, ensure_ascii=False) + if self.deterministic: + return self._deterministic_context_json(json.loads(str(as_json(context))), indent=3) + "\n" return str(as_json(context)) + "\n" + @staticmethod + def _deterministic_context_json(data: dict, indent: int = 3) -> str: + """Serialize a JSON-LD context with deterministic key ordering. + + Preserves the conventional JSON-LD context structure: + 1. ``comments`` block first (metadata) + 2. ``@context`` block second, with: + a. ``@``-prefixed directives (``@vocab``, ``@base``) first + b. Prefix declarations (string values) second + c. Class/property term entries (object values) last + 3. Each group sorted alphabetically within itself + + Unlike :func:`deterministic_json`, this understands JSON-LD + conventions so that the output remains human-readable while + still being byte-identical across invocations. + """ + from linkml.utils.generator import deterministic_json + + ordered = {} + + # 1. "comments" first (if present) + if "comments" in data: + ordered["comments"] = data["comments"] + + # 2. "@context" with structured internal ordering + if "@context" in data: + ctx = data["@context"] + ordered_ctx = {} + + # 2a. @-prefixed directives (@vocab, @base, etc.) + for k in sorted(k for k in ctx if k.startswith("@")): + ordered_ctx[k] = ctx[k] + + # 2b. Prefix declarations (string values — short namespace URIs) + for k in sorted(k for k in ctx if not k.startswith("@") and isinstance(ctx[k], str)): + ordered_ctx[k] = ctx[k] + + # 2c. Term definitions (object values) — deep-sorted for determinism + term_entries = {k: v for k, v in ctx.items() if not k.startswith("@") and not isinstance(v, str)} + sorted_terms = json.loads(deterministic_json(term_entries)) + for k in sorted(sorted_terms): + ordered_ctx[k] = sorted_terms[k] + + ordered["@context"] = ordered_ctx + + # 3. Any remaining top-level keys + for k in sorted(data): + if k not in ordered: + ordered[k] = data[k] + + return json.dumps(ordered, indent=indent, ensure_ascii=False) + def visit_class(self, cls: ClassDefinition) -> bool: if self.exclude_imports and cls.name not in self._local_classes: return False diff --git a/packages/linkml/src/linkml/generators/jsonldgen.py b/packages/linkml/src/linkml/generators/jsonldgen.py index ee2fd0cf4e..c94c74d9dd 100644 --- a/packages/linkml/src/linkml/generators/jsonldgen.py +++ b/packages/linkml/src/linkml/generators/jsonldgen.py @@ -1,5 +1,6 @@ """Generate JSONld from a LinkML schema.""" +import json import os from collections.abc import Sequence from copy import deepcopy @@ -205,6 +206,10 @@ def end_schema(self, context: str | Sequence[str] | None = None, context_kwargs: self.schema["@context"].append({"@base": base_prefix}) # json_obj["@id"] = self.schema.id out = str(as_json(self.schema, indent=" ")) + "\n" + if self.deterministic: + from linkml.utils.generator import deterministic_json + + out = deterministic_json(json.loads(out), indent=2) + "\n" self.schema = self.original_schema return out diff --git a/packages/linkml/src/linkml/generators/owlgen.py b/packages/linkml/src/linkml/generators/owlgen.py index ae1a60db5e..c065b3f3b3 100644 --- a/packages/linkml/src/linkml/generators/owlgen.py +++ b/packages/linkml/src/linkml/generators/owlgen.py @@ -23,6 +23,7 @@ from linkml.utils.deprecation import deprecation_warning from linkml.utils.generator import Generator, normalize_graph_prefixes, shared_arguments from linkml.utils.language_tags import LanguageTagResolver +from linkml.utils.rdf_canonicalize import canonicalize_rdf_graph from linkml_runtime import SchemaView from linkml_runtime.linkml_model.meta import ( AnonymousClassExpression, @@ -43,7 +44,7 @@ ) from linkml_runtime.utils.formatutils import camelcase, underscore from linkml_runtime.utils.introspection import package_schemaview -from linkml_runtime.utils.rdf_canonicalize import canonicalize_rdf_graph +from linkml_runtime.utils.yamlutils import YAMLRoot logger = logging.getLogger(__name__) @@ -56,6 +57,21 @@ SWRLB = rdflib.Namespace("http://www.w3.org/2003/11/swrlb#") +def _expression_sort_key(expr: YAMLRoot) -> str: + """Return a stable sort key for LinkML anonymous expressions. + + Used by ``--deterministic`` to order ``any_of``, ``all_of``, + ``none_of``, and ``exactly_one_of`` members reproducibly. + + This relies on ``YAMLRoot.__repr__()`` which formats objects using + their **field values** (not memory addresses). All anonymous + expression dataclasses in ``linkml_runtime.linkml_model.meta`` + use ``@dataclass(repr=False)`` and inherit this field-based repr, + so the output is deterministic across runs. + """ + return repr(expr) + + @unique class MetadataProfile(Enum): """ @@ -352,6 +368,10 @@ def serialize(self, **kwargs: Any) -> str: """ self.as_graph() fmt = "turtle" if self.format in ["owl", "ttl"] else self.format + if self.deterministic and fmt == "turtle": + from linkml.utils.generator import deterministic_turtle + + return deterministic_turtle(self.graph) return canonicalize_rdf_graph(self.graph, output_format=fmt) def add_metadata(self, e: Definition | PermissibleValue, uri: URIRef) -> None: @@ -658,13 +678,17 @@ def transform_class_expression( own_slots = self.get_own_slots(cls) owl_exprs: list[OWL_EXPRESSION] = [] if cls.any_of: - any_of_expr = self._union_of([self.transform_class_expression(x) for x in cls.any_of]) + members = list(cls.any_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) + any_of_expr = self._union_of([self.transform_class_expression(x) for x in members]) if any_of_expr: owl_exprs.append(any_of_expr) if cls.exactly_one_of: - sub_exprs: list[OWL_EXPRESSION] = self._present( - self.transform_class_expression(x) for x in cls.exactly_one_of - ) + members = list(cls.exactly_one_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) + sub_exprs: list[OWL_EXPRESSION] = self._present(self.transform_class_expression(x) for x in members) if isinstance(cls, ClassDefinition): cls_uri = self._class_uri(cls.name) listnode = BNode() @@ -672,11 +696,11 @@ def transform_class_expression( graph.add((cls_uri, OWL.disjointUnionOf, listnode)) else: sub_sub_exprs: list[OWL_EXPRESSION] = [] - for i, x in enumerate(cls.exactly_one_of): + for i, x in enumerate(members): operand_expr = self.transform_class_expression(x) if not operand_expr: continue - rest = cls.exactly_one_of[0:i] + cls.exactly_one_of[i + 1 :] + rest = members[0:i] + members[i + 1 :] neg_expr = self._complement_of_union_of([self.transform_class_expression(nx) for nx in rest]) pos_expr = self._intersection_of([operand_expr, neg_expr]) if pos_expr: @@ -686,11 +710,17 @@ def transform_class_expression( owl_exprs.append(union_expr) # owl_exprs.extend(sub_exprs) if cls.all_of: - all_of_expr = self._intersection_of([self.transform_class_expression(x) for x in cls.all_of]) + members = list(cls.all_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) + all_of_expr = self._intersection_of([self.transform_class_expression(x) for x in members]) if all_of_expr: owl_exprs.append(all_of_expr) if cls.none_of: - none_of_expr = self._complement_of_union_of([self.transform_class_expression(x) for x in cls.none_of]) + members = list(cls.none_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) + none_of_expr = self._complement_of_union_of([self.transform_class_expression(x) for x in members]) if none_of_expr: owl_exprs.append(none_of_expr) for slot in own_slots: @@ -863,19 +893,29 @@ def _get_slot_nodes( ) return rdflib_nodes or None - if any_of_rdflib_nodes := _get_slot_nodes(slot.any_of): + def _maybe_sort_slots( + slot_definitions: Sequence[SlotDefinition | AnonymousSlotExpression] | None, + ) -> Sequence[SlotDefinition | AnonymousSlotExpression] | None: + if slot_definitions and self.deterministic: + return sorted(slot_definitions, key=_expression_sort_key) + return slot_definitions + + if any_of_rdflib_nodes := _get_slot_nodes(_maybe_sort_slots(slot.any_of)): owl_exprs.append(self._union_of(any_of_rdflib_nodes)) - if all_of_rdflib_nodes := _get_slot_nodes(slot.all_of): + if all_of_rdflib_nodes := _get_slot_nodes(_maybe_sort_slots(slot.all_of)): owl_exprs.append(self._intersection_of(all_of_rdflib_nodes)) - if none_of_rdflib_nodes := _get_slot_nodes(slot.none_of): + if none_of_rdflib_nodes := _get_slot_nodes(_maybe_sort_slots(slot.none_of)): owl_exprs.append(self._complement_of_union_of(none_of_rdflib_nodes)) if slot.exactly_one_of: + members = list(slot.exactly_one_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) disj_exprs: list[OWL_EXPRESSION] = [] - for i, operand in enumerate(slot.exactly_one_of): + for i, operand in enumerate(members): operand_expr = self.transform_class_slot_expression(cls, operand, main_slot, owl_types) if not operand_expr: continue - rest = slot.exactly_one_of[0:i] + slot.exactly_one_of[i + 1 :] + rest = members[0:i] + members[i + 1 :] neg_expr = self._complement_of_union_of( [self.transform_class_slot_expression(cls, x, main_slot, owl_types) for x in rest], owl_types=owl_types, @@ -1149,7 +1189,10 @@ def add_enum(self, e: EnumDefinition) -> None: owl_types: list[URIRef | None] = [] enum_owl_type = self._get_metatype(e, self.default_permissible_value_type) - for pv in e.permissible_values.values(): + pvs = e.permissible_values.values() + if self.deterministic: + pvs = sorted(pvs, key=lambda x: x.text) + for pv in pvs: pv_owl_type = self._get_metatype(pv, enum_owl_type) owl_types.append(pv_owl_type) if pv_owl_type == RDFS.Literal: diff --git a/packages/linkml/src/linkml/generators/rdfgen.py b/packages/linkml/src/linkml/generators/rdfgen.py index a3fcf6a848..95d832f2b3 100644 --- a/packages/linkml/src/linkml/generators/rdfgen.py +++ b/packages/linkml/src/linkml/generators/rdfgen.py @@ -19,8 +19,8 @@ from linkml._version import __version__ from linkml.generators.jsonldgen import JSONLDGenerator from linkml.utils.generator import Generator, shared_arguments +from linkml.utils.rdf_canonicalize import canonicalize_rdf_graph from linkml_runtime.linkml_model import SchemaDefinition -from linkml_runtime.utils.rdf_canonicalize import canonicalize_rdf_graph @dataclass diff --git a/packages/linkml/src/linkml/generators/shaclgen.py b/packages/linkml/src/linkml/generators/shaclgen.py index 422051b640..f344d21c97 100644 --- a/packages/linkml/src/linkml/generators/shaclgen.py +++ b/packages/linkml/src/linkml/generators/shaclgen.py @@ -15,9 +15,9 @@ from linkml.generators.shacl.shacl_ifabsent_processor import ShaclIfAbsentProcessor from linkml.utils.generator import Generator, normalize_graph_prefixes, shared_arguments from linkml.utils.language_tags import LanguageTagResolver +from linkml.utils.rdf_canonicalize import canonicalize_rdf_graph from linkml_runtime.linkml_model.meta import ClassDefinition, ElementName, PresenceEnum from linkml_runtime.utils.formatutils import underscore -from linkml_runtime.utils.rdf_canonicalize import canonicalize_rdf_graph from linkml_runtime.utils.yamlutils import TypedNode, extended_float, extended_int, extended_str logger = logging.getLogger(__name__) @@ -156,6 +156,10 @@ def generate_header(self) -> str: def serialize(self, **args) -> str: g = self.as_graph() fmt = "turtle" if self.format in ["owl", "ttl"] else self.format + if self.deterministic and fmt == "turtle": + from linkml.utils.generator import deterministic_turtle + + return deterministic_turtle(g) return canonicalize_rdf_graph(g, output_format=fmt) def as_graph(self) -> Graph: @@ -624,13 +628,13 @@ def _add_enum(self, g: Graph, func: Callable, r: ElementName) -> None: sv = self.schemaview enum = sv.get_enum(r) pv_node = BNode() + pv_items = list(enum.permissible_values.items()) + if self.deterministic: + pv_items = sorted(pv_items, key=lambda x: x[0]) Collection( g, pv_node, - [ - URIRef(sv.expand_curie(pv.meaning)) if pv.meaning else Literal(pv_name) - for pv_name, pv in enum.permissible_values.items() - ], + [URIRef(sv.expand_curie(pv.meaning)) if pv.meaning else Literal(pv_name) for pv_name, pv in pv_items], ) func(SH["in"], pv_node) @@ -789,7 +793,10 @@ def collect_child_properties(class_name: str, output: set) -> None: list_node = BNode() ignored_properties.add(RDF.type) - Collection(g, list_node, list(ignored_properties)) + props = list(ignored_properties) + if self.deterministic: + props = sorted(props, key=str) + Collection(g, list_node, props) return list_node diff --git a/packages/linkml/src/linkml/generators/shexgen.py b/packages/linkml/src/linkml/generators/shexgen.py index 40a93ffbc9..704dd1ae61 100644 --- a/packages/linkml/src/linkml/generators/shexgen.py +++ b/packages/linkml/src/linkml/generators/shexgen.py @@ -15,6 +15,7 @@ from linkml._version import __version__ from linkml.generators.common.subproperty import get_subproperty_values from linkml.utils.generator import Generator, shared_arguments +from linkml.utils.rdf_canonicalize import canonicalize_rdf_graph from linkml_runtime.linkml_model.meta import ( ClassDefinition, ElementName, @@ -26,7 +27,6 @@ from linkml_runtime.linkml_model.types import SHEX from linkml_runtime.utils.formatutils import camelcase, sfx from linkml_runtime.utils.metamodelcore import URIorCURIE -from linkml_runtime.utils.rdf_canonicalize import canonicalize_rdf_graph @dataclass diff --git a/packages/linkml/src/linkml/utils/generator.py b/packages/linkml/src/linkml/utils/generator.py index 72b977eaa7..0aab3c40dd 100644 --- a/packages/linkml/src/linkml/utils/generator.py +++ b/packages/linkml/src/linkml/utils/generator.py @@ -38,6 +38,10 @@ from linkml.utils.schemaloader import SchemaLoader from linkml.utils.typereferences import References from linkml_runtime import SchemaView + +if TYPE_CHECKING: + from rdflib import Graph as RdfGraph + from linkml_runtime.linkml_model.meta import ( ClassDefinition, ClassDefinitionName, @@ -230,6 +234,278 @@ def normalize_graph_prefixes(graph: "Graph", schema_prefixes: dict[str, str]) -> graph.bind(std_pfx, Namespace(ns_str), override=True, replace=True) +def _wl_signatures( + quads: list, + iterations: int = 4, +) -> dict[str, str]: + """Compute Weisfeiler-Lehman structural signatures for blank nodes. + + Uses 1-dimensional WL colour refinement [1]_ to assign each blank + node a deterministic signature derived from its multi-hop + neighbourhood structure. The signature depends only on predicate + IRIs, literal values, and named-node IRIs — **not** on blank-node + identifiers — so it remains stable when unrelated triples are added + or removed. + + Parameters + ---------- + quads : list + Canonical quads from pyoxigraph (after RDFC-1.0). + iterations : int + Number of WL refinement rounds (default 4). + + Returns + ------- + dict[str, str] + Mapping from canonical blank-node ID (e.g. ``c14n42``) to a + truncated SHA-256 hash suitable for use as a stable blank-node + label. + + References + ---------- + .. [1] Weisfeiler, B. & Leman, A. (1968). "The reduction of a graph + to canonical form and the algebra which appears therein." + """ + import hashlib + + import pyoxigraph # guaranteed available — caller (deterministic_turtle) checks + + # Collect all blank node IDs and build adjacency index. + bnode_ids: set[str] = set() + # outgoing[b] = list of (predicate_str, object_str_or_bnode_id, is_bnode) + outgoing: dict[str, list[tuple[str, str, bool]]] = {} + # incoming[b] = list of (subject_str_or_bnode_id, predicate_str, is_bnode) + incoming: dict[str, list[tuple[str, str, bool]]] = {} + + for q in quads: + s, p, o = q.subject, q.predicate, q.object + s_is_bn = isinstance(s, pyoxigraph.BlankNode) + o_is_bn = isinstance(o, pyoxigraph.BlankNode) + p_str = str(p) + + if s_is_bn: + bnode_ids.add(s.value) + outgoing.setdefault(s.value, []).append((p_str, o.value if o_is_bn else str(o), o_is_bn)) + if o_is_bn: + bnode_ids.add(o.value) + incoming.setdefault(o.value, []).append((s.value if s_is_bn else str(s), p_str, s_is_bn)) + + # Initialise signatures: named-node edges only (no bnode IDs). + sig: dict[str, str] = {} + for bid in bnode_ids: + parts = [] + for p_str, o_str, o_is_bn in outgoing.get(bid, []): + if not o_is_bn: + parts.append(f"+{p_str}={o_str}") + for s_str, p_str, s_is_bn in incoming.get(bid, []): + if not s_is_bn: + parts.append(f"-{s_str}={p_str}") + sig[bid] = "|".join(sorted(parts)) + + # Iterative refinement: incorporate neighbour signatures. + for _ in range(iterations): + new_sig: dict[str, str] = {} + for bid in bnode_ids: + parts = [sig[bid]] + for p_str, o_str, o_is_bn in outgoing.get(bid, []): + if o_is_bn: + parts.append(f"+{p_str}={sig.get(o_str, '')}") + for s_str, p_str, s_is_bn in incoming.get(bid, []): + if s_is_bn: + parts.append(f"-{sig.get(s_str, '')}={p_str}") + new_sig[bid] = "|".join(sorted(parts)) + sig = new_sig + + # Convert signatures to truncated SHA-256 hashes. + # Use 12 hex chars (48 bits) — birthday-bound collision probability + # is ~n²/2^49: ~0.002% at 100k nodes. Collisions are handled by + # appending a counter (see below), so correctness is preserved. + hash_map: dict[str, str] = {} + seen_hashes: dict[str, int] = {} + for bid in sorted(bnode_ids): + digest = hashlib.sha256(sig[bid].encode("utf-8")).hexdigest()[:12] + # Handle collisions by appending a counter. + count = seen_hashes.get(digest, 0) + seen_hashes[digest] = count + 1 + label = f"b{digest}" if count == 0 else f"b{digest}_{count}" + hash_map[bid] = label + + return hash_map + + +def deterministic_turtle(graph: "RdfGraph") -> str: + """Serialize an RDF graph to Turtle with deterministic output ordering. + + Uses a three-phase hybrid pipeline for **correctness**, **diff + stability**, and **readability**: + + 1. **RDFC-1.0** [1]_ (via ``pyoxigraph``) canonicalizes the graph, + ensuring isomorphic inputs produce identical triple sets. + 2. **Weisfeiler-Lehman structural hashing** replaces the sequential + ``_:c14nN`` identifiers with content-based hashes derived from + each blank node's multi-hop neighbourhood. These hashes depend + only on predicate IRIs, literal values, and named-node IRIs — + not on blank-node numbering — so adding or removing a triple + only affects the identifiers of directly involved blank nodes. + 3. **Hybrid rdflib re-serialization** parses the canonicalized, + WL-hashed triples back into an rdflib ``Graph`` and serializes + with rdflib's native Turtle writer. This recovers idiomatic + Turtle features that pyoxigraph cannot emit: + + - **Inline blank nodes** (``[ … ]``) for singly-referenced + blank nodes (Turtle §2.7 [2]_), instead of verbose named + ``_:bHASH`` syntax. + - **Collection syntax** (``( … )``) for ``rdf:List`` chains + (Turtle §2.8 [2]_). + - **Prefix filtering**: only prefixes actually used in the + graph's IRIs are declared, following the practice of Apache + Jena, Eclipse RDF4J, and Raptor. + + All triples from the source graph are preserved — the hybrid step + only changes syntactic form, never semantic content. + + Parameters + ---------- + graph : rdflib.Graph + An rdflib Graph to serialize. + + Returns + ------- + str + Deterministic Turtle string with ``@prefix`` declarations. + + References + ---------- + .. [1] W3C (2024). "RDF Dataset Canonicalization (RDFC-1.0)." + W3C Recommendation. https://www.w3.org/TR/rdf-canon/ + .. [2] W3C (2014). "RDF 1.1 Turtle — Terse RDF Triple Language." + W3C Recommendation. https://www.w3.org/TR/turtle/ + """ + try: + import pyoxigraph + except ImportError as exc: + raise ImportError( + "pyoxigraph >= 0.4.0 is required for --deterministic output. " + "Install it with: pip install 'pyoxigraph>=0.4.0'" + ) from exc + + from rdflib import BNode, Graph, Literal, URIRef + + # ── Phase 1: RDFC-1.0 canonicalization ────────────────────────── + nt_data = graph.serialize(format="nt") + + dataset = pyoxigraph.Dataset(pyoxigraph.parse(nt_data, format=pyoxigraph.RdfFormat.N_TRIPLES)) + dataset.canonicalize(pyoxigraph.CanonicalizationAlgorithm.RDFC_1_0) + + canonical_quads = list(dataset) + + # ── Phase 2: WL structural hashing for diff-stable blank node IDs + wl_map = _wl_signatures(canonical_quads) + + def _remap(term): + if isinstance(term, pyoxigraph.BlankNode) and term.value in wl_map: + return pyoxigraph.BlankNode(wl_map[term.value]) + return term + + remapped = [pyoxigraph.Triple(_remap(q.subject), q.predicate, _remap(q.object)) for q in canonical_quads] + + # ── Phase 3: Hybrid rdflib re-serialization ───────────────────── + # Convert pyoxigraph terms to rdflib terms and populate a clean + # Graph that only carries explicitly-bound prefixes. + def _to_rdflib(term): + """Convert a pyoxigraph term to the equivalent rdflib term.""" + if isinstance(term, pyoxigraph.NamedNode): + return URIRef(term.value) + if isinstance(term, pyoxigraph.BlankNode): + return BNode(term.value) + if isinstance(term, pyoxigraph.Literal): + if term.language: + return Literal(term.value, lang=term.language) + if term.datatype: + dt_iri = term.datatype.value + # In RDF 1.1, simple literals are syntactic sugar for + # xsd:string (Turtle §2.5.1). Preserve the shorter form + # to match the original owlgen output and avoid spurious + # diffs on every string literal. + if dt_iri == "http://www.w3.org/2001/XMLSchema#string": + return Literal(term.value) + return Literal(term.value, datatype=URIRef(dt_iri)) + return Literal(term.value) + raise TypeError(f"Unexpected pyoxigraph term type: {type(term).__name__}: {term}") + + result_graph = Graph(bind_namespaces="none") + for triple in remapped: + result_graph.add( + ( + _to_rdflib(triple.subject), + _to_rdflib(triple.predicate), + _to_rdflib(triple.object), + ) + ) + + # Bind only prefixes whose namespace IRI is actually referenced + # by at least one subject, predicate, or object in the graph. + # This filters out rdflib's ~27 built-in default bindings + # (brick, csvw, doap, …) that leak through Graph() even when + # the schema never declared them. + used_iris: set[str] = set() + for s, p, o in result_graph: + for term in (s, p, o): + if isinstance(term, URIRef): + used_iris.add(str(term)) + + for pfx, ns in sorted(graph.namespaces()): + pfx_s, ns_s = str(pfx), str(ns) + if pfx_s and any(iri.startswith(ns_s) for iri in used_iris): + result_graph.bind(pfx_s, ns_s) + + return result_graph.serialize(format="turtle") + + +def deterministic_json(obj: object, indent: int = 3, preserve_list_order_keys: frozenset[str] | None = None) -> str: + """Serialize a JSON-compatible object with deterministic ordering. + + Recursively sorts all dict keys *and* list elements to produce + stable output across Python versions and process invocations. + + List elements are sorted by their canonical JSON representation + (``json.dumps(item, sort_keys=True)``), which handles lists of + dicts, strings, and mixed types. + + :param obj: A JSON-serializable object (typically parsed from ``as_json``). + :param indent: Number of spaces for indentation. + :param preserve_list_order_keys: Dict keys whose list values must NOT be + sorted (e.g. ``@context``, ``@list`` in JSON-LD where array order is + semantic). Defaults to ``_JSONLD_ORDERED_KEYS``. + :returns: Deterministic JSON string. + """ + import json + + skip = preserve_list_order_keys if preserve_list_order_keys is not None else _JSONLD_ORDERED_KEYS + + def _deep_sort(value: object, parent_key: str = "") -> object: + if isinstance(value, dict): + return {k: _deep_sort(v, parent_key=k) for k, v in sorted(value.items())} + if isinstance(value, list): + sorted_items = [_deep_sort(item) for item in value] + if parent_key in skip: + return sorted_items + try: + return sorted(sorted_items, key=lambda x: json.dumps(x, sort_keys=True, ensure_ascii=False)) + except TypeError: + return sorted_items + return value + + return json.dumps(_deep_sort(obj), indent=indent, ensure_ascii=False) + + +# JSON-LD keys whose array values carry ordering semantics and must not +# be sorted. @context arrays define an override cascade (JSON-LD 1.1 +# §4.1); @list containers are explicitly ordered; @graph and @set are +# included defensively. +_JSONLD_ORDERED_KEYS: frozenset[str] = frozenset({"@context", "@list", "@graph", "@set", "imports"}) + + @dataclass class Generator(metaclass=abc.ABCMeta): """ @@ -291,6 +567,9 @@ class Generator(metaclass=abc.ABCMeta): mergeimports: bool | None = True """True means merge non-linkml sources into importing package. False means separate packages""" + deterministic: bool = False + """True means produce stable, reproducible output with sorted keys and canonical blank-node ordering""" + source_file_date: str | None = None """Modification date of input source file""" @@ -1144,6 +1423,16 @@ def decorator(f: Command) -> Command: callback=stacktrace_callback, ) ) + f.params.append( + Option( + ("--deterministic/--no-deterministic",), + default=False, + show_default=True, + help="Generate stable, reproducible output with sorted keys and canonical blank-node ordering. " + "Supported by OWL, SHACL, JSON-LD, and JSON-LD Context generators. " + "Useful when generated artifacts are stored in version control.", + ) + ) f.params.append( Option( ("--normalize-prefixes/--no-normalize-prefixes",), diff --git a/packages/linkml/src/linkml/utils/rdf_canonicalize.py b/packages/linkml/src/linkml/utils/rdf_canonicalize.py new file mode 100644 index 0000000000..da57f23399 --- /dev/null +++ b/packages/linkml/src/linkml/utils/rdf_canonicalize.py @@ -0,0 +1,223 @@ +"""Deterministic RDF serialization via pyoxigraph RDFC-1.0 canonicalization. + +This module provides a function to canonicalize an rdflib Graph using +pyoxigraph's RDFC-1.0 implementation, producing deterministic output +with stable blank node labels and sorted triples. + +**Known limitations:** + +1. **xsd:string normalization**: pyoxigraph follows RDF 1.1, where plain + string literals and ``"text"^^xsd:string`` are identical. The output + will never contain explicit ``^^xsd:string`` annotations. Code that + re-parses the output with rdflib will see ``Literal("x")`` (datatype + ``None``) rather than ``Literal("x", datatype=XSD.string)``. + +2. **Non-standard RDF**: Graphs with literal predicates (e.g. SHACL + annotation mode) are rejected by pyoxigraph. This function falls + back to rdflib's serializer for such graphs. + +3. **Numeric short forms**: pyoxigraph uses Turtle short forms for + ``xsd:integer`` (``42``), ``xsd:boolean`` (``true``), and + ``xsd:decimal`` (``1.23``). rdflib parses these back with the + correct datatype, so this is lossless. + +4. **Base IRI / prefix collision**: When a graph has ``@base`` and a + prefix whose namespace equals the base IRI (e.g. rdflib's auto-bound + ``base:`` prefix), pyoxigraph emits CURIEs like ``base:label`` that + rdflib rejects. We skip such prefixes during serialization. + +5. **Trailing escaped dot in PN_LOCAL**: pyoxigraph emits CURIEs like + ``prefix:local\\.`` for IRIs whose local part ends with ``.``. This + is valid Turtle (PN_LOCAL_ESC), but rdflib's notation3 parser rejects + it because it conflicts with the statement-terminator dot. We + post-process the output to expand such CURIEs to full ```` form. +""" + +import io +import logging +import re + +import pyoxigraph as ox +import rdflib + +logger = logging.getLogger(__name__) + +# Mapping from rdflib/LinkML format strings to pyoxigraph RdfFormat objects. +_FORMAT_MAP: dict[str, ox.RdfFormat] = { + "turtle": ox.RdfFormat.TURTLE, + "ttl": ox.RdfFormat.TURTLE, + "nt": ox.RdfFormat.N_TRIPLES, + "ntriples": ox.RdfFormat.N_TRIPLES, + "n-triples": ox.RdfFormat.N_TRIPLES, + "nt11": ox.RdfFormat.N_TRIPLES, + "nquads": ox.RdfFormat.N_QUADS, + "n-quads": ox.RdfFormat.N_QUADS, + "xml": ox.RdfFormat.RDF_XML, + "rdf/xml": ox.RdfFormat.RDF_XML, + "trig": ox.RdfFormat.TRIG, + "n3": ox.RdfFormat.N3, +} + +# Formats that support prefix declarations. +_PREFIX_FORMATS = frozenset({ox.RdfFormat.TURTLE, ox.RdfFormat.TRIG, ox.RdfFormat.N3, ox.RdfFormat.RDF_XML}) + + +# Characters that may appear escaped in a Turtle PN_LOCAL via PN_LOCAL_ESC. +_PN_LOCAL_ESC_UNESCAPE = re.compile(r"\\([_~.\-!$&'()*+,;=/?#@%])") + + +def _expand_trailing_dot_curies(turtle_text: str, prefixes: dict[str, str]) -> str: + """Replace CURIEs whose local part ends in ``\\.`` with full ```` form. + + rdflib's notation3 parser rejects PN_LOCAL ending in an escaped dot + even though Turtle permits it (PN_LOCAL_ESC). pyoxigraph emits this + form for IRIs ending in ``.`` (e.g. ``biolink:StrandEnum#.``). We + rewrite each such CURIE to its expanded ```` form so the output + round-trips through rdflib. + """ + if not prefixes: + return turtle_text + + # Match: a prefix name, ':', a local part (no whitespace or token + # delimiters), ending in ``\.``, followed by whitespace. Use a + # negative lookbehind to avoid matching inside ``<...>`` or word + # characters that would make this a substring of something else. + pattern = re.compile( + r"(?\"'\[\]]*?\\\.)" + r"(?=\s)" + ) + + def replace(match: re.Match[str]) -> str: + prefix = match.group(1) + local_escaped = match.group(2) + namespace = prefixes.get(prefix) + if namespace is None: + return match.group(0) + local = _PN_LOCAL_ESC_UNESCAPE.sub(r"\1", local_escaped) + return f"<{namespace}{local}>" + + return pattern.sub(replace, turtle_text) + + +def _is_safe_prefix_iri(iri: str) -> bool: + """Check whether a namespace IRI is safe for prefix serialization. + + pyoxigraph rejects IRIs with invalid code-points (e.g. double ``#``), + and rdflib's Turtle parser cannot round-trip CURIEs whose namespace + contains query parameters or fragments in unexpected positions. This + function returns ``False`` for such IRIs so they can be skipped during + prefix collection. + """ + # A namespace IRI should end with '/' or '#'. If '#' appears + # *before* the final character, the IRI contains an embedded + # fragment which produces unusable CURIEs. + if "#" in iri[:-1]: + return False + # Query parameters in namespace IRIs produce CURIEs that rdflib + # cannot parse back. + if "?" in iri: + return False + return True + + +def canonicalize_rdf_graph( + graph: rdflib.Graph, + output_format: str = "turtle", +) -> str: + """Serialize an rdflib Graph deterministically using RDFC-1.0 canonicalization. + + The graph is transferred to pyoxigraph via N-Triples, canonicalized + with RDFC-1.0, sorted, and serialized back to the requested format. + Prefix bindings from the rdflib Graph are preserved in the output + for formats that support them (Turtle, TriG, N3, RDF/XML). + + Falls back to plain rdflib serialization for unsupported formats or + graphs containing non-standard RDF (e.g. literal predicates). + + :param graph: The rdflib Graph to serialize. + :param output_format: Target serialization format (e.g. ``"turtle"``, ``"nt"``). + :return: Deterministic string serialization of the graph. + """ + ox_format = _FORMAT_MAP.get(output_format.lower()) + if ox_format is None: + logger.warning( + "pyoxigraph does not support format %r; falling back to rdflib serializer", + output_format, + ) + return graph.serialize(format=output_format) + + # 1. Transfer rdflib graph to pyoxigraph via N-Triples. + nt_data = graph.serialize(format="nt") + nt_bytes = nt_data.encode("utf-8") if isinstance(nt_data, str) else nt_data + + # 2. Parse into pyoxigraph and build a Dataset for canonicalization. + # Fall back to rdflib if the graph contains non-standard RDF + # (e.g. literal predicates from annotations) that pyoxigraph rejects. + try: + triples = list(ox.parse(io.BytesIO(nt_bytes), format=ox.RdfFormat.N_TRIPLES)) + except SyntaxError: + logger.warning( + "Graph contains non-standard RDF that pyoxigraph cannot parse; falling back to rdflib serializer" + ) + return graph.serialize(format=output_format) + + dataset = ox.Dataset() + for triple in triples: + dataset.add(ox.Quad(triple.subject, triple.predicate, triple.object, ox.DefaultGraph())) + + # 3. Canonicalize blank node labels with RDFC-1.0. + dataset.canonicalize(ox.CanonicalizationAlgorithm.RDFC_1_0) + + # 4. Sort triples for deterministic ordering. + quads = list(dataset) + sorted_triples = sorted( + (ox.Triple(q.subject, q.predicate, q.object) for q in quads), + key=lambda t: (str(t.subject), str(t.predicate), str(t.object)), + ) + + # 5. Collect prefixes for formats that support them. + base_iri = str(graph.base) if graph.base else None + prefixes: dict[str, str] | None = None + if ox_format in _PREFIX_FORMATS: + prefixes = {} + for prefix, namespace in graph.namespace_manager.namespaces(): + if not prefix: # skip empty prefix (base) + continue + ns_str = str(namespace) + # Skip prefixes whose namespace matches the base IRI to avoid + # pyoxigraph emitting CURIEs like `base:label` that conflict + # with the @base directive. + if base_iri and ns_str == base_iri: + continue + # Skip namespace IRIs that pyoxigraph rejects or that produce + # CURIEs rdflib cannot round-trip. Valid namespace IRIs for + # prefix use should end with '/' or '#' and contain no query + # parameters or fragment-like characters in the middle. + if not _is_safe_prefix_iri(ns_str): + continue + prefixes[str(prefix)] = ns_str + used_prefixes = prefixes + try: + result_bytes = ox.serialize( + sorted_triples, + format=ox_format, + prefixes=prefixes, + base_iri=base_iri, + ) + except ValueError: + # pyoxigraph rejects prefixes with invalid IRIs (e.g. containing + # fragment-like characters such as double '#'). Retry without + # the offending prefixes by falling back to no prefixes, which + # still produces valid (if verbose) Turtle. + logger.warning("pyoxigraph rejected one or more prefix IRIs; serializing without prefix declarations") + result_bytes = ox.serialize( + sorted_triples, + format=ox_format, + ) + used_prefixes = None + result = result_bytes.decode("utf-8") + if ox_format in _PREFIX_FORMATS and used_prefixes: + result = _expand_trailing_dot_curies(result, used_prefixes) + return result diff --git a/tests/linkml/test_generators/test_deterministic_benchmark.py b/tests/linkml/test_generators/test_deterministic_benchmark.py new file mode 100644 index 0000000000..b7488a8dda --- /dev/null +++ b/tests/linkml/test_generators/test_deterministic_benchmark.py @@ -0,0 +1,356 @@ +"""Benchmark: deterministic Turtle serializer on real-world ontologies. + +Evaluates the ``--deterministic`` flag against schema.org (~16 000 triples, +~800 classes, ~1 400 properties) and the kitchen_sink LinkML schema to +demonstrate four properties: + +1. **Semantic equivalence** — ``rdflib.compare.isomorphic()`` confirms that + deterministic and non-deterministic outputs encode the same RDF graph. +2. **Byte-level stability** — SHA-256 identity across repeated runs proves + that deterministic output is truly reproducible. +3. **Diff quality** — controlled mutations show that small schema changes + produce small, focused diffs (high signal-to-noise ratio). +4. **Performance** — generation time stays within acceptable bounds even + on large real-world graphs. + +Schema.org tests exercise ``deterministic_turtle()`` directly on a +pre-existing OWL ontology. Kitchen_sink tests exercise the full +``OwlSchemaGenerator`` / ``ShaclGenerator`` pipeline with LinkML schemas. + +References +---------- +- W3C RDFC-1.0: https://www.w3.org/TR/rdf-canon/ +- W3C Turtle 1.1: https://www.w3.org/TR/turtle/ +- schema.org: https://schema.org/docs/developers.html +""" + +import difflib +import hashlib +import time +from pathlib import Path + +import pytest +import yaml +from rdflib import Graph +from rdflib.compare import isomorphic + +from linkml.generators.owlgen import OwlSchemaGenerator +from linkml.generators.shaclgen import ShaclGenerator +from linkml.utils.generator import deterministic_turtle + +_has_pyoxigraph = False +try: + import pyoxigraph + + _has_pyoxigraph = hasattr(pyoxigraph, "Dataset") +except ImportError: + pass + +pytestmark = pytest.mark.skipif( + not _has_pyoxigraph, + reason="pyoxigraph >= 0.4.0 required for deterministic benchmarks", +) + +KITCHEN_SINK = str(Path(__file__).parent / "input" / "kitchen_sink.yaml") +SCHEMA_ORG_URL = "https://schema.org/version/latest/schemaorg-current-https.ttl" + + +def _sha256(text: str) -> str: + return hashlib.sha256(text.encode()).hexdigest() + + +def _diff_line_count(a: str, b: str) -> int: + """Count lines present in *b* but not in *a* (unified-diff additions).""" + al = a.strip().splitlines() + bl = b.strip().splitlines() + return sum( + 1 for line in difflib.unified_diff(al, bl, lineterm="") if line.startswith("+") and not line.startswith("+++") + ) + + +# ── Schema.org: direct serializer benchmark ──────────────────────── + + +@pytest.fixture(scope="module") +def schema_org_graph(): + """Download and parse schema.org as an rdflib Graph. + + Cached for the module so the network fetch only happens once. + Skips all dependent tests if the download fails. + """ + try: + import urllib.request + + with urllib.request.urlopen(SCHEMA_ORG_URL, timeout=60) as resp: + data = resp.read().decode("utf-8") + except Exception as exc: + pytest.skip(f"Could not fetch schema.org: {exc}") + + g = Graph() + g.parse(data=data, format="turtle") + return g + + +@pytest.mark.network +class TestSchemaOrgDeterministicSerializer: + """Benchmark ``deterministic_turtle()`` on schema.org OWL ontology.""" + + def test_semantic_equivalence(self, schema_org_graph): + """Deterministic serialization must be isomorphic to the original graph.""" + det_ttl = deterministic_turtle(schema_org_graph) + + g_det = Graph() + g_det.parse(data=det_ttl, format="turtle") + + assert len(g_det) == len(schema_org_graph), ( + f"Triple count mismatch: original={len(schema_org_graph)}, deterministic={len(g_det)}" + ) + assert isomorphic(g_det, schema_org_graph), ( + "Deterministic output is NOT isomorphic to original schema.org graph" + ) + + def test_byte_stability(self, schema_org_graph): + """Two deterministic runs must produce byte-identical output.""" + run1 = deterministic_turtle(schema_org_graph) + run2 = deterministic_turtle(schema_org_graph) + assert _sha256(run1) == _sha256(run2), "Deterministic serializer produced different output across runs" + + def test_prefix_filtering(self, schema_org_graph): + """Only prefixes actually used in the graph should be declared.""" + det_ttl = deterministic_turtle(schema_org_graph) + + # Extract declared prefixes + declared = {} + for line in det_ttl.splitlines(): + if line.startswith("@prefix"): + parts = line.split() + pfx = parts[1].rstrip(":") + ns = parts[2].strip("<>") + declared[pfx] = ns + + # Collect all IRIs in the graph + from rdflib import URIRef + + used_iris = set() + for s, p, o in schema_org_graph: + for term in (s, p, o): + if isinstance(term, URIRef): + used_iris.add(str(term)) + + # Every declared prefix must have at least one IRI using it + for pfx, ns in declared.items(): + assert any(iri.startswith(ns) for iri in used_iris), f"Prefix '{pfx}:' <{ns}> declared but no IRI uses it" + + def test_performance(self, schema_org_graph): + """Serialization must complete within 60 seconds for ~16K triples.""" + start = time.time() + det_ttl = deterministic_turtle(schema_org_graph) + elapsed = time.time() - start + triple_count = len(schema_org_graph) + throughput = triple_count / elapsed if elapsed > 0 else float("inf") + + # Log for benchmark visibility (shows with pytest -v) + print(f"\n schema.org: {triple_count} triples in {elapsed:.1f}s ({throughput:.0f} triples/s)") + + assert elapsed < 60.0, f"Serialization took {elapsed:.1f}s (limit: 60s) for {triple_count} triples" + assert len(det_ttl) > 1000, "Output suspiciously short" + + +# ── Kitchen_sink: full pipeline benchmark ─────────────────────────── + + +def _mutate_kitchen_sink(description_suffix: str = "", add_slot: bool = False) -> str: + """Create a mutated copy of kitchen_sink.yaml **in the same directory** and return its path. + + The copy must live alongside the original so that LinkML relative imports + (``linkml:types``, ``core``, etc.) resolve correctly. + + Uses a unique filename (via ``os.getpid()``) to avoid race conditions + when tests run in parallel under pytest-xdist. + + Parameters + ---------- + description_suffix + Text appended to the first class description found. + add_slot + If True, adds a synthetic ``benchmark_notes`` slot to the first class. + """ + import os + + ks_path = Path(KITCHEN_SINK) + schema = yaml.safe_load(ks_path.read_text()) + + if description_suffix or add_slot: + # Find the first class with a description + for cls_name, cls_def in schema.get("classes", {}).items(): + if isinstance(cls_def, dict) and cls_def.get("description"): + if description_suffix: + cls_def["description"] += description_suffix + if add_slot: + slots = cls_def.get("slots", []) + slots.append("benchmark_notes") + cls_def["slots"] = slots + break + + # Define the synthetic slot if adding one + if add_slot: + slots_dict = schema.setdefault("slots", {}) + slots_dict["benchmark_notes"] = { + "description": "Synthetic benchmark slot for diff quality testing.", + "range": "string", + } + + # Write in the same directory so relative imports resolve. + # Use PID to avoid race conditions with pytest-xdist workers. + out_path = ks_path.parent / f"_benchmark_mutated_{os.getpid()}_kitchen_sink.yaml" + out_path.write_text( + yaml.dump(schema, default_flow_style=False, allow_unicode=True), + encoding="utf-8", + ) + return str(out_path) + + +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +class TestKitchenSinkDiffQuality: + """Measure diff quality on the kitchen_sink schema with controlled mutations.""" + + def test_mutation_description_change(self, generator_cls): + """A single description change must produce a small, focused diff. + + Deterministic mode should change only the affected line(s) and their + immediate context (e.g. SHACL may repeat descriptions in sh:description). + Non-deterministic mode produces a much larger diff due to blank-node + and property-ordering instability. + """ + base = generator_cls(KITCHEN_SINK, deterministic=True).serialize() + mutated_path = _mutate_kitchen_sink(description_suffix=" (benchmark edit)") + try: + mutated = generator_cls(mutated_path, deterministic=True).serialize() + finally: + Path(mutated_path).unlink(missing_ok=True) + + det_diff = _diff_line_count(base, mutated) + + # Non-deterministic baseline for comparison + non_base = generator_cls(KITCHEN_SINK, deterministic=False).serialize() + non_mutated_path = _mutate_kitchen_sink(description_suffix=" (benchmark edit)") + try: + non_mutated = generator_cls(non_mutated_path, deterministic=False).serialize() + finally: + Path(non_mutated_path).unlink(missing_ok=True) + + non_diff = _diff_line_count(non_base, non_mutated) + + # The deterministic diff must be small (description + any SHACL mirrors) + assert det_diff <= 20, ( + f"Deterministic diff too large for a 1-description change: {det_diff} lines (expected ≤20)" + ) + # Signal-to-noise: deterministic must be at least 5× smaller + if non_diff > 0: + ratio = non_diff / max(det_diff, 1) + assert ratio >= 5, ( + f"Insufficient noise reduction: det={det_diff}, non-det={non_diff}, ratio={ratio:.1f}× (expected ≥5×)" + ) + + print( + f"\n {generator_cls.__name__} description mutation: " + f"det={det_diff} lines, non-det={non_diff} lines, " + f"noise reduction={non_diff / max(det_diff, 1):.0f}×" + ) + + def test_mutation_add_slot(self, generator_cls): + """Adding a new slot must produce a proportionally small diff. + + A new slot adds ~10-20 triples (label, range, domain, restrictions). + The diff should be roughly proportional to the new content, not a + full-file rewrite. + """ + base = generator_cls(KITCHEN_SINK, deterministic=True).serialize() + mutated_path = _mutate_kitchen_sink(add_slot=True) + try: + mutated = generator_cls(mutated_path, deterministic=True).serialize() + finally: + Path(mutated_path).unlink(missing_ok=True) + + det_diff = _diff_line_count(base, mutated) + + # Non-deterministic baseline for comparison + non_base = generator_cls(KITCHEN_SINK, deterministic=False).serialize() + non_mutated_path = _mutate_kitchen_sink(add_slot=True) + try: + non_mutated = generator_cls(non_mutated_path, deterministic=False).serialize() + finally: + Path(non_mutated_path).unlink(missing_ok=True) + + non_diff = _diff_line_count(non_base, non_mutated) + + g_base = Graph() + g_base.parse(data=base, format="turtle") + g_mut = Graph() + g_mut.parse(data=mutated, format="turtle") + new_triples = len(g_mut) - len(g_base) + + # Diff should be proportional to new triples (allow 5× margin) + assert det_diff <= max(new_triples * 5, 40), ( + f"Deterministic diff ({det_diff} lines) disproportionate to new triples ({new_triples})" + ) + # Signal-to-noise: deterministic must be at least 5× smaller + if non_diff > 0: + ratio = non_diff / max(det_diff, 1) + assert ratio >= 5, ( + f"Insufficient noise reduction: det={det_diff}, non-det={non_diff}, ratio={ratio:.1f}× (expected ≥5×)" + ) + + print( + f"\n {generator_cls.__name__} add-slot mutation: " + f"det_diff={det_diff} lines, non-det={non_diff} lines, " + f"new_triples={new_triples}, noise reduction={non_diff / max(det_diff, 1):.0f}×" + ) + + print(f"\n {generator_cls.__name__} add-slot mutation: det_diff={det_diff} lines, new_triples={new_triples}") + + +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +class TestKitchenSinkEquivalence: + """Verify semantic equivalence between deterministic and non-deterministic modes.""" + + def test_triple_count_matches(self, generator_cls): + """Both modes must produce the same number of triples.""" + det = generator_cls(KITCHEN_SINK, deterministic=True).serialize() + nondet = generator_cls(KITCHEN_SINK, deterministic=False).serialize() + + g_det = Graph() + g_det.parse(data=det, format="turtle") + g_nondet = Graph() + g_nondet.parse(data=nondet, format="turtle") + + assert len(g_det) == len(g_nondet), ( + f"Triple count mismatch: deterministic={len(g_det)}, non-deterministic={len(g_nondet)}" + ) + + def test_byte_stability_across_runs(self, generator_cls): + """Three deterministic runs must produce identical output.""" + runs = [generator_cls(KITCHEN_SINK, deterministic=True).serialize() for _ in range(3)] + hashes = [_sha256(r) for r in runs] + assert hashes[0] == hashes[1] == hashes[2], f"Deterministic output varies across runs: {hashes}" + + def test_non_deterministic_instability(self, generator_cls): + """Non-deterministic output should vary across runs (documents the problem). + + This test is advisory — it passes regardless but logs the instability. + """ + runs = [generator_cls(KITCHEN_SINK, deterministic=False).serialize() for _ in range(3)] + hashes = [_sha256(r) for r in runs] + identical = hashes[0] == hashes[1] == hashes[2] + print( + f"\n {generator_cls.__name__} non-det stable: {identical} " + f"(expected: False for Turtle due to bnode/ordering instability)" + ) diff --git a/tests/linkml/test_generators/test_deterministic_output.py b/tests/linkml/test_generators/test_deterministic_output.py new file mode 100644 index 0000000000..6721c2ac93 --- /dev/null +++ b/tests/linkml/test_generators/test_deterministic_output.py @@ -0,0 +1,481 @@ +"""Tests for deterministic generator output. + +When ``deterministic=True``, generators must produce byte-identical output +across multiple invocations. This ensures version-controlled artifacts don't +show spurious diffs from blank-node relabeling or dict-ordering instability. + +Generators must also produce **isomorphic** output — the deterministic +serialization must encode the same RDF graph as non-deterministic mode. +""" + +import json +import time +from pathlib import Path + +import pytest +from rdflib import Graph +from rdflib.compare import isomorphic + +from linkml.generators.jsonldcontextgen import ContextGenerator +from linkml.generators.jsonldgen import JSONLDGenerator +from linkml.generators.owlgen import OwlSchemaGenerator +from linkml.generators.shaclgen import ShaclGenerator + +# Deterministic Turtle requires pyoxigraph >= 0.4.0 (for Dataset/canonicalize). +# When an older version is present (e.g. pulled in by morph-kgc), skip these tests. +_has_pyoxigraph = False +try: + import pyoxigraph + + _has_pyoxigraph = hasattr(pyoxigraph, "Dataset") +except ImportError: + pass + +pytestmark = pytest.mark.skipif(not _has_pyoxigraph, reason="pyoxigraph >= 0.4.0 required for deterministic tests") + +SCHEMA = str(Path(__file__).parent / "input" / "personinfo.yaml") + + +@pytest.mark.parametrize( + "generator_cls,kwargs", + [ + (OwlSchemaGenerator, {}), + (ShaclGenerator, {}), + (ContextGenerator, {}), + (JSONLDGenerator, {}), + ], + ids=["owl", "shacl", "context", "jsonld"], +) +def test_deterministic_output_is_identical_across_runs(generator_cls, kwargs): + """Generate output twice with deterministic=True and verify identity.""" + out1 = generator_cls(SCHEMA, deterministic=True, **kwargs).serialize() + out2 = generator_cls(SCHEMA, deterministic=True, **kwargs).serialize() + # JSONLDGenerator embeds a generation_date timestamp — normalize it + if generator_cls is JSONLDGenerator: + import re + + ts_re = re.compile(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}") + out1 = ts_re.sub("TIMESTAMP", out1) + out2 = ts_re.sub("TIMESTAMP", out2) + assert out1 == out2, f"{generator_cls.__name__} produced different output across runs" + assert len(out1) > 100, "Output suspiciously short — generator may have failed silently" + + +@pytest.mark.parametrize( + "generator_cls", + [ContextGenerator, JSONLDGenerator], + ids=["context", "jsonld"], +) +def test_deterministic_json_has_sorted_keys(generator_cls): + """When deterministic=True, JSON dict keys should be sorted at all levels. + + For the ContextGenerator, @context keys use grouped ordering (prefixes + before term entries) — each group is sorted, but not globally. + """ + out = generator_cls(SCHEMA, deterministic=True).serialize() + parsed = json.loads(out) + + is_context_gen = generator_cls is ContextGenerator + + def _check_sorted_keys(obj, path="root"): + if isinstance(obj, dict): + keys = list(obj.keys()) + # Context generator groups @context keys: @-directives, prefixes, terms + if is_context_gen and path == "root.@context": + at_keys = [k for k in keys if k.startswith("@")] + prefix_keys = [k for k in keys if not k.startswith("@") and isinstance(obj[k], str)] + term_keys = [k for k in keys if not k.startswith("@") and not isinstance(obj[k], str)] + assert at_keys == sorted(at_keys), f"@-keys not sorted: {at_keys}" + assert prefix_keys == sorted(prefix_keys), f"Prefix keys not sorted: {prefix_keys}" + assert term_keys == sorted(term_keys), f"Term keys not sorted: {term_keys}" + else: + assert keys == sorted(keys), f"Keys not sorted at {path}: {keys}" + for k, v in obj.items(): + _check_sorted_keys(v, f"{path}.{k}") + elif isinstance(obj, list): + for i, item in enumerate(obj): + _check_sorted_keys(item, f"{path}[{i}]") + + _check_sorted_keys(parsed) + + +@pytest.mark.parametrize( + "generator_cls", + [ContextGenerator, JSONLDGenerator], + ids=["context", "jsonld"], +) +def test_deterministic_json_lists_are_sorted(generator_cls): + """When deterministic=True, JSON list elements should be sorted. + + Lists under JSON-LD structural keys (``@context``, ``@list``, ``imports``, + etc.) are exempt because their ordering carries semantic meaning. + """ + out = generator_cls(SCHEMA, deterministic=True).serialize() + parsed = json.loads(out) + + # JSON-LD keys whose array values carry ordering semantics. + _ORDERED_KEYS = {"@context", "@list", "@graph", "@set", "imports"} + + def _check_sorted_lists(obj, path="root", parent_key=""): + if isinstance(obj, dict): + for k, v in obj.items(): + _check_sorted_lists(v, f"{path}.{k}", parent_key=k) + elif isinstance(obj, list): + if parent_key not in _ORDERED_KEYS: + str_items = [json.dumps(item, sort_keys=True, ensure_ascii=False) for item in obj] + assert str_items == sorted(str_items), f"List not sorted at {path}" + for i, item in enumerate(obj): + _check_sorted_lists(item, f"{path}[{i}]") + + _check_sorted_lists(parsed) + + +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +def test_deterministic_turtle_preserves_at_prefix(generator_cls): + """deterministic_turtle must produce standard @prefix, not SPARQL PREFIX.""" + out = generator_cls(SCHEMA, deterministic=True).serialize() + assert "@prefix" in out, "Output uses non-standard prefix syntax" + assert "PREFIX " not in out, "Output uses SPARQL PREFIX instead of Turtle @prefix" + + +def test_deterministic_turtle_performance(): + """Deterministic OWL generation must complete within 10 seconds for personinfo. + + The Weisfeiler-Lehman approach is O(n log n), so this should easily pass. + The previous canon=True approach was exponential and failed this test + for graphs above ~250 triples. + """ + start = time.time() + out = OwlSchemaGenerator(SCHEMA, deterministic=True).serialize() + elapsed = time.time() - start + assert elapsed < 10.0, f"Deterministic generation took {elapsed:.1f}s (limit: 10s)" + assert len(out) > 100, "Output suspiciously short" + + +def test_shacl_closed_ignored_properties_deterministic(): + """sh:ignoredProperties in closed shapes must be deterministic. + + ``_build_ignored_properties`` collects inherited slots into a set; without + explicit sorting this produces different ``rdf:first``/``rdf:rest`` chains + on each run. With ``deterministic=True`` (and sorted Collection inputs) + the output must be byte-identical. + """ + runs = [ShaclGenerator(SCHEMA, deterministic=True, closed=True).serialize() for _ in range(3)] + assert runs[0] == runs[1] == runs[2], "sh:ignoredProperties ordering differs across runs" + assert "sh:ignoredProperties" in runs[0], "Expected closed shapes with sh:ignoredProperties" + + +def test_shacl_enum_in_deterministic(): + """sh:in RDF lists for enums must be deterministic. + + ``_build_enum_constraint`` iterates ``enum.permissible_values.items()`` + (dict iteration order) into a ``Collection``. Without sorting, the + ``rdf:first``/``rdf:rest`` chain varies across runs. + """ + runs = [ShaclGenerator(SCHEMA, deterministic=True).serialize() for _ in range(3)] + assert runs[0] == runs[1] == runs[2], "sh:in enum list ordering differs across runs" + assert "sh:in" in runs[0], "Expected sh:in constraints for enums" + + +def test_owl_enum_one_of_deterministic(): + """owl:oneOf RDF lists for enums must be deterministic. + + ``_boolean_expression`` feeds ``pv_uris`` (from ``permissible_values``) + into a ``Collection``. Without sorting, ``owl:oneOf`` list ordering varies. + """ + runs = [OwlSchemaGenerator(SCHEMA, deterministic=True).serialize() for _ in range(3)] + assert runs[0] == runs[1] == runs[2], "owl:oneOf enum list ordering differs across runs" + + +KITCHEN_SINK = str(Path(__file__).parent / "input" / "kitchen_sink.yaml") + + +def test_deterministic_large_schema(): + """End-to-end idempotency on a complex schema (kitchen_sink). + + Exercises many code paths simultaneously: closed shapes, enums, imports, + class hierarchies, and mixed ranges. + """ + owl1 = OwlSchemaGenerator(KITCHEN_SINK, deterministic=True).serialize() + owl2 = OwlSchemaGenerator(KITCHEN_SINK, deterministic=True).serialize() + assert owl1 == owl2, "OWL output differs across runs for kitchen_sink" + assert len(owl1) > 500, "kitchen_sink output suspiciously short" + + shacl1 = ShaclGenerator(KITCHEN_SINK, deterministic=True).serialize() + shacl2 = ShaclGenerator(KITCHEN_SINK, deterministic=True).serialize() + assert shacl1 == shacl2, "SHACL output differs across runs for kitchen_sink" + assert len(shacl1) > 500, "kitchen_sink output suspiciously short" + + +def test_deterministic_context_preserves_jsonld_structure(): + """Deterministic JSON-LD context must preserve conventional structure. + + JSON-LD contexts have a conventional layout: + 1. ``comments`` block first (metadata) + 2. ``@context`` block second, with prefixes grouped before term entries + + ``deterministic_json()`` would scramble this by sorting all keys + uniformly. The context generator must use JSON-LD-aware ordering. + """ + out = ContextGenerator(SCHEMA, deterministic=True, metadata=True).serialize() + parsed = json.loads(out) + + # Top-level key order: "comments" before "@context" + top_keys = list(parsed.keys()) + assert "comments" in top_keys, "Expected 'comments' block with metadata=True" + assert top_keys.index("comments") < top_keys.index("@context"), ( + f"'comments' should precede '@context', got: {top_keys}" + ) + + # Inside @context: @-directives, then prefixes (str values), then terms (dict values) + ctx = parsed["@context"] + ctx_keys = list(ctx.keys()) + + at_keys = [k for k in ctx_keys if k.startswith("@")] + prefix_keys = [k for k in ctx_keys if not k.startswith("@") and isinstance(ctx[k], str)] + term_keys = [k for k in ctx_keys if not k.startswith("@") and not isinstance(ctx[k], str)] + + # Verify grouping: all @-keys before all prefix keys before all term keys + last_at = max(ctx_keys.index(k) for k in at_keys) if at_keys else -1 + first_prefix = min(ctx_keys.index(k) for k in prefix_keys) if prefix_keys else len(ctx_keys) + last_prefix = max(ctx_keys.index(k) for k in prefix_keys) if prefix_keys else -1 + first_term = min(ctx_keys.index(k) for k in term_keys) if term_keys else len(ctx_keys) + + assert last_at < first_prefix, "@-directives must come before prefixes" + assert last_prefix < first_term, "Prefixes must come before term entries" + + # Verify each group is sorted internally + assert at_keys == sorted(at_keys), f"@-directives not sorted: {at_keys}" + assert prefix_keys == sorted(prefix_keys), f"Prefixes not sorted: {prefix_keys}" + assert term_keys == sorted(term_keys), f"Term entries not sorted: {term_keys}" + + +def test_non_deterministic_is_default(): + """Verify that ``deterministic`` defaults to False.""" + gen = OwlSchemaGenerator(SCHEMA) + assert gen.deterministic is False + + +def test_wl_handles_structurally_similar_bnodes(): + """Blank nodes with identical local structure but different named neighbours + must receive different WL signatures and thus different stable labels. + + This tests the core WL property: two BNodes that differ only in their + connected named nodes (URIs/literals) must be distinguishable. + """ + from rdflib import BNode, Graph, Namespace, URIRef + + from linkml.utils.generator import deterministic_turtle + + RDF_TYPE = URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type") + OWL_RESTRICTION = URIRef("http://www.w3.org/2002/07/owl#Restriction") + OWL_ON_PROP = URIRef("http://www.w3.org/2002/07/owl#onProperty") + OWL_ALL_VALUES = URIRef("http://www.w3.org/2002/07/owl#allValuesFrom") + + EX = Namespace("http://example.org/") + g = Graph() + + # Two restrictions with same structure but different property URIs + r1 = BNode() + g.add((r1, RDF_TYPE, OWL_RESTRICTION)) + g.add((r1, OWL_ON_PROP, EX.alpha)) + g.add((r1, OWL_ALL_VALUES, EX.Target1)) + + r2 = BNode() + g.add((r2, RDF_TYPE, OWL_RESTRICTION)) + g.add((r2, OWL_ON_PROP, EX.beta)) + g.add((r2, OWL_ALL_VALUES, EX.Target2)) + + RDFS_SUBCLASS = URIRef("http://www.w3.org/2000/01/rdf-schema#subClassOf") + g.add((EX.MyClass, RDFS_SUBCLASS, r1)) + g.add((EX.MyClass, RDFS_SUBCLASS, r2)) + + # Must be deterministic across runs + out1 = deterministic_turtle(g) + out2 = deterministic_turtle(g) + assert out1 == out2, "WL-based serializer is not deterministic for similar BNodes" + + # Both restrictions must appear (not collapsed) + assert "alpha" in out1 + assert "beta" in out1 + + +def test_deterministic_turtle_no_bnodes(): + """Graphs with no blank nodes should still produce sorted, deterministic output.""" + from rdflib import Graph, Literal, Namespace + from rdflib.namespace import RDFS + + from linkml.utils.generator import deterministic_turtle + + EX = Namespace("http://example.org/") + g = Graph() + g.add((EX.B, RDFS.label, Literal("B"))) + g.add((EX.A, RDFS.label, Literal("A"))) + + out1 = deterministic_turtle(g) + out2 = deterministic_turtle(g) + assert out1 == out2 + + # A should appear before B (sorted) + a_pos = out1.find("example.org/A") + b_pos = out1.find("example.org/B") + assert a_pos < b_pos, "Triples should be sorted: A before B" + + +@pytest.mark.xfail( + reason=( + "Collection sorting (owl:oneOf, sh:in) in deterministic mode intentionally " + "reorders RDF list triples for canonical output. The resulting graph is " + "semantically equivalent (OWL/SHACL interpret these as unordered sets) but " + "not RDF-isomorphic because rdf:first/rdf:rest chains encode ordering." + ), + strict=True, +) +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +def test_deterministic_turtle_is_isomorphic(generator_cls): + """Deterministic output is NOT RDF-isomorphic to non-deterministic output. + + This documents the trade-off identified in linkml/linkml#3295 review: + deterministic mode sorts Collection inputs (owl:oneOf, sh:in, + sh:ignoredProperties) to produce canonical RDF list ordering. Since RDF + Collections encode order via rdf:first/rdf:rest triples, the sorted graph + is structurally different from the insertion-order graph — even though the + OWL/SHACL semantics are identical (these Collections represent sets). + + The test is marked xfail(strict=True) so that it: + - Documents the known, intentional non-isomorphism + - Alerts maintainers if the behaviour changes (strict xfail fails on pass) + """ + out_det = generator_cls(SCHEMA, deterministic=True).serialize() + out_nondet = generator_cls(SCHEMA, deterministic=False).serialize() + + g_det = Graph() + g_det.parse(data=out_det, format="turtle") + + g_nondet = Graph() + g_nondet.parse(data=out_nondet, format="turtle") + + assert len(g_det) == len(g_nondet), ( + f"Triple count mismatch: deterministic={len(g_det)}, non-deterministic={len(g_nondet)}" + ) + assert isomorphic(g_det, g_nondet), ( + f"{generator_cls.__name__}: deterministic output is NOT isomorphic " + "to non-deterministic output — the serialization changed the graph" + ) + + +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +def test_non_deterministic_output_unchanged(generator_cls): + """Non-deterministic output must still produce valid RDF. + + Ensures that changes for deterministic mode don't break default behavior. + """ + out = generator_cls(SCHEMA, deterministic=False).serialize() + assert len(out) > 100, "Output suspiciously short" + g = Graph() + g.parse(data=out, format="turtle") + assert len(g) > 50, f"Graph has too few triples ({len(g)})" + + +@pytest.mark.parametrize( + "generator_cls,kwargs", + [ + (OwlSchemaGenerator, {}), + (ShaclGenerator, {}), + (ContextGenerator, {}), + (JSONLDGenerator, {}), + ], + ids=["owl", "shacl", "context", "jsonld"], +) +def test_non_deterministic_produces_valid_output(generator_cls, kwargs): + """All generators must produce valid output in non-deterministic mode.""" + out = generator_cls(SCHEMA, deterministic=False, **kwargs).serialize() + assert len(out) > 100, f"{generator_cls.__name__} output suspiciously short" + + +@pytest.mark.xfail( + reason=( + "Collection sorting in deterministic mode produces non-isomorphic RDF " + "(different rdf:first/rdf:rest triples). See test_deterministic_turtle_is_isomorphic." + ), + strict=True, +) +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +def test_deterministic_kitchen_sink_isomorphic(generator_cls): + """Isomorphism check on the complex kitchen_sink schema. + + Expected to fail for the same reason as test_deterministic_turtle_is_isomorphic: + Collection sorting changes the RDF structure while preserving OWL/SHACL semantics. + """ + out_det = generator_cls(KITCHEN_SINK, deterministic=True).serialize() + out_nondet = generator_cls(KITCHEN_SINK, deterministic=False).serialize() + + g_det = Graph() + g_det.parse(data=out_det, format="turtle") + + g_nondet = Graph() + g_nondet.parse(data=out_nondet, format="turtle") + + assert isomorphic(g_det, g_nondet), ( + f"{generator_cls.__name__}: kitchen_sink deterministic output is NOT isomorphic to non-deterministic output" + ) + + +@pytest.mark.skipif(False, reason="does not require pyoxigraph") +def test_expression_sort_key_is_stable(): + """``_expression_sort_key`` must produce stable, content-based keys. + + LinkML anonymous expressions inherit ``YAMLRoot.__repr__()``, which + formats objects using **field values** (not memory addresses). + The ``_expression_sort_key`` helper relies on this for deterministic + ordering of ``any_of`` / ``all_of`` / ``none_of`` members. + + This test verifies that: + 1. Two distinct objects with identical fields produce the same key. + 2. Objects with different fields produce different keys. + 3. Sorting is stable across repeated calls. + """ + from linkml.generators.owlgen import _expression_sort_key + from linkml_runtime.linkml_model.meta import AnonymousClassExpression, AnonymousSlotExpression + + # Two distinct objects with identical content → same key + a1 = AnonymousClassExpression(is_a="Parent") + a2 = AnonymousClassExpression(is_a="Parent") + assert a1 is not a2 + assert _expression_sort_key(a1) == _expression_sort_key(a2) + + # Different content → different keys + b = AnonymousClassExpression(is_a="Child") + assert _expression_sort_key(a1) != _expression_sort_key(b) + + # Sorting stability: same order every time + items = [b, a1, a2] + for _ in range(5): + result = sorted(items, key=_expression_sort_key) + # "Child" < "Parent" alphabetically, so b comes first + assert _expression_sort_key(result[0]) == _expression_sort_key(b) + assert _expression_sort_key(result[1]) == _expression_sort_key(result[2]) # a1, a2 together + + # Slot expressions work too + s1 = AnonymousSlotExpression(range="string") + s2 = AnonymousSlotExpression(range="integer") + assert _expression_sort_key(s1) != _expression_sort_key(s2) + order1 = sorted([s2, s1], key=_expression_sort_key) + order2 = sorted([s1, s2], key=_expression_sort_key) + assert [_expression_sort_key(x) for x in order1] == [_expression_sort_key(x) for x in order2] diff --git a/tests/linkml/test_generators/test_shaclgen.py b/tests/linkml/test_generators/test_shaclgen.py index 11f85e142c..d66ba89606 100644 --- a/tests/linkml/test_generators/test_shaclgen.py +++ b/tests/linkml/test_generators/test_shaclgen.py @@ -1373,6 +1373,7 @@ def _build_message_test_schema(): # Helper functions # --------------------------------------------------------------------------- + def _parse_shacl(schema, **kwargs): shacl = ShaclGenerator(schema, mergeimports=False, **kwargs).serialize() g = rdflib.Graph() From 7d774e34edd045f5dada07abc93e2a6a049c5417 Mon Sep 17 00:00:00 2001 From: Carlo van Driesten Date: Tue, 9 Jun 2026 18:20:03 +0200 Subject: [PATCH 10/12] fix(shaclgen): apply default_language to SPARQL constraint messages When --default-language is set, the sh:message literal on SPARQL constraints (sh:SPARQLConstraint) was emitted without a language tag. Add lang=self._resolve_language() to the Literal() constructor call for SPARQL rule descriptions. Signed-off-by: Carlo van Driesten --- packages/linkml/src/linkml/generators/shaclgen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/linkml/src/linkml/generators/shaclgen.py b/packages/linkml/src/linkml/generators/shaclgen.py index f344d21c97..e56285edb7 100644 --- a/packages/linkml/src/linkml/generators/shaclgen.py +++ b/packages/linkml/src/linkml/generators/shaclgen.py @@ -452,7 +452,7 @@ def _add_rules(self, g: Graph, shape_uri: URIRef, cls: ClassDefinition) -> None: message = getattr(rule, "description", None) if message: - g.add((constraint, SH.message, Literal(message))) + g.add((constraint, SH.message, Literal(message, lang=self._resolve_language()))) g.add((constraint, SH.select, Literal(sparql_query))) From 7aabeb5845a27997c5e5ae008dd0e486181c4da4 Mon Sep 17 00:00:00 2001 From: Carlo van Driesten Date: Tue, 12 May 2026 14:17:49 +0200 Subject: [PATCH 11/12] fix(generators): normalize trailing newline in Turtle serialization rdflib's Turtle serializer always emits a trailing double newline. Normalize to single newline in deterministic_turtle() and the rdflib fallback path in canonicalize_rdf_graph() for consistent file endings. Note: CLI print() still adds a newline after serialize()'s trailing newline. Callers capturing stdout should strip trailing blank lines (e.g. via sed). Signed-off-by: Carlo van Driesten --- packages/linkml/src/linkml/generators/jsonldcontextgen.py | 4 ++-- packages/linkml/src/linkml/utils/generator.py | 4 +++- packages/linkml/src/linkml/utils/rdf_canonicalize.py | 5 ++++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/packages/linkml/src/linkml/generators/jsonldcontextgen.py b/packages/linkml/src/linkml/generators/jsonldcontextgen.py index bc52c11008..0c81a0edc4 100644 --- a/packages/linkml/src/linkml/generators/jsonldcontextgen.py +++ b/packages/linkml/src/linkml/generators/jsonldcontextgen.py @@ -310,8 +310,8 @@ def end_schema( json.dump(frame, f, indent=2, ensure_ascii=False) if self.deterministic: - return self._deterministic_context_json(json.loads(str(as_json(context))), indent=3) + "\n" - return str(as_json(context)) + "\n" + return self._deterministic_context_json(json.loads(str(as_json(context))), indent=3) + return str(as_json(context)) @staticmethod def _deterministic_context_json(data: dict, indent: int = 3) -> str: diff --git a/packages/linkml/src/linkml/utils/generator.py b/packages/linkml/src/linkml/utils/generator.py index 0aab3c40dd..99121b50e4 100644 --- a/packages/linkml/src/linkml/utils/generator.py +++ b/packages/linkml/src/linkml/utils/generator.py @@ -459,7 +459,9 @@ def _to_rdflib(term): if pfx_s and any(iri.startswith(ns_s) for iri in used_iris): result_graph.bind(pfx_s, ns_s) - return result_graph.serialize(format="turtle") + # rdflib's Turtle serializer always emits a trailing double newline; + # normalize to a single newline for consistent file endings. + return result_graph.serialize(format="turtle").rstrip("\n") + "\n" def deterministic_json(obj: object, indent: int = 3, preserve_list_order_keys: frozenset[str] | None = None) -> str: diff --git a/packages/linkml/src/linkml/utils/rdf_canonicalize.py b/packages/linkml/src/linkml/utils/rdf_canonicalize.py index da57f23399..4b6f093b29 100644 --- a/packages/linkml/src/linkml/utils/rdf_canonicalize.py +++ b/packages/linkml/src/linkml/utils/rdf_canonicalize.py @@ -146,7 +146,10 @@ def canonicalize_rdf_graph( "pyoxigraph does not support format %r; falling back to rdflib serializer", output_format, ) - return graph.serialize(format=output_format) + # rdflib's Turtle serializer emits a trailing double newline; + # normalize to single newline for consistent file endings. + data = graph.serialize(format=output_format) + return data.rstrip("\n") + "\n" if data.endswith("\n") else data # 1. Transfer rdflib graph to pyoxigraph via N-Triples. nt_data = graph.serialize(format="nt") From b2b3dba5d2abd88c9773028bb8527e4f2bed2283 Mon Sep 17 00:00:00 2001 From: Carlo van Driesten Date: Tue, 9 Jun 2026 20:42:12 +0200 Subject: [PATCH 12/12] fix(shaclgen): address PR #3451 review comments Address all 6 review comments from @amc-corey-cox: 1. bidirectional rules: now skip-and-warn (continue) instead of emit-and-warn, preventing silent semantic divergence. 2. elseconditions: add explicit warning when else branch is dropped, so schema authors know only the forward branch is emitted. 3. PresenceEnum wrapping: keep PresenceEnum(PresenceEnum.PRESENT) - it is NOT redundant. PresenceEnum.PRESENT is a PermissibleValue, but parsed schemas return PresenceEnum instances; the wrapping ensures type-compatible comparison. Added explanatory comment. 4. xsd:boolean vs xsd:string: change SPARQL from ?flag != true to str(?flag) != "true" so the comparison works regardless of whether the data stores the flag as xsd:boolean or xsd:string. 5. Unused fixture: delete boolean_guard_rules.yaml (tests inline duplicate schemas as Python strings). 6. End-to-end pyshacl test: add test_rule_boolean_guard_pyshacl_end_to_end that validates a conforming instance passes and a violating instance (missing flag) is correctly flagged by pyshacl with advanced=True. Additional tests added: - test_rule_with_elseconditions_warns (warning emission) - test_rule_bidirectional_skipped (skip behavior + warning) Signed-off-by: Carlo van Driesten --- .../linkml/src/linkml/generators/shaclgen.py | 17 ++- .../input/shaclgen/boolean_guard_rules.yaml | 70 ----------- tests/linkml/test_generators/test_shaclgen.py | 118 +++++++++++++++++- 3 files changed, 131 insertions(+), 74 deletions(-) delete mode 100644 tests/linkml/test_generators/input/shaclgen/boolean_guard_rules.yaml diff --git a/packages/linkml/src/linkml/generators/shaclgen.py b/packages/linkml/src/linkml/generators/shaclgen.py index e56285edb7..bcf88037c2 100644 --- a/packages/linkml/src/linkml/generators/shaclgen.py +++ b/packages/linkml/src/linkml/generators/shaclgen.py @@ -424,10 +424,11 @@ def _add_rules(self, g: Graph, shape_uri: URIRef, cls: ClassDefinition) -> None: if getattr(rule, "bidirectional", False): logger.warning( "Rule in class %r has bidirectional=true; " - "SHACL-SPARQL generation does not yet support bidirectional rules. " - "Only the forward direction is emitted.", + "SHACL-SPARQL generation does not support bidirectional rules. " + "Skipping this rule entirely.", cls.name, ) + continue if getattr(rule, "open_world", False): logger.warning( @@ -437,6 +438,14 @@ def _add_rules(self, g: Graph, shape_uri: URIRef, cls: ClassDefinition) -> None: cls.name, ) + if getattr(rule, "elseconditions", None): + logger.warning( + "Rule in class %r has elseconditions; " + "only the forward (if/then) branch is emitted as sh:sparql. " + "The else branch cannot be represented in SHACL-SPARQL.", + cls.name, + ) + sparql_query = self._rule_to_sparql(sv, cls, rule) if sparql_query is None: logger.debug( @@ -479,6 +488,8 @@ def _rule_to_sparql(self, sv, cls: ClassDefinition, rule) -> str | None: pre_cond = pre_slots[pre_slot_name] post_cond = post_slots[post_slot_name] + # Note: PresenceEnum.PRESENT is a PermissibleValue, but parsed schemas + # return PresenceEnum instances — wrapping ensures type-compatible comparison. is_value_present = getattr(pre_cond, "value_presence", None) == PresenceEnum(PresenceEnum.PRESENT) is_flag_true = getattr(post_cond, "equals_string", None) == "true" @@ -515,7 +526,7 @@ def _build_boolean_guard_sparql(self, sv, cls: ClassDefinition, flag_slot_name: f" OPTIONAL {{ $this <{flag_uri}> ?flag . }}\n" f" OPTIONAL {{ $this <{value_uri}> ?value . }}\n" f" FILTER (\n" - f" ( !BOUND(?flag) || ?flag != true ) &&\n" + f' ( !BOUND(?flag) || str(?flag) != "true" ) &&\n' f" BOUND(?value)\n" f" )\n" f"}}" diff --git a/tests/linkml/test_generators/input/shaclgen/boolean_guard_rules.yaml b/tests/linkml/test_generators/input/shaclgen/boolean_guard_rules.yaml deleted file mode 100644 index f56c2eca6a..0000000000 --- a/tests/linkml/test_generators/input/shaclgen/boolean_guard_rules.yaml +++ /dev/null @@ -1,70 +0,0 @@ -id: https://example.org/boolean-guards -name: boolean_guard_rules -description: >- - Test schema for SHACL generation of sh:sparql constraints from LinkML rules. - Models the boolean-guard pattern where a boolean flag must be true if a - corresponding value property is present. - -prefixes: - linkml: https://w3id.org/linkml/ - ex: https://example.org/boolean-guards/ - -imports: - - linkml:types - -default_prefix: ex -default_range: string - -slots: - WeatherWind: - description: Whether wind conditions are present. - range: boolean - slot_uri: ex:WeatherWind - weatherWindValue: - description: Wind speed value. - range: decimal - slot_uri: ex:weatherWindValue - WeatherRain: - description: Whether rain conditions are present. - range: boolean - slot_uri: ex:WeatherRain - weatherRainValue: - description: Rain intensity value. - range: decimal - slot_uri: ex:weatherRainValue - Temperature: - description: Ambient temperature. - range: decimal - slot_uri: ex:Temperature - -classes: - Environment: - description: Environmental conditions. - class_uri: ex:Environment - slots: - - WeatherWind - - weatherWindValue - - WeatherRain - - weatherRainValue - - Temperature - rules: - - description: >- - If weatherWindValue is provided, WeatherWind must be true. - preconditions: - slot_conditions: - weatherWindValue: - value_presence: PRESENT - postconditions: - slot_conditions: - WeatherWind: - equals_string: "true" - - description: >- - If weatherRainValue is provided, WeatherRain must be true. - preconditions: - slot_conditions: - weatherRainValue: - value_presence: PRESENT - postconditions: - slot_conditions: - WeatherRain: - equals_string: "true" diff --git a/tests/linkml/test_generators/test_shaclgen.py b/tests/linkml/test_generators/test_shaclgen.py index d66ba89606..8b3ce5e89b 100644 --- a/tests/linkml/test_generators/test_shaclgen.py +++ b/tests/linkml/test_generators/test_shaclgen.py @@ -2158,7 +2158,8 @@ def test_rule_no_explicit_slot_uri(): def test_rule_with_elseconditions_emitted(): - """Rules with elseconditions now emit the forward (if/then) branch as sh:sparql.""" + """Rules with elseconditions emit the forward (if/then) branch and warn.""" + g = _parse_shacl(_ELSE_COND_SCHEMA_YAML) shape = URIRef("https://example.org/else-test/TestClass") @@ -2166,6 +2167,121 @@ def test_rule_with_elseconditions_emitted(): assert len(sparql_nodes) >= 1, "Rule with elseconditions should emit sh:sparql for the forward branch" +def test_rule_with_elseconditions_warns(caplog): + """Rules with elseconditions emit a warning about the dropped else branch.""" + import logging + + with caplog.at_level(logging.WARNING): + _parse_shacl(_ELSE_COND_SCHEMA_YAML) + + assert any("elseconditions" in rec.message for rec in caplog.records), ( + "Expected a warning about elseconditions being dropped" + ) + + +_BIDIRECTIONAL_RULE_SCHEMA_YAML = """ +id: https://example.org/bidir-test +name: bidir_rule_test +prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/bidir-test/ +imports: + - linkml:types +default_prefix: ex +default_range: string +slots: + Flag: + range: boolean + slot_uri: ex:Flag + flagValue: + range: decimal + slot_uri: ex:flagValue +classes: + TestClass: + class_uri: ex:TestClass + slots: + - Flag + - flagValue + rules: + - description: Bidirectional rule should be skipped. + bidirectional: true + preconditions: + slot_conditions: + flagValue: + value_presence: PRESENT + postconditions: + slot_conditions: + Flag: + equals_string: "true" +""" + + +def test_rule_bidirectional_skipped(caplog): + """Rules with bidirectional=true are skipped entirely with a warning.""" + import logging + + with caplog.at_level(logging.WARNING): + g = _parse_shacl(_BIDIRECTIONAL_RULE_SCHEMA_YAML) + + shape = URIRef("https://example.org/bidir-test/TestClass") + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 0, "Bidirectional rules should NOT emit sh:sparql" + assert any("bidirectional" in rec.message for rec in caplog.records), ( + "Expected a warning about bidirectional rules being skipped" + ) + + +# --------------------------------------------------------------------------- +# End-to-end pyshacl validation test +# --------------------------------------------------------------------------- + + +def test_rule_boolean_guard_pyshacl_end_to_end(): + """End-to-end: pyshacl flags a violation and passes a conforming instance.""" + import pyshacl + + shacl_ttl = ShaclGenerator(_RULES_SCHEMA_YAML, mergeimports=False, emit_rules=True).serialize() + + # Build a conforming RDF instance: weatherWindValue present AND WeatherWind = true + conforming_data = """ + @prefix ex: . + @prefix xsd: . + + ex:env1 a ex:Environment ; + ex:WeatherWind "true"^^xsd:boolean ; + ex:weatherWindValue "12.5"^^xsd:decimal . + """ + + # Build a violating RDF instance: weatherWindValue present but WeatherWind missing + violating_data = """ + @prefix ex: . + @prefix xsd: . + + ex:env2 a ex:Environment ; + ex:weatherWindValue "8.0"^^xsd:decimal . + """ + + # Conforming instance should pass + conforms, _, _ = pyshacl.validate( + data_graph=conforming_data, + shacl_graph=shacl_ttl, + data_graph_format="turtle", + shacl_graph_format="turtle", + advanced=True, + ) + assert conforms, "Conforming instance should pass SHACL validation" + + # Violating instance should fail + conforms, results_graph, results_text = pyshacl.validate( + data_graph=violating_data, + shacl_graph=shacl_ttl, + data_graph_format="turtle", + shacl_graph_format="turtle", + advanced=True, + ) + assert not conforms, f"Violating instance should fail SHACL validation:\n{results_text}" + + # --------------------------------------------------------------------------- # SPARQL syntax validation # ---------------------------------------------------------------------------