From 6ca468cb1bf2487f6a7829c909e8beac1e9153b8 Mon Sep 17 00:00:00 2001 From: jdsika Date: Wed, 25 Mar 2026 18:24:34 +0100 Subject: [PATCH] feat(generators): add --deterministic flag for reproducible output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a --deterministic flag to OWL, SHACL, JSON-LD, and JSON-LD Context generators that produces stable, reproducible output suitable for version-controlled artifacts. When enabled, the flag activates: 1. **RDFC-1.0 blank-node canonicalization** via pyoxigraph (W3C Recommendation) for Turtle serialisation of OWL and SHACL graphs. 2. **Deterministic Collection ordering** — RDF Collections (owl:oneOf, sh:in, sh:ignoredProperties) are sorted so that enum members and property lists appear in a stable order. This intentionally changes the RDF graph (Collections encode order at the triple level) and is therefore opt-in. 3. **Deterministic JSON key ordering** for JSON-LD and JSON-LD Context output, with structure-aware sorting that preserves JSON-LD conventions (@context directives first, then prefixes, then terms). The flag defaults to False to preserve backward compatibility. Four tests are marked xfail(strict=True) to document that deterministic Collection sorting intentionally produces non-isomorphic output. New dependency: pyoxigraph >= 0.4.0 (Rust-based, W3C RDFC-1.0). Refs: - W3C (2024) RDF Dataset Canonicalization (RDFC-1.0) https://www.w3.org/TR/rdf-canon/ Signed-off-by: jdsika --- packages/linkml/pyproject.toml | 1 + .../src/linkml/generators/jsonldcontextgen.py | 54 +++ .../linkml/src/linkml/generators/jsonldgen.py | 5 + .../linkml/src/linkml/generators/owlgen.py | 14 +- .../linkml/src/linkml/generators/shaclgen.py | 21 +- packages/linkml/src/linkml/utils/generator.py | 124 +++++- .../test_deterministic_output.py | 353 ++++++++++++++++++ 7 files changed, 563 insertions(+), 9 deletions(-) create mode 100644 tests/linkml/test_generators/test_deterministic_output.py diff --git a/packages/linkml/pyproject.toml b/packages/linkml/pyproject.toml index 3586aa3ff..474424f5a 100644 --- a/packages/linkml/pyproject.toml +++ b/packages/linkml/pyproject.toml @@ -55,6 +55,7 @@ dependencies = [ # Specifier syntax: https://peps.python.org/pep-0631/ "pyshex >= 0.7.20", "pyshexc >= 0.8.3", "python-dateutil", + "pyoxigraph >= 0.4.0", "pyyaml", "rdflib >=6.0.0", "requests >= 2.22", diff --git a/packages/linkml/src/linkml/generators/jsonldcontextgen.py b/packages/linkml/src/linkml/generators/jsonldcontextgen.py index 60eaa9ffd..1c6cec148 100644 --- a/packages/linkml/src/linkml/generators/jsonldcontextgen.py +++ b/packages/linkml/src/linkml/generators/jsonldcontextgen.py @@ -189,8 +189,62 @@ def end_schema( with open(frame_path, "w", encoding="UTF-8") as f: json.dump(frame, f, indent=2, ensure_ascii=False) + if self.deterministic: + return self._deterministic_context_json(json.loads(str(as_json(context))), indent=3) + "\n" return str(as_json(context)) + "\n" + @staticmethod + def _deterministic_context_json(data: dict, indent: int = 3) -> str: + """Serialize a JSON-LD context with deterministic key ordering. + + Preserves the conventional JSON-LD context structure: + 1. ``comments`` block first (metadata) + 2. ``@context`` block second, with: + a. ``@``-prefixed directives (``@vocab``, ``@base``) first + b. Prefix declarations (string values) second + c. Class/property term entries (object values) last + 3. Each group sorted alphabetically within itself + + Unlike :func:`deterministic_json`, this understands JSON-LD + conventions so that the output remains human-readable while + still being byte-identical across invocations. + """ + from linkml.utils.generator import deterministic_json + + ordered = {} + + # 1. "comments" first (if present) + if "comments" in data: + ordered["comments"] = data["comments"] + + # 2. "@context" with structured internal ordering + if "@context" in data: + ctx = data["@context"] + ordered_ctx = {} + + # 2a. @-prefixed directives (@vocab, @base, etc.) + for k in sorted(k for k in ctx if k.startswith("@")): + ordered_ctx[k] = ctx[k] + + # 2b. Prefix declarations (string values — short namespace URIs) + for k in sorted(k for k in ctx if not k.startswith("@") and isinstance(ctx[k], str)): + ordered_ctx[k] = ctx[k] + + # 2c. Term definitions (object values) — deep-sorted for determinism + term_entries = {k: v for k, v in ctx.items() if not k.startswith("@") and not isinstance(v, str)} + sorted_terms = json.loads(deterministic_json(term_entries)) + for k in sorted(sorted_terms): + ordered_ctx[k] = sorted_terms[k] + + ordered["@context"] = ordered_ctx + + # 3. Any remaining top-level keys + for k in sorted(data): + if k not in ordered: + ordered[k] = data[k] + + return json.dumps(ordered, indent=indent, ensure_ascii=False) + def visit_class(self, cls: ClassDefinition) -> bool: if self.exclude_imports and cls.name not in self._local_classes: return False diff --git a/packages/linkml/src/linkml/generators/jsonldgen.py b/packages/linkml/src/linkml/generators/jsonldgen.py index c974e762d..0c9c87cbb 100644 --- a/packages/linkml/src/linkml/generators/jsonldgen.py +++ b/packages/linkml/src/linkml/generators/jsonldgen.py @@ -1,5 +1,6 @@ """Generate JSONld from a LinkML schema.""" +import json import os from collections.abc import Sequence from copy import deepcopy @@ -202,6 +203,10 @@ def end_schema(self, context: str | Sequence[str] | None = None, context_kwargs: self.schema["@context"].append({"@base": base_prefix}) # json_obj["@id"] = self.schema.id out = str(as_json(self.schema, indent=" ")) + "\n" + if self.deterministic: + from linkml.utils.generator import deterministic_json + + out = deterministic_json(json.loads(out), indent=2) + "\n" self.schema = self.original_schema return out diff --git a/packages/linkml/src/linkml/generators/owlgen.py b/packages/linkml/src/linkml/generators/owlgen.py index 33c58b0ec..4ab4b8cf3 100644 --- a/packages/linkml/src/linkml/generators/owlgen.py +++ b/packages/linkml/src/linkml/generators/owlgen.py @@ -267,7 +267,14 @@ def serialize(self, **kwargs) -> str: :return: """ self.as_graph() - data = self.graph.serialize(format="turtle" if self.format in ["owl", "ttl"] else self.format) + fmt = "turtle" if self.format in ["owl", "ttl"] else self.format + if self.deterministic and fmt == "turtle": + # Deferred to avoid circular import (generator.py imports from this package) + from linkml.utils.generator import deterministic_turtle + + data = deterministic_turtle(self.graph) + else: + data = self.graph.serialize(format=fmt) return data def add_metadata(self, e: Definition | PermissibleValue, uri: URIRef) -> None: @@ -998,7 +1005,10 @@ def add_enum(self, e: EnumDefinition) -> None: owl_types = [] enum_owl_type = self._get_metatype(e, self.default_permissible_value_type) - for pv in e.permissible_values.values(): + pvs = e.permissible_values.values() + if self.deterministic: + pvs = sorted(pvs, key=lambda x: x.text) + for pv in pvs: pv_owl_type = self._get_metatype(pv, enum_owl_type) owl_types.append(pv_owl_type) if pv_owl_type == RDFS.Literal: diff --git a/packages/linkml/src/linkml/generators/shaclgen.py b/packages/linkml/src/linkml/generators/shaclgen.py index 5425051e3..ec78e7ba5 100644 --- a/packages/linkml/src/linkml/generators/shaclgen.py +++ b/packages/linkml/src/linkml/generators/shaclgen.py @@ -93,7 +93,13 @@ def generate_header(self) -> str: def serialize(self, **args) -> str: g = self.as_graph() - data = g.serialize(format="turtle" if self.format in ["owl", "ttl"] else self.format) + fmt = "turtle" if self.format in ["owl", "ttl"] else self.format + if self.deterministic and fmt == "turtle": + from linkml.utils.generator import deterministic_turtle + + data = deterministic_turtle(g) + else: + data = g.serialize(format=fmt) return data def as_graph(self) -> Graph: @@ -309,13 +315,13 @@ def _add_enum(self, g: Graph, func: Callable, r: ElementName) -> None: sv = self.schemaview enum = sv.get_enum(r) pv_node = BNode() + pv_items = list(enum.permissible_values.items()) + if self.deterministic: + pv_items = sorted(pv_items) Collection( g, pv_node, - [ - URIRef(sv.expand_curie(pv.meaning)) if pv.meaning else Literal(pv_name) - for pv_name, pv in enum.permissible_values.items() - ], + [URIRef(sv.expand_curie(pv.meaning)) if pv.meaning else Literal(pv_name) for pv_name, pv in pv_items], ) func(SH["in"], pv_node) @@ -469,7 +475,10 @@ def collect_child_properties(class_name: str, output: set) -> None: list_node = BNode() ignored_properties.add(RDF.type) - Collection(g, list_node, list(ignored_properties)) + props = list(ignored_properties) + if self.deterministic: + props = sorted(props, key=str) + Collection(g, list_node, props) return list_node diff --git a/packages/linkml/src/linkml/utils/generator.py b/packages/linkml/src/linkml/utils/generator.py index 88fc48585..6cf5dce62 100644 --- a/packages/linkml/src/linkml/utils/generator.py +++ b/packages/linkml/src/linkml/utils/generator.py @@ -24,7 +24,7 @@ from dataclasses import dataclass, field from functools import lru_cache from pathlib import Path -from typing import ClassVar, TextIO, Union, cast +from typing import TYPE_CHECKING, ClassVar, TextIO, Union, cast import click from click import Argument, Command, Option @@ -37,6 +37,10 @@ from linkml.utils.schemaloader import SchemaLoader from linkml.utils.typereferences import References from linkml_runtime import SchemaView + +if TYPE_CHECKING: + from rdflib import Graph as RdfGraph + from linkml_runtime.linkml_model.meta import ( ClassDefinition, ClassDefinitionName, @@ -78,6 +82,111 @@ def _resolved_metamodel(mergeimports): return metamodel +def deterministic_turtle(graph: "RdfGraph") -> str: + """Serialize an RDF graph to Turtle with deterministic output ordering. + + Uses W3C RDFC-1.0 [1]_ (RDF Dataset Canonicalization) via + ``pyoxigraph`` to assign stable blank-node identifiers, then + serializes to Turtle with pyoxigraph's own deterministic serializer. + + The pipeline is: rdflib N-Triples → pyoxigraph RDFC-1.0 + canonicalization → pyoxigraph Turtle serialization with namespace + prefixes derived from the source graph. + + Prefixes are resolved with *rdflib-default-first* semantics: if a + namespace is already bound by rdflib's curated defaults (e.g. + ``schema: https://schema.org/``), that binding wins over the model's + alias (e.g. ``sdo:``). + + This guarantees both **serialisation determinism** (same input → + same output) and **graph isomorphism preservation** (the canonical + output is isomorphic to the input). + + Parameters + ---------- + graph : rdflib.Graph + An rdflib Graph to serialize. + + Returns + ------- + str + Deterministic Turtle string in standard ``@prefix`` format. + + References + ---------- + .. [1] W3C (2024). "RDF Dataset Canonicalization (RDFC-1.0)." + W3C Recommendation. https://www.w3.org/TR/rdf-canon/ + """ + import pyoxigraph + from rdflib import Graph as RdfGraph + + nt_data = graph.serialize(format="nt") + + dataset = pyoxigraph.Dataset( + pyoxigraph.parse(nt_data, format=pyoxigraph.RdfFormat.N_TRIPLES) + ) + dataset.canonicalize(pyoxigraph.CanonicalizationAlgorithm.RDFC_1_0) + + # Build prefix dict. Start with rdflib's curated defaults so that + # well-known namespaces keep their standard prefix names, then layer + # the source graph's custom bindings (skip any namespace that is + # already covered by a default). + ns_to_prefix: dict[str, str] = {} + prefixes: dict[str, str] = {} + for pfx, ns in RdfGraph().namespaces(): + pfx_s, ns_s = str(pfx), str(ns) + if pfx_s: + prefixes[pfx_s] = ns_s + ns_to_prefix[ns_s] = pfx_s + for pfx, ns in graph.namespaces(): + pfx_s, ns_s = str(pfx), str(ns) + if pfx_s and ns_s not in ns_to_prefix: + prefixes[pfx_s] = ns_s + ns_to_prefix[ns_s] = pfx_s + + # Sort canonical triples by their N-Triples string form to ensure + # a stable input order for the Turtle serializer. The Dataset's + # BTreeSet ordering uses internal interned IDs which may differ + # between independently-canonicalized datasets. + triples = sorted((q.triple for q in dataset), key=str) + ttl_bytes = pyoxigraph.serialize( + triples, + format=pyoxigraph.RdfFormat.TURTLE, + prefixes=prefixes, + ) + return ttl_bytes.decode("utf-8") + + +def deterministic_json(obj: object, indent: int = 3) -> str: + """Serialize a JSON-compatible object with deterministic ordering. + + Recursively sorts all dict keys *and* list elements to produce + stable output across Python versions and process invocations. + + List elements are sorted by their canonical JSON representation + (``json.dumps(item, sort_keys=True)``), which handles lists of + dicts, strings, and mixed types. + + :param obj: A JSON-serializable object (typically parsed from ``as_json``). + :param indent: Number of spaces for indentation. + :returns: Deterministic JSON string. + """ + import json + + def _deep_sort(value: object) -> object: + if isinstance(value, dict): + return {k: _deep_sort(v) for k, v in sorted(value.items())} + if isinstance(value, list): + sorted_items = [_deep_sort(item) for item in value] + try: + return sorted(sorted_items, key=lambda x: json.dumps(x, sort_keys=True, ensure_ascii=False)) + except TypeError: + return sorted_items + return value + + return json.dumps(_deep_sort(obj), indent=indent, ensure_ascii=False) + + @dataclass class Generator(metaclass=abc.ABCMeta): """ @@ -139,6 +248,9 @@ class Generator(metaclass=abc.ABCMeta): mergeimports: bool | None = True """True means merge non-linkml sources into importing package. False means separate packages""" + deterministic: bool = False + """True means produce stable, reproducible output with sorted keys and canonical blank-node ordering""" + source_file_date: str | None = None """Modification date of input source file""" @@ -986,6 +1098,16 @@ def decorator(f: Command) -> Command: callback=stacktrace_callback, ) ) + f.params.append( + Option( + ("--deterministic/--no-deterministic",), + default=False, + show_default=True, + help="Generate stable, reproducible output with sorted keys and canonical blank-node ordering. " + "Supported by OWL, SHACL, JSON-LD, and JSON-LD Context generators. " + "Useful when generated artifacts are stored in version control.", + ) + ) return f diff --git a/tests/linkml/test_generators/test_deterministic_output.py b/tests/linkml/test_generators/test_deterministic_output.py new file mode 100644 index 000000000..b39aa313d --- /dev/null +++ b/tests/linkml/test_generators/test_deterministic_output.py @@ -0,0 +1,353 @@ +"""Tests for deterministic generator output. + +When ``deterministic=True``, generators must produce byte-identical output +across multiple invocations. This ensures version-controlled artifacts don't +show spurious diffs from blank-node relabeling or dict-ordering instability. + +Generators must also produce **isomorphic** output — the deterministic +serialization must encode the same RDF graph as non-deterministic mode. +""" + +import json +import time +from pathlib import Path + +import pytest +from rdflib import Graph +from rdflib.compare import isomorphic + +from linkml.generators.jsonldcontextgen import ContextGenerator +from linkml.generators.jsonldgen import JSONLDGenerator +from linkml.generators.owlgen import OwlSchemaGenerator +from linkml.generators.shaclgen import ShaclGenerator + +SCHEMA = str(Path(__file__).parent / "input" / "personinfo.yaml") + + +@pytest.mark.parametrize( + "generator_cls,kwargs", + [ + (OwlSchemaGenerator, {}), + (ShaclGenerator, {}), + (ContextGenerator, {}), + (JSONLDGenerator, {}), + ], + ids=["owl", "shacl", "context", "jsonld"], +) +def test_deterministic_output_is_identical_across_runs(generator_cls, kwargs): + """Generate output twice with deterministic=True and verify identity.""" + out1 = generator_cls(SCHEMA, deterministic=True, **kwargs).serialize() + out2 = generator_cls(SCHEMA, deterministic=True, **kwargs).serialize() + # JSONLDGenerator embeds a generation_date timestamp — normalize it + if generator_cls is JSONLDGenerator: + import re + + ts_re = re.compile(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}") + out1 = ts_re.sub("TIMESTAMP", out1) + out2 = ts_re.sub("TIMESTAMP", out2) + assert out1 == out2, f"{generator_cls.__name__} produced different output across runs" + assert len(out1) > 100, "Output suspiciously short — generator may have failed silently" + + +@pytest.mark.parametrize( + "generator_cls", + [ContextGenerator, JSONLDGenerator], + ids=["context", "jsonld"], +) +def test_deterministic_json_has_sorted_keys(generator_cls): + """When deterministic=True, JSON dict keys should be sorted at all levels. + + For the ContextGenerator, @context keys use grouped ordering (prefixes + before term entries) — each group is sorted, but not globally. + """ + out = generator_cls(SCHEMA, deterministic=True).serialize() + parsed = json.loads(out) + + is_context_gen = generator_cls is ContextGenerator + + def _check_sorted_keys(obj, path="root"): + if isinstance(obj, dict): + keys = list(obj.keys()) + # Context generator groups @context keys: @-directives, prefixes, terms + if is_context_gen and path == "root.@context": + at_keys = [k for k in keys if k.startswith("@")] + prefix_keys = [k for k in keys if not k.startswith("@") and isinstance(obj[k], str)] + term_keys = [k for k in keys if not k.startswith("@") and not isinstance(obj[k], str)] + assert at_keys == sorted(at_keys), f"@-keys not sorted: {at_keys}" + assert prefix_keys == sorted(prefix_keys), f"Prefix keys not sorted: {prefix_keys}" + assert term_keys == sorted(term_keys), f"Term keys not sorted: {term_keys}" + else: + assert keys == sorted(keys), f"Keys not sorted at {path}: {keys}" + for k, v in obj.items(): + _check_sorted_keys(v, f"{path}.{k}") + elif isinstance(obj, list): + for i, item in enumerate(obj): + _check_sorted_keys(item, f"{path}[{i}]") + + _check_sorted_keys(parsed) + + +@pytest.mark.parametrize( + "generator_cls", + [ContextGenerator, JSONLDGenerator], + ids=["context", "jsonld"], +) +def test_deterministic_json_lists_are_sorted(generator_cls): + """When deterministic=True, JSON list elements should be sorted.""" + out = generator_cls(SCHEMA, deterministic=True).serialize() + parsed = json.loads(out) + + def _check_sorted_lists(obj, path="root"): + if isinstance(obj, dict): + for k, v in obj.items(): + _check_sorted_lists(v, f"{path}.{k}") + elif isinstance(obj, list): + str_items = [json.dumps(item, sort_keys=True, ensure_ascii=False) for item in obj] + assert str_items == sorted(str_items), f"List not sorted at {path}" + for i, item in enumerate(obj): + _check_sorted_lists(item, f"{path}[{i}]") + + _check_sorted_lists(parsed) + + +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +def test_deterministic_turtle_preserves_at_prefix(generator_cls): + """deterministic_turtle must produce standard @prefix, not SPARQL PREFIX.""" + out = generator_cls(SCHEMA, deterministic=True).serialize() + assert "@prefix" in out, "Output uses non-standard prefix syntax" + assert "PREFIX " not in out, "Output uses SPARQL PREFIX instead of Turtle @prefix" + + +def test_deterministic_turtle_performance(): + """Deterministic OWL generation must complete within 10 seconds for personinfo. + + The Weisfeiler-Lehman approach is O(n log n), so this should easily pass. + The previous canon=True approach was exponential and failed this test + for graphs above ~250 triples. + """ + start = time.time() + out = OwlSchemaGenerator(SCHEMA, deterministic=True).serialize() + elapsed = time.time() - start + assert elapsed < 10.0, f"Deterministic generation took {elapsed:.1f}s (limit: 10s)" + assert len(out) > 100, "Output suspiciously short" + + +def test_shacl_closed_ignored_properties_deterministic(): + """sh:ignoredProperties in closed shapes must be deterministic. + + ``_build_ignored_properties`` collects inherited slots into a set; without + explicit sorting this produces different ``rdf:first``/``rdf:rest`` chains + on each run. With ``deterministic=True`` (and sorted Collection inputs) + the output must be byte-identical. + """ + runs = [ShaclGenerator(SCHEMA, deterministic=True, closed=True).serialize() for _ in range(3)] + assert runs[0] == runs[1] == runs[2], "sh:ignoredProperties ordering differs across runs" + assert "sh:ignoredProperties" in runs[0], "Expected closed shapes with sh:ignoredProperties" + + +def test_shacl_enum_in_deterministic(): + """sh:in RDF lists for enums must be deterministic. + + ``_build_enum_constraint`` iterates ``enum.permissible_values.items()`` + (dict iteration order) into a ``Collection``. Without sorting, the + ``rdf:first``/``rdf:rest`` chain varies across runs. + """ + runs = [ShaclGenerator(SCHEMA, deterministic=True).serialize() for _ in range(3)] + assert runs[0] == runs[1] == runs[2], "sh:in enum list ordering differs across runs" + assert "sh:in" in runs[0], "Expected sh:in constraints for enums" + + +def test_owl_enum_one_of_deterministic(): + """owl:oneOf RDF lists for enums must be deterministic. + + ``_boolean_expression`` feeds ``pv_uris`` (from ``permissible_values``) + into a ``Collection``. Without sorting, ``owl:oneOf`` list ordering varies. + """ + runs = [OwlSchemaGenerator(SCHEMA, deterministic=True).serialize() for _ in range(3)] + assert runs[0] == runs[1] == runs[2], "owl:oneOf enum list ordering differs across runs" + + +KITCHEN_SINK = str(Path(__file__).parent / "input" / "kitchen_sink.yaml") + + +def test_deterministic_large_schema(): + """End-to-end idempotency on a complex schema (kitchen_sink). + + Exercises many code paths simultaneously: closed shapes, enums, imports, + class hierarchies, and mixed ranges. + """ + owl1 = OwlSchemaGenerator(KITCHEN_SINK, deterministic=True).serialize() + owl2 = OwlSchemaGenerator(KITCHEN_SINK, deterministic=True).serialize() + assert owl1 == owl2, "OWL output differs across runs for kitchen_sink" + assert len(owl1) > 500, "kitchen_sink output suspiciously short" + + shacl1 = ShaclGenerator(KITCHEN_SINK, deterministic=True).serialize() + shacl2 = ShaclGenerator(KITCHEN_SINK, deterministic=True).serialize() + assert shacl1 == shacl2, "SHACL output differs across runs for kitchen_sink" + assert len(shacl1) > 500, "kitchen_sink output suspiciously short" + + +def test_deterministic_context_preserves_jsonld_structure(): + """Deterministic JSON-LD context must preserve conventional structure. + + JSON-LD contexts have a conventional layout: + 1. ``comments`` block first (metadata) + 2. ``@context`` block second, with prefixes grouped before term entries + + ``deterministic_json()`` would scramble this by sorting all keys + uniformly. The context generator must use JSON-LD-aware ordering. + """ + out = ContextGenerator(SCHEMA, deterministic=True, metadata=True).serialize() + parsed = json.loads(out) + + # Top-level key order: "comments" before "@context" + top_keys = list(parsed.keys()) + assert "comments" in top_keys, "Expected 'comments' block with metadata=True" + assert top_keys.index("comments") < top_keys.index("@context"), ( + f"'comments' should precede '@context', got: {top_keys}" + ) + + # Inside @context: @-directives, then prefixes (str values), then terms (dict values) + ctx = parsed["@context"] + ctx_keys = list(ctx.keys()) + + at_keys = [k for k in ctx_keys if k.startswith("@")] + prefix_keys = [k for k in ctx_keys if not k.startswith("@") and isinstance(ctx[k], str)] + term_keys = [k for k in ctx_keys if not k.startswith("@") and not isinstance(ctx[k], str)] + + # Verify grouping: all @-keys before all prefix keys before all term keys + last_at = max(ctx_keys.index(k) for k in at_keys) if at_keys else -1 + first_prefix = min(ctx_keys.index(k) for k in prefix_keys) if prefix_keys else len(ctx_keys) + last_prefix = max(ctx_keys.index(k) for k in prefix_keys) if prefix_keys else -1 + first_term = min(ctx_keys.index(k) for k in term_keys) if term_keys else len(ctx_keys) + + assert last_at < first_prefix, "@-directives must come before prefixes" + assert last_prefix < first_term, "Prefixes must come before term entries" + + # Verify each group is sorted internally + assert at_keys == sorted(at_keys), f"@-directives not sorted: {at_keys}" + assert prefix_keys == sorted(prefix_keys), f"Prefixes not sorted: {prefix_keys}" + assert term_keys == sorted(term_keys), f"Term entries not sorted: {term_keys}" + + +def test_non_deterministic_is_default(): + """Verify that ``deterministic`` defaults to False.""" + gen = OwlSchemaGenerator(SCHEMA) + assert gen.deterministic is False + + +@pytest.mark.xfail( + reason=( + "Collection sorting (owl:oneOf, sh:in) in deterministic mode intentionally " + "reorders RDF list triples for canonical output. The resulting graph is " + "semantically equivalent (OWL/SHACL interpret these as unordered sets) but " + "not RDF-isomorphic because rdf:first/rdf:rest chains encode ordering." + ), + strict=True, +) +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +def test_deterministic_turtle_is_isomorphic(generator_cls): + """Deterministic output is NOT RDF-isomorphic to non-deterministic output. + + This documents the trade-off identified in linkml/linkml#3295 review: + deterministic mode sorts Collection inputs (owl:oneOf, sh:in, + sh:ignoredProperties) to produce canonical RDF list ordering. Since RDF + Collections encode order via rdf:first/rdf:rest triples, the sorted graph + is structurally different from the insertion-order graph — even though the + OWL/SHACL semantics are identical (these Collections represent sets). + + The test is marked xfail(strict=True) so that it: + - Documents the known, intentional non-isomorphism + - Alerts maintainers if the behaviour changes (strict xfail fails on pass) + """ + out_det = generator_cls(SCHEMA, deterministic=True).serialize() + out_nondet = generator_cls(SCHEMA, deterministic=False).serialize() + + g_det = Graph() + g_det.parse(data=out_det, format="turtle") + + g_nondet = Graph() + g_nondet.parse(data=out_nondet, format="turtle") + + assert len(g_det) == len(g_nondet), ( + f"Triple count mismatch: deterministic={len(g_det)}, " + f"non-deterministic={len(g_nondet)}" + ) + assert isomorphic(g_det, g_nondet), ( + f"{generator_cls.__name__}: deterministic output is NOT isomorphic " + "to non-deterministic output — the serialization changed the graph" + ) + + +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +def test_non_deterministic_output_unchanged(generator_cls): + """Non-deterministic output must still produce valid RDF. + + Ensures that changes for deterministic mode don't break default behavior. + """ + out = generator_cls(SCHEMA, deterministic=False).serialize() + assert len(out) > 100, "Output suspiciously short" + g = Graph() + g.parse(data=out, format="turtle") + assert len(g) > 50, f"Graph has too few triples ({len(g)})" + + +@pytest.mark.parametrize( + "generator_cls,kwargs", + [ + (OwlSchemaGenerator, {}), + (ShaclGenerator, {}), + (ContextGenerator, {}), + (JSONLDGenerator, {}), + ], + ids=["owl", "shacl", "context", "jsonld"], +) +def test_non_deterministic_produces_valid_output(generator_cls, kwargs): + """All generators must produce valid output in non-deterministic mode.""" + out = generator_cls(SCHEMA, deterministic=False, **kwargs).serialize() + assert len(out) > 100, f"{generator_cls.__name__} output suspiciously short" + + +@pytest.mark.xfail( + reason=( + "Collection sorting in deterministic mode produces non-isomorphic RDF " + "(different rdf:first/rdf:rest triples). See test_deterministic_turtle_is_isomorphic." + ), + strict=True, +) +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +def test_deterministic_kitchen_sink_isomorphic(generator_cls): + """Isomorphism check on the complex kitchen_sink schema. + + Expected to fail for the same reason as test_deterministic_turtle_is_isomorphic: + Collection sorting changes the RDF structure while preserving OWL/SHACL semantics. + """ + out_det = generator_cls(KITCHEN_SINK, deterministic=True).serialize() + out_nondet = generator_cls(KITCHEN_SINK, deterministic=False).serialize() + + g_det = Graph() + g_det.parse(data=out_det, format="turtle") + + g_nondet = Graph() + g_nondet.parse(data=out_nondet, format="turtle") + + assert isomorphic(g_det, g_nondet), ( + f"{generator_cls.__name__}: kitchen_sink deterministic output is NOT " + "isomorphic to non-deterministic output" + )