From 6ca468cb1bf2487f6a7829c909e8beac1e9153b8 Mon Sep 17 00:00:00 2001
From: jdsika <carlo.van-driesten@bmw.de>
Date: Wed, 25 Mar 2026 18:24:34 +0100
Subject: [PATCH] feat(generators): add --deterministic flag for reproducible
 output
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a --deterministic flag to OWL, SHACL, JSON-LD, and JSON-LD Context
generators that produces stable, reproducible output suitable for
version-controlled artifacts.

When enabled, the flag activates:

1. **RDFC-1.0 blank-node canonicalization** via pyoxigraph (W3C
   Recommendation) for Turtle serialisation of OWL and SHACL graphs.
2. **Deterministic Collection ordering** — RDF Collections (owl:oneOf,
   sh:in, sh:ignoredProperties) are sorted so that enum members and
   property lists appear in a stable order.  This intentionally changes
   the RDF graph (Collections encode order at the triple level) and is
   therefore opt-in.
3. **Deterministic JSON key ordering** for JSON-LD and JSON-LD Context
   output, with structure-aware sorting that preserves JSON-LD
   conventions (@context directives first, then prefixes, then terms).

The flag defaults to False to preserve backward compatibility.  Four
tests are marked xfail(strict=True) to document that deterministic
Collection sorting intentionally produces non-isomorphic output.

New dependency: pyoxigraph >= 0.4.0 (Rust-based, W3C RDFC-1.0).

Refs:
- W3C (2024) RDF Dataset Canonicalization (RDFC-1.0)
  https://www.w3.org/TR/rdf-canon/

Signed-off-by: jdsika <carlo.van-driesten@bmw.de>
---
 packages/linkml/pyproject.toml                |   1 +
 .../src/linkml/generators/jsonldcontextgen.py |  54 +++
 .../linkml/src/linkml/generators/jsonldgen.py |   5 +
 .../linkml/src/linkml/generators/owlgen.py    |  14 +-
 .../linkml/src/linkml/generators/shaclgen.py  |  21 +-
 packages/linkml/src/linkml/utils/generator.py | 124 +++++-
 .../test_deterministic_output.py              | 353 ++++++++++++++++++
 7 files changed, 563 insertions(+), 9 deletions(-)
 create mode 100644 tests/linkml/test_generators/test_deterministic_output.py

diff --git a/packages/linkml/pyproject.toml b/packages/linkml/pyproject.toml
index 3586aa3ff..474424f5a 100644
--- a/packages/linkml/pyproject.toml
+++ b/packages/linkml/pyproject.toml
@@ -55,6 +55,7 @@ dependencies = [ # Specifier syntax: https://peps.python.org/pep-0631/
     "pyshex >= 0.7.20",
     "pyshexc >= 0.8.3",
     "python-dateutil",
+    "pyoxigraph >= 0.4.0",
     "pyyaml",
     "rdflib >=6.0.0",
     "requests >= 2.22",
diff --git a/packages/linkml/src/linkml/generators/jsonldcontextgen.py b/packages/linkml/src/linkml/generators/jsonldcontextgen.py
index 60eaa9ffd..1c6cec148 100644
--- a/packages/linkml/src/linkml/generators/jsonldcontextgen.py
+++ b/packages/linkml/src/linkml/generators/jsonldcontextgen.py
@@ -189,8 +189,62 @@ def end_schema(
             with open(frame_path, "w", encoding="UTF-8") as f:
                 json.dump(frame, f, indent=2, ensure_ascii=False)
 
+        if self.deterministic:
+            return self._deterministic_context_json(json.loads(str(as_json(context))), indent=3) + "\n"
         return str(as_json(context)) + "\n"
 
+    @staticmethod
+    def _deterministic_context_json(data: dict, indent: int = 3) -> str:
+        """Serialize a JSON-LD context with deterministic key ordering.
+
+        Preserves the conventional JSON-LD context structure:
+        1. ``comments`` block first (metadata)
+        2. ``@context`` block second, with:
+           a. ``@``-prefixed directives (``@vocab``, ``@base``) first
+           b. Prefix declarations (string values) second
+           c. Class/property term entries (object values) last
+        3. Each group sorted alphabetically within itself
+
+        Unlike :func:`deterministic_json`, this understands JSON-LD
+        conventions so that the output remains human-readable while
+        still being byte-identical across invocations.
+        """
+        from linkml.utils.generator import deterministic_json
+
+        ordered = {}
+
+        # 1. "comments" first (if present)
+        if "comments" in data:
+            ordered["comments"] = data["comments"]
+
+        # 2. "@context" with structured internal ordering
+        if "@context" in data:
+            ctx = data["@context"]
+            ordered_ctx = {}
+
+            # 2a. @-prefixed directives (@vocab, @base, etc.)
+            for k in sorted(k for k in ctx if k.startswith("@")):
+                ordered_ctx[k] = ctx[k]
+
+            # 2b. Prefix declarations (string values — short namespace URIs)
+            for k in sorted(k for k in ctx if not k.startswith("@") and isinstance(ctx[k], str)):
+                ordered_ctx[k] = ctx[k]
+
+            # 2c. Term definitions (object values) — deep-sorted for determinism
+            term_entries = {k: v for k, v in ctx.items() if not k.startswith("@") and not isinstance(v, str)}
+            sorted_terms = json.loads(deterministic_json(term_entries))
+            for k in sorted(sorted_terms):
+                ordered_ctx[k] = sorted_terms[k]
+
+            ordered["@context"] = ordered_ctx
+
+        # 3. Any remaining top-level keys
+        for k in sorted(data):
+            if k not in ordered:
+                ordered[k] = data[k]
+
+        return json.dumps(ordered, indent=indent, ensure_ascii=False)
+
     def visit_class(self, cls: ClassDefinition) -> bool:
         if self.exclude_imports and cls.name not in self._local_classes:
             return False
diff --git a/packages/linkml/src/linkml/generators/jsonldgen.py b/packages/linkml/src/linkml/generators/jsonldgen.py
index c974e762d..0c9c87cbb 100644
--- a/packages/linkml/src/linkml/generators/jsonldgen.py
+++ b/packages/linkml/src/linkml/generators/jsonldgen.py
@@ -1,5 +1,6 @@
 """Generate JSONld from a LinkML schema."""
 
+import json
 import os
 from collections.abc import Sequence
 from copy import deepcopy
@@ -202,6 +203,10 @@ def end_schema(self, context: str | Sequence[str] | None = None, context_kwargs:
                 self.schema["@context"].append({"@base": base_prefix})
         # json_obj["@id"] = self.schema.id
         out = str(as_json(self.schema, indent="  ")) + "\n"
+        if self.deterministic:
+            from linkml.utils.generator import deterministic_json
+
+            out = deterministic_json(json.loads(out), indent=2) + "\n"
         self.schema = self.original_schema
         return out
 
diff --git a/packages/linkml/src/linkml/generators/owlgen.py b/packages/linkml/src/linkml/generators/owlgen.py
index 33c58b0ec..4ab4b8cf3 100644
--- a/packages/linkml/src/linkml/generators/owlgen.py
+++ b/packages/linkml/src/linkml/generators/owlgen.py
@@ -267,7 +267,14 @@ def serialize(self, **kwargs) -> str:
         :return:
         """
         self.as_graph()
-        data = self.graph.serialize(format="turtle" if self.format in ["owl", "ttl"] else self.format)
+        fmt = "turtle" if self.format in ["owl", "ttl"] else self.format
+        if self.deterministic and fmt == "turtle":
+            # Deferred to avoid circular import (generator.py imports from this package)
+            from linkml.utils.generator import deterministic_turtle
+
+            data = deterministic_turtle(self.graph)
+        else:
+            data = self.graph.serialize(format=fmt)
         return data
 
     def add_metadata(self, e: Definition | PermissibleValue, uri: URIRef) -> None:
@@ -998,7 +1005,10 @@ def add_enum(self, e: EnumDefinition) -> None:
         owl_types = []
         enum_owl_type = self._get_metatype(e, self.default_permissible_value_type)
 
-        for pv in e.permissible_values.values():
+        pvs = e.permissible_values.values()
+        if self.deterministic:
+            pvs = sorted(pvs, key=lambda x: x.text)
+        for pv in pvs:
             pv_owl_type = self._get_metatype(pv, enum_owl_type)
             owl_types.append(pv_owl_type)
             if pv_owl_type == RDFS.Literal:
diff --git a/packages/linkml/src/linkml/generators/shaclgen.py b/packages/linkml/src/linkml/generators/shaclgen.py
index 5425051e3..ec78e7ba5 100644
--- a/packages/linkml/src/linkml/generators/shaclgen.py
+++ b/packages/linkml/src/linkml/generators/shaclgen.py
@@ -93,7 +93,13 @@ def generate_header(self) -> str:
 
     def serialize(self, **args) -> str:
         g = self.as_graph()
-        data = g.serialize(format="turtle" if self.format in ["owl", "ttl"] else self.format)
+        fmt = "turtle" if self.format in ["owl", "ttl"] else self.format
+        if self.deterministic and fmt == "turtle":
+            from linkml.utils.generator import deterministic_turtle
+
+            data = deterministic_turtle(g)
+        else:
+            data = g.serialize(format=fmt)
         return data
 
     def as_graph(self) -> Graph:
@@ -309,13 +315,13 @@ def _add_enum(self, g: Graph, func: Callable, r: ElementName) -> None:
         sv = self.schemaview
         enum = sv.get_enum(r)
         pv_node = BNode()
+        pv_items = list(enum.permissible_values.items())
+        if self.deterministic:
+            pv_items = sorted(pv_items)
         Collection(
             g,
             pv_node,
-            [
-                URIRef(sv.expand_curie(pv.meaning)) if pv.meaning else Literal(pv_name)
-                for pv_name, pv in enum.permissible_values.items()
-            ],
+            [URIRef(sv.expand_curie(pv.meaning)) if pv.meaning else Literal(pv_name) for pv_name, pv in pv_items],
         )
         func(SH["in"], pv_node)
 
@@ -469,7 +475,10 @@ def collect_child_properties(class_name: str, output: set) -> None:
 
         list_node = BNode()
         ignored_properties.add(RDF.type)
-        Collection(g, list_node, list(ignored_properties))
+        props = list(ignored_properties)
+        if self.deterministic:
+            props = sorted(props, key=str)
+        Collection(g, list_node, props)
 
         return list_node
 
diff --git a/packages/linkml/src/linkml/utils/generator.py b/packages/linkml/src/linkml/utils/generator.py
index 88fc48585..6cf5dce62 100644
--- a/packages/linkml/src/linkml/utils/generator.py
+++ b/packages/linkml/src/linkml/utils/generator.py
@@ -24,7 +24,7 @@
 from dataclasses import dataclass, field
 from functools import lru_cache
 from pathlib import Path
-from typing import ClassVar, TextIO, Union, cast
+from typing import TYPE_CHECKING, ClassVar, TextIO, Union, cast
 
 import click
 from click import Argument, Command, Option
@@ -37,6 +37,10 @@
 from linkml.utils.schemaloader import SchemaLoader
 from linkml.utils.typereferences import References
 from linkml_runtime import SchemaView
+
+if TYPE_CHECKING:
+    from rdflib import Graph as RdfGraph
+
 from linkml_runtime.linkml_model.meta import (
     ClassDefinition,
     ClassDefinitionName,
@@ -78,6 +82,111 @@ def _resolved_metamodel(mergeimports):
     return metamodel
 
 
+def deterministic_turtle(graph: "RdfGraph") -> str:
+    """Serialize an RDF graph to Turtle with deterministic output ordering.
+
+    Uses W3C RDFC-1.0 [1]_ (RDF Dataset Canonicalization) via
+    ``pyoxigraph`` to assign stable blank-node identifiers, then
+    serializes to Turtle with pyoxigraph's own deterministic serializer.
+
+    The pipeline is: rdflib N-Triples → pyoxigraph RDFC-1.0
+    canonicalization → pyoxigraph Turtle serialization with namespace
+    prefixes derived from the source graph.
+
+    Prefixes are resolved with *rdflib-default-first* semantics: if a
+    namespace is already bound by rdflib's curated defaults (e.g.
+    ``schema: https://schema.org/``), that binding wins over the model's
+    alias (e.g. ``sdo:``).
+
+    This guarantees both **serialisation determinism** (same input →
+    same output) and **graph isomorphism preservation** (the canonical
+    output is isomorphic to the input).
+
+    Parameters
+    ----------
+    graph : rdflib.Graph
+        An rdflib Graph to serialize.
+
+    Returns
+    -------
+    str
+        Deterministic Turtle string in standard ``@prefix`` format.
+
+    References
+    ----------
+    .. [1] W3C (2024). "RDF Dataset Canonicalization (RDFC-1.0)."
+       W3C Recommendation.  https://www.w3.org/TR/rdf-canon/
+    """
+    import pyoxigraph
+    from rdflib import Graph as RdfGraph
+
+    nt_data = graph.serialize(format="nt")
+
+    dataset = pyoxigraph.Dataset(
+        pyoxigraph.parse(nt_data, format=pyoxigraph.RdfFormat.N_TRIPLES)
+    )
+    dataset.canonicalize(pyoxigraph.CanonicalizationAlgorithm.RDFC_1_0)
+
+    # Build prefix dict.  Start with rdflib's curated defaults so that
+    # well-known namespaces keep their standard prefix names, then layer
+    # the source graph's custom bindings (skip any namespace that is
+    # already covered by a default).
+    ns_to_prefix: dict[str, str] = {}
+    prefixes: dict[str, str] = {}
+    for pfx, ns in RdfGraph().namespaces():
+        pfx_s, ns_s = str(pfx), str(ns)
+        if pfx_s:
+            prefixes[pfx_s] = ns_s
+            ns_to_prefix[ns_s] = pfx_s
+    for pfx, ns in graph.namespaces():
+        pfx_s, ns_s = str(pfx), str(ns)
+        if pfx_s and ns_s not in ns_to_prefix:
+            prefixes[pfx_s] = ns_s
+            ns_to_prefix[ns_s] = pfx_s
+
+    # Sort canonical triples by their N-Triples string form to ensure
+    # a stable input order for the Turtle serializer.  The Dataset's
+    # BTreeSet ordering uses internal interned IDs which may differ
+    # between independently-canonicalized datasets.
+    triples = sorted((q.triple for q in dataset), key=str)
+    ttl_bytes = pyoxigraph.serialize(
+        triples,
+        format=pyoxigraph.RdfFormat.TURTLE,
+        prefixes=prefixes,
+    )
+    return ttl_bytes.decode("utf-8")
+
+
+def deterministic_json(obj: object, indent: int = 3) -> str:
+    """Serialize a JSON-compatible object with deterministic ordering.
+
+    Recursively sorts all dict keys *and* list elements to produce
+    stable output across Python versions and process invocations.
+
+    List elements are sorted by their canonical JSON representation
+    (``json.dumps(item, sort_keys=True)``), which handles lists of
+    dicts, strings, and mixed types.
+
+    :param obj: A JSON-serializable object (typically parsed from ``as_json``).
+    :param indent: Number of spaces for indentation.
+    :returns: Deterministic JSON string.
+    """
+    import json
+
+    def _deep_sort(value: object) -> object:
+        if isinstance(value, dict):
+            return {k: _deep_sort(v) for k, v in sorted(value.items())}
+        if isinstance(value, list):
+            sorted_items = [_deep_sort(item) for item in value]
+            try:
+                return sorted(sorted_items, key=lambda x: json.dumps(x, sort_keys=True, ensure_ascii=False))
+            except TypeError:
+                return sorted_items
+        return value
+
+    return json.dumps(_deep_sort(obj), indent=indent, ensure_ascii=False)
+
+
 @dataclass
 class Generator(metaclass=abc.ABCMeta):
     """
@@ -139,6 +248,9 @@ class Generator(metaclass=abc.ABCMeta):
     mergeimports: bool | None = True
     """True means merge non-linkml sources into importing package.  False means separate packages"""
 
+    deterministic: bool = False
+    """True means produce stable, reproducible output with sorted keys and canonical blank-node ordering"""
+
     source_file_date: str | None = None
     """Modification date of input source file"""
 
@@ -986,6 +1098,16 @@ def decorator(f: Command) -> Command:
                 callback=stacktrace_callback,
             )
         )
+        f.params.append(
+            Option(
+                ("--deterministic/--no-deterministic",),
+                default=False,
+                show_default=True,
+                help="Generate stable, reproducible output with sorted keys and canonical blank-node ordering. "
+                "Supported by OWL, SHACL, JSON-LD, and JSON-LD Context generators. "
+                "Useful when generated artifacts are stored in version control.",
+            )
+        )
 
         return f
 
diff --git a/tests/linkml/test_generators/test_deterministic_output.py b/tests/linkml/test_generators/test_deterministic_output.py
new file mode 100644
index 000000000..b39aa313d
--- /dev/null
+++ b/tests/linkml/test_generators/test_deterministic_output.py
@@ -0,0 +1,353 @@
+"""Tests for deterministic generator output.
+
+When ``deterministic=True``, generators must produce byte-identical output
+across multiple invocations. This ensures version-controlled artifacts don't
+show spurious diffs from blank-node relabeling or dict-ordering instability.
+
+Generators must also produce **isomorphic** output — the deterministic
+serialization must encode the same RDF graph as non-deterministic mode.
+"""
+
+import json
+import time
+from pathlib import Path
+
+import pytest
+from rdflib import Graph
+from rdflib.compare import isomorphic
+
+from linkml.generators.jsonldcontextgen import ContextGenerator
+from linkml.generators.jsonldgen import JSONLDGenerator
+from linkml.generators.owlgen import OwlSchemaGenerator
+from linkml.generators.shaclgen import ShaclGenerator
+
+SCHEMA = str(Path(__file__).parent / "input" / "personinfo.yaml")
+
+
+@pytest.mark.parametrize(
+    "generator_cls,kwargs",
+    [
+        (OwlSchemaGenerator, {}),
+        (ShaclGenerator, {}),
+        (ContextGenerator, {}),
+        (JSONLDGenerator, {}),
+    ],
+    ids=["owl", "shacl", "context", "jsonld"],
+)
+def test_deterministic_output_is_identical_across_runs(generator_cls, kwargs):
+    """Generate output twice with deterministic=True and verify identity."""
+    out1 = generator_cls(SCHEMA, deterministic=True, **kwargs).serialize()
+    out2 = generator_cls(SCHEMA, deterministic=True, **kwargs).serialize()
+    # JSONLDGenerator embeds a generation_date timestamp — normalize it
+    if generator_cls is JSONLDGenerator:
+        import re
+
+        ts_re = re.compile(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}")
+        out1 = ts_re.sub("TIMESTAMP", out1)
+        out2 = ts_re.sub("TIMESTAMP", out2)
+    assert out1 == out2, f"{generator_cls.__name__} produced different output across runs"
+    assert len(out1) > 100, "Output suspiciously short — generator may have failed silently"
+
+
+@pytest.mark.parametrize(
+    "generator_cls",
+    [ContextGenerator, JSONLDGenerator],
+    ids=["context", "jsonld"],
+)
+def test_deterministic_json_has_sorted_keys(generator_cls):
+    """When deterministic=True, JSON dict keys should be sorted at all levels.
+
+    For the ContextGenerator, @context keys use grouped ordering (prefixes
+    before term entries) — each group is sorted, but not globally.
+    """
+    out = generator_cls(SCHEMA, deterministic=True).serialize()
+    parsed = json.loads(out)
+
+    is_context_gen = generator_cls is ContextGenerator
+
+    def _check_sorted_keys(obj, path="root"):
+        if isinstance(obj, dict):
+            keys = list(obj.keys())
+            # Context generator groups @context keys: @-directives, prefixes, terms
+            if is_context_gen and path == "root.@context":
+                at_keys = [k for k in keys if k.startswith("@")]
+                prefix_keys = [k for k in keys if not k.startswith("@") and isinstance(obj[k], str)]
+                term_keys = [k for k in keys if not k.startswith("@") and not isinstance(obj[k], str)]
+                assert at_keys == sorted(at_keys), f"@-keys not sorted: {at_keys}"
+                assert prefix_keys == sorted(prefix_keys), f"Prefix keys not sorted: {prefix_keys}"
+                assert term_keys == sorted(term_keys), f"Term keys not sorted: {term_keys}"
+            else:
+                assert keys == sorted(keys), f"Keys not sorted at {path}: {keys}"
+            for k, v in obj.items():
+                _check_sorted_keys(v, f"{path}.{k}")
+        elif isinstance(obj, list):
+            for i, item in enumerate(obj):
+                _check_sorted_keys(item, f"{path}[{i}]")
+
+    _check_sorted_keys(parsed)
+
+
+@pytest.mark.parametrize(
+    "generator_cls",
+    [ContextGenerator, JSONLDGenerator],
+    ids=["context", "jsonld"],
+)
+def test_deterministic_json_lists_are_sorted(generator_cls):
+    """When deterministic=True, JSON list elements should be sorted."""
+    out = generator_cls(SCHEMA, deterministic=True).serialize()
+    parsed = json.loads(out)
+
+    def _check_sorted_lists(obj, path="root"):
+        if isinstance(obj, dict):
+            for k, v in obj.items():
+                _check_sorted_lists(v, f"{path}.{k}")
+        elif isinstance(obj, list):
+            str_items = [json.dumps(item, sort_keys=True, ensure_ascii=False) for item in obj]
+            assert str_items == sorted(str_items), f"List not sorted at {path}"
+            for i, item in enumerate(obj):
+                _check_sorted_lists(item, f"{path}[{i}]")
+
+    _check_sorted_lists(parsed)
+
+
+@pytest.mark.parametrize(
+    "generator_cls",
+    [OwlSchemaGenerator, ShaclGenerator],
+    ids=["owl", "shacl"],
+)
+def test_deterministic_turtle_preserves_at_prefix(generator_cls):
+    """deterministic_turtle must produce standard @prefix, not SPARQL PREFIX."""
+    out = generator_cls(SCHEMA, deterministic=True).serialize()
+    assert "@prefix" in out, "Output uses non-standard prefix syntax"
+    assert "PREFIX " not in out, "Output uses SPARQL PREFIX instead of Turtle @prefix"
+
+
+def test_deterministic_turtle_performance():
+    """Deterministic OWL generation must complete within 10 seconds for personinfo.
+
+    The Weisfeiler-Lehman approach is O(n log n), so this should easily pass.
+    The previous canon=True approach was exponential and failed this test
+    for graphs above ~250 triples.
+    """
+    start = time.time()
+    out = OwlSchemaGenerator(SCHEMA, deterministic=True).serialize()
+    elapsed = time.time() - start
+    assert elapsed < 10.0, f"Deterministic generation took {elapsed:.1f}s (limit: 10s)"
+    assert len(out) > 100, "Output suspiciously short"
+
+
+def test_shacl_closed_ignored_properties_deterministic():
+    """sh:ignoredProperties in closed shapes must be deterministic.
+
+    ``_build_ignored_properties`` collects inherited slots into a set; without
+    explicit sorting this produces different ``rdf:first``/``rdf:rest`` chains
+    on each run.  With ``deterministic=True`` (and sorted Collection inputs)
+    the output must be byte-identical.
+    """
+    runs = [ShaclGenerator(SCHEMA, deterministic=True, closed=True).serialize() for _ in range(3)]
+    assert runs[0] == runs[1] == runs[2], "sh:ignoredProperties ordering differs across runs"
+    assert "sh:ignoredProperties" in runs[0], "Expected closed shapes with sh:ignoredProperties"
+
+
+def test_shacl_enum_in_deterministic():
+    """sh:in RDF lists for enums must be deterministic.
+
+    ``_build_enum_constraint`` iterates ``enum.permissible_values.items()``
+    (dict iteration order) into a ``Collection``.  Without sorting, the
+    ``rdf:first``/``rdf:rest`` chain varies across runs.
+    """
+    runs = [ShaclGenerator(SCHEMA, deterministic=True).serialize() for _ in range(3)]
+    assert runs[0] == runs[1] == runs[2], "sh:in enum list ordering differs across runs"
+    assert "sh:in" in runs[0], "Expected sh:in constraints for enums"
+
+
+def test_owl_enum_one_of_deterministic():
+    """owl:oneOf RDF lists for enums must be deterministic.
+
+    ``_boolean_expression`` feeds ``pv_uris`` (from ``permissible_values``)
+    into a ``Collection``.  Without sorting, ``owl:oneOf`` list ordering varies.
+    """
+    runs = [OwlSchemaGenerator(SCHEMA, deterministic=True).serialize() for _ in range(3)]
+    assert runs[0] == runs[1] == runs[2], "owl:oneOf enum list ordering differs across runs"
+
+
+KITCHEN_SINK = str(Path(__file__).parent / "input" / "kitchen_sink.yaml")
+
+
+def test_deterministic_large_schema():
+    """End-to-end idempotency on a complex schema (kitchen_sink).
+
+    Exercises many code paths simultaneously: closed shapes, enums, imports,
+    class hierarchies, and mixed ranges.
+    """
+    owl1 = OwlSchemaGenerator(KITCHEN_SINK, deterministic=True).serialize()
+    owl2 = OwlSchemaGenerator(KITCHEN_SINK, deterministic=True).serialize()
+    assert owl1 == owl2, "OWL output differs across runs for kitchen_sink"
+    assert len(owl1) > 500, "kitchen_sink output suspiciously short"
+
+    shacl1 = ShaclGenerator(KITCHEN_SINK, deterministic=True).serialize()
+    shacl2 = ShaclGenerator(KITCHEN_SINK, deterministic=True).serialize()
+    assert shacl1 == shacl2, "SHACL output differs across runs for kitchen_sink"
+    assert len(shacl1) > 500, "kitchen_sink output suspiciously short"
+
+
+def test_deterministic_context_preserves_jsonld_structure():
+    """Deterministic JSON-LD context must preserve conventional structure.
+
+    JSON-LD contexts have a conventional layout:
+    1. ``comments`` block first (metadata)
+    2. ``@context`` block second, with prefixes grouped before term entries
+
+    ``deterministic_json()`` would scramble this by sorting all keys
+    uniformly.  The context generator must use JSON-LD-aware ordering.
+    """
+    out = ContextGenerator(SCHEMA, deterministic=True, metadata=True).serialize()
+    parsed = json.loads(out)
+
+    # Top-level key order: "comments" before "@context"
+    top_keys = list(parsed.keys())
+    assert "comments" in top_keys, "Expected 'comments' block with metadata=True"
+    assert top_keys.index("comments") < top_keys.index("@context"), (
+        f"'comments' should precede '@context', got: {top_keys}"
+    )
+
+    # Inside @context: @-directives, then prefixes (str values), then terms (dict values)
+    ctx = parsed["@context"]
+    ctx_keys = list(ctx.keys())
+
+    at_keys = [k for k in ctx_keys if k.startswith("@")]
+    prefix_keys = [k for k in ctx_keys if not k.startswith("@") and isinstance(ctx[k], str)]
+    term_keys = [k for k in ctx_keys if not k.startswith("@") and not isinstance(ctx[k], str)]
+
+    # Verify grouping: all @-keys before all prefix keys before all term keys
+    last_at = max(ctx_keys.index(k) for k in at_keys) if at_keys else -1
+    first_prefix = min(ctx_keys.index(k) for k in prefix_keys) if prefix_keys else len(ctx_keys)
+    last_prefix = max(ctx_keys.index(k) for k in prefix_keys) if prefix_keys else -1
+    first_term = min(ctx_keys.index(k) for k in term_keys) if term_keys else len(ctx_keys)
+
+    assert last_at < first_prefix, "@-directives must come before prefixes"
+    assert last_prefix < first_term, "Prefixes must come before term entries"
+
+    # Verify each group is sorted internally
+    assert at_keys == sorted(at_keys), f"@-directives not sorted: {at_keys}"
+    assert prefix_keys == sorted(prefix_keys), f"Prefixes not sorted: {prefix_keys}"
+    assert term_keys == sorted(term_keys), f"Term entries not sorted: {term_keys}"
+
+
+def test_non_deterministic_is_default():
+    """Verify that ``deterministic`` defaults to False."""
+    gen = OwlSchemaGenerator(SCHEMA)
+    assert gen.deterministic is False
+
+
+@pytest.mark.xfail(
+    reason=(
+        "Collection sorting (owl:oneOf, sh:in) in deterministic mode intentionally "
+        "reorders RDF list triples for canonical output. The resulting graph is "
+        "semantically equivalent (OWL/SHACL interpret these as unordered sets) but "
+        "not RDF-isomorphic because rdf:first/rdf:rest chains encode ordering."
+    ),
+    strict=True,
+)
+@pytest.mark.parametrize(
+    "generator_cls",
+    [OwlSchemaGenerator, ShaclGenerator],
+    ids=["owl", "shacl"],
+)
+def test_deterministic_turtle_is_isomorphic(generator_cls):
+    """Deterministic output is NOT RDF-isomorphic to non-deterministic output.
+
+    This documents the trade-off identified in linkml/linkml#3295 review:
+    deterministic mode sorts Collection inputs (owl:oneOf, sh:in,
+    sh:ignoredProperties) to produce canonical RDF list ordering.  Since RDF
+    Collections encode order via rdf:first/rdf:rest triples, the sorted graph
+    is structurally different from the insertion-order graph — even though the
+    OWL/SHACL semantics are identical (these Collections represent sets).
+
+    The test is marked xfail(strict=True) so that it:
+    - Documents the known, intentional non-isomorphism
+    - Alerts maintainers if the behaviour changes (strict xfail fails on pass)
+    """
+    out_det = generator_cls(SCHEMA, deterministic=True).serialize()
+    out_nondet = generator_cls(SCHEMA, deterministic=False).serialize()
+
+    g_det = Graph()
+    g_det.parse(data=out_det, format="turtle")
+
+    g_nondet = Graph()
+    g_nondet.parse(data=out_nondet, format="turtle")
+
+    assert len(g_det) == len(g_nondet), (
+        f"Triple count mismatch: deterministic={len(g_det)}, "
+        f"non-deterministic={len(g_nondet)}"
+    )
+    assert isomorphic(g_det, g_nondet), (
+        f"{generator_cls.__name__}: deterministic output is NOT isomorphic "
+        "to non-deterministic output — the serialization changed the graph"
+    )
+
+
+@pytest.mark.parametrize(
+    "generator_cls",
+    [OwlSchemaGenerator, ShaclGenerator],
+    ids=["owl", "shacl"],
+)
+def test_non_deterministic_output_unchanged(generator_cls):
+    """Non-deterministic output must still produce valid RDF.
+
+    Ensures that changes for deterministic mode don't break default behavior.
+    """
+    out = generator_cls(SCHEMA, deterministic=False).serialize()
+    assert len(out) > 100, "Output suspiciously short"
+    g = Graph()
+    g.parse(data=out, format="turtle")
+    assert len(g) > 50, f"Graph has too few triples ({len(g)})"
+
+
+@pytest.mark.parametrize(
+    "generator_cls,kwargs",
+    [
+        (OwlSchemaGenerator, {}),
+        (ShaclGenerator, {}),
+        (ContextGenerator, {}),
+        (JSONLDGenerator, {}),
+    ],
+    ids=["owl", "shacl", "context", "jsonld"],
+)
+def test_non_deterministic_produces_valid_output(generator_cls, kwargs):
+    """All generators must produce valid output in non-deterministic mode."""
+    out = generator_cls(SCHEMA, deterministic=False, **kwargs).serialize()
+    assert len(out) > 100, f"{generator_cls.__name__} output suspiciously short"
+
+
+@pytest.mark.xfail(
+    reason=(
+        "Collection sorting in deterministic mode produces non-isomorphic RDF "
+        "(different rdf:first/rdf:rest triples). See test_deterministic_turtle_is_isomorphic."
+    ),
+    strict=True,
+)
+@pytest.mark.parametrize(
+    "generator_cls",
+    [OwlSchemaGenerator, ShaclGenerator],
+    ids=["owl", "shacl"],
+)
+def test_deterministic_kitchen_sink_isomorphic(generator_cls):
+    """Isomorphism check on the complex kitchen_sink schema.
+
+    Expected to fail for the same reason as test_deterministic_turtle_is_isomorphic:
+    Collection sorting changes the RDF structure while preserving OWL/SHACL semantics.
+    """
+    out_det = generator_cls(KITCHEN_SINK, deterministic=True).serialize()
+    out_nondet = generator_cls(KITCHEN_SINK, deterministic=False).serialize()
+
+    g_det = Graph()
+    g_det.parse(data=out_det, format="turtle")
+
+    g_nondet = Graph()
+    g_nondet.parse(data=out_nondet, format="turtle")
+
+    assert isomorphic(g_det, g_nondet), (
+        f"{generator_cls.__name__}: kitchen_sink deterministic output is NOT "
+        "isomorphic to non-deterministic output"
+    )