Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions packages/linkml/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ dependencies = [ # Specifier syntax: https://peps.python.org/pep-0631/
"pyshex >= 0.7.20",
"pyshexc >= 0.8.3",
"python-dateutil",
"pyoxigraph >= 0.4.0",
"pyyaml",
"rdflib >=6.0.0",
"requests >= 2.22",
Expand Down
54 changes: 54 additions & 0 deletions packages/linkml/src/linkml/generators/jsonldcontextgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,62 @@ def end_schema(
with open(frame_path, "w", encoding="UTF-8") as f:
json.dump(frame, f, indent=2, ensure_ascii=False)

if self.deterministic:
return self._deterministic_context_json(json.loads(str(as_json(context))), indent=3) + "\n"
return str(as_json(context)) + "\n"

@staticmethod
def _deterministic_context_json(data: dict, indent: int = 3) -> str:
"""Serialize a JSON-LD context with deterministic key ordering.

Preserves the conventional JSON-LD context structure:
1. ``comments`` block first (metadata)
2. ``@context`` block second, with:
a. ``@``-prefixed directives (``@vocab``, ``@base``) first
b. Prefix declarations (string values) second
c. Class/property term entries (object values) last
3. Each group sorted alphabetically within itself

Unlike :func:`deterministic_json`, this understands JSON-LD
conventions so that the output remains human-readable while
still being byte-identical across invocations.
"""
from linkml.utils.generator import deterministic_json

ordered = {}

# 1. "comments" first (if present)
if "comments" in data:
ordered["comments"] = data["comments"]

# 2. "@context" with structured internal ordering
if "@context" in data:
ctx = data["@context"]
ordered_ctx = {}

# 2a. @-prefixed directives (@vocab, @base, etc.)
for k in sorted(k for k in ctx if k.startswith("@")):
ordered_ctx[k] = ctx[k]

# 2b. Prefix declarations (string values — short namespace URIs)
for k in sorted(k for k in ctx if not k.startswith("@") and isinstance(ctx[k], str)):
ordered_ctx[k] = ctx[k]

# 2c. Term definitions (object values) — deep-sorted for determinism
term_entries = {k: v for k, v in ctx.items() if not k.startswith("@") and not isinstance(v, str)}
sorted_terms = json.loads(deterministic_json(term_entries))
for k in sorted(sorted_terms):
ordered_ctx[k] = sorted_terms[k]

ordered["@context"] = ordered_ctx

# 3. Any remaining top-level keys
for k in sorted(data):
if k not in ordered:
ordered[k] = data[k]

return json.dumps(ordered, indent=indent, ensure_ascii=False)

def visit_class(self, cls: ClassDefinition) -> bool:
if self.exclude_imports and cls.name not in self._local_classes:
return False
Expand Down
5 changes: 5 additions & 0 deletions packages/linkml/src/linkml/generators/jsonldgen.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Generate JSONld from a LinkML schema."""

import json
import os
from collections.abc import Sequence
from copy import deepcopy
Expand Down Expand Up @@ -202,6 +203,10 @@ def end_schema(self, context: str | Sequence[str] | None = None, context_kwargs:
self.schema["@context"].append({"@base": base_prefix})
# json_obj["@id"] = self.schema.id
out = str(as_json(self.schema, indent=" ")) + "\n"
if self.deterministic:
from linkml.utils.generator import deterministic_json

out = deterministic_json(json.loads(out), indent=2) + "\n"
self.schema = self.original_schema
return out

Expand Down
14 changes: 12 additions & 2 deletions packages/linkml/src/linkml/generators/owlgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,14 @@ def serialize(self, **kwargs) -> str:
:return:
"""
self.as_graph()
data = self.graph.serialize(format="turtle" if self.format in ["owl", "ttl"] else self.format)
fmt = "turtle" if self.format in ["owl", "ttl"] else self.format
if self.deterministic and fmt == "turtle":
# Deferred to avoid circular import (generator.py imports from this package)
from linkml.utils.generator import deterministic_turtle

data = deterministic_turtle(self.graph)
else:
data = self.graph.serialize(format=fmt)
return data

def add_metadata(self, e: Definition | PermissibleValue, uri: URIRef) -> None:
Expand Down Expand Up @@ -998,7 +1005,10 @@ def add_enum(self, e: EnumDefinition) -> None:
owl_types = []
enum_owl_type = self._get_metatype(e, self.default_permissible_value_type)

for pv in e.permissible_values.values():
pvs = e.permissible_values.values()
if self.deterministic:
pvs = sorted(pvs, key=lambda x: x.text)
for pv in pvs:
pv_owl_type = self._get_metatype(pv, enum_owl_type)
owl_types.append(pv_owl_type)
if pv_owl_type == RDFS.Literal:
Expand Down
21 changes: 15 additions & 6 deletions packages/linkml/src/linkml/generators/shaclgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,13 @@ def generate_header(self) -> str:

def serialize(self, **args) -> str:
g = self.as_graph()
data = g.serialize(format="turtle" if self.format in ["owl", "ttl"] else self.format)
fmt = "turtle" if self.format in ["owl", "ttl"] else self.format
if self.deterministic and fmt == "turtle":
from linkml.utils.generator import deterministic_turtle

data = deterministic_turtle(g)
else:
data = g.serialize(format=fmt)
return data

def as_graph(self) -> Graph:
Expand Down Expand Up @@ -309,13 +315,13 @@ def _add_enum(self, g: Graph, func: Callable, r: ElementName) -> None:
sv = self.schemaview
enum = sv.get_enum(r)
pv_node = BNode()
pv_items = list(enum.permissible_values.items())
if self.deterministic:
pv_items = sorted(pv_items)
Collection(
g,
pv_node,
[
URIRef(sv.expand_curie(pv.meaning)) if pv.meaning else Literal(pv_name)
for pv_name, pv in enum.permissible_values.items()
],
[URIRef(sv.expand_curie(pv.meaning)) if pv.meaning else Literal(pv_name) for pv_name, pv in pv_items],
)
func(SH["in"], pv_node)

Expand Down Expand Up @@ -469,7 +475,10 @@ def collect_child_properties(class_name: str, output: set) -> None:

list_node = BNode()
ignored_properties.add(RDF.type)
Collection(g, list_node, list(ignored_properties))
props = list(ignored_properties)
if self.deterministic:
props = sorted(props, key=str)
Collection(g, list_node, props)

return list_node

Expand Down
124 changes: 123 additions & 1 deletion packages/linkml/src/linkml/utils/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from dataclasses import dataclass, field
from functools import lru_cache
from pathlib import Path
from typing import ClassVar, TextIO, Union, cast
from typing import TYPE_CHECKING, ClassVar, TextIO, Union, cast

import click
from click import Argument, Command, Option
Expand All @@ -37,6 +37,10 @@
from linkml.utils.schemaloader import SchemaLoader
from linkml.utils.typereferences import References
from linkml_runtime import SchemaView

if TYPE_CHECKING:
from rdflib import Graph as RdfGraph

from linkml_runtime.linkml_model.meta import (
ClassDefinition,
ClassDefinitionName,
Expand Down Expand Up @@ -78,6 +82,111 @@ def _resolved_metamodel(mergeimports):
return metamodel


def deterministic_turtle(graph: "RdfGraph") -> str:
"""Serialize an RDF graph to Turtle with deterministic output ordering.

Uses W3C RDFC-1.0 [1]_ (RDF Dataset Canonicalization) via
``pyoxigraph`` to assign stable blank-node identifiers, then
serializes to Turtle with pyoxigraph's own deterministic serializer.

The pipeline is: rdflib N-Triples → pyoxigraph RDFC-1.0
canonicalization → pyoxigraph Turtle serialization with namespace
prefixes derived from the source graph.

Prefixes are resolved with *rdflib-default-first* semantics: if a
namespace is already bound by rdflib's curated defaults (e.g.
``schema: https://schema.org/``), that binding wins over the model's
alias (e.g. ``sdo:``).

This guarantees both **serialisation determinism** (same input →
same output) and **graph isomorphism preservation** (the canonical
output is isomorphic to the input).

Parameters
----------
graph : rdflib.Graph
An rdflib Graph to serialize.

Returns
-------
str
Deterministic Turtle string in standard ``@prefix`` format.

References
----------
.. [1] W3C (2024). "RDF Dataset Canonicalization (RDFC-1.0)."
W3C Recommendation. https://www.w3.org/TR/rdf-canon/
"""
import pyoxigraph
from rdflib import Graph as RdfGraph

nt_data = graph.serialize(format="nt")

dataset = pyoxigraph.Dataset(
pyoxigraph.parse(nt_data, format=pyoxigraph.RdfFormat.N_TRIPLES)
)
dataset.canonicalize(pyoxigraph.CanonicalizationAlgorithm.RDFC_1_0)

# Build prefix dict. Start with rdflib's curated defaults so that
# well-known namespaces keep their standard prefix names, then layer
# the source graph's custom bindings (skip any namespace that is
# already covered by a default).
ns_to_prefix: dict[str, str] = {}
prefixes: dict[str, str] = {}
for pfx, ns in RdfGraph().namespaces():
pfx_s, ns_s = str(pfx), str(ns)
if pfx_s:
prefixes[pfx_s] = ns_s
ns_to_prefix[ns_s] = pfx_s
for pfx, ns in graph.namespaces():
pfx_s, ns_s = str(pfx), str(ns)
if pfx_s and ns_s not in ns_to_prefix:
prefixes[pfx_s] = ns_s
ns_to_prefix[ns_s] = pfx_s

# Sort canonical triples by their N-Triples string form to ensure
# a stable input order for the Turtle serializer. The Dataset's
# BTreeSet ordering uses internal interned IDs which may differ
# between independently-canonicalized datasets.
triples = sorted((q.triple for q in dataset), key=str)
ttl_bytes = pyoxigraph.serialize(
triples,
format=pyoxigraph.RdfFormat.TURTLE,
prefixes=prefixes,
)
return ttl_bytes.decode("utf-8")


def deterministic_json(obj: object, indent: int = 3) -> str:
"""Serialize a JSON-compatible object with deterministic ordering.

Recursively sorts all dict keys *and* list elements to produce
stable output across Python versions and process invocations.

List elements are sorted by their canonical JSON representation
(``json.dumps(item, sort_keys=True)``), which handles lists of
dicts, strings, and mixed types.

:param obj: A JSON-serializable object (typically parsed from ``as_json``).
:param indent: Number of spaces for indentation.
:returns: Deterministic JSON string.
"""
import json

def _deep_sort(value: object) -> object:
if isinstance(value, dict):
return {k: _deep_sort(v) for k, v in sorted(value.items())}
if isinstance(value, list):
sorted_items = [_deep_sort(item) for item in value]
try:
return sorted(sorted_items, key=lambda x: json.dumps(x, sort_keys=True, ensure_ascii=False))
except TypeError:
return sorted_items
return value

return json.dumps(_deep_sort(obj), indent=indent, ensure_ascii=False)


@dataclass
class Generator(metaclass=abc.ABCMeta):
"""
Expand Down Expand Up @@ -139,6 +248,9 @@ class Generator(metaclass=abc.ABCMeta):
mergeimports: bool | None = True
"""True means merge non-linkml sources into importing package. False means separate packages"""

deterministic: bool = False
"""True means produce stable, reproducible output with sorted keys and canonical blank-node ordering"""

source_file_date: str | None = None
"""Modification date of input source file"""

Expand Down Expand Up @@ -986,6 +1098,16 @@ def decorator(f: Command) -> Command:
callback=stacktrace_callback,
)
)
f.params.append(
Option(
("--deterministic/--no-deterministic",),
default=False,
show_default=True,
help="Generate stable, reproducible output with sorted keys and canonical blank-node ordering. "
"Supported by OWL, SHACL, JSON-LD, and JSON-LD Context generators. "
"Useful when generated artifacts are stored in version control.",
)
)

return f

Expand Down
Loading
Loading