SchemaValidator canonical cache key

p1c2u · p1c2u · commit 98fdb1ef8f88 · 2026-05-29T21:39:43.000+01:00
diff --git a/openapi_core/validation/schemas/validators.py b/openapi_core/validation/schemas/validators.py
@@ -10,6 +10,7 @@
 from jsonschema.exceptions import FormatError
 from jsonschema.protocols import Validator
 from jsonschema_path import SchemaPath
+from referencing.exceptions import Unresolvable
 
 from openapi_core.validation.schemas.datatypes import (
     _EMPTY_STATE_TUPLE as _EMPTY_STATES_TUPLE,
@@ -27,6 +28,11 @@
 
 log = logging.getLogger(__name__)
 
+# Feature-detect jsonschema-path SchemaPath.canonical (PR #263). When present
+# the needs_state cache keys on the resolved (canonical) location; otherwise it
+# degrades to per-route keying. Also used by the benchmark to A/B the two.
+_HAS_CANONICAL = hasattr(SchemaPath, "canonical")
+
 
 class SchemaValidator:
     def __init__(
@@ -48,17 +54,51 @@ def validate(self, value: Any) -> None:
             raise InvalidSchemaValue(value, schema_type, schema_errors=errors)
 
     # Cache the recursive "does this schema benefit from a ValidationState?"
-    # check, keyed on the SchemaPath. Under jsonschema-path 0.5 (pathable
-    # 0.6) SchemaPath is an AccessorPath whose identity is
-    # (parts, accessor), and SchemaAccessor in turn hashes/compares on
-    # id(node) and id(path_resolver). The key is therefore effectively
-    # per-resolver: two SchemaPaths share a cache slot only when they
-    # address the same location *within the same loaded spec*, never
-    # across distinct specs that merely share a JSON-pointer path.
-    # Entries are bounded by the number of distinct schema shapes per
-    # spec and become collectable once the owning resolver is GC'd.
+    # check, keyed on the schema's *canonical* SchemaPath -- the location the
+    # $ref chain ultimately resolves to (jsonschema-path SchemaPath.canonical).
+    # Canonical keying buys two properties at once:
+    #
+    #   * Cross-route dedup -- every $ref alias of one target collapses to a
+    #     single slot, so the recursive walk runs once per distinct resolved
+    #     shape instead of once per navigation path.
+    #   * Cross-spec safety -- SchemaPath identity is (parts, accessor), and
+    #     canonical() hands back a shared accessor per resolved document, so
+    #     two independently loaded specs that merely share a JSON pointer
+    #     never collide.
+    #
+    # Entries are bounded by the number of distinct resolved schema shapes
+    # across loaded specs. On jsonschema-path builds without canonical(), or
+    # for a node whose $ref is unresolvable, keying degrades to the navigation
+    # path. A $dynamicRef-only node yields no cross-target dedup either, since
+    # canonical() returns it as-is rather than following the dynamic ref. None
+    # of these cases raise.
     _needs_state_cache: dict[SchemaPath, bool] = {}
 
+    # Memoise the (cheap-keyed) navigation-path -> canonical-path mapping so a
+    # warm needs_state lookup pays one cheap SchemaPath hash instead of
+    # re-walking the $ref chain every call. Without this, canonicalising on
+    # every _schema_needs_state invocation regresses the hot path.
+    _canonical_key_cache: dict[SchemaPath, SchemaPath] = {}
+
+    @classmethod
+    def _needs_state_key(cls, schema: SchemaPath) -> SchemaPath:
+        """Cache key for ``schema``: its canonical SchemaPath when available
+        (collapsing $ref aliases, correct across specs), else the navigation
+        path. Never raises -- an unresolvable/$dynamicRef-only node falls back
+        to ``schema``. Canonical resolution is memoised on the cheap path key.
+        """
+        if not _HAS_CANONICAL:
+            return schema
+        cached = cls._canonical_key_cache.get(schema)
+        if cached is not None:
+            return cached
+        try:
+            canonical = schema.canonical()
+        except Unresolvable:
+            canonical = schema
+        cls._canonical_key_cache[schema] = canonical
+        return canonical
+
     @classmethod
     def _schema_needs_state(cls, schema: SchemaPath) -> bool:
         """True if building a ValidationState for ``schema`` carries
@@ -70,42 +110,46 @@ def _schema_needs_state(cls, schema: SchemaPath) -> bool:
         sentinel once the recursion completes.
         """
         cache = cls._needs_state_cache
-        cached = cache.get(schema)
+        key = cls._needs_state_key(schema)
+        cached = cache.get(key)
         if cached is not None:
             return cached
+        # Walk the canonical target so composition behind a $ref is seen even
+        # when the source node only carries "$ref"; identical for non-$ref.
+        node = key
         # Self-composition is the strongest signal; check it first to
         # short-circuit the cheap case.
-        if "oneOf" in schema or "anyOf" in schema or "allOf" in schema:
-            cache[schema] = True
+        if "oneOf" in node or "anyOf" in node or "allOf" in node:
+            cache[key] = True
             return True
         # Seed the in-progress sentinel for cycle protection.
-        cache[schema] = False
+        cache[key] = False
         # Recurse into children. We only need to find one descendant
         # that needs state to flip our own answer.
         result = False
-        if "properties" in schema:
-            prop_iter = (schema / "properties").items()
+        if "properties" in node:
+            prop_iter = (node / "properties").items()
             for prop_name, prop_schema in prop_iter:
                 if not isinstance(prop_name, str):
                     continue
                 if cls._schema_needs_state(prop_schema):
                     result = True
                     break
-        if not result and "additionalProperties" in schema:
+        if not result and "additionalProperties" in node:
             try:
-                ap = schema / "additionalProperties"
+                ap = node / "additionalProperties"
             except Exception:
                 ap = None
             if ap is not None and cls._schema_needs_state(ap):
                 result = True
-        if not result and "items" in schema:
+        if not result and "items" in node:
             try:
-                items = schema / "items"
+                items = node / "items"
             except Exception:
                 items = None
             if items is not None and cls._schema_needs_state(items):
                 result = True
-        cache[schema] = result
+        cache[key] = result
         return result
 
     def validate_state(self, value: Any) -> ValidationState:
diff --git a/tests/benchmarks/bench_canonical_key.py b/tests/benchmarks/bench_canonical_key.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+"""Benchmark: canonical-path keying vs id(content) keying vs per-route
+(navigation-path) keying for the SchemaValidator static-analysis caches.
+
+Reports, per strategy:
+  * cold_us_per_node -- cost of deriving the cache key for every schema
+    node (the once-per-construction work).
+  * warm_us_per_node -- steady-state derive+lookup cost (the hot path).
+  * distinct_keys / dedup_ratio -- how many cache slots the strategy
+    produces; higher dedup = fewer recomputations of needs_state.
+
+"per_route" is the master/baseline behaviour (key on the navigation
+SchemaPath, no $ref collapsing). "canonical" requires jsonschema-path
+PR #263. "id_content" mirrors docs/plans/v3-cache-refactor/_caches.py.
+"""
+from __future__ import annotations
+
+import argparse
+import gc
+import json
+import statistics
+import time
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+from jsonschema_path import SchemaPath
+
+HAS_CANONICAL = hasattr(SchemaPath, "canonical")
+
+
+def build_spec(schemas: int, depth: int, shared_targets: int) -> SchemaPath:
+    defs: Dict[str, Any] = {
+        f"Leaf{t}": {"type": "string", "format": "uuid"}
+        for t in range(shared_targets)
+    }
+    components: Dict[str, Any] = {}
+    for s in range(schemas):
+        node: Dict[str, Any] = {"type": "object", "properties": {}}
+        cursor = node["properties"]
+        for d in range(depth):
+            child: Dict[str, Any] = {"type": "object", "properties": {}}
+            cursor[f"level{d}"] = child
+            cursor = child["properties"]
+        for k in range(4):
+            target = f"Leaf{(s + k) % shared_targets}"
+            cursor[f"leaf{k}"] = {"$ref": f"#/$defs/{target}"}
+        components[f"Schema{s}"] = node
+    spec_dict = {
+        "openapi": "3.1.0",
+        "info": {"title": "bench-canonical", "version": "0"},
+        "$defs": defs,
+        "components": {"schemas": components},
+    }
+    return SchemaPath.from_dict(spec_dict)
+
+
+def collect_schema_paths(spec: SchemaPath) -> List[SchemaPath]:
+    paths: List[SchemaPath] = []
+
+    def walk(node: SchemaPath) -> None:
+        paths.append(node)
+        if "properties" in node:
+            for name, sub in (node / "properties").items():
+                if isinstance(name, str):
+                    walk(sub)
+
+    for name, schema in (spec / "components" / "schemas").items():
+        if isinstance(name, str):
+            walk(schema)
+    return paths
+
+
+def key_per_route(path: SchemaPath) -> Any:
+    return path  # master/baseline: navigation path identity
+
+
+def key_canonical(path: SchemaPath) -> Optional[Tuple[int, Tuple[Any, ...]]]:
+    from referencing.exceptions import Unresolvable
+
+    try:
+        canon = path.canonical()
+    except Unresolvable:
+        return None
+    return (id(canon.accessor), tuple(canon.parts))
+
+
+def key_id_content(path: SchemaPath) -> Optional[int]:
+    try:
+        with path.resolve() as resolved:
+            return id(resolved.contents)
+    except Exception:
+        return None
+
+
+@dataclass
+class StrategyResult:
+    name: str
+    nodes: int
+    distinct_keys: int
+    cold_seconds: List[float] = field(default_factory=list)
+    warm_seconds: List[float] = field(default_factory=list)
+
+    def as_dict(self) -> Dict[str, Any]:
+        cold = statistics.median(self.cold_seconds)
+        warm = statistics.median(self.warm_seconds)
+        return {
+            "name": self.name,
+            "nodes": self.nodes,
+            "distinct_keys": self.distinct_keys,
+            "dedup_ratio": round(self.nodes / self.distinct_keys, 2),
+            "cold_us_per_node": round(cold / self.nodes * 1e6, 3),
+            "warm_us_per_node": round(warm / self.nodes * 1e6, 3),
+        }
+
+
+def measure(name, paths, keyfn, repeats, warmup) -> StrategyResult:
+    cold: List[float] = []
+    distinct = 0
+    for _ in range(repeats):
+        seen = set()
+        t0 = time.perf_counter()
+        for p in paths:
+            k = keyfn(p)
+            if k is not None:
+                seen.add(k)
+        cold.append(time.perf_counter() - t0)
+        distinct = len(seen)
+    cache: Dict[Any, bool] = {}
+    for p in paths:
+        k = keyfn(p)
+        if k is not None:
+            cache[k] = True
+    for _ in range(warmup):
+        for p in paths:
+            cache.get(keyfn(p))
+    warm: List[float] = []
+    for _ in range(repeats):
+        t0 = time.perf_counter()
+        for p in paths:
+            cache.get(keyfn(p))
+        warm.append(time.perf_counter() - t0)
+    return StrategyResult(name, len(paths), distinct, cold, warm)
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--schemas", type=int, default=500)
+    ap.add_argument("--depth", type=int, default=3)
+    ap.add_argument("--shared-targets", type=int, default=16)
+    ap.add_argument("--repeats", type=int, default=7)
+    ap.add_argument("--warmup", type=int, default=2)
+    ap.add_argument("--output", type=str, default="")
+    ap.add_argument("--no-gc", action="store_true")
+    args = ap.parse_args()
+
+    spec = build_spec(args.schemas, args.depth, args.shared_targets)
+    paths = collect_schema_paths(spec)
+    if args.no_gc:
+        gc.disable()
+    results = [
+        measure("per_route", paths, key_per_route, args.repeats, args.warmup),
+        measure(
+            "id_content", paths, key_id_content, args.repeats, args.warmup
+        ),
+    ]
+    if HAS_CANONICAL:
+        results.append(
+            measure(
+                "canonical", paths, key_canonical, args.repeats, args.warmup
+            )
+        )
+    if args.no_gc:
+        gc.enable()
+    payload = {
+        "config": {
+            "schemas": args.schemas,
+            "depth": args.depth,
+            "shared_targets": args.shared_targets,
+            "nodes": len(paths),
+            "has_canonical": HAS_CANONICAL,
+        },
+        "strategies": [r.as_dict() for r in results],
+    }
+    print(json.dumps(payload, indent=2, sort_keys=True))
+    if args.output:
+        with open(args.output, "w", encoding="utf-8") as f:
+            json.dump(payload, f, indent=2, sort_keys=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/unit/validation/test_schema_validators.py b/tests/unit/validation/test_schema_validators.py
@@ -5,6 +5,7 @@
     oas30_write_schema_validators_factory,
 )
 from openapi_core.validation.schemas.exceptions import InvalidSchemaValue
+from openapi_core.validation.schemas.validators import _HAS_CANONICAL
 from openapi_core.validation.schemas.validators import SchemaValidator
 
 
@@ -478,11 +479,12 @@ def _build(schema_dict):
         return _build
 
     @pytest.mark.xfail(
+        condition=not _HAS_CANONICAL,
         strict=True,
         reason=(
-            "The cache keys on the navigation path, so each $ref "
-            "alias gets its own slot. Once the cache keys on canonical "
-            "the aliases collapse to a single entry."
+            "Without SchemaPath.canonical the cache keys on the navigation "
+            "path, so each $ref alias gets its own slot. With canonical "
+            "keying the aliases collapse to a single entry."
         ),
     )
     def test_aliases_to_same_node_share_one_cache_slot(
@@ -497,4 +499,4 @@ def test_aliases_to_same_node_share_one_cache_slot(
         assert len(cache) == 1
         assert prop_a not in cache
         assert prop_b not in cache
-        assert cache[canonical] is True
+        assert cache[canonical] is True