Skip to content

Commit 98fdb1e

Browse files
committed
SchemaValidator canonical cache key
1 parent 0337b43 commit 98fdb1e

3 files changed

Lines changed: 261 additions & 24 deletions

File tree

openapi_core/validation/schemas/validators.py

Lines changed: 64 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from jsonschema.exceptions import FormatError
1111
from jsonschema.protocols import Validator
1212
from jsonschema_path import SchemaPath
13+
from referencing.exceptions import Unresolvable
1314

1415
from openapi_core.validation.schemas.datatypes import (
1516
_EMPTY_STATE_TUPLE as _EMPTY_STATES_TUPLE,
@@ -27,6 +28,11 @@
2728

2829
log = logging.getLogger(__name__)
2930

31+
# Feature-detect jsonschema-path SchemaPath.canonical (PR #263). When present
32+
# the needs_state cache keys on the resolved (canonical) location; otherwise it
33+
# degrades to per-route keying. Also used by the benchmark to A/B the two.
34+
_HAS_CANONICAL = hasattr(SchemaPath, "canonical")
35+
3036

3137
class SchemaValidator:
3238
def __init__(
@@ -48,17 +54,51 @@ def validate(self, value: Any) -> None:
4854
raise InvalidSchemaValue(value, schema_type, schema_errors=errors)
4955

5056
# Cache the recursive "does this schema benefit from a ValidationState?"
51-
# check, keyed on the SchemaPath. Under jsonschema-path 0.5 (pathable
52-
# 0.6) SchemaPath is an AccessorPath whose identity is
53-
# (parts, accessor), and SchemaAccessor in turn hashes/compares on
54-
# id(node) and id(path_resolver). The key is therefore effectively
55-
# per-resolver: two SchemaPaths share a cache slot only when they
56-
# address the same location *within the same loaded spec*, never
57-
# across distinct specs that merely share a JSON-pointer path.
58-
# Entries are bounded by the number of distinct schema shapes per
59-
# spec and become collectable once the owning resolver is GC'd.
57+
# check, keyed on the schema's *canonical* SchemaPath -- the location the
58+
# $ref chain ultimately resolves to (jsonschema-path SchemaPath.canonical).
59+
# Canonical keying buys two properties at once:
60+
#
61+
# * Cross-route dedup -- every $ref alias of one target collapses to a
62+
# single slot, so the recursive walk runs once per distinct resolved
63+
# shape instead of once per navigation path.
64+
# * Cross-spec safety -- SchemaPath identity is (parts, accessor), and
65+
# canonical() hands back a shared accessor per resolved document, so
66+
# two independently loaded specs that merely share a JSON pointer
67+
# never collide.
68+
#
69+
# Entries are bounded by the number of distinct resolved schema shapes
70+
# across loaded specs. On jsonschema-path builds without canonical(), or
71+
# for a node whose $ref is unresolvable, keying degrades to the navigation
72+
# path. A $dynamicRef-only node yields no cross-target dedup either, since
73+
# canonical() returns it as-is rather than following the dynamic ref. None
74+
# of these cases raise.
6075
_needs_state_cache: dict[SchemaPath, bool] = {}
6176

77+
# Memoise the (cheap-keyed) navigation-path -> canonical-path mapping so a
78+
# warm needs_state lookup pays one cheap SchemaPath hash instead of
79+
# re-walking the $ref chain every call. Without this, canonicalising on
80+
# every _schema_needs_state invocation regresses the hot path.
81+
_canonical_key_cache: dict[SchemaPath, SchemaPath] = {}
82+
83+
@classmethod
84+
def _needs_state_key(cls, schema: SchemaPath) -> SchemaPath:
85+
"""Cache key for ``schema``: its canonical SchemaPath when available
86+
(collapsing $ref aliases, correct across specs), else the navigation
87+
path. Never raises -- an unresolvable/$dynamicRef-only node falls back
88+
to ``schema``. Canonical resolution is memoised on the cheap path key.
89+
"""
90+
if not _HAS_CANONICAL:
91+
return schema
92+
cached = cls._canonical_key_cache.get(schema)
93+
if cached is not None:
94+
return cached
95+
try:
96+
canonical = schema.canonical()
97+
except Unresolvable:
98+
canonical = schema
99+
cls._canonical_key_cache[schema] = canonical
100+
return canonical
101+
62102
@classmethod
63103
def _schema_needs_state(cls, schema: SchemaPath) -> bool:
64104
"""True if building a ValidationState for ``schema`` carries
@@ -70,42 +110,46 @@ def _schema_needs_state(cls, schema: SchemaPath) -> bool:
70110
sentinel once the recursion completes.
71111
"""
72112
cache = cls._needs_state_cache
73-
cached = cache.get(schema)
113+
key = cls._needs_state_key(schema)
114+
cached = cache.get(key)
74115
if cached is not None:
75116
return cached
117+
# Walk the canonical target so composition behind a $ref is seen even
118+
# when the source node only carries "$ref"; identical for non-$ref.
119+
node = key
76120
# Self-composition is the strongest signal; check it first to
77121
# short-circuit the cheap case.
78-
if "oneOf" in schema or "anyOf" in schema or "allOf" in schema:
79-
cache[schema] = True
122+
if "oneOf" in node or "anyOf" in node or "allOf" in node:
123+
cache[key] = True
80124
return True
81125
# Seed the in-progress sentinel for cycle protection.
82-
cache[schema] = False
126+
cache[key] = False
83127
# Recurse into children. We only need to find one descendant
84128
# that needs state to flip our own answer.
85129
result = False
86-
if "properties" in schema:
87-
prop_iter = (schema / "properties").items()
130+
if "properties" in node:
131+
prop_iter = (node / "properties").items()
88132
for prop_name, prop_schema in prop_iter:
89133
if not isinstance(prop_name, str):
90134
continue
91135
if cls._schema_needs_state(prop_schema):
92136
result = True
93137
break
94-
if not result and "additionalProperties" in schema:
138+
if not result and "additionalProperties" in node:
95139
try:
96-
ap = schema / "additionalProperties"
140+
ap = node / "additionalProperties"
97141
except Exception:
98142
ap = None
99143
if ap is not None and cls._schema_needs_state(ap):
100144
result = True
101-
if not result and "items" in schema:
145+
if not result and "items" in node:
102146
try:
103-
items = schema / "items"
147+
items = node / "items"
104148
except Exception:
105149
items = None
106150
if items is not None and cls._schema_needs_state(items):
107151
result = True
108-
cache[schema] = result
152+
cache[key] = result
109153
return result
110154

111155
def validate_state(self, value: Any) -> ValidationState:
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
#!/usr/bin/env python3
2+
"""Benchmark: canonical-path keying vs id(content) keying vs per-route
3+
(navigation-path) keying for the SchemaValidator static-analysis caches.
4+
5+
Reports, per strategy:
6+
* cold_us_per_node -- cost of deriving the cache key for every schema
7+
node (the once-per-construction work).
8+
* warm_us_per_node -- steady-state derive+lookup cost (the hot path).
9+
* distinct_keys / dedup_ratio -- how many cache slots the strategy
10+
produces; higher dedup = fewer recomputations of needs_state.
11+
12+
"per_route" is the master/baseline behaviour (key on the navigation
13+
SchemaPath, no $ref collapsing). "canonical" requires jsonschema-path
14+
PR #263. "id_content" mirrors docs/plans/v3-cache-refactor/_caches.py.
15+
"""
16+
from __future__ import annotations
17+
18+
import argparse
19+
import gc
20+
import json
21+
import statistics
22+
import time
23+
from dataclasses import dataclass, field
24+
from typing import Any, Callable, Dict, List, Optional, Tuple
25+
26+
from jsonschema_path import SchemaPath
27+
28+
HAS_CANONICAL = hasattr(SchemaPath, "canonical")
29+
30+
31+
def build_spec(schemas: int, depth: int, shared_targets: int) -> SchemaPath:
32+
defs: Dict[str, Any] = {
33+
f"Leaf{t}": {"type": "string", "format": "uuid"}
34+
for t in range(shared_targets)
35+
}
36+
components: Dict[str, Any] = {}
37+
for s in range(schemas):
38+
node: Dict[str, Any] = {"type": "object", "properties": {}}
39+
cursor = node["properties"]
40+
for d in range(depth):
41+
child: Dict[str, Any] = {"type": "object", "properties": {}}
42+
cursor[f"level{d}"] = child
43+
cursor = child["properties"]
44+
for k in range(4):
45+
target = f"Leaf{(s + k) % shared_targets}"
46+
cursor[f"leaf{k}"] = {"$ref": f"#/$defs/{target}"}
47+
components[f"Schema{s}"] = node
48+
spec_dict = {
49+
"openapi": "3.1.0",
50+
"info": {"title": "bench-canonical", "version": "0"},
51+
"$defs": defs,
52+
"components": {"schemas": components},
53+
}
54+
return SchemaPath.from_dict(spec_dict)
55+
56+
57+
def collect_schema_paths(spec: SchemaPath) -> List[SchemaPath]:
58+
paths: List[SchemaPath] = []
59+
60+
def walk(node: SchemaPath) -> None:
61+
paths.append(node)
62+
if "properties" in node:
63+
for name, sub in (node / "properties").items():
64+
if isinstance(name, str):
65+
walk(sub)
66+
67+
for name, schema in (spec / "components" / "schemas").items():
68+
if isinstance(name, str):
69+
walk(schema)
70+
return paths
71+
72+
73+
def key_per_route(path: SchemaPath) -> Any:
74+
return path # master/baseline: navigation path identity
75+
76+
77+
def key_canonical(path: SchemaPath) -> Optional[Tuple[int, Tuple[Any, ...]]]:
78+
from referencing.exceptions import Unresolvable
79+
80+
try:
81+
canon = path.canonical()
82+
except Unresolvable:
83+
return None
84+
return (id(canon.accessor), tuple(canon.parts))
85+
86+
87+
def key_id_content(path: SchemaPath) -> Optional[int]:
88+
try:
89+
with path.resolve() as resolved:
90+
return id(resolved.contents)
91+
except Exception:
92+
return None
93+
94+
95+
@dataclass
96+
class StrategyResult:
97+
name: str
98+
nodes: int
99+
distinct_keys: int
100+
cold_seconds: List[float] = field(default_factory=list)
101+
warm_seconds: List[float] = field(default_factory=list)
102+
103+
def as_dict(self) -> Dict[str, Any]:
104+
cold = statistics.median(self.cold_seconds)
105+
warm = statistics.median(self.warm_seconds)
106+
return {
107+
"name": self.name,
108+
"nodes": self.nodes,
109+
"distinct_keys": self.distinct_keys,
110+
"dedup_ratio": round(self.nodes / self.distinct_keys, 2),
111+
"cold_us_per_node": round(cold / self.nodes * 1e6, 3),
112+
"warm_us_per_node": round(warm / self.nodes * 1e6, 3),
113+
}
114+
115+
116+
def measure(name, paths, keyfn, repeats, warmup) -> StrategyResult:
117+
cold: List[float] = []
118+
distinct = 0
119+
for _ in range(repeats):
120+
seen = set()
121+
t0 = time.perf_counter()
122+
for p in paths:
123+
k = keyfn(p)
124+
if k is not None:
125+
seen.add(k)
126+
cold.append(time.perf_counter() - t0)
127+
distinct = len(seen)
128+
cache: Dict[Any, bool] = {}
129+
for p in paths:
130+
k = keyfn(p)
131+
if k is not None:
132+
cache[k] = True
133+
for _ in range(warmup):
134+
for p in paths:
135+
cache.get(keyfn(p))
136+
warm: List[float] = []
137+
for _ in range(repeats):
138+
t0 = time.perf_counter()
139+
for p in paths:
140+
cache.get(keyfn(p))
141+
warm.append(time.perf_counter() - t0)
142+
return StrategyResult(name, len(paths), distinct, cold, warm)
143+
144+
145+
def main() -> None:
146+
ap = argparse.ArgumentParser()
147+
ap.add_argument("--schemas", type=int, default=500)
148+
ap.add_argument("--depth", type=int, default=3)
149+
ap.add_argument("--shared-targets", type=int, default=16)
150+
ap.add_argument("--repeats", type=int, default=7)
151+
ap.add_argument("--warmup", type=int, default=2)
152+
ap.add_argument("--output", type=str, default="")
153+
ap.add_argument("--no-gc", action="store_true")
154+
args = ap.parse_args()
155+
156+
spec = build_spec(args.schemas, args.depth, args.shared_targets)
157+
paths = collect_schema_paths(spec)
158+
if args.no_gc:
159+
gc.disable()
160+
results = [
161+
measure("per_route", paths, key_per_route, args.repeats, args.warmup),
162+
measure(
163+
"id_content", paths, key_id_content, args.repeats, args.warmup
164+
),
165+
]
166+
if HAS_CANONICAL:
167+
results.append(
168+
measure(
169+
"canonical", paths, key_canonical, args.repeats, args.warmup
170+
)
171+
)
172+
if args.no_gc:
173+
gc.enable()
174+
payload = {
175+
"config": {
176+
"schemas": args.schemas,
177+
"depth": args.depth,
178+
"shared_targets": args.shared_targets,
179+
"nodes": len(paths),
180+
"has_canonical": HAS_CANONICAL,
181+
},
182+
"strategies": [r.as_dict() for r in results],
183+
}
184+
print(json.dumps(payload, indent=2, sort_keys=True))
185+
if args.output:
186+
with open(args.output, "w", encoding="utf-8") as f:
187+
json.dump(payload, f, indent=2, sort_keys=True)
188+
189+
190+
if __name__ == "__main__":
191+
main()

tests/unit/validation/test_schema_validators.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
oas30_write_schema_validators_factory,
66
)
77
from openapi_core.validation.schemas.exceptions import InvalidSchemaValue
8+
from openapi_core.validation.schemas.validators import _HAS_CANONICAL
89
from openapi_core.validation.schemas.validators import SchemaValidator
910

1011

@@ -478,11 +479,12 @@ def _build(schema_dict):
478479
return _build
479480

480481
@pytest.mark.xfail(
482+
condition=not _HAS_CANONICAL,
481483
strict=True,
482484
reason=(
483-
"The cache keys on the navigation path, so each $ref "
484-
"alias gets its own slot. Once the cache keys on canonical "
485-
"the aliases collapse to a single entry."
485+
"Without SchemaPath.canonical the cache keys on the navigation "
486+
"path, so each $ref alias gets its own slot. With canonical "
487+
"keying the aliases collapse to a single entry."
486488
),
487489
)
488490
def test_aliases_to_same_node_share_one_cache_slot(
@@ -497,4 +499,4 @@ def test_aliases_to_same_node_share_one_cache_slot(
497499
assert len(cache) == 1
498500
assert prop_a not in cache
499501
assert prop_b not in cache
500-
assert cache[canonical] is True
502+
assert cache[canonical] is True

0 commit comments

Comments
 (0)