Skip to content

Commit 002e237

Browse files
committed
SchemaValidator canonical cache key
1 parent 0337b43 commit 002e237

3 files changed

Lines changed: 266 additions & 23 deletions

File tree

openapi_core/validation/schemas/validators.py

Lines changed: 63 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from jsonschema.exceptions import FormatError
1111
from jsonschema.protocols import Validator
1212
from jsonschema_path import SchemaPath
13+
from referencing.exceptions import Unresolvable
1314

1415
from openapi_core.validation.schemas.datatypes import (
1516
_EMPTY_STATE_TUPLE as _EMPTY_STATES_TUPLE,
@@ -27,6 +28,11 @@
2728

2829
log = logging.getLogger(__name__)
2930

31+
# Feature-detect jsonschema-path SchemaPath.canonical(). When present
32+
# the needs_state cache keys on the resolved (canonical) location; otherwise it
33+
# degrades to per-route keying.
34+
_HAS_CANONICAL = hasattr(SchemaPath, "canonical")
35+
3036

3137
class SchemaValidator:
3238
def __init__(
@@ -48,17 +54,50 @@ def validate(self, value: Any) -> None:
4854
raise InvalidSchemaValue(value, schema_type, schema_errors=errors)
4955

5056
# Cache the recursive "does this schema benefit from a ValidationState?"
51-
# check, keyed on the SchemaPath. Under jsonschema-path 0.5 (pathable
52-
# 0.6) SchemaPath is an AccessorPath whose identity is
53-
# (parts, accessor), and SchemaAccessor in turn hashes/compares on
54-
# id(node) and id(path_resolver). The key is therefore effectively
55-
# per-resolver: two SchemaPaths share a cache slot only when they
56-
# address the same location *within the same loaded spec*, never
57-
# across distinct specs that merely share a JSON-pointer path.
58-
# Entries are bounded by the number of distinct schema shapes per
59-
# spec and become collectable once the owning resolver is GC'd.
57+
# check, keyed on the schema's *canonical* SchemaPath -- the location the
58+
# $ref chain ultimately resolves to (jsonschema-path SchemaPath.canonical).
59+
# Canonical keying buys two properties at once:
60+
#
61+
# * Cross-route dedup -- every $ref alias of one target collapses to a
62+
# single slot, so the recursive walk runs once per distinct resolved
63+
# shape instead of once per navigation path.
64+
# * Cross-spec safety -- SchemaPath identity is (parts, accessor), and
65+
# canonical() hands back a shared accessor per resolved document, so
66+
# two independently loaded specs that merely share a JSON pointer
67+
# never collide.
68+
#
69+
# Entries are bounded by the number of distinct resolved schema shapes
70+
# across loaded specs. On jsonschema-path builds without canonical(), or
71+
# for a node whose $ref is unresolvable, keying degrades to the navigation
72+
# path. A $dynamicRef-only node yields no cross-target dedup either, since
73+
# canonical() returns it as-is rather than following the dynamic ref. None
74+
# of these cases raise.
6075
_needs_state_cache: dict[SchemaPath, bool] = {}
6176

77+
# Memoise the (cheap-keyed) navigation-path -> canonical-path mapping so a
78+
# warm needs_state lookup pays one cheap SchemaPath hash instead of
79+
# re-walking the $ref chain every call.
80+
_canonical_key_cache: dict[SchemaPath, SchemaPath] = {}
81+
82+
@classmethod
83+
def _needs_state_key(cls, schema: SchemaPath) -> SchemaPath:
84+
"""Cache key for ``schema``: its canonical SchemaPath when available
85+
(collapsing $ref aliases, correct across specs), else the navigation
86+
path. Never raises -- an unresolvable/$dynamicRef-only node falls back
87+
to ``schema``. Canonical resolution is memoised on the cheap path key.
88+
"""
89+
if not _HAS_CANONICAL:
90+
return schema
91+
cached = cls._canonical_key_cache.get(schema)
92+
if cached is not None:
93+
return cached
94+
try:
95+
canonical = schema.canonical()
96+
except Unresolvable:
97+
canonical = schema
98+
cls._canonical_key_cache[schema] = canonical
99+
return canonical
100+
62101
@classmethod
63102
def _schema_needs_state(cls, schema: SchemaPath) -> bool:
64103
"""True if building a ValidationState for ``schema`` carries
@@ -70,42 +109,46 @@ def _schema_needs_state(cls, schema: SchemaPath) -> bool:
70109
sentinel once the recursion completes.
71110
"""
72111
cache = cls._needs_state_cache
73-
cached = cache.get(schema)
112+
key = cls._needs_state_key(schema)
113+
cached = cache.get(key)
74114
if cached is not None:
75115
return cached
116+
# Walk the canonical target so composition behind a $ref is seen even
117+
# when the source node only carries "$ref"; identical for non-$ref.
118+
node = key
76119
# Self-composition is the strongest signal; check it first to
77120
# short-circuit the cheap case.
78-
if "oneOf" in schema or "anyOf" in schema or "allOf" in schema:
79-
cache[schema] = True
121+
if "oneOf" in node or "anyOf" in node or "allOf" in node:
122+
cache[key] = True
80123
return True
81124
# Seed the in-progress sentinel for cycle protection.
82-
cache[schema] = False
125+
cache[key] = False
83126
# Recurse into children. We only need to find one descendant
84127
# that needs state to flip our own answer.
85128
result = False
86-
if "properties" in schema:
87-
prop_iter = (schema / "properties").items()
129+
if "properties" in node:
130+
prop_iter = (node / "properties").items()
88131
for prop_name, prop_schema in prop_iter:
89132
if not isinstance(prop_name, str):
90133
continue
91134
if cls._schema_needs_state(prop_schema):
92135
result = True
93136
break
94-
if not result and "additionalProperties" in schema:
137+
if not result and "additionalProperties" in node:
95138
try:
96-
ap = schema / "additionalProperties"
139+
ap = node / "additionalProperties"
97140
except Exception:
98141
ap = None
99142
if ap is not None and cls._schema_needs_state(ap):
100143
result = True
101-
if not result and "items" in schema:
144+
if not result and "items" in node:
102145
try:
103-
items = schema / "items"
146+
items = node / "items"
104147
except Exception:
105148
items = None
106149
if items is not None and cls._schema_needs_state(items):
107150
result = True
108-
cache[schema] = result
151+
cache[key] = result
109152
return result
110153

111154
def validate_state(self, value: Any) -> ValidationState:
Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
#!/usr/bin/env python3
2+
"""Benchmark: canonical-path keying vs id(content) keying vs per-route
3+
(navigation-path) keying for the SchemaValidator static-analysis caches.
4+
5+
Reports, per strategy:
6+
* cold_us_per_node -- cost of deriving the cache key for every schema
7+
node (the once-per-construction work).
8+
* warm_us_per_node -- steady-state derive+lookup cost (the hot path).
9+
* distinct_keys / dedup_ratio -- how many cache slots the strategy
10+
produces; higher dedup = fewer recomputations of needs_state.
11+
12+
"per_route" is the master/baseline behaviour (key on the navigation
13+
SchemaPath, no $ref collapsing). "canonical" requires jsonschema-path
14+
PR #263. "id_content" mirrors docs/plans/v3-cache-refactor/_caches.py.
15+
"""
16+
17+
from __future__ import annotations
18+
19+
import argparse
20+
import gc
21+
import json
22+
import statistics
23+
import time
24+
from dataclasses import dataclass
25+
from dataclasses import field
26+
from typing import Any
27+
from typing import Callable
28+
from typing import Dict
29+
from typing import List
30+
from typing import Optional
31+
from typing import Tuple
32+
33+
from jsonschema_path import SchemaPath
34+
35+
HAS_CANONICAL = hasattr(SchemaPath, "canonical")
36+
37+
38+
def build_spec(schemas: int, depth: int, shared_targets: int) -> SchemaPath:
39+
defs: dict[str, Any] = {
40+
f"Leaf{t}": {"type": "string", "format": "uuid"}
41+
for t in range(shared_targets)
42+
}
43+
components: dict[str, Any] = {}
44+
for s in range(schemas):
45+
node: dict[str, Any] = {"type": "object", "properties": {}}
46+
cursor = node["properties"]
47+
for d in range(depth):
48+
child: dict[str, Any] = {"type": "object", "properties": {}}
49+
cursor[f"level{d}"] = child
50+
cursor = child["properties"]
51+
for k in range(4):
52+
target = f"Leaf{(s + k) % shared_targets}"
53+
cursor[f"leaf{k}"] = {"$ref": f"#/$defs/{target}"}
54+
components[f"Schema{s}"] = node
55+
spec_dict = {
56+
"openapi": "3.1.0",
57+
"info": {"title": "bench-canonical", "version": "0"},
58+
"$defs": defs,
59+
"components": {"schemas": components},
60+
}
61+
return SchemaPath.from_dict(spec_dict)
62+
63+
64+
def collect_schema_paths(spec: SchemaPath) -> list[SchemaPath]:
65+
paths: list[SchemaPath] = []
66+
67+
def walk(node: SchemaPath) -> None:
68+
paths.append(node)
69+
if "properties" in node:
70+
for name, sub in (node / "properties").items():
71+
if isinstance(name, str):
72+
walk(sub)
73+
74+
for name, schema in (spec / "components" / "schemas").items():
75+
if isinstance(name, str):
76+
walk(schema)
77+
return paths
78+
79+
80+
def key_per_route(path: SchemaPath) -> Any:
81+
return path # master/baseline: navigation path identity
82+
83+
84+
def key_canonical(path: SchemaPath) -> tuple[int, tuple[Any, ...]] | None:
85+
from referencing.exceptions import Unresolvable
86+
87+
try:
88+
canon = path.canonical()
89+
except Unresolvable:
90+
return None
91+
return (id(canon.accessor), tuple(canon.parts))
92+
93+
94+
def key_id_content(path: SchemaPath) -> int | None:
95+
try:
96+
with path.resolve() as resolved:
97+
return id(resolved.contents)
98+
except Exception:
99+
return None
100+
101+
102+
@dataclass
103+
class StrategyResult:
104+
name: str
105+
nodes: int
106+
distinct_keys: int
107+
cold_seconds: list[float] = field(default_factory=list)
108+
warm_seconds: list[float] = field(default_factory=list)
109+
110+
def as_dict(self) -> dict[str, Any]:
111+
cold = statistics.median(self.cold_seconds)
112+
warm = statistics.median(self.warm_seconds)
113+
return {
114+
"name": self.name,
115+
"nodes": self.nodes,
116+
"distinct_keys": self.distinct_keys,
117+
"dedup_ratio": round(self.nodes / self.distinct_keys, 2),
118+
"cold_us_per_node": round(cold / self.nodes * 1e6, 3),
119+
"warm_us_per_node": round(warm / self.nodes * 1e6, 3),
120+
}
121+
122+
123+
def measure(name, paths, keyfn, repeats, warmup) -> StrategyResult:
124+
cold: list[float] = []
125+
distinct = 0
126+
for _ in range(repeats):
127+
seen = set()
128+
t0 = time.perf_counter()
129+
for p in paths:
130+
k = keyfn(p)
131+
if k is not None:
132+
seen.add(k)
133+
cold.append(time.perf_counter() - t0)
134+
distinct = len(seen)
135+
cache: dict[Any, bool] = {}
136+
for p in paths:
137+
k = keyfn(p)
138+
if k is not None:
139+
cache[k] = True
140+
for _ in range(warmup):
141+
for p in paths:
142+
cache.get(keyfn(p))
143+
warm: list[float] = []
144+
for _ in range(repeats):
145+
t0 = time.perf_counter()
146+
for p in paths:
147+
cache.get(keyfn(p))
148+
warm.append(time.perf_counter() - t0)
149+
return StrategyResult(name, len(paths), distinct, cold, warm)
150+
151+
152+
def main() -> None:
153+
ap = argparse.ArgumentParser()
154+
ap.add_argument("--schemas", type=int, default=500)
155+
ap.add_argument("--depth", type=int, default=3)
156+
ap.add_argument("--shared-targets", type=int, default=16)
157+
ap.add_argument("--repeats", type=int, default=7)
158+
ap.add_argument("--warmup", type=int, default=2)
159+
ap.add_argument("--output", type=str, default="")
160+
ap.add_argument("--no-gc", action="store_true")
161+
args = ap.parse_args()
162+
163+
spec = build_spec(args.schemas, args.depth, args.shared_targets)
164+
paths = collect_schema_paths(spec)
165+
if args.no_gc:
166+
gc.disable()
167+
results = [
168+
measure("per_route", paths, key_per_route, args.repeats, args.warmup),
169+
measure(
170+
"id_content", paths, key_id_content, args.repeats, args.warmup
171+
),
172+
]
173+
if HAS_CANONICAL:
174+
results.append(
175+
measure(
176+
"canonical", paths, key_canonical, args.repeats, args.warmup
177+
)
178+
)
179+
if args.no_gc:
180+
gc.enable()
181+
payload = {
182+
"config": {
183+
"schemas": args.schemas,
184+
"depth": args.depth,
185+
"shared_targets": args.shared_targets,
186+
"nodes": len(paths),
187+
"has_canonical": HAS_CANONICAL,
188+
},
189+
"strategies": [r.as_dict() for r in results],
190+
}
191+
print(json.dumps(payload, indent=2, sort_keys=True))
192+
if args.output:
193+
with open(args.output, "w", encoding="utf-8") as f:
194+
json.dump(payload, f, indent=2, sort_keys=True)
195+
196+
197+
if __name__ == "__main__":
198+
main()

tests/unit/validation/test_schema_validators.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
oas30_write_schema_validators_factory,
66
)
77
from openapi_core.validation.schemas.exceptions import InvalidSchemaValue
8+
from openapi_core.validation.schemas.validators import _HAS_CANONICAL
89
from openapi_core.validation.schemas.validators import SchemaValidator
910

1011

@@ -478,11 +479,12 @@ def _build(schema_dict):
478479
return _build
479480

480481
@pytest.mark.xfail(
482+
condition=not _HAS_CANONICAL,
481483
strict=True,
482484
reason=(
483-
"The cache keys on the navigation path, so each $ref "
484-
"alias gets its own slot. Once the cache keys on canonical "
485-
"the aliases collapse to a single entry."
485+
"Without SchemaPath.canonical the cache keys on the navigation "
486+
"path, so each $ref alias gets its own slot. With canonical "
487+
"keying the aliases collapse to a single entry."
486488
),
487489
)
488490
def test_aliases_to_same_node_share_one_cache_slot(

0 commit comments

Comments
 (0)