Skip to content

Commit f4d7b26

Browse files
committed
Treat binary string schemas as opaque bytes
1 parent 0337b43 commit f4d7b26

11 files changed

Lines changed: 399 additions & 23 deletions

File tree

openapi_core/deserializing/media_types/deserializers.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from openapi_core.deserializing.styles.factories import (
1919
StyleDeserializersFactory,
2020
)
21+
from openapi_core.schema.binary import is_binary_schema
2122
from openapi_core.schema.encodings import get_content_type
2223
from openapi_core.schema.parameters import get_style_and_explode
2324
from openapi_core.schema.protocols import SuportsGetAll
@@ -294,12 +295,15 @@ def should_decode_multipart_form_value(
294295
prop_schema: SchemaPath,
295296
) -> bool:
296297
schema_type = (prop_schema / "type").read_str(None)
297-
schema_format = (prop_schema / "format").read_str(None)
298298

299299
if schema_type in ["integer", "number", "boolean"]:
300300
return True
301301

302-
return schema_type == "string" and schema_format != "binary"
302+
# A string part is decoded to text unless it is an opaque binary
303+
# payload. Routing through the canonical predicate means OAS 3.1
304+
# ``contentMediaType`` binary parts are left as raw bytes too --
305+
# keying on ``format != "binary"`` alone would wrongly decode them.
306+
return schema_type == "string" and not is_binary_schema(prop_schema)
303307

304308
def decode_multipart_form_value(self, value: bytes) -> str:
305309
try:

openapi_core/schema/binary.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
"""Canonical detection of "binary" (opaque, non-text) string schemas.
2+
3+
OpenAPI expresses "this string carries raw, opaque bytes" in two
4+
different ways depending on the version:
5+
6+
* OpenAPI 3.0 uses ``type: string`` together with ``format: binary``
7+
(raw octets) or ``format: byte`` (base64-encoded *text*).
8+
* OpenAPI 3.1+ aligns with JSON Schema 2020-12, where ``binary``/``byte``
9+
are no longer defined formats. Raw bytes are described with
10+
``contentMediaType`` (e.g. ``application/octet-stream``) and base64
11+
payloads with ``contentEncoding: base64``. Per JSON Schema Validation
12+
§8 these content keywords are *annotations, not assertions*, so a
13+
conforming validator does not reject a value for carrying raw bytes.
14+
15+
This module is the single source of truth for that distinction so the
16+
deserializer, validator, unmarshaller and encoding helpers all agree on
17+
what counts as binary. ``byte``/``base64`` are deliberately *excluded*
18+
from the binary set: those are text (base64) and must keep flowing
19+
through the normal string/text code paths.
20+
21+
Predicates accept either a ``jsonschema_path.SchemaPath`` (used through
22+
most of openapi-core) or a plain ``Mapping`` (the shape jsonschema hands
23+
a custom keyword validator).
24+
"""
25+
26+
from typing import Any
27+
from typing import Mapping
28+
from typing import Optional
29+
from typing import Union
30+
31+
from jsonschema_path import SchemaPath
32+
33+
# ``format`` values denoting base64-encoded *text* (NOT opaque bytes).
34+
# ``base64`` is an accepted alternate spelling for ``byte``.
35+
_BASE64_FORMATS = frozenset({"byte", "base64"})
36+
37+
# JSON Schema 2020-12 ``contentEncoding`` values denoting base64 text.
38+
_BASE64_ENCODINGS = frozenset({"base64", "base64url"})
39+
40+
SchemaLike = Union[SchemaPath, Mapping[str, Any]]
41+
42+
43+
def _read_str(schema: SchemaLike, key: str) -> Optional[str]:
44+
if isinstance(schema, SchemaPath):
45+
return (schema / key).read_str(None)
46+
value = schema.get(key)
47+
return value if isinstance(value, str) else None
48+
49+
50+
def _read_type(schema: SchemaLike) -> Union[None, str, list[str]]:
51+
if isinstance(schema, SchemaPath):
52+
return (schema / "type").read_str_or_list(None)
53+
value = schema.get("type")
54+
if value is None or isinstance(value, (str, list)):
55+
return value
56+
return None
57+
58+
59+
def type_allows_string(schema: SchemaLike) -> bool:
60+
"""True if a ``string`` instance is permitted at this schema node.
61+
62+
A missing ``type`` is treated as permissive (OAS 3.1 / JSON Schema
63+
leaves any value allowed), so the binary/content keywords remain
64+
authoritative.
65+
"""
66+
types = _read_type(schema)
67+
if types is None:
68+
return True
69+
if isinstance(types, str):
70+
return types == "string"
71+
return "string" in types
72+
73+
74+
def is_base64_schema(schema: SchemaLike) -> bool:
75+
"""True when the schema describes base64-encoded *text*."""
76+
if _read_str(schema, "format") in _BASE64_FORMATS:
77+
return True
78+
return _read_str(schema, "contentEncoding") in _BASE64_ENCODINGS
79+
80+
81+
def is_binary_schema(schema: SchemaLike) -> bool:
82+
"""True when the schema describes an opaque, non-text byte payload.
83+
84+
Covers OAS 3.0 ``format: binary`` and OAS 3.1
85+
``contentMediaType`` of a non-``text/*`` media type. Base64 text
86+
(``format: byte``/``base64`` or ``contentEncoding``) is explicitly
87+
excluded -- it stays on the normal text path.
88+
"""
89+
if not isinstance(schema, (SchemaPath, Mapping)):
90+
return False
91+
if not type_allows_string(schema):
92+
return False
93+
if is_base64_schema(schema):
94+
return False
95+
if _read_str(schema, "format") == "binary":
96+
return True
97+
content_media_type = _read_str(schema, "contentMediaType")
98+
if content_media_type is not None and not content_media_type.startswith(
99+
"text/"
100+
):
101+
return True
102+
return False

openapi_core/schema/encodings.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
from jsonschema_path import SchemaPath
55

6+
from openapi_core.schema.binary import is_binary_schema
7+
68

79
def get_content_type(
810
prop_schema: SchemaPath, encoding: Optional[SchemaPath]
@@ -26,8 +28,11 @@ def get_default_content_type(
2628
if prop_type is None:
2729
return "text/plain" if encoding else "application/octet-stream"
2830

29-
prop_format = (prop_schema / "format").read_str(None)
30-
if prop_type == "string" and prop_format in ["binary", "base64"]:
31+
if prop_type == "string" and is_binary_schema(prop_schema):
32+
# Opaque binary (OAS 3.0 ``format: binary`` or OAS 3.1
33+
# ``contentMediaType``) defaults to octet-stream. base64 text
34+
# (``byte``/``base64``/``contentEncoding``) is NOT binary and
35+
# falls through to ``text/plain`` below.
3136
return "application/octet-stream"
3237

3338
if prop_type == "object":

openapi_core/templating/media_types/finders.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,10 @@ def _parse_parameter(self, parameter: str) -> Tuple[str, str]:
6161
except "charset" which is case-insensitive
6262
https://www.rfc-editor.org/rfc/rfc2046#section-4.1.2
6363
"""
64-
name, value = parameter.split("=")
64+
# Split on the first "=" only: a parameter value may itself
65+
# contain "=" (e.g. a multipart ``boundary`` like
66+
# ``===============123==``), which RFC 9110 permits.
67+
name, value = parameter.split("=", 1)
6568
name = name.lower().lstrip()
6669
# remove surrounding quotes from value
6770
value = re.sub('^"(.*)"$', r"\1", value, count=1)

openapi_core/validation/schemas/_validators.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from typing import Any
2+
from typing import Callable
23
from typing import Iterator
34
from typing import Mapping
5+
from typing import Optional
46
from typing import cast
57

68
from jsonschema._utils import extras_msg
@@ -9,6 +11,10 @@
911
from jsonschema.protocols import Validator
1012
from jsonschema.validators import extend
1113

14+
from openapi_core.schema.binary import is_binary_schema
15+
16+
_KeywordValidator = Callable[[Any, Any, Any, Mapping[str, Any]], Iterator[Any]]
17+
1218

1319
def build_forbid_unspecified_additional_properties_validator(
1420
validator_class: type[Validator],
@@ -83,6 +89,82 @@ def iter_missing_additional_properties_errors(
8389
yield ValidationError(error % extras_msg(extras))
8490

8591

92+
def build_binary_aware_validator(
93+
validator_class: type[Validator],
94+
) -> type[Validator]:
95+
"""Extend ``validator_class`` so raw ``bytes`` validate against
96+
"binary" string schemas.
97+
98+
OpenAPI lets a ``bytes`` payload flow through a ``type: string``
99+
schema whose ``format``/``contentMediaType`` marks it as opaque
100+
binary (file uploads, ``application/octet-stream`` bodies). Plain
101+
jsonschema rejects ``bytes`` as a non-``string`` and then crashes or
102+
misfires on ``pattern``/length/``enum``/``format`` keywords.
103+
104+
We treat the byte payload as *opaque* -- consistent with JSON Schema
105+
2020-12 where ``contentMediaType``/``contentEncoding`` are
106+
annotations, not assertions. The ``type`` keyword accepts the bytes,
107+
and the string-only assertion keywords are skipped *only* at a
108+
binary node holding bytes. Every other instance/keyword combination
109+
is delegated unchanged to the wrapped validator, so behaviour for
110+
ordinary strings (including the invariant that ``bytes`` is still
111+
rejected against a plain ``type: string``) is preserved. Because the
112+
binary branch now validates, jsonschema's own ``oneOf``/``anyOf``/
113+
``allOf`` selection picks it without any parallel value walk.
114+
"""
115+
type_validator = validator_class.VALIDATORS.get("type")
116+
117+
def _is_opaque_binary(instance: Any, schema: Mapping[str, Any]) -> bool:
118+
return isinstance(instance, bytes) and is_binary_schema(schema)
119+
120+
def binary_aware_type(
121+
validator: Any,
122+
data_type: Any,
123+
instance: Any,
124+
schema: Mapping[str, Any],
125+
) -> Iterator[Any]:
126+
if _is_opaque_binary(instance, schema):
127+
return
128+
if type_validator is not None:
129+
yield from type_validator(validator, data_type, instance, schema)
130+
131+
def _skip_binary(
132+
original: Optional[_KeywordValidator],
133+
) -> _KeywordValidator:
134+
def keyword(
135+
validator: Any,
136+
keyword_value: Any,
137+
instance: Any,
138+
schema: Mapping[str, Any],
139+
) -> Iterator[Any]:
140+
if _is_opaque_binary(instance, schema):
141+
return
142+
if original is not None:
143+
yield from original(validator, keyword_value, instance, schema)
144+
145+
return keyword
146+
147+
validators: dict[str, _KeywordValidator] = {"type": binary_aware_type}
148+
# String-only assertion keywords: harmless to skip on an opaque byte
149+
# payload, and ``pattern`` in particular raises ``TypeError`` when
150+
# applied to ``bytes``.
151+
for keyword_name in (
152+
"pattern",
153+
"minLength",
154+
"maxLength",
155+
"enum",
156+
"format",
157+
):
158+
validators[keyword_name] = _skip_binary(
159+
validator_class.VALIDATORS.get(keyword_name)
160+
)
161+
162+
return cast(
163+
type[Validator],
164+
extend(validator_class, validators=validators),
165+
)
166+
167+
86168
def build_enforce_properties_required_validator(
87169
validator_class: type[Validator],
88170
) -> type[Validator]:

openapi_core/validation/schemas/factories.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
from jsonschema.validators import validator_for
1010
from jsonschema_path import SchemaPath
1111

12+
from openapi_core.validation.schemas._validators import (
13+
build_binary_aware_validator,
14+
)
1215
from openapi_core.validation.schemas._validators import (
1316
build_enforce_properties_required_validator,
1417
)
@@ -74,6 +77,11 @@ def create(
7477
enforce_properties_required: bool = False,
7578
) -> SchemaValidator:
7679
validator_cls: type[Validator] = self.get_validator_cls(spec, schema)
80+
# Always binary-aware: a ``bytes`` payload must validate against a
81+
# binary string schema (file uploads, octet-stream bodies). Applied
82+
# first so the conditional extensions below chain their own ``type``
83+
# override on top of the binary-aware one.
84+
validator_cls = build_binary_aware_validator(validator_cls)
7785
if enforce_properties_required:
7886
validator_cls = build_enforce_properties_required_validator(
7987
validator_cls

openapi_core/validation/schemas/validators.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from jsonschema.protocols import Validator
1212
from jsonschema_path import SchemaPath
1313

14+
from openapi_core.schema.binary import is_binary_schema
1415
from openapi_core.validation.schemas.datatypes import (
1516
_EMPTY_STATE_TUPLE as _EMPTY_STATES_TUPLE,
1617
)
@@ -272,6 +273,14 @@ def get_primitive_type(self, value: Any) -> Optional[str]:
272273
schema_types = sorted(self.validator.TYPE_CHECKER._type_checkers)
273274
assert isinstance(schema_types, list)
274275
for schema_type in schema_types:
276+
if schema_type == "string" and isinstance(value, bytes):
277+
# OpenAPI lets raw ``bytes`` satisfy a binary string
278+
# schema. jsonschema's type checker doesn't know that
279+
# convention, so resolve it here to keep primitive-type
280+
# detection (and downstream format discovery) consistent
281+
# with the binary-aware validator.
282+
if is_binary_schema(self.schema):
283+
return "string"
275284
result = self.type_validator(value, type_override=schema_type)
276285
if not result:
277286
continue

tests/integration/unmarshalling/test_request_unmarshaller.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -469,12 +469,6 @@ def test_request_body_with_object_default(self):
469469
assert result.errors == []
470470
assert result.body == {"tags": []}
471471

472-
@pytest.mark.xfail(
473-
reason=(
474-
"multipart composed-schema branch selection is not binary-aware"
475-
),
476-
strict=True,
477-
)
478472
def test_request_body_multipart_oneof_binary_field(self):
479473
from openapi_core import OpenAPI
480474

tests/unit/deserializing/test_media_types_deserializers.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -657,12 +657,6 @@ def test_urlencoded_form_with_array_default(self, deserializer_factory):
657657

658658
assert result == {"tags": []}
659659

660-
@pytest.mark.xfail(
661-
reason=(
662-
"multipart composed-schema branch selection is not binary-aware"
663-
),
664-
strict=True,
665-
)
666660
def test_multipart_oneof_binary_field(self, spec, deserializer_factory):
667661
mimetype = "multipart/form-data"
668662
schema_dict = {
@@ -757,12 +751,6 @@ def test_multipart_oneof_string_field(self, spec, deserializer_factory):
757751
"fieldA": "value",
758752
}
759753

760-
@pytest.mark.xfail(
761-
reason=(
762-
"multipart composed-schema branch selection is not binary-aware"
763-
),
764-
strict=True,
765-
)
766754
def test_multipart_anyof_binary_field(self, spec, deserializer_factory):
767755
mimetype = "multipart/form-data"
768756
schema_dict = {

0 commit comments

Comments
 (0)