|
| 1 | +"""Canonical detection of "binary" (opaque, non-text) string schemas. |
| 2 | +
|
| 3 | +OpenAPI expresses "this string carries raw, opaque bytes" in two |
| 4 | +different ways depending on the version: |
| 5 | +
|
| 6 | +* OpenAPI 3.0 uses ``type: string`` together with ``format: binary`` |
| 7 | + (raw octets) or ``format: byte`` (base64-encoded *text*). |
| 8 | +* OpenAPI 3.1+ aligns with JSON Schema 2020-12, where ``binary``/``byte`` |
| 9 | + are no longer defined formats. Raw bytes are described with |
| 10 | + ``contentMediaType`` (e.g. ``application/octet-stream``) and base64 |
| 11 | + payloads with ``contentEncoding: base64``. Per JSON Schema Validation |
| 12 | + §8 these content keywords are *annotations, not assertions*, so a |
| 13 | + conforming validator does not reject a value for carrying raw bytes. |
| 14 | +
|
| 15 | +This module is the single source of truth for that distinction so the |
| 16 | +deserializer, validator, unmarshaller and encoding helpers all agree on |
| 17 | +what counts as binary. ``byte``/``base64`` are deliberately *excluded* |
| 18 | +from the binary set: those are text (base64) and must keep flowing |
| 19 | +through the normal string/text code paths. |
| 20 | +
|
| 21 | +Predicates accept either a ``jsonschema_path.SchemaPath`` (used through |
| 22 | +most of openapi-core) or a plain ``Mapping`` (the shape jsonschema hands |
| 23 | +a custom keyword validator). |
| 24 | +""" |
| 25 | + |
| 26 | +from typing import Any |
| 27 | +from typing import Mapping |
| 28 | +from typing import Optional |
| 29 | +from typing import Union |
| 30 | + |
| 31 | +from jsonschema_path import SchemaPath |
| 32 | + |
| 33 | +# ``format`` values denoting base64-encoded *text* (NOT opaque bytes). |
| 34 | +# ``base64`` is an accepted alternate spelling for ``byte``. |
| 35 | +_BASE64_FORMATS = frozenset({"byte", "base64"}) |
| 36 | + |
| 37 | +# JSON Schema 2020-12 ``contentEncoding`` values denoting base64 text. |
| 38 | +_BASE64_ENCODINGS = frozenset({"base64", "base64url"}) |
| 39 | + |
| 40 | +SchemaLike = Union[SchemaPath, Mapping[str, Any]] |
| 41 | + |
| 42 | + |
| 43 | +def _read_str(schema: SchemaLike, key: str) -> Optional[str]: |
| 44 | + if isinstance(schema, SchemaPath): |
| 45 | + return (schema / key).read_str(None) |
| 46 | + value = schema.get(key) |
| 47 | + return value if isinstance(value, str) else None |
| 48 | + |
| 49 | + |
| 50 | +def _read_type(schema: SchemaLike) -> Union[None, str, list[str]]: |
| 51 | + if isinstance(schema, SchemaPath): |
| 52 | + return (schema / "type").read_str_or_list(None) |
| 53 | + value = schema.get("type") |
| 54 | + if value is None or isinstance(value, (str, list)): |
| 55 | + return value |
| 56 | + return None |
| 57 | + |
| 58 | + |
| 59 | +def type_allows_string(schema: SchemaLike) -> bool: |
| 60 | + """True if a ``string`` instance is permitted at this schema node. |
| 61 | +
|
| 62 | + A missing ``type`` is treated as permissive (OAS 3.1 / JSON Schema |
| 63 | + leaves any value allowed), so the binary/content keywords remain |
| 64 | + authoritative. |
| 65 | + """ |
| 66 | + types = _read_type(schema) |
| 67 | + if types is None: |
| 68 | + return True |
| 69 | + if isinstance(types, str): |
| 70 | + return types == "string" |
| 71 | + return "string" in types |
| 72 | + |
| 73 | + |
| 74 | +def is_base64_schema(schema: SchemaLike) -> bool: |
| 75 | + """True when the schema describes base64-encoded *text*.""" |
| 76 | + if _read_str(schema, "format") in _BASE64_FORMATS: |
| 77 | + return True |
| 78 | + return _read_str(schema, "contentEncoding") in _BASE64_ENCODINGS |
| 79 | + |
| 80 | + |
| 81 | +def is_binary_schema(schema: SchemaLike) -> bool: |
| 82 | + """True when the schema describes an opaque, non-text byte payload. |
| 83 | +
|
| 84 | + Covers OAS 3.0 ``format: binary`` and OAS 3.1 |
| 85 | + ``contentMediaType`` of a non-``text/*`` media type. Base64 text |
| 86 | + (``format: byte``/``base64`` or ``contentEncoding``) is explicitly |
| 87 | + excluded -- it stays on the normal text path. |
| 88 | + """ |
| 89 | + if not isinstance(schema, (SchemaPath, Mapping)): |
| 90 | + return False |
| 91 | + if not type_allows_string(schema): |
| 92 | + return False |
| 93 | + if is_base64_schema(schema): |
| 94 | + return False |
| 95 | + if _read_str(schema, "format") == "binary": |
| 96 | + return True |
| 97 | + content_media_type = _read_str(schema, "contentMediaType") |
| 98 | + if content_media_type is not None and not content_media_type.startswith( |
| 99 | + "text/" |
| 100 | + ): |
| 101 | + return True |
| 102 | + return False |
0 commit comments