From 795c8c8814cae22fc099ccde6f04d0cbdd974441 Mon Sep 17 00:00:00 2001 From: Zhengguo Yang Date: Fri, 6 Feb 2026 15:57:02 +0800 Subject: [PATCH 1/6] feat: introduce JSONPath module and batch query APIs * Add new JSONPath headers (dom/dump/ondemand) and GetByJsonPaths to query multiple paths with a single parse * Refactor GetByJsonPath by extracting shared logic into GetByJsonPathInternal to avoid redundant DOM parsing * Expand SIMD scanning/skip/quote/unicode support across x86 and Arm (including SVE2-128) for JSONPath workloads * Add comprehensive jsonpath and json_tuple tests (including batch and invalid-path coverage) * Update build and tooling: Bazel/bzlmod files, CMake scripts, CI workflows, and pre-commit configuration --- .github/workflows/clang-format-check.yml | 5 + .github/workflows/test_arm.yml | 8 +- .github/workflows/test_coverage.yml | 4 + .github/workflows/test_x86.yml | 4 + benchmark/main.cpp | 67 +- benchmark/ondemand.hpp | 3 +- include/sonic/dom/dynamicnode.h | 163 +++- include/sonic/dom/flags.h | 86 +- include/sonic/dom/generic_document.h | 35 +- include/sonic/dom/genericnode.h | 87 +- include/sonic/dom/handler.h | 19 + include/sonic/dom/parser.h | 225 ++++- include/sonic/dom/schema_handler.h | 24 +- include/sonic/dom/serialize.h | 56 +- include/sonic/dom/type.h | 17 +- include/sonic/error.h | 30 +- include/sonic/experiment/lazy_update.h | 27 +- include/sonic/internal/arch/avx2/unicode.h | 13 +- .../internal/arch/common/arm_common/quote.h | 34 +- .../sonic/internal/arch/common/quote_common.h | 169 +++- .../sonic/internal/arch/common/quote_tables.h | 106 ++- .../sonic/internal/arch/common/skip_common.h | 31 +- .../internal/arch/common/unicode_common.h | 23 + .../arch/common/x86_common/quote.inc.h | 54 +- .../arch/common/x86_common/skip.inc.h | 2 + include/sonic/internal/arch/neon/quote.h | 29 +- include/sonic/internal/arch/neon/unicode.h | 19 +- include/sonic/internal/arch/simd_skip.h | 892 +++++++++++++++++- include/sonic/internal/arch/sse/unicode.h | 12 +- include/sonic/internal/arch/sve2-128/quote.h | 32 +- .../sonic/internal/arch/sve2-128/unicode.h | 20 +- .../sonic/internal/arch/x86_ifuncs/quote.h | 102 +- include/sonic/internal/ftoa.h | 72 +- include/sonic/jsonpath/dom.h | 90 ++ include/sonic/jsonpath/dump.h | 61 ++ include/sonic/jsonpath/jsonpath.h | 411 ++++++++ include/sonic/jsonpath/ondemand.h | 196 ++++ include/sonic/sonic.h | 2 + tests/allocator_test.cpp | 46 + tests/document_test.cpp | 34 +- tests/exp_update_test.cpp | 34 +- tests/ftoa_test.cpp | 2 +- tests/json_tuple_test.cpp | 202 ++++ tests/jsonpath_test.cpp | 791 ++++++++++++++++ tests/node_test.cpp | 66 ++ tests/parsenumber_test.cpp | 127 ++- tests/quote_test.cpp | 3 +- 47 files changed, 4267 insertions(+), 268 deletions(-) create mode 100644 include/sonic/jsonpath/dom.h create mode 100644 include/sonic/jsonpath/dump.h create mode 100644 include/sonic/jsonpath/jsonpath.h create mode 100644 include/sonic/jsonpath/ondemand.h create mode 100644 tests/json_tuple_test.cpp create mode 100644 tests/jsonpath_test.cpp diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml index 65dc7464..c004f856 100644 --- a/.github/workflows/clang-format-check.yml +++ b/.github/workflows/clang-format-check.yml @@ -1,4 +1,9 @@ name: clang-format Check + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + on: [push, pull_request] jobs: formatting-check: diff --git a/.github/workflows/test_arm.yml b/.github/workflows/test_arm.yml index 1df03614..b8640cbf 100644 --- a/.github/workflows/test_arm.yml +++ b/.github/workflows/test_arm.yml @@ -1,5 +1,9 @@ name: Test ARM +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + on: push: branches: [ master ] @@ -34,6 +38,6 @@ jobs: - name: Build and test SVE2 if: steps.sve_check.outputs.supported == 'true' run: | - cmake -S . -B build-sve -G Ninja -DENABLE_SVE2_128=ON -DENABLE_ASAN=OFF + cmake -S . -B build-sve -G Ninja -DENABLE_SVE2_128=ON cmake --build build-sve - ./build-sve/tests/unittest + ASAN_OPTIONS=detect_stack_use_after_return=0 ./build-sve/tests/unittest diff --git a/.github/workflows/test_coverage.yml b/.github/workflows/test_coverage.yml index ef391515..5971a39a 100644 --- a/.github/workflows/test_coverage.yml +++ b/.github/workflows/test_coverage.yml @@ -1,6 +1,10 @@ # yaml-language-server: $schema=https://json-schema.org/draft-07/schema# name: Test Coverage +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + on: push: pull_request: diff --git a/.github/workflows/test_x86.yml b/.github/workflows/test_x86.yml index 66835811..1d349925 100644 --- a/.github/workflows/test_x86.yml +++ b/.github/workflows/test_x86.yml @@ -1,4 +1,8 @@ name: Test +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + on: [push, pull_request] jobs: diff --git a/benchmark/main.cpp b/benchmark/main.cpp index 70761063..70159963 100644 --- a/benchmark/main.cpp +++ b/benchmark/main.cpp @@ -16,11 +16,14 @@ #include +#include #include #include #include #include #include +#include +#include #include "cjson.hpp" #include "jsoncpp.hpp" @@ -30,14 +33,50 @@ #include "sonic.hpp" #include "yyjson.hpp" -static std::string get_json(const std::string_view file) { - std::ifstream ifs; +static std::string get_json(const std::filesystem::path &file) { + std::ifstream ifs(file, std::ios::in | std::ios::binary); + if (!ifs.is_open()) return {}; + std::stringstream ss; - ifs.open(file.data()); ss << ifs.rdbuf(); return ss.str(); } +static void add_testdata_candidates(std::vector &out, + const std::filesystem::path &root) { + if (root.empty()) return; + + out.push_back(root / "testdata"); + out.push_back(root / "_main" / "testdata"); + + std::error_code ec; + if (!std::filesystem::is_directory(root, ec)) return; + for (const auto &e : std::filesystem::directory_iterator(root, ec)) { + if (e.is_directory(ec)) out.push_back(e.path() / "testdata"); + } +} + +static std::filesystem::path find_testdata_dir(const char *argv0) { + std::vector candidates; + candidates.emplace_back("testdata"); + + if (const char *p = std::getenv("RUNFILES_DIR")) + add_testdata_candidates(candidates, std::filesystem::path(p)); + if (const char *p = std::getenv("TEST_SRCDIR")) + add_testdata_candidates(candidates, std::filesystem::path(p)); + + if (argv0 && *argv0) { + add_testdata_candidates( + candidates, std::filesystem::path(std::string(argv0) + ".runfiles")); + } + + std::error_code ec; + for (const auto &c : candidates) { + if (std::filesystem::is_directory(c, ec)) return c; + } + return {}; +} + template static void BM_Encode(benchmark::State &state, std::string_view filename, std::string_view data) { @@ -173,7 +212,7 @@ static void BM_Decode(benchmark::State &state, std::string filename, state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(data.size())); } -static void regitser_OnDemand() { +static void regitser_OnDemand(const std::filesystem::path &testdata_dir) { std::vector tests = { {"twitter", "Normal", {"search_metadata", "count"}, 100, true}, {"citm_catalog", @@ -185,7 +224,7 @@ static void regitser_OnDemand() { }; for (auto &t : tests) { - auto file_path = std::string("testdata/") + t.file + ".json"; + auto file_path = testdata_dir / (t.file + ".json"); t.json = get_json(file_path); #define REG_ONDEMAND(JSON) \ @@ -203,14 +242,24 @@ static void regitser_OnDemand() { int main(int argc, char **argv) { benchmark::Initialize(&argc, argv); + auto testdata_dir = find_testdata_dir(argv[0]); + if (testdata_dir.empty()) { + std::cerr << "Cannot locate 'testdata' directory. Try running with Bazel " + "runfiles." + << std::endl; + return 1; + } + // Read the data from json files std::vector> jsons; - for (const auto &entry : std::filesystem::directory_iterator("testdata")) + std::error_code ec; + for (const auto &entry : + std::filesystem::directory_iterator(testdata_dir, ec)) { if (entry.path().extension() == ".json") - jsons.push_back( - std::make_pair(entry.path(), get_json(entry.path().string()))); + jsons.push_back(std::make_pair(entry.path(), get_json(entry.path()))); + } - regitser_OnDemand(); + regitser_OnDemand(testdata_dir); #define ADD_JSON_BMK(JSON, ACT) \ do { \ benchmark::RegisterBenchmark( \ diff --git a/benchmark/ondemand.hpp b/benchmark/ondemand.hpp index befff8c9..af5b8ce7 100644 --- a/benchmark/ondemand.hpp +++ b/benchmark/ondemand.hpp @@ -41,7 +41,8 @@ static void BM_SonicOnDemand(benchmark::State& state, const OnDemand& data) { sonic_json::GenericJsonPointer path(data.path); lite.ParseOnDemand(data.json.data(), data.json.size(), path); bool existed = !lite.HasParseError(); - bool ok = existed == data.existed && lite.GetUint64() == data.value; + bool ok = existed == data.existed && + (!existed || (lite.IsUint64() && lite.GetUint64() == data.value)); if (!ok) { state.SkipWithError("Verify failed"); return; diff --git a/include/sonic/dom/dynamicnode.h b/include/sonic/dom/dynamicnode.h index a5918e8a..3439513d 100644 --- a/include/sonic/dom/dynamicnode.h +++ b/include/sonic/dom/dynamicnode.h @@ -51,7 +51,7 @@ class DNode : public GenericNode> { friend BaseNode; template friend class DNode; - template + template friend SonicError internal::SerializeImpl(const NodeType*, WriteBuffer&); // constructor @@ -120,6 +120,24 @@ class DNode : public GenericNode> { } break; } + case kNumber: { + if (rhs.GetType() != kNumStr) { + std::memcpy(&(this->data), &rhs, sizeof(this->data)); + break; + } + [[fallthrough]]; + } + case kRaw: { + size_t len = rhs.Size(); + // Mark buffer as owned so destroy() will free it for kNeedFree alloc. + this->sv.len = rhs.getTypeAndLen() | kOwnedStringMask; + this->sv.p = (char*)(alloc.Malloc(len + 1)); + sonic_assert(this->sv.p != nullptr); + std::memcpy(const_cast(this->sv.p), rhs.GetStringView().data(), + len); + const_cast(this->sv.p)[len] = '\0'; + break; + } default: std::memcpy(&(this->data), &rhs, sizeof(this->data)); break; @@ -146,8 +164,8 @@ class DNode : public GenericNode> { DNode& operator=(DNode&& rhs) { if (sonic_likely(this != &rhs)) { // Can't destroy "this" before assigning "rhs", otherwise "rhs" - // could be used after free if it's an sub-node of "this", - // hence the temporary danse. + // could be used after free if it's a sub-node of "this", + // hence the temporary dance. // Copied from RapidJSON. DNode temp; temp.rawAssign(rhs); @@ -241,7 +259,7 @@ class DNode : public GenericNode> { * @retval MemberEnd() not found * @retval others iterator for found member * @note If target name is a literal string, string_view can be optimized by - * compiler. This function will provide a better memcmp implemention than + * compiler. This function will provide a better memcmp implementation than * std::memcmp while length is not too large. */ sonic_force_inline MemberIterator FindMember(const char* key, @@ -257,7 +275,7 @@ class DNode : public GenericNode> { * @retval MemberEnd() not found * @retval others iterator for found member * @note If target name is a literal string, string_view can be optimized by - * compiler. This function will provide a better memcmp implemention than + * compiler. This function will provide a better memcmp implementation than * std::memcmp while length is not too large. */ sonic_force_inline ConstMemberIterator FindMember(const char* key, @@ -293,8 +311,18 @@ class DNode : public GenericNode> { return true; } + bool atJsonPathImpl(const internal::JsonPath& path, size_t index, + std::vector& res) { + return atJsonPathImplCommon(this, path, index, res); + } + + bool atJsonPathImpl(const internal::JsonPath& path, size_t index, + std::vector& res) const { + return atJsonPathImplCommon(this, path, index, res); + } + /** - * @brief Destory the created map. This means that you don't want maintain the + * @brief Destroy the created map. This means that you don't want maintain the * map anymore. */ void DestroyMap() { @@ -432,10 +460,42 @@ class DNode : public GenericNode> { return *this; } - DNode& setRawImpl(const char* s, size_t len) { + DNode& setRawImpl(StringView s) { return setRawLikeImpl(s, kRaw); } + + DNode& setRawImpl(StringView s, Allocator& alloc) { + return setRawLikeImpl(s, kRaw, alloc); + } + + DNode& setStringNumberImpl(StringView s) { + return setRawLikeImpl(s, kNumStr); + } + + DNode& setStringNumberImpl(StringView s, Allocator& alloc) { + return setRawLikeImpl(s, kNumStr, alloc); + } + + DNode& setRawLikeImpl(StringView s, TypeFlag typ) { + this->destroy(); + this->raw.p = s.data(); + this->setLength(s.size(), typ); + return *this; + } + + DNode& setRawLikeImpl(StringView s, TypeFlag typ, Allocator& alloc) { this->destroy(); - this->raw.p = s; - this->setLength(len, kRaw); + size_t len = s.size(); + char* p = static_cast(alloc.Malloc(len + 1)); + if (p) { + std::memcpy(p, s.data(), len); + p[len] = '\0'; + this->raw.p = p; + // Mark buffer as owned so destroy() will free it for kNeedFree alloc. + this->setLength(len, static_cast(static_cast(typ) | + kOwnedStringMask)); + } else { + this->raw.p = ""; + this->setLength(0, typ); + } return *this; } @@ -477,6 +537,69 @@ class DNode : public GenericNode> { return children() != nullptr ? meta()->cap : 0; } + template + static bool atJsonPathImplCommon(SelfPtr self, const internal::JsonPath& path, + size_t index, std::vector& res) { + static_assert(std::is_pointer::value, + "ResPtr must be a pointer type"); + if (index >= path.size()) { + res.push_back(reinterpret_cast(self)); + return true; + } + + if (path[index].is_wildcard()) { + // select nothing from the primitive JSON value + if (!self->IsObject() && !self->IsArray()) { + return true; + } + using CurPtr = std::conditional_t< + std::is_const>::value, const DNode*, + DNode*>; + CurPtr n = reinterpret_cast(self->getChildrenFirstUnsafe()) + + (self->IsObject() ? 1 : 0); + size_t step = self->IsObject() ? 2 : 1; + for (size_t i = 0; i < self->Size(); ++i) { + CurPtr cur = (n + i * step); + atJsonPathImplCommon(cur, path, index + 1, res); + } + return true; + } + + if (path[index].is_key()) { + if (!self->IsObject()) { + return false; + } + auto m = self->FindMember(path[index].key()); + if (m != self->MemberEnd()) { + auto* child = + reinterpret_cast*>(&m->value); + return atJsonPathImplCommon(child, path, index + 1, res); + } + return false; + } + + if (path[index].is_index()) { + if (!self->IsArray()) { + return false; + } + + // index maybe negative + int64_t idx = path[index].index(); + if (idx < 0) { + idx = self->Size() + idx; + } + + if (idx >= int64_t(self->Size()) || idx < 0) { + return false; + } + auto& child_ref = self->findValueImpl(size_t(idx)); + auto* child = + reinterpret_cast*>(&child_ref); + return atJsonPathImplCommon(child, path, index + 1, res); + } + return false; + } + DNode& memberReserveImpl(size_t new_cap, Allocator& alloc) { if (new_cap > this->Capacity()) { void* old_ptr = children(); @@ -565,6 +688,11 @@ class DNode : public GenericNode> { sizeof(MetaNode) / sizeof(char)); } + sonic_force_inline DNode* getChildrenFirstUnsafe() const { + return (DNode*)((char*)this->a.next.children + + sizeof(MetaNode) / sizeof(char)); + } + sonic_force_inline DNode* getObjChildrenFirst() const { sonic_assert(this->IsObject()); if (nullptr == children()) { @@ -787,7 +915,7 @@ class DNode : public GenericNode> { DNode& pushBackImpl(DNode& value, Allocator& alloc) { constexpr size_t k_default_array_cap = 16; sonic_assert(this->IsArray()); - // reseve capacity + // reserve capacity size_t cap = this->Capacity(); if (this->Size() >= cap) { size_t new_cap = cap ? cap + (cap + 1) / 2 : k_default_array_cap; @@ -817,7 +945,7 @@ class DNode : public GenericNode> { return start; } - template + template SonicError serializeImpl(WriteBuffer& wb) const { return internal::SerializeImpl(this, wb); } @@ -831,6 +959,19 @@ class DNode : public GenericNode> { if (!Allocator::kNeedFree) { return; } + + // Free owned string buffers for Raw / NumStr. + // Note: We use the extra ownership bit in the 8-bit type info + // (kOwnedStringMask) while keeping GetType() stable (it masks by + // kSubTypeMask). + const uint8_t info = static_cast(this->sv.len & kInfoMask); + if ((info & kOwnedStringMask) != 0) { + if (this->getBasicType() == kRaw || this->GetType() == kNumStr) { + Allocator::Free((void*)this->sv.p); + return; + } + } + switch (this->GetType()) { case kObject: { if (children()) { diff --git a/include/sonic/dom/flags.h b/include/sonic/dom/flags.h index 070bba60..b9279343 100644 --- a/include/sonic/dom/flags.h +++ b/include/sonic/dom/flags.h @@ -16,14 +16,92 @@ #pragma once +#include + // ParseFlag is one-hot encoded for different parsing option. -// User can define customed flags through combinations. -enum ParseFlag { +// User can define customized flags through combinations. +enum class ParseFlags : uint32_t { kParseDefault = 0, + kParseAllowUnescapedControlChars = 1 << 1, + // parse all integer as raw number + kParseIntegerAsRaw = 1 << 2, + // Parse numbers as number strings (NumStr) when needed. + // When enabled, floating-point numbers are stored as NumStr; integers are + // still stored as int64/uint64 when in range, otherwise stored as NumStr. + kParseOverflowNumAsNumStr = 1 << 3, }; +// Compatibility layer for downstream users. +// Note: ParseFlag was the previous unscoped enum type. +using ParseFlag [[deprecated("Use ParseFlags instead")]] = ParseFlags; +[[deprecated("Use ParseFlags::kParseDefault instead")]] constexpr ParseFlags + kParseDefault = ParseFlags::kParseDefault; +[[deprecated( + "Use ParseFlags::kParseAllowUnescapedControlChars " + "instead")]] constexpr ParseFlags kParseAllowUnescapedControlChars = + ParseFlags::kParseAllowUnescapedControlChars; +[[deprecated( + "Use ParseFlags::kParseIntegerAsRaw instead")]] constexpr ParseFlags + kParseIntegerAsRaw = ParseFlags::kParseIntegerAsRaw; +[[deprecated( + "Use ParseFlags::kParseOverflowNumAsNumStr instead")]] constexpr ParseFlags + kParseOverflowNumAsNumStr = ParseFlags::kParseOverflowNumAsNumStr; + +constexpr ParseFlags operator|(ParseFlags lhs, ParseFlags rhs) { + return static_cast(static_cast(lhs) | + static_cast(rhs)); +} + +constexpr bool operator&(ParseFlags lhs, ParseFlags rhs) { + return (static_cast(lhs) & static_cast(rhs)) != 0; +} + // SerializeFlags is one-hot encoded for different serializing option. -// User can define customed flags through combinations. -enum SerializeFlags { +// User can define customized flags through combinations. +enum class SerializeFlags : uint32_t { kSerializeDefault = 0, + kSerializeAppendBuffer = 1 << 1, + kSerializeEscapeEmoji = 1 << 2, + kSerializeInfNan = 1 << 3, + kSerializeUnicodeEscapeUppercase = 1 << 4, + kSerializeFloatFormatJava = 1 << 5, }; + +// Compatibility layer for downstream users. +// Note: SerializeFlag was the previous unscoped enum type. +using SerializeFlag [[deprecated("Use SerializeFlags instead")]] = + SerializeFlags; +[[deprecated( + "Use SerializeFlags::kSerializeDefault instead")]] constexpr SerializeFlags + kSerializeDefault = SerializeFlags::kSerializeDefault; +[[deprecated( + "Use SerializeFlags::kSerializeAppendBuffer " + "instead")]] constexpr SerializeFlags kSerializeAppendBuffer = + SerializeFlags::kSerializeAppendBuffer; +[[deprecated( + "Use SerializeFlags::kSerializeEscapeEmoji " + "instead")]] constexpr SerializeFlags kSerializeEscapeEmoji = + SerializeFlags::kSerializeEscapeEmoji; +[[deprecated( + "Use SerializeFlags::kSerializeInfNan instead")]] constexpr SerializeFlags + kSerializeInfNan = SerializeFlags::kSerializeInfNan; +[[deprecated( + "Use SerializeFlags::kSerializeUnicodeEscapeUppercase " + "instead")]] constexpr SerializeFlags kSerializeUnicodeEscapeUppercase = + SerializeFlags::kSerializeUnicodeEscapeUppercase; +[[deprecated( + "Use SerializeFlags::kSerializeFloatFormatJava " + "instead")]] constexpr SerializeFlags kSerializeFloatFormatJava = + SerializeFlags::kSerializeFloatFormatJava; + +constexpr SerializeFlags operator|(SerializeFlags lhs, SerializeFlags rhs) { + return static_cast(static_cast(lhs) | + static_cast(rhs)); +} +constexpr bool operator&(SerializeFlags lhs, SerializeFlags rhs) { + return (static_cast(lhs) & static_cast(rhs)) != 0; +} + +constexpr static auto kSerializeJavaStyleFlag = + SerializeFlags::kSerializeFloatFormatJava | + SerializeFlags::kSerializeUnicodeEscapeUppercase; diff --git a/include/sonic/dom/generic_document.h b/include/sonic/dom/generic_document.h index 2f8ed7f3..2794344c 100644 --- a/include/sonic/dom/generic_document.h +++ b/include/sonic/dom/generic_document.h @@ -16,11 +16,16 @@ #pragma once +#include + #include "sonic/dom/dynamicnode.h" #include "sonic/dom/json_pointer.h" #include "sonic/dom/parser.h" namespace sonic_json { + +template +class Parser; template class GenericDocument : public NodeType { public: @@ -57,7 +62,7 @@ class GenericDocument : public NodeType { } /** - * @brief Move assignement + * @brief Move assignment */ GenericDocument& operator=(GenericDocument&& rhs) { // Step1: clear self memory @@ -117,23 +122,23 @@ class GenericDocument : public NodeType { * @note If using memorypool allocator, memory will be cleared every time * before parsing to avoid memory overuse. */ - template + template GenericDocument& Parse(StringView json) { return Parse(json.data(), json.size()); } - template + template GenericDocument& Parse(const char* data, size_t len) { destroyDom(); return parseImpl(data, len); } - template + template GenericDocument& ParseSchema(StringView json) { return ParseSchema(json.data(), json.size()); } - template + template GenericDocument& ParseSchema(const char* data, size_t len) { return parseSchemaImpl(data, len); } @@ -147,14 +152,14 @@ class GenericDocument : public NodeType { * @note If using memorypool allocator, memory will be cleared every time * before parsing to avoid memory overuse. */ - template GenericDocument& ParseOnDemand(StringView json, const GenericJsonPointer& path) { return ParseOnDemand(json.data(), json.size(), path); } - template GenericDocument& ParseOnDemand(const char* data, size_t len, const GenericJsonPointer& path) { @@ -204,9 +209,9 @@ class GenericDocument : public NodeType { this->setType(kNull); } - template + template GenericDocument& parseImpl(const char* json, size_t len) { - Parser p; + Parser p; SAXHandler sax(*alloc_); parse_result_ = allocateStringBuffer(json, len); if (sonic_unlikely(HasParseError())) { @@ -216,7 +221,7 @@ class GenericDocument : public NodeType { parse_result_ = kErrorNoMem; return *this; } - parse_result_ = p.template Parse(str_, len, sax); + parse_result_ = p.Parse(str_, len, sax); if (sonic_unlikely(HasParseError())) { return *this; } @@ -224,9 +229,9 @@ class GenericDocument : public NodeType { return *this; } - template + template GenericDocument& parseSchemaImpl(const char* json, size_t len) { - Parser p; + Parser p; SchemaHandler sax(this, *alloc_); parse_result_ = allocateSchemaStringBuffer(json, len); if (sonic_unlikely(HasParseError())) { @@ -236,11 +241,11 @@ class GenericDocument : public NodeType { parse_result_ = kErrorNoMem; return *this; } - parse_result_ = p.template Parse(schema_str_, len, sax); + parse_result_ = p.Parse(schema_str_, len, sax); return *this; } - template + template GenericDocument& parseOnDemandImpl( const char* json, size_t len, const GenericJsonPointer& path) { @@ -281,7 +286,7 @@ class GenericDocument : public NodeType { schema_str_[len + 2] = 'x'; return kErrorNone; } - + template friend class Parser; // Note: it is a callback function in parse.parse_impl diff --git a/include/sonic/dom/genericnode.h b/include/sonic/dom/genericnode.h index 96616e07..33a0570b 100644 --- a/include/sonic/dom/genericnode.h +++ b/include/sonic/dom/genericnode.h @@ -19,6 +19,7 @@ #include #include #include +#include #include "sonic/dom/handler.h" #include "sonic/dom/json_pointer.h" @@ -26,6 +27,7 @@ #include "sonic/dom/serialize.h" #include "sonic/dom/type.h" #include "sonic/error.h" +#include "sonic/jsonpath/jsonpath.h" #include "sonic/string_view.h" #include "sonic/writebuffer.h" @@ -42,6 +44,12 @@ class MemberNodeT { template struct NodeTraits; +template +struct JsonPathResult { + std::vector nodes; + SonicError error; +}; + /** * @brief Basic class represents a JSON value. * @tparam NodeType: the Derived class. @@ -275,6 +283,10 @@ class GenericNode { sonic_force_inline bool IsDouble() const noexcept { return GetType() == kReal; } + + sonic_force_inline bool IsStringNumber() const noexcept { + return GetType() == kNumStr; + } /** * @brief Check this node is in the range of int64. * @return true if it is int64. @@ -321,7 +333,7 @@ class GenericNode { * @return std::string */ sonic_force_inline std::string GetString() const { - sonic_assert(IsString()); + sonic_assert(IsString() || IsStringNumber()); return std::string(sv.p, Size()); } @@ -330,7 +342,12 @@ class GenericNode { * @return StringView */ sonic_force_inline StringView GetStringView() const noexcept { - sonic_assert(IsString()); + sonic_assert(IsString() || IsStringNumber() || IsRaw()); + return StringView(sv.p, Size()); + } + + sonic_force_inline StringView GetStringNumber() const noexcept { + sonic_assert(IsStringNumber()); return StringView(sv.p, Size()); } @@ -372,7 +389,7 @@ class GenericNode { * @return double */ sonic_force_inline double GetDouble() const noexcept { - sonic_assert(IsNumber()); + sonic_assert(IsNumber() && !IsStringNumber()); if (IsDouble()) return n.f64; if (IsUint64()) return static_cast( @@ -433,6 +450,13 @@ class GenericNode { return downCast()->setDoubleImpl(d); } + NodeType& SetStringNumber(StringView s) { + return downCast()->setStringNumberImpl(s); + } + + NodeType& SetStringNumber(StringView s, alloc_type& alloc) { + return downCast()->setStringNumberImpl(s, alloc); + } /** * @brief Set this node as a copied string through the allocator alloc. * allocator. @@ -553,7 +577,8 @@ class GenericNode { * @return size_t */ size_t Size() const noexcept { - sonic_assert(this->IsContainer() || this->IsString() || this->IsRaw()); + sonic_assert(this->IsContainer() || this->IsString() || this->IsRaw() || + this->IsStringNumber()); return sv.len >> kInfoBits; } @@ -698,6 +723,49 @@ class GenericNode { return atPointerImpl(pointer); } + /** + * @brief get specific nodes by json path + * @param path json pointer + * @retval nullptr get node failed + * @retval others success + */ + JsonPathResult AtJsonPath(const StringView jsonpath) { + return AtJsonPathCommon(downCast(), jsonpath); + } + + JsonPathResult AtJsonPath(const StringView jsonpath) const { + return AtJsonPathCommon(downCast(), jsonpath); + } + + private: + template + static sonic_force_inline JsonPathResult> + AtJsonPathCommon(DerivedPtr self, const StringView jsonpath) { + using ResultNodeType = std::remove_pointer_t; + JsonPathResult ret = {}; + ret.error = kErrorNone; + internal::JsonPath path; + + // padding some buffers + std::string pathpadd = internal::paddingJsonPath(jsonpath); + // Only parse the logical jsonpath length; the extra '\0' bytes are for + // safe lookahead during unescaping. + if (!path.ParsePadded(StringView(pathpadd.data(), pathpadd.size()), + jsonpath.size())) { + ret.error = kUnsupportedJsonPath; + return ret; + } + + if (path[0].is_root() && path.size() == 1) { + ret.nodes.push_back(self); + } else if (!self->atJsonPathImpl(path, 1, ret.nodes)) { + ret.error = kNotFoundByJsonPath; + ret.nodes.clear(); + } + return ret; + } + + public: /** * @brief get specific node by json pointer. This is implemented by variable * argument. @@ -760,7 +828,7 @@ class GenericNode { /** * @brief get specific node by json pointer(RFC 6901) - * @tparam StringType json pointer string type, can use StringView to aovoid + * @tparam StringType json pointer string type, can use StringView to avoid * copying string. * @param pointer json pointer * @retval nullptr get node failed @@ -997,7 +1065,7 @@ class GenericNode { * @param wb write buffer where you want to store json string. * @return EndcodeError */ - template + template SonicError Serialize(WriteBuffer& wb) const { return downCast()->template serializeImpl(wb); } @@ -1007,7 +1075,7 @@ class GenericNode { * @param serializeFlags combination of different SerializeFlag. * @return empty string if there are errors when serializing. */ - template + template std::string Dump() const { WriteBuffer wb; SonicError err = Serialize(wb); @@ -1046,8 +1114,9 @@ class GenericNode { setEmptyString(); } } - NodeType& setRaw(StringView s) { - return downCast()->setRawImpl(s.data(), s.size()); + NodeType& setRaw(StringView s) { return downCast()->setRawImpl(s); } + NodeType& setRaw(StringView s, alloc_type& alloc) { + return downCast()->setRawImpl(s, alloc); } void setEmptyString() noexcept { sv.p = ""; diff --git a/include/sonic/dom/handler.h b/include/sonic/dom/handler.h index fcd47509..c7475d37 100644 --- a/include/sonic/dom/handler.h +++ b/include/sonic/dom/handler.h @@ -130,8 +130,25 @@ class SAXHandler { sonic_force_inline bool String(StringView s) { return stringImpl(s); } + sonic_force_inline bool NumStr(StringView s) { + SONIC_ADD_NODE(); + new (&st_[np_ - 1]) NodeType(); + st_[np_ - 1].setLength(s.size(), kNumStr); + st_[np_ - 1].sv.p = s.data(); + return true; + } + + sonic_force_inline bool Raw(const char *data, size_t len) { + SONIC_ADD_NODE(); + new (&st_[np_ - 1]) NodeType(); + auto raw = StringView(data, len); + st_[np_ - 1].setRaw(raw); + return true; + } + sonic_force_inline bool StartObject() noexcept { SONIC_ADD_NODE(); + new (&st_[np_ - 1]) NodeType(); NodeType *cur = &st_[np_ - 1]; cur->o.next.ofs = parent_; parent_ = np_ - 1; @@ -140,6 +157,7 @@ class SAXHandler { sonic_force_inline bool StartArray() noexcept { SONIC_ADD_NODE(); + new (&st_[np_ - 1]) NodeType(); NodeType *cur = &st_[np_ - 1]; cur->o.next.ofs = parent_; parent_ = np_ - 1; @@ -184,6 +202,7 @@ class SAXHandler { sonic_force_inline bool stringImpl(StringView s) { SONIC_ADD_NODE(); + new (&st_[np_ - 1]) NodeType(); st_[np_ - 1].setLength(s.size(), kStringCopy); st_[np_ - 1].sv.p = s.data(); return true; diff --git a/include/sonic/dom/parser.h b/include/sonic/dom/parser.h index dfbc3e56..aab1e304 100644 --- a/include/sonic/dom/parser.h +++ b/include/sonic/dom/parser.h @@ -54,6 +54,7 @@ ParseResult GetOnDemand(StringView json, return ParseResult(kErrorNone, pos); } +template class Parser { public: explicit Parser() noexcept = default; @@ -64,12 +65,12 @@ class Parser { sonic_force_inline Parser &operator=(Parser &&other) noexcept = default; ~Parser() noexcept = default; - template + template sonic_force_inline ParseResult Parse(char *data, size_t len, SAX &sax) { reset(); json_buf_ = reinterpret_cast(data); len_ = len; - parseImpl(sax); + parseImpl(sax); if (!err_ && hasTrailingChars()) { err_ = kParseErrorInvalidChar; } @@ -132,7 +133,7 @@ class Parser { sonic_force_inline StringView parseStringHelper() { uint8_t *src = json_buf_ + pos_; uint8_t *sdst = src; - size_t n = internal::parseStringInplace(src, err_); + size_t n = internal::parseStringInplace(src, err_); pos_ = src - json_buf_; return StringView(reinterpret_cast(sdst), n); } @@ -167,12 +168,12 @@ class Parser { sonic_force_inline bool parseFloatingFast(double &d, int exp10, uint64_t man) const { d = (double)man; - // if man is small, but exp is large, also can parse excactly + // if man is small, but exp is large, also can parse exactly if (exp10 > 0) { if (exp10 > 22) { d *= internal::kPow10Tab[exp10 - 22]; if (d > 1e15 || d < -1e15) { - // the exponent is tooo large + // the exponent is too large return false; } d *= internal::kPow10Tab[22]; @@ -218,6 +219,21 @@ class Parser { template sonic_force_inline bool parseNumber(SAX &sax) { + // check flags + if constexpr (parseFlags & ParseFlags::kParseOverflowNumAsNumStr) { + return parseNumberAsString(sax); + } + +// These helper macros are used only within this function. +// Define/undefine them locally to avoid leaking into includers. +#undef FLOATING_LONGEST_DIGITS +#undef RETURN_SET_ERROR_CODE +#undef CHECK_DIGIT +#undef SET_INT_AND_RETURN +#undef SET_UINT_AND_RETURN +#undef SET_DOUBLE_AND_RETURN +#undef SET_U64_AS_DOUBLE_AND_RETURN + #define FLOATING_LONGEST_DIGITS 17 #define RETURN_SET_ERROR_CODE(error_code) \ @@ -266,10 +282,11 @@ class Parser { static constexpr uint64_t kUint64Max = 0xFFFFFFFFFFFFFFFF; int sgn = -1; int man_nd = 0; // # digits of mantissa, 10 ^ 19 fits uint64_t - int exp10 = 0; // 10-based exponet of float point number + int exp10 = 0; // 10-based exponent of float point number int trunc = 0; uint64_t man = 0; // mantissa of float point number size_t i = pos_ - 1; + size_t start_idx = pos_ - 1; size_t exp10_s = i; const char *s = reinterpret_cast(json_buf_); using internal::is_digit; @@ -311,6 +328,12 @@ class Parser { SET_DOUBLE_AND_RETURN(0.0 * sgn); } + // Zero Integer + if constexpr (parseFlags & ParseFlags::kParseIntegerAsRaw) { + if (!sax.Raw(s + start_idx, i - start_idx)) + RETURN_SET_ERROR_CODE(kParseErrorInvalidChar); + RETURN_SET_ERROR_CODE(kErrorNone); + } SET_UINT_AND_RETURN(0); } @@ -347,6 +370,12 @@ class Parser { if (sonic_unlikely(s[i] == 'e' || s[i] == 'E')) goto double_exp; // Integer + if constexpr (parseFlags & ParseFlags::kParseIntegerAsRaw) { + if (!sax.Raw(s + start_idx, i - start_idx)) + RETURN_SET_ERROR_CODE(kParseErrorInvalidChar); + RETURN_SET_ERROR_CODE(kErrorNone); + } + if (exp10 == 0) { // less than or equal to 19 digits if (sgn == -1) { @@ -380,8 +409,8 @@ class Parser { goto double_fast; } - // Is error when run here - // TODO: Assert + // Should be unreachable. + // TODO: add assertion. double_fract : { int fract_len = FLOATING_LONGEST_DIGITS - man_nd; @@ -475,9 +504,183 @@ class Parser { RETURN_SET_ERROR_CODE(error_code); } +#undef SET_U64_AS_DOUBLE_AND_RETURN +#undef SET_DOUBLE_AND_RETURN +#undef SET_UINT_AND_RETURN +#undef SET_INT_AND_RETURN +#undef CHECK_DIGIT +#undef RETURN_SET_ERROR_CODE +#undef FLOATING_LONGEST_DIGITS + return true; + } + template + sonic_force_inline bool parseNumberAsString(SAX &sax) { +// These helper macros are used only within this function. +// Define/undefine them locally to avoid hidden coupling with parseNumber(). +#undef RETURN_SET_ERROR_CODE #undef CHECK_DIGIT +#undef SET_INT_AND_RETURN +#undef SET_UINT_AND_RETURN +#undef SET_DOUBLE_AND_RETURN + +#define RETURN_SET_ERROR_CODE(error_code) \ + do { \ + pos_ = i; \ + err_ = error_code; \ + return true; \ + } while (0) + +#define CHECK_DIGIT() \ + do { \ + if (sonic_unlikely(s[i] < '0' || s[i] > '9')) { \ + RETURN_SET_ERROR_CODE(kParseErrorInvalidChar); \ + } \ + } while (0) + +#define SET_INT_AND_RETURN(int_val) \ + do { \ + if (!sax.Int(int_val)) RETURN_SET_ERROR_CODE(kParseErrorInvalidChar); \ + RETURN_SET_ERROR_CODE(kErrorNone); \ + } while (0) + +#define SET_UINT_AND_RETURN(int_val) \ + do { \ + if (!sax.Uint(int_val)) RETURN_SET_ERROR_CODE(kParseErrorInvalidChar); \ + RETURN_SET_ERROR_CODE(kErrorNone); \ + } while (0) + +#define SET_DOUBLE_AND_RETURN(dbl) \ + do { \ + if (!sax.Double(dbl)) RETURN_SET_ERROR_CODE(kParseErrorInvalidChar); \ + RETURN_SET_ERROR_CODE(kErrorNone); \ + } while (0) + + size_t i = pos_ - 1; + size_t start = i; + uint64_t man = 0; + int man_nd = 0; + const char *s = reinterpret_cast(json_buf_); + size_t digit_start = 0; + using internal::is_digit; + static constexpr uint64_t kUint64Max = 0xFFFFFFFFFFFFFFFF; + + bool neg = (s[i] == '-'); + i += uint8_t(neg); + int sgn = neg ? -1 : 1; + if (s[i] == '0') { + i++; + if (sonic_likely(s[i] == '.')) { // floating number, parse as string + i++; + CHECK_DIGIT(); + + while (s[i] == '0') { + i++; + } + if (sonic_unlikely(s[i] == 'e' || s[i] == 'E')) { + i++; + if (s[i] == '-' || s[i] == '+') i++; + CHECK_DIGIT(); + while (is_digit(s[i])) { + i++; + } + SET_DOUBLE_AND_RETURN(0.0 * sgn); + } + goto double_string_fract; + // parse floating number as json string value + // if (!sax.String(StringView(reinterpret_cast(s + start), i - + // start))) { + // RETURN_SET_ERROR_CODE(kParseErrorInvalidChar); + // } + // RETURN_SET_ERROR_CODE(kErrorNone); + } else if (sonic_unlikely(s[i] == 'e' || s[i] == 'E')) { + i++; + if (s[i] == '-' || s[i] == '+') i++; + CHECK_DIGIT(); + while (is_digit(s[i])) i++; + // parse as +/- 0.0 + SET_DOUBLE_AND_RETURN(0.0 * sgn); + } + SET_UINT_AND_RETURN(0); + } + + digit_start = i; + man = str2int(s, i); + man_nd = i - digit_start; + + if (man_nd == 0) { + RETURN_SET_ERROR_CODE(kParseErrorInvalidChar); + } + + // if man_nd > 19, the number should store as string, no need to + // calculate correct man + + if (sonic_likely(s[i] == '.')) { + i++; + goto double_string_fract; + } + if (sonic_unlikely(s[i] == 'e' || s[i] == 'E')) goto double_string_exp; + + // Integer + if (man_nd <= 19) { + if (neg) { + if (man > ((uint64_t)1 << 63)) { + goto double_string_fast; + } else { + SET_INT_AND_RETURN(-man); + } + } else { + SET_UINT_AND_RETURN(man); + } + } else if (man_nd == 20) { + // now we get 20 digits, it maybe overflow for uint64 + man = 0; + for (int ii = 0; ii < 19; ++ii) { + man = man * 10 + (s[ii + digit_start] - '0'); + } + unsigned num = s[i - 1] - '0'; + if (man < kUint64Max / 10 || + (man == kUint64Max / 10 && num <= UINT_MAX % 10)) { + man = man * 10 + num; + if (sgn == -1) { + goto double_string_fast; + } else { + SET_UINT_AND_RETURN(man); + } + } else { + goto double_string_fast; + } + } else { + goto double_string_fast; + } + + double_string_fract: + while (is_digit(s[i])) i++; + if (sonic_likely(s[i] != 'e' && s[i] != 'E')) { + goto double_string_fast; + } + double_string_exp: + i++; + if (s[i] == '-' || s[i] == '+') { + i++; + } + CHECK_DIGIT(); + + while (is_digit(s[i])) i++; + + double_string_fast: + // parse floating number as json string value + if (!sax.NumStr(StringView(const_cast(s + start), i - start))) { + RETURN_SET_ERROR_CODE(kParseErrorInvalidChar); + } + RETURN_SET_ERROR_CODE(kErrorNone); + +#undef SET_DOUBLE_AND_RETURN +#undef SET_UINT_AND_RETURN +#undef SET_INT_AND_RETURN +#undef CHECK_DIGIT +#undef RETURN_SET_ERROR_CODE } template @@ -525,7 +728,7 @@ class Parser { struct CheckKeyReturn : std::true_type {}; - template + template sonic_force_inline void parseImpl(SAX &sax) { #define sonic_check_err() \ do { \ @@ -801,7 +1004,7 @@ class Parser { if (sonic_unlikely(c != '"')) { goto err_invalid_char; } - // parse string in allocater if has esacped chars + // parse string in allocator if has escaped chars src = data + pos; sdst = src; skips = internal::SkipString(data, pos, len); @@ -815,7 +1018,7 @@ class Parser { uint8_t *dst = (uint8_t *)alloc.Malloc(sn + 32); sdst = dst; std::memcpy(dst, src, sn); - sn = internal::parseStringInplace(dst, err); + sn = internal::parseStringInplace(dst, err); if (err) { // update the error positions pos = (src - data) + (dst - sdst); diff --git a/include/sonic/dom/schema_handler.h b/include/sonic/dom/schema_handler.h index 0bf3f507..035a1ecc 100644 --- a/include/sonic/dom/schema_handler.h +++ b/include/sonic/dom/schema_handler.h @@ -163,6 +163,17 @@ class SchemaHandler { return true; } + sonic_force_inline bool Raw(const char *data, size_t len) { + if (cur_node_) { + cur_node_->setRaw(StringView(data, len)); + return true; + } + SONIC_ADD_NODE(); + new (&st_[np_ - 1]) NodeType(); + st_[np_ - 1].setRaw(StringView(data, len)); + return true; + } + sonic_force_inline bool Key(StringView s) { if (parent_node_ && parent_node_->IsObject()) { if (found_node_count_ >= parent_node_->Size()) { @@ -210,6 +221,7 @@ class SchemaHandler { return true; } SONIC_ADD_NODE(); + new (&st_[np_ - 1]) NodeType(); NodeType *cur = &st_[np_ - 1]; cur->o.next.ofs = parent_; parent_ = np_ - 1; @@ -223,12 +235,21 @@ class SchemaHandler { cur_node_ = nullptr; } SONIC_ADD_NODE(); + new (&st_[np_ - 1]) NodeType(); NodeType *cur = &st_[np_ - 1]; cur->o.next.ofs = parent_; parent_ = np_ - 1; return true; } + sonic_force_inline bool NumStr(StringView s) { + SONIC_ADD_NODE(); + new (&st_[np_ - 1]) NodeType(); + st_[np_ - 1].setLength(s.size(), kNumStr); + st_[np_ - 1].sv.p = s.data(); + return true; + } + sonic_force_inline bool EndObject(uint32_t pairs) { if (parent_node_ && parent_node_->IsObject()) { parent_node_ = parent_st_.back(); @@ -245,7 +266,7 @@ class SchemaHandler { obj_ptr = parent_st_.back(); obj_member_ptr = &st_[0]; parent_st_.pop_back(); - // resotre parent node ptr + // restore parent node ptr parent_node_ = parent_st_.back(); parent_st_.pop_back(); cur_node_ = nullptr; @@ -306,6 +327,7 @@ class SchemaHandler { sonic_force_inline bool stringImpl(StringView s) { SONIC_ADD_NODE(); + new (&st_[np_ - 1]) NodeType(); st_[np_ - 1].setLength(s.size(), kStringCopy); st_[np_ - 1].sv.p = s.data(); return true; diff --git a/include/sonic/dom/serialize.h b/include/sonic/dom/serialize.h index dfbe3768..d89327db 100644 --- a/include/sonic/dom/serialize.h +++ b/include/sonic/dom/serialize.h @@ -17,6 +17,9 @@ #pragma once +#include +#include + #include "sonic/dom/flags.h" #include "sonic/dom/type.h" #include "sonic/error.h" @@ -29,7 +32,7 @@ namespace sonic_json { namespace internal { -template +template sonic_force_inline SonicError SerializeImpl(const NodeType* node, WriteBuffer& wb) { struct ParentCtx { @@ -53,9 +56,13 @@ sonic_force_inline SonicError SerializeImpl(const NodeType* node, ssize_t rn = 0; internal::Stack stk; ParentCtx* parent; - - wb.Clear(); - wb.Reserve(estimate); + if constexpr ((serializeFlags & SerializeFlags::kSerializeAppendBuffer) == + 0) { + wb.Clear(); + wb.Reserve(estimate); + } else { + wb.Reserve(estimate + wb.Size()); + } bool is_single = (!node->IsContainer()) || node->Empty(); if (sonic_unlikely(is_single)) { @@ -75,7 +82,8 @@ sonic_force_inline SonicError SerializeImpl(const NodeType* node, inc_len = str_len * 6 + 32 + 3; wb.Grow(inc_len); str_ptr = node->GetStringView().data(); - rn = internal::Quote(str_ptr, str_len, wb.End()) - wb.End(); + rn = internal::Quote(str_ptr, str_len, wb.End()) - + wb.End(); wb.PushSizeUnsafe(rn); wb.PushUnsafe(is_key ? ':' : ','); member_cnt -= is_key; @@ -94,14 +102,42 @@ sonic_force_inline SonicError SerializeImpl(const NodeType* node, wb.End(); break; case kReal: { - rn = internal::F64toa(wb.End(), node->GetDouble()); - if (rn <= 0) goto inf_err; + const double d = node->GetDouble(); + rn = internal::F64toa(wb.End(), d); + // support Infinity/-Infinity or NaN/-NaN + + if (sonic_unlikely(rn <= 0)) { + if (serializeFlags & SerializeFlags::kSerializeInfNan) { + if (sonic_unlikely(std::isinf(d))) { + const bool neg_inf = std::signbit(d); + const char* s = neg_inf ? "\"-Infinity\"" : "\"Infinity\""; + rn = neg_inf ? 11 : 10; + std::memcpy(wb.End(), s, (size_t)rn); + } else if (sonic_unlikely(std::isnan(d))) { + const bool neg_nan = std::signbit(d); + const char* s = neg_nan ? "\"-NaN\"" : "\"NaN\""; + rn = neg_nan ? 6 : 5; + std::memcpy(wb.End(), s, (size_t)rn); + } else { + goto inf_err; + } + } else { + goto inf_err; + } + } break; - default: - break; } + case kNumStr: { + rn = 0; + str_len = node->Size(); + wb.Grow(str_len + 1); + wb.PushUnsafe(node->GetStringNumber().data(), str_len); + break; + } + default: + break; } - sonic_assert(rn > 0 && rn <= 32); + sonic_assert(rn >= 0 && rn <= 32); wb.PushSizeUnsafe(rn); wb.PushUnsafe(','); break; diff --git a/include/sonic/dom/type.h b/include/sonic/dom/type.h index 6d75a2be..f98b7968 100644 --- a/include/sonic/dom/type.h +++ b/include/sonic/dom/type.h @@ -31,11 +31,12 @@ enum TypeFlag { kArray = 7, // xxxxx111 // SubType: 2 bits - kFalse = ((uint8_t)(0 << 3)) | kBool, // xxx00_010, 2 - kTrue = ((uint8_t)(1 << 3)) | kBool, // xxx01_010, 10 - kUint = ((uint8_t)(0 << 3)) | kNumber, // xxx00_011, 3 - kSint = ((uint8_t)(1 << 3)) | kNumber, // xxx01_011, 11 - kReal = ((uint8_t)(2 << 3)) | kNumber, // xxx10_011, 19 + kFalse = ((uint8_t)(0 << 3)) | kBool, // xxx00_010, 2 + kTrue = ((uint8_t)(1 << 3)) | kBool, // xxx01_010, 10 + kUint = ((uint8_t)(0 << 3)) | kNumber, // xxx00_011, 3 + kSint = ((uint8_t)(1 << 3)) | kNumber, // xxx01_011, 11 + kReal = ((uint8_t)(2 << 3)) | kNumber, // xxx10_011, 19 + kNumStr = ((uint8_t)(3 << 3)) | kNumber, // xx100_011, 27 // kStringCopy: sv.p is copied, but not need free, e.g. node's string buffer // is dom str_ kStringCopy = kString, // xxx00_100, 4 @@ -59,6 +60,12 @@ enum TypeInfo { kSubTypeBits = 2, kSubTypeMask = 0x1F, + // Ownership bit inside the 8-bit type info. + // This bit is intentionally outside kSubTypeMask so that: + // - GetType() (masking by kSubTypeMask) keeps working unchanged + // - DNode can still know whether sv.p/raw.p needs Allocator::Free() + kOwnedStringMask = 1 << 5, + // Others kInfoBits = 8, kInfoMask = (1 << 8) - 1, diff --git a/include/sonic/error.h b/include/sonic/error.h index e9a46ef6..0ea14d86 100644 --- a/include/sonic/error.h +++ b/include/sonic/error.h @@ -45,12 +45,19 @@ enum SonicError { kParseErrorMismatchType = 10, ///< ParseOnDemand: the target type is not matched. kSerErrorUnsupportedType = 11, ///< Serialize: DOM has invalid node type. - kSerErrorInfinity = 12, ///< Serialize: DOM has inifinity number node. + kSerErrorInfinity = 12, ///< Serialize: DOM has infinity number node. kSerErrorInvalidObjKey = 13, ///< Serialize: The type of object's key is not ///< string. kErrorNoMem = 14, ///< Memory is not enough to allocate. kParseErrorUnexpect = 15, ///< Unexpected Errors + kSaxTermination = 16, ///< Parse: SAX handler return false to + ///< terminate parsing. + kUnsupportedJsonPath = 17, ///< JsonPath: Unsupported json path. + kNotFoundByJsonPath = 18, ///< JsonPath: Not found the target by json path. + kUnmatchedTypeInJsonPath = + 19, ///< JsonPath: The type of node is not matched. + kErrorNoneNoMatch = 20, ///< JsonPath: No node is matched by the json path. kErrorNums, }; @@ -79,13 +86,30 @@ inline const char* ErrorMsg(SonicError error) noexcept { {kParseErrorMismatchType, "ParseOnDemand: the target type is not matched."}, {kSerErrorUnsupportedType, "Serialize: DOM has invalid node type."}, - {kSerErrorInfinity, "Serialize: DOM has inifinity number node."}, + {kSerErrorInfinity, "Serialize: DOM has infinity number node."}, {kSerErrorInvalidObjKey, "Serialize: The type of object's key is not string."}, {kErrorNoMem, "Memory is not enough to allocate."}, {kParseErrorUnexpect, "Unexpected Errors"}, + {kSaxTermination, + "Parse: SAX handler return false to terminate parsing."}, + {kUnsupportedJsonPath, "JsonPath: Unsupported json path."}, + {kNotFoundByJsonPath, "JsonPath: Not found the target by json path."}, + {kUnmatchedTypeInJsonPath, "JsonPath: The type of node is not matched."}, + {kErrorNoneNoMatch, "JsonPath: no match."}, + }; - return kErrorMsg[error].msg; + + const int idx = static_cast(error); + if (idx < 0 || idx >= static_cast(kErrorNums)) { + return "Unknown error"; + } + + const SonicErrorInfo& info = kErrorMsg[idx]; + if (info.err != error || info.msg == nullptr) { + return "Unknown error"; + } + return info.msg; } struct ParseResult { diff --git a/include/sonic/experiment/lazy_update.h b/include/sonic/experiment/lazy_update.h index 78531cba..591b8fa4 100644 --- a/include/sonic/experiment/lazy_update.h +++ b/include/sonic/experiment/lazy_update.h @@ -25,11 +25,11 @@ namespace sonic_json { namespace internal { -template +template static inline ParseResult ParseLazy(NodeType &node, StringView json, Allocator &alloc) { LazySAXHandler sax(alloc); - Parser p; + Parser p; ParseResult ret = p.ParseLazy(reinterpret_cast(json.data()), json.size(), sax); if (ret.Error()) { @@ -40,17 +40,19 @@ static inline ParseResult ParseLazy(NodeType &node, StringView json, return ret; } -template +template static inline SonicError UpdateNodeLazy(NodeType &target, NodeType &source, Allocator &alloc) { ParseResult ret; SonicError err = kErrorNone; // check the raw type if (target.IsRaw() && *target.GetRaw().data() == '{') { - ret = ParseLazy(target, target.GetRaw(), alloc); + ret = ParseLazy(target, target.GetRaw(), + alloc); } if (source.IsRaw() && *source.GetRaw().data() == '{') { - ret = ParseLazy(source, source.GetRaw(), alloc); + ret = ParseLazy(source, source.GetRaw(), + alloc); } if (ret.Error()) { return ret.Error(); @@ -68,7 +70,8 @@ static inline SonicError UpdateNodeLazy(NodeType &target, NodeType &source, if (match == target.MemberEnd()) { target.AddMember(key, std::move(iter->value), alloc); } else { - err = UpdateNodeLazy(match->value, iter->value, alloc); + err = UpdateNodeLazy(match->value, + iter->value, alloc); if (err) return err; } } @@ -85,6 +88,7 @@ static inline SonicError UpdateNodeLazy(NodeType &target, NodeType &source, * @param target the target json * @param source the source json */ +template static inline std::string UpdateLazy(StringView target, StringView source) { using Allocator = Node::AllocatorType; Allocator alloc; @@ -93,15 +97,18 @@ static inline std::string UpdateLazy(StringView target, StringView source) { ParseResult ret1, ret2; Node ntarget, nsource; - ret1 = internal::ParseLazy(ntarget, target, alloc); - ret2 = internal::ParseLazy(nsource, source, alloc); + ret1 = + internal::ParseLazy(ntarget, target, alloc); + ret2 = + internal::ParseLazy(nsource, source, alloc); if (ret2.Error()) { return ret1.Error() ? "{}" : std::string(target.data(), target.size()); } if (ret1.Error()) { return std::string(source.data(), source.size()); } - err = internal::UpdateNodeLazy(ntarget, nsource, alloc); + err = internal::UpdateNodeLazy(ntarget, nsource, + alloc); if (err) { return "{}"; } @@ -112,4 +119,4 @@ static inline std::string UpdateLazy(StringView target, StringView source) { return std::string(wb.ToString(), wb.Size()); } -} // namespace sonic_json \ No newline at end of file +} // namespace sonic_json diff --git a/include/sonic/internal/arch/avx2/unicode.h b/include/sonic/internal/arch/avx2/unicode.h index 4d3afbc0..40c6b907 100644 --- a/include/sonic/internal/arch/avx2/unicode.h +++ b/include/sonic/internal/arch/avx2/unicode.h @@ -38,8 +38,15 @@ using sonic_json::internal::common::handle_unicode_codepoint; struct StringBlock { public: sonic_force_inline static StringBlock Find(const uint8_t *src); + template sonic_force_inline bool HasQuoteFirst() { - return (((bs_bits - 1) & quote_bits) != 0) && !HasUnescaped(); + constexpr bool kAllowUnescapedControlChars = + (parseFlags & ParseFlags::kParseAllowUnescapedControlChars) != 0; + if constexpr (kAllowUnescapedControlChars) { + return (((bs_bits - 1) & quote_bits) != 0); + } else { + return (((bs_bits - 1) & quote_bits) != 0) && (!HasUnescaped()); + } } sonic_force_inline bool HasBackslash() { return ((quote_bits - 1) & bs_bits) != 0; @@ -47,11 +54,9 @@ struct StringBlock { sonic_force_inline bool HasUnescaped() { return ((quote_bits - 1) & unescaped_bits) != 0; } + sonic_force_inline int QuoteIndex() { return TrailingZeroes(quote_bits); } sonic_force_inline int BsIndex() { return TrailingZeroes(bs_bits); } - sonic_force_inline int UnescapedIndex() { - return TrailingZeroes(unescaped_bits); - } uint32_t bs_bits; uint32_t quote_bits; diff --git a/include/sonic/internal/arch/common/arm_common/quote.h b/include/sonic/internal/arch/common/arm_common/quote.h index 0227d6b2..d3925bfc 100644 --- a/include/sonic/internal/arch/common/arm_common/quote.h +++ b/include/sonic/internal/arch/common/arm_common/quote.h @@ -68,22 +68,30 @@ namespace sonic_json { namespace internal { namespace arm_common { -static sonic_force_inline uint64_t CopyAndGetEscapMask128(const char *src, - char *dst) { - uint8x16_t v = vld1q_u8(reinterpret_cast(src)); - vst1q_u8(reinterpret_cast(dst), v); +template +static sonic_force_inline uint64_t CopyAndGetEscapMask128(const char* src, + char* dst) { + uint8x16_t v = vld1q_u8(reinterpret_cast(src)); + vst1q_u8(reinterpret_cast(dst), v); uint8x16_t m1 = vceqq_u8(v, vdupq_n_u8('\\')); uint8x16_t m2 = vceqq_u8(v, vdupq_n_u8('"')); uint8x16_t m3 = vcltq_u8(v, vdupq_n_u8('\x20')); - uint8x16_t m4 = vorrq_u8(m1, m2); - uint8x16_t m5 = vorrq_u8(m3, m4); + uint8x16_t mask = vorrq_u8(m1, m2); + mask = vorrq_u8(mask, m3); + if constexpr (EscapeEmoji) { + uint8x16_t m_emoji = vcgeq_u8(v, vdupq_n_u8(0xF0)); + mask = vorrq_u8(mask, m_emoji); + } - return to_bitmask(m5); + return to_bitmask(mask); } -sonic_static_inline char *Quote(const char *src, size_t nb, char *dst) { +template +sonic_static_inline char* Quote(const char* src, size_t nb, char* dst) { + constexpr bool EscapeEmoji = + serializeFlags & SerializeFlags::kSerializeEscapeEmoji; *dst++ = '"'; sonic_assert(nb < (1ULL << 32)); uint64_t mm; @@ -93,11 +101,11 @@ sonic_static_inline char *Quote(const char *src, size_t nb, char *dst) { while (nb >= VEC_LEN) { /* check for matches */ // TODO: optimize: exploit the simd bitmask in the escape block. - if ((mm = CopyAndGetEscapMask128(src, dst)) != 0) { + if ((mm = CopyAndGetEscapMask128(src, dst)) != 0) { // cn = __builtin_ctz(mm); cn = TrailingZeroes(mm) >> 2; MOVE_N_CHARS(src, cn); - DoEscape(src, dst, nb); + DoEscape(src, dst, nb); } else { /* move to next block */ MOVE_N_CHARS(src, VEC_LEN); @@ -106,7 +114,7 @@ sonic_static_inline char *Quote(const char *src, size_t nb, char *dst) { if (nb > 0) { char tmp_src[64]; - const char *src_r; + const char* src_r; #ifdef SONIC_USE_SANITIZE if (0) { #else @@ -119,12 +127,12 @@ sonic_static_inline char *Quote(const char *src, size_t nb, char *dst) { src_r = tmp_src; } while (nb > 0) { - mm = CopyAndGetEscapMask128(src_r, dst) & + mm = CopyAndGetEscapMask128(src_r, dst) & (0xFFFFFFFFFFFFFFFF >> ((VEC_LEN - nb) << 2)); if (mm) { cn = TrailingZeroes(mm) >> 2; MOVE_N_CHARS(src_r, cn); - DoEscape(src_r, dst, nb); + DoEscape(src_r, dst, nb); } else { dst += nb; nb = 0; diff --git a/include/sonic/internal/arch/common/quote_common.h b/include/sonic/internal/arch/common/quote_common.h index 0e028e83..a3edd693 100644 --- a/include/sonic/internal/arch/common/quote_common.h +++ b/include/sonic/internal/arch/common/quote_common.h @@ -16,7 +16,13 @@ #pragma once +#include +#include + #include "quote_tables.h" +#include "sonic/dom/flags.h" +#include "sonic/error.h" +#include "unicode_common.h" // Not check the buffer size of dst, src must be a valid UTF-8 string with // null-terminator. @@ -24,25 +30,160 @@ namespace sonic_json { namespace internal { -static sonic_force_inline uint8_t GetEscapeMask4(const char *src) { - return kNeedEscaped[*(uint8_t *)(src)] | - (kNeedEscaped[*(uint8_t *)(src + 1)] << 1) | - (kNeedEscaped[*(uint8_t *)(src + 2)] << 2) | - (kNeedEscaped[*(uint8_t *)(src + 3)] << 3); +template +sonic_static_inline void DoEscape(const char*& src, char*& dst, size_t& nb); + +namespace common { + +// Scalar fallback implementations for x86 dynamic dispatch. +// These are used by x86_ifuncs when CPU lacks AVX2 / (SSE4.2 + PCLMUL). + +template +sonic_force_inline size_t parseStringInplace(uint8_t*& src, SonicError& err) { + constexpr bool kAllowUnescapedControlChars = + (parseFlags & ParseFlags::kParseAllowUnescapedControlChars) != 0; + + err = kErrorNone; + uint8_t* dst = src; + uint8_t* sdst = src; + while (true) { + const uint8_t c = *src; + if (c == '"') { + *dst = '\0'; + ++src; + return static_cast(dst - sdst); + } + if (sonic_unlikely(!kAllowUnescapedControlChars && c <= 0x1f)) { + err = kParseErrorUnEscaped; + return 0; + } + if (sonic_likely(c != '\\')) { + *dst++ = *src++; + continue; + } + + // Escape sequence. + const uint8_t escape_char = src[1]; + if (sonic_unlikely(escape_char == 'u')) { + const uint8_t* src_ptr = src; + uint8_t* dst_ptr = dst; + if (!handle_unicode_codepoint(&src_ptr, &dst_ptr)) { + err = kParseErrorEscapedUnicode; + return 0; + } + src = const_cast(src_ptr); + dst = dst_ptr; + } else { + *dst = kEscapedMap[escape_char]; + if (sonic_unlikely(*dst == 0u)) { + err = kParseErrorEscapedFormat; + return 0; + } + src += 2; + dst += 1; + } + } +} + +template +sonic_static_inline char* Quote(const char* src, size_t nb, char* dst) { + constexpr bool EscapeEmoji = + (serializeFlags & SerializeFlags::kSerializeEscapeEmoji) != 0; + + *dst++ = '"'; + while (nb > 0) { + const uint8_t ch = static_cast(*src); + const bool need_escape = + (kNeedEscaped[ch] != 0) || (EscapeEmoji && ((ch & 0xF0) == 0xF0)); + if (sonic_likely(!need_escape)) { + *dst++ = *src++; + --nb; + continue; + } + DoEscape(src, dst, nb); + } + *dst++ = '"'; + return dst; +} + +} // namespace common + +static sonic_force_inline uint8_t GetEscapeMask4(const char* src) { + return kNeedEscaped[*(uint8_t*)(src)] | + (kNeedEscaped[*(uint8_t*)(src + 1)] << 1) | + (kNeedEscaped[*(uint8_t*)(src + 2)] << 2) | + (kNeedEscaped[*(uint8_t*)(src + 3)] << 3); } -sonic_static_inline void DoEscape(const char *&src, char *&dst, size_t &nb) { +static constexpr char kHexCharsUpper[16] = {'0', '1', '2', '3', '4', '5', + '6', '7', '8', '9', 'A', 'B', + 'C', 'D', 'E', 'F'}; +static constexpr char kHexCharsLower[16] = {'0', '1', '2', '3', '4', '5', + '6', '7', '8', '9', 'a', 'b', + 'c', 'd', 'e', 'f'}; + +template +sonic_static_inline void writeHex(uint32_t value, char*& dst) { + const char* hexChars = UpperCase ? kHexCharsUpper : kHexCharsLower; + *dst++ = '\\'; + *dst++ = 'u'; + *dst++ = hexChars[(value >> 12) & 0xf]; + *dst++ = hexChars[(value >> 8) & 0xf]; + *dst++ = hexChars[(value >> 4) & 0xf]; + *dst++ = hexChars[value & 0xf]; +} + +template +sonic_static_inline void DoEscape(const char*& src, char*& dst, size_t& nb) { + constexpr bool UnicodeEscapeUpperCase = + serializeFlags & SerializeFlags::kSerializeUnicodeEscapeUppercase; + constexpr bool EscapeEmoji = + serializeFlags & SerializeFlags::kSerializeEscapeEmoji; + + const auto& quote_tab = + UnicodeEscapeUpperCase ? kQuoteTabUpperCase : kQuoteTabLowerCase; /* get the escape entry, handle consecutive quotes */ do { - uint8_t ch = *(uint8_t *)src; - int nc = kQuoteTab[ch].n; - std::memcpy(dst, kQuoteTab[ch].s, 8); - src++; - nb--; - dst += nc; - if (nb <= 0) return; + uint8_t ch = *(uint8_t*)src; + int nc = quote_tab[ch].n; + if (nc != 0) { + std::memcpy(dst, quote_tab[ch].s, 6); + src++; + nb--; + dst += nc; + } else { + if constexpr (EscapeEmoji) { + if (nb < 4) { + // Not enough bytes for a 4-byte emoji, handle as raw char or error + *dst++ = *src++; + nb--; + continue; + } + // TODO: validate the utf8? + uint32_t unicode = (src[0] & 0x07) << 18 | (src[1] & 0x3f) << 12 | + (src[2] & 0x3f) << 6 | (src[3] & 0x3f); + unicode -= 0x10000; + writeHex(0xD800 | ((unicode >> 10) & 0x3FF), + dst); + writeHex(0xDC00 | (unicode & 0x3FF), dst); + src += 4; + nb -= 4; + } + } + + if (nb <= 0) { + return; + } + + /* next char is emoji */ + if constexpr (EscapeEmoji) { + if ((*(uint8_t*)(src)&0xf0) == 0xf0) { + continue; + } + } + /* copy and find escape chars */ - if (kNeedEscaped[*(uint8_t *)(src)] == 0) { + if (kNeedEscaped[*(uint8_t*)(src)] == 0) { return; } } while (true); diff --git a/include/sonic/internal/arch/common/quote_tables.h b/include/sonic/internal/arch/common/quote_tables.h index 6d7543a4..7d396fc9 100644 --- a/include/sonic/internal/arch/common/quote_tables.h +++ b/include/sonic/internal/arch/common/quote_tables.h @@ -57,7 +57,7 @@ struct QuotedChar { const char *s; }; -static const struct QuotedChar kQuoteTab[256] = { +static const struct QuotedChar kQuoteTabLowerCase[256] = { // 0x00 ~ 0x1f {.n = 6, .s = "\\u0000\0\0"}, {.n = 6, .s = "\\u0001\0\0"}, @@ -161,6 +161,110 @@ static const struct QuotedChar kQuoteTab[256] = { // 0x60 ~ 0xff }; +static const struct QuotedChar kQuoteTabUpperCase[256] = { + // 0x00 ~ 0x1f + {.n = 6, .s = "\\u0000\0\0"}, + {.n = 6, .s = "\\u0001\0\0"}, + {.n = 6, .s = "\\u0002\0\0"}, + {.n = 6, .s = "\\u0003\0\0"}, + {.n = 6, .s = "\\u0004\0\0"}, + {.n = 6, .s = "\\u0005\0\0"}, + {.n = 6, .s = "\\u0006\0\0"}, + {.n = 6, .s = "\\u0007\0\0"}, + {.n = 2, .s = "\\b\0\0\0\0\0\0"}, + {.n = 2, .s = "\\t\0\0\0\0\0\0"}, + {.n = 2, .s = "\\n\0\0\0\0\0\0"}, + {.n = 6, .s = "\\u000B\0\0"}, + {.n = 2, .s = "\\f\0\0\0\0\0\0"}, + {.n = 2, .s = "\\r\0\0\0\0\0\0"}, + {.n = 6, .s = "\\u000E\0\0"}, + {.n = 6, .s = "\\u000F\0\0"}, + {.n = 6, .s = "\\u0010\0\0"}, + {.n = 6, .s = "\\u0011\0\0"}, + {.n = 6, .s = "\\u0012\0\0"}, + {.n = 6, .s = "\\u0013\0\0"}, + {.n = 6, .s = "\\u0014\0\0"}, + {.n = 6, .s = "\\u0015\0\0"}, + {.n = 6, .s = "\\u0016\0\0"}, + {.n = 6, .s = "\\u0017\0\0"}, + {.n = 6, .s = "\\u0018\0\0"}, + {.n = 6, .s = "\\u0019\0\0"}, + {.n = 6, .s = "\\u001A\0\0"}, + {.n = 6, .s = "\\u001B\0\0"}, + {.n = 6, .s = "\\u001C\0\0"}, + {.n = 6, .s = "\\u001D\0\0"}, + {.n = 6, .s = "\\u001E\0\0"}, + {.n = 6, .s = "\\u001F\0\0"}, + // 0x20 ~ 0x2f + {0, 0}, + {0, 0}, + {.n = 2, .s = "\\\"\0\0\0\0\0\0"}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + // 0x30 ~ 0x4f + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + // 0x50 ~ 0x5f + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {0, 0}, + {.n = 2, .s = "\\\\\0\0\0\0\0\0"}, + {0, 0}, + {0, 0}, + {0, 0}, + // 0x60 ~ 0xff +}; + static const bool kNeedEscaped[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, diff --git a/include/sonic/internal/arch/common/skip_common.h b/include/sonic/internal/arch/common/skip_common.h index b1ef09a4..58bfa456 100644 --- a/include/sonic/internal/arch/common/skip_common.h +++ b/include/sonic/internal/arch/common/skip_common.h @@ -30,32 +30,53 @@ static sonic_force_inline bool EqBytes4(const uint8_t *src, uint32_t target) { return val == target; } +static sonic_force_inline bool IsValidSeparator(uint8_t c) { + // return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == ',' || + // c == ']' || c == '}' || c == '\0'; + constexpr uint64_t mask = (1ULL << 0) | // '\0' + (1ULL << 9) | // '\t' + (1ULL << 10) | // '\n' + (1ULL << 13) | // '\r' + (1ULL << 32) | // ' ' + (1ULL << 44); // ',' + + return c < 64 ? (mask >> c) & 1 : (c == 93 || c == 125); +} + sonic_force_inline bool SkipLiteral(const uint8_t *data, size_t &pos, size_t len, uint8_t token) { + // the binary of 'ull' in null static constexpr uint32_t kNullBin = 0x6c6c756e; + // the binary of 'rue' in true static constexpr uint32_t kTrueBin = 0x65757274; - static constexpr uint32_t kFalseBin = - 0x65736c61; // the binary of 'alse' in false + // the binary of 'alse' in false + static constexpr uint32_t kFalseBin = 0x65736c61; auto start = data + pos - 1; auto end = data + len; switch (token) { case 't': - if (start + 4 <= end && EqBytes4(start, kTrueBin)) { + if (start + 4 <= end && EqBytes4(start, kTrueBin) && + (start + 4 == end || IsValidSeparator(start[4]))) { pos += 3; return true; } break; case 'n': - if (start + 4 <= end && EqBytes4(start, kNullBin)) { + if (start + 4 <= end && EqBytes4(start, kNullBin) && + (start + 4 == end || IsValidSeparator(start[4]))) { pos += 3; return true; } break; case 'f': - if (start + 5 <= end && EqBytes4(start + 1, kFalseBin)) { + if (start + 5 <= end && EqBytes4(start + 1, kFalseBin) && + (start + 5 == end || IsValidSeparator(start[5]))) { pos += 4; return true; } + break; + default: + return false; } return false; } diff --git a/include/sonic/internal/arch/common/unicode_common.h b/include/sonic/internal/arch/common/unicode_common.h index e037ac36..6c488fb7 100644 --- a/include/sonic/internal/arch/common/unicode_common.h +++ b/include/sonic/internal/arch/common/unicode_common.h @@ -17,6 +17,8 @@ #pragma once +#include "sonic/internal/arch/common/quote_tables.h" + namespace sonic_json { namespace internal { namespace common { @@ -281,6 +283,27 @@ sonic_force_inline uint64_t GetEscaped(uint64_t &prev_escaped, return escaped_with_prev; } +// unescape with padding buffer +sonic_force_inline size_t unescape_with_padding(const uint8_t **src_ptr, + uint8_t **dst_ptr) { + uint8_t escape_char = (*src_ptr)[1]; + if (sonic_unlikely(escape_char == 'u')) { + if (!handle_unicode_codepoint(src_ptr, dst_ptr)) { + return 0; + } else { + return 1; + } + } else { + **dst_ptr = kEscapedMap[escape_char]; + if (sonic_unlikely(**dst_ptr == '\0')) { + return 0; + } + *src_ptr += 2; + *dst_ptr += 1; + return 1; + } +} + } // namespace common } // namespace internal } // namespace sonic_json diff --git a/include/sonic/internal/arch/common/x86_common/quote.inc.h b/include/sonic/internal/arch/common/x86_common/quote.inc.h index 58d7a828..1a84f2d6 100644 --- a/include/sonic/internal/arch/common/x86_common/quote.inc.h +++ b/include/sonic/internal/arch/common/x86_common/quote.inc.h @@ -61,23 +61,28 @@ using common::handle_unicode_codepoint; +template sonic_force_inline size_t parseStringInplace(uint8_t *&src, SonicError &err) { -#define SONIC_REPEAT8(v) {v v v v v v v v} - +#define SONIC_REPEAT8(v) \ + { v v v v v v v v } + constexpr bool kAllowUnescapedControlChars = + (parseFlags & ParseFlags::kParseAllowUnescapedControlChars) != 0; uint8_t *dst = src; uint8_t *sdst = src; while (1) { find: auto block = StringBlock::Find(src); - if (block.HasQuoteFirst()) { + if (block.HasQuoteFirst()) { int idx = block.QuoteIndex(); src += idx; *src++ = '\0'; return src - sdst - 1; } - if (block.HasUnescaped()) { - err = kParseErrorUnEscaped; - return 0; + if constexpr (!kAllowUnescapedControlChars) { + if (block.HasUnescaped()) { + err = kParseErrorUnEscaped; + return 0; + } } if (!block.HasBackslash()) { src += VEC_LEN; @@ -104,7 +109,7 @@ sonic_force_inline size_t parseStringInplace(uint8_t *&src, SonicError &err) { src += 2; dst += 1; } - // fast path for continous escaped chars + // fast path for continuous escaped chars if (*src == '\\') { bs_dist = 0; goto cont; @@ -115,11 +120,12 @@ sonic_force_inline size_t parseStringInplace(uint8_t *&src, SonicError &err) { VecType v(src); block = StringBlock{ static_cast((v == '\\').to_bitmask()), // bs_bits - static_cast((v == '"').to_bitmask()), // quote_bits + + static_cast((v == '"').to_bitmask()), // quote_bits static_cast((v <= '\x1f').to_bitmask()), }; // If the next thing is the end quote, copy and return - if (block.HasQuoteFirst()) { + if (block.HasQuoteFirst()) { // we encountered quotes first. Move dst to point to quotes and exit while (1) { SONIC_REPEAT8(if (sonic_unlikely(*src == '"')) break; @@ -129,9 +135,11 @@ sonic_force_inline size_t parseStringInplace(uint8_t *&src, SonicError &err) { src++; return dst - sdst; } - if (block.HasUnescaped()) { - err = kParseErrorUnEscaped; - return 0; + if constexpr (!kAllowUnescapedControlChars) { + if (block.HasUnescaped()) { + err = kParseErrorUnEscaped; + return 0; + } } if (!block.HasBackslash()) { /* they are the same. Since they can't co-occur, it means we @@ -151,13 +159,22 @@ sonic_force_inline size_t parseStringInplace(uint8_t *&src, SonicError &err) { #undef SONIC_REPEAT8 } +template static sonic_force_inline int CopyAndGetEscapMask(const char *src, char *dst) { VecType v(reinterpret_cast(src)); v.store(reinterpret_cast(dst)); - return ((v < '\x20') | (v == '\\') | (v == '"')).to_bitmask(); + if constexpr (EscapeEmoji) { + return ((v < '\x20') | (v == '\\') | (v == '"') | (v >= '\xF0')) + .to_bitmask(); + } else { + return ((v < '\x20') | (v == '\\') | (v == '"')).to_bitmask(); + } } +template sonic_static_inline char *Quote(const char *src, size_t nb, char *dst) { + constexpr bool EscapeEmoji = + serializeFlags & SerializeFlags::kSerializeEscapeEmoji; *dst++ = '"'; sonic_assert(nb < (1ULL << 32)); uint32_t mm; @@ -167,10 +184,10 @@ sonic_static_inline char *Quote(const char *src, size_t nb, char *dst) { while (nb >= VEC_LEN) { /* check for matches */ // TODO: optimize: exploit the simd bitmask in the escape block. - if ((mm = CopyAndGetEscapMask(src, dst)) != 0) { + if ((mm = CopyAndGetEscapMask(src, dst)) != 0) { cn = __builtin_ctz(mm); MOVE_N_CHARS(src, cn); - DoEscape(src, dst, nb); + DoEscape(src, dst, nb); } else { /* move to next block */ MOVE_N_CHARS(src, VEC_LEN); @@ -178,7 +195,7 @@ sonic_static_inline char *Quote(const char *src, size_t nb, char *dst) { } if (nb > 0) { - char tmp_src[VEC_LEN * 2]; + char tmp_src[VEC_LEN * 2] = {0}; const char *src_r; #ifdef SONIC_USE_SANITIZE if (0) { @@ -192,11 +209,12 @@ sonic_static_inline char *Quote(const char *src, size_t nb, char *dst) { src_r = tmp_src; } while (nb > 0) { - mm = CopyAndGetEscapMask(src_r, dst) & (VEC_FULL_MASK >> (VEC_LEN - nb)); + mm = CopyAndGetEscapMask(src_r, dst) & + (VEC_FULL_MASK >> (VEC_LEN - nb)); if (mm) { cn = __builtin_ctz(mm); MOVE_N_CHARS(src_r, cn); - DoEscape(src_r, dst, nb); + DoEscape(src_r, dst, nb); } else { dst += nb; nb = 0; diff --git a/include/sonic/internal/arch/common/x86_common/skip.inc.h b/include/sonic/internal/arch/common/x86_common/skip.inc.h index 4a716043..1cab276a 100644 --- a/include/sonic/internal/arch/common/x86_common/skip.inc.h +++ b/include/sonic/internal/arch/common/x86_common/skip.inc.h @@ -117,6 +117,8 @@ sonic_force_inline int SkipString(const uint8_t *data, size_t &pos, } // return true if container is closed. +// the implementation is inspired from JSONSki +// reference: https://dl.acm.org/doi/10.1145/3503222.3507719 sonic_force_inline bool SkipContainer(const uint8_t *data, size_t &pos, size_t len, uint8_t left, uint8_t right) { uint64_t prev_instring = 0, prev_escaped = 0, instring; diff --git a/include/sonic/internal/arch/neon/quote.h b/include/sonic/internal/arch/neon/quote.h index 19c5b0d8..3c8f382a 100644 --- a/include/sonic/internal/arch/neon/quote.h +++ b/include/sonic/internal/arch/neon/quote.h @@ -28,23 +28,28 @@ namespace neon { using sonic_json::internal::arm_common::Quote; +template sonic_force_inline size_t parseStringInplace(uint8_t *&src, SonicError &err) { -#define SONIC_REPEAT8(v) {v v v v v v v v} - +#define SONIC_REPEAT8(v) \ + { v v v v v v v v } + constexpr bool kAllowUnescapedControlChars = + (parseFlags & ParseFlags::kParseAllowUnescapedControlChars) != 0; uint8_t *dst = src; uint8_t *sdst = src; while (1) { find: auto block = StringBlock::Find(src); - if (block.HasQuoteFirst()) { + if (block.HasQuoteFirst()) { int idx = block.QuoteIndex(); src += idx; *src++ = '\0'; return src - sdst - 1; } - if (block.HasUnescaped()) { - err = kParseErrorUnEscaped; - return 0; + if constexpr (!kAllowUnescapedControlChars) { + if (block.HasUnescaped()) { + err = kParseErrorUnEscaped; + return 0; + } } if (!block.HasBackslash()) { src += VEC_LEN; @@ -71,7 +76,7 @@ sonic_force_inline size_t parseStringInplace(uint8_t *&src, SonicError &err) { src += 2; dst += 1; } - // fast path for continous escaped chars + // fast path for continuous escaped chars if (*src == '\\') { bs_dist = 0; goto cont; @@ -82,7 +87,7 @@ sonic_force_inline size_t parseStringInplace(uint8_t *&src, SonicError &err) { uint8x16_t v = vld1q_u8(src); block = StringBlock::Find(v); // If the next thing is the end quote, copy and return - if (block.HasQuoteFirst()) { + if (block.HasQuoteFirst()) { // we encountered quotes first. Move dst to point to quotes and exit while (1) { SONIC_REPEAT8(if (sonic_unlikely(*src == '"')) break; @@ -92,9 +97,11 @@ sonic_force_inline size_t parseStringInplace(uint8_t *&src, SonicError &err) { src++; return dst - sdst; } - if (block.HasUnescaped()) { - err = kParseErrorUnEscaped; - return 0; + if constexpr (!kAllowUnescapedControlChars) { + if (block.HasUnescaped()) { + err = kParseErrorUnEscaped; + return 0; + } } if (!block.HasBackslash()) { /* they are the same. Since they can't co-occur, it means we diff --git a/include/sonic/internal/arch/neon/unicode.h b/include/sonic/internal/arch/neon/unicode.h index 933a0e82..3c6a9b26 100644 --- a/include/sonic/internal/arch/neon/unicode.h +++ b/include/sonic/internal/arch/neon/unicode.h @@ -36,8 +36,15 @@ struct StringBlock { public: sonic_force_inline static StringBlock Find(const uint8_t *src); sonic_force_inline static StringBlock Find(uint8x16_t &v); + template sonic_force_inline bool HasQuoteFirst() const { - return (((bs_bits - 1) & quote_bits) != 0) && !HasUnescaped(); + constexpr bool kAllowUnescapedControlChars = + parseFlags & ParseFlags::kParseAllowUnescapedControlChars; + if constexpr (kAllowUnescapedControlChars) { + return (((bs_bits - 1) & quote_bits) != 0); + } else { + return (((bs_bits - 1) & quote_bits) != 0) && (!HasUnescaped()); + } } sonic_force_inline bool HasBackslash() const { return ((quote_bits - 1) & bs_bits) != 0; @@ -53,10 +60,6 @@ struct StringBlock { // return TrailingZeroes(bs_bits); return TrailingZeroes(bs_bits) >> 2; } - sonic_force_inline int UnescapedIndex() const { - // return TrailingZeroes(unescaped_bits); - return TrailingZeroes(unescaped_bits) >> 2; - } uint64_t bs_bits; uint64_t quote_bits; @@ -65,11 +68,7 @@ struct StringBlock { sonic_force_inline StringBlock StringBlock::Find(const uint8_t *src) { uint8x16_t v = vld1q_u8(src); - return { - to_bitmask(vceqq_u8(v, vdupq_n_u8('\\'))), - to_bitmask(vceqq_u8(v, vdupq_n_u8('"'))), - to_bitmask(vcleq_u8(v, vdupq_n_u8('\x1f'))), - }; + return StringBlock::Find(v); } sonic_force_inline StringBlock StringBlock::Find(uint8x16_t &v) { diff --git a/include/sonic/internal/arch/simd_skip.h b/include/sonic/internal/arch/simd_skip.h index c3a25334..527bed48 100644 --- a/include/sonic/internal/arch/simd_skip.h +++ b/include/sonic/internal/arch/simd_skip.h @@ -16,10 +16,20 @@ #pragma once +#include +#include +#include +#include + #include "simd_dispatch.h" +#include "sonic/dom/flags.h" +#include "sonic/error.h" +#include "sonic/jsonpath/jsonpath.h" #include INCLUDE_ARCH_FILE(skip.h) +#include + namespace sonic_json { namespace internal { @@ -31,6 +41,14 @@ SONIC_USING_ARCH_FUNC(SkipContainer); SONIC_USING_ARCH_FUNC(skip_space); SONIC_USING_ARCH_FUNC(skip_space_safe); +#define RETURN_FALSE_IF_PARSE_ERROR(x) \ + do { \ + x; \ + if (error_ != kErrorNone) { \ + return false; \ + } \ + } while (0) + static bool SkipArray(const uint8_t *data, size_t &pos, size_t len) { return SkipContainer(data, pos, len, '[', ']'); } @@ -91,7 +109,7 @@ class SkipScanner { } // SkipOne skip one raw json value and return the start of value, return the - // negtive if errors. + // negative if errors. sonic_force_inline long SkipOne(const uint8_t *data, size_t &pos, size_t len) { uint8_t c = SkipSpaceSafe(data, pos, len); @@ -137,6 +155,78 @@ class SkipScanner { return start; } + sonic_force_inline bool matchKey(const uint8_t *data, size_t &pos, size_t len, + StringView key, std::vector &kbuf, + SonicError &err) { + auto start = data + pos; + auto status = SkipString(data, pos, len); + // has errors + if (!status) { + err = SonicError::kParseErrorInvalidChar; + return false; + } + + auto slen = data + pos - 1 - start; + // has escaped char + if (status == 2) { + // parse escaped key + kbuf.resize(slen + 32); + uint8_t *nsrc = &kbuf[0]; + + // parseStringInplace need `"` as the end + std::memcpy(nsrc, start, slen + 1); + slen = parseStringInplace(nsrc, err); + if (err) { + pos = (start - data) + (nsrc - &kbuf[0]); + return false; + } + start = &kbuf[0]; + } + + // compare the key + return slen == static_cast(key.size()) && + std::memcmp(start, key.data(), slen) == 0; + } + sonic_force_inline int matchKeys(const uint8_t *data, size_t &pos, size_t len, + const std::vector &keys, + std::vector &kbuf, + SonicError &err) { + auto start = data + pos; + auto status = SkipString(data, pos, len); + // has errors + if (!status) { + err = SonicError::kParseErrorInvalidChar; + return -1; + } + + auto slen = data + pos - 1 - start; + // has escaped char + if (status == 2) { + // parse escaped key + kbuf.resize(slen + 32); + uint8_t *nsrc = &kbuf[0]; + + // parseStringInplace need `"` as the end + std::memcpy(nsrc, start, slen + 1); + slen = parseStringInplace(nsrc, err); + if (err) { + pos = (start - data) + (nsrc - &kbuf[0]); + return -1; + } + start = &kbuf[0]; + } + + for (size_t i = 0; i < keys.size(); i++) { + const auto &key = keys[i]; + if (slen == static_cast(key.size()) && + std::memcmp(start, key.data(), slen) == 0) { + return i; + } + } + // compare the key + return -1; + } + // GetOnDemand get the target json field through the path, and update the // position. template @@ -144,16 +234,14 @@ class SkipScanner { const GenericJsonPointer &path) { using namespace sonic_json::internal; size_t i = 0; - const uint8_t *sp; - long sn = 0; uint8_t c; StringView key; - int skips; // TODO: use stack smallvector here. std::vector kbuf(32); // key buffer for parsed keys const uint8_t *data = reinterpret_cast(json.data()); size_t len = json.size(); SonicError err = kErrorNone; + bool matched = false; query: if (i++ != path.size()) { @@ -176,32 +264,19 @@ class SkipScanner { obj_key: // advance quote pos++; - sp = data + pos; - skips = SkipString(data, pos, len); - sn = data + pos - 1 - sp; - if (!skips) goto err_invalid_char; - if (skips == 2) { - // parse escaped key - kbuf.resize(sn + 32); - uint8_t *nsrc = &kbuf[0]; - // parseStringInplace need `"` as the end - std::memcpy(nsrc, sp, sn + 1); - sn = parseStringInplace(nsrc, err); - if (err) { - pos = (sp - data) + (nsrc - &kbuf[0]); - return err; - } - sp = &kbuf[0]; + matched = matchKey(data, pos, len, key, kbuf, err); + if (err != kErrorNone) { + return -err; } c = SkipSpaceSafe(data, pos, len); if (c != ':') { goto err_invalid_char; } + // match key and skip parsing unneeded fields - if (sn == static_cast(key.size()) && - std::memcmp(sp, key.data(), sn) == 0) { + if (matched) { goto query; } else { c = SkipSpaceSafe(data, pos, len); @@ -248,5 +323,778 @@ class SkipScanner { uint64_t nonspace_bits_{0}; }; +class SkipScanner2 { + public: + sonic_force_inline StringView getOne() { + long start = scanner_.SkipOne(data_, pos_, len_); + if (start < 0) { + setError(SonicError(-start)); + return ""; + } + return StringView(reinterpret_cast(data_) + start, + pos_ - start); + } + + sonic_force_inline SonicError skipOne() { + long start = scanner_.SkipOne(data_, pos_, len_); + if (start < 0) { + setError(SonicError(-start)); + return error_; + } + return SonicError::kErrorNone; + } + + sonic_force_inline uint8_t peek() { + if (sonic_unlikely(pos_ >= len_)) { + setError(SonicError::kParseErrorEof); + return 0; + } + + auto c = scanner_.SkipSpaceSafe(data_, pos_, len_); + + // If we reached the end while still seeing spaces, there is no token. + if (sonic_unlikely(pos_ >= len_) && + (c == ' ' || c == '\n' || c == '\r' || c == '\t')) { + setError(SonicError::kParseErrorEof); + return 0; + } + + pos_ -= 1; + return c; + } + + sonic_force_inline bool hasError() { + return error_ != SonicError::kErrorNone; + } + + sonic_force_inline void setIsFieldName() { this->isFieldName = true; } + sonic_force_inline bool getAndClearIsFieldName() { + auto ret = this->isFieldName; + this->isFieldName = false; + return ret; + } + sonic_force_inline uint8_t advance() { + return scanner_.SkipSpaceSafe(data_, pos_, len_); + } + + // + + sonic_force_inline void skipIfPresent(const uint8_t c) { + if (sonic_unlikely(pos_ == len_)) { + setError(SonicError::kParseErrorEof); + return; + } + if (peek() == c) { + advance(); + } + } + + sonic_force_inline bool consume(uint8_t c) { + if (sonic_unlikely(pos_ >= len_)) { + setError(SonicError::kParseErrorEof); + return false; + } + + auto got = scanner_.SkipSpaceSafe(data_, pos_, len_); + if (got != c) { + if (sonic_unlikely(pos_ >= len_) && + (got == ' ' || got == '\n' || got == '\r' || got == '\t')) { + setError(SonicError::kParseErrorEof); + } else { + setError(SonicError::kParseErrorInvalidChar); + } + return false; + } + return true; + } + + sonic_force_inline void setError(SonicError err) { error_ = err; } + + // Precondition: calling advance takes input the " of the first fieldname + // post condition: if found peek() returns first char of the found value + // if not found, peek() returns } + sonic_force_inline bool advanceKey(StringView key) { + auto c = advance(); + bool matched = false; + while (c != '}') { + if (c != '"') { + setError(SonicError::kParseErrorInvalidChar); + return false; + } + + // match the key + matched = scanner_.matchKey(data_, pos_, len_, key, kbuf_, error_); + if (error_ != SonicError::kErrorNone) { + return false; + } + + if (!consume(':')) { + setError(SonicError::kParseErrorInvalidChar); + return false; + } + + if (matched) { + break; + } + + RETURN_FALSE_IF_PARSE_ERROR(skipOne()); + + // get the next key + c = advance(); + if (c == ',') { + c = advance(); + } else if (c != '}') { + setError(SonicError::kParseErrorInvalidChar); + } + } + + // When no key matches, the loop above would consume all members *and* the + // closing '}'. However, getJsonPath() (including the Java/Spark-compatible + // template variant with `serializeFlags = kSerializeJavaStyleFlag`) expects + // '}' to be left unconsumed and handled by the caller that processes the + // object. + if (!matched && c == '}') { + pos_--; + } + return matched; + } + + // Precondition: calling advance takes input the " of the first fieldname + // post condition: if found peek() returns first char of the found value + // if not found, peek() returns } + sonic_force_inline int advanceKeys(const std::vector &keys) { + auto c = advance(); + int matched = -1; + while (c != '}') { + if (c != '"') { + setError(SonicError::kParseErrorInvalidChar); + return -1; + } + + // match the key + matched = scanner_.matchKeys(data_, pos_, len_, keys, kbuf_, error_); + if (error_ != SonicError::kErrorNone) { + return -1; + } + + if (!consume(':')) { + setError(SonicError::kParseErrorInvalidChar); + return -1; + } + + if (matched != -1) { + break; + } + + RETURN_FALSE_IF_PARSE_ERROR(skipOne()); + + // get the next key + c = advance(); + if (c == ',') { + c = advance(); + } else if (c != '}') { + setError(SonicError::kParseErrorInvalidChar); + } + } + + // When no key matches, the loop above would consume all members *and* the + // closing '}'. However, getJsonPath() (including the Java/Spark-compatible + // template variant with `serializeFlags = kSerializeJavaStyleFlag`) expects + // '}' to be left unconsumed and handled by the caller that processes the + // object. + if (matched == -1 && c == '}') { + pos_--; + } + return matched; + } + + sonic_force_inline SonicError traverseObject(const JsonPath &path, + size_t index, + std::vector &res) { + auto c = advance(); + while (c != '}') { + if (c != '"') { + setError(SonicError::kParseErrorInvalidChar); + return error_; + } + + // skip the key + if (!SkipString(data_, pos_, len_)) { + setError(SonicError::kParseErrorInvalidChar); + return error_; + } + + if (!consume(':')) { + return error_; + } + + // recursively parse the value + if (getJsonPath(path, index + 1, res, true) != SonicError::kErrorNone) { + return error_; + } + + // get the next key + c = advance(); + if (c == ',') { + c = advance(); + } else if (c != '}') { + setError(SonicError::kParseErrorInvalidChar); + return error_; + } + } + return kErrorNone; + } + + sonic_force_inline SonicError traverseArray(const JsonPath &path, + size_t index, + std::vector &res) { + auto c = advance(); + pos_--; + while (c != ']') { + // recursively parse the value + if (getJsonPath(path, index + 1, res, true) != SonicError::kErrorNone) { + return error_; + } + + // get the next elem + c = advance(); + if (c == ',') { + continue; + } else if (c != ']') { + setError(SonicError::kParseErrorInvalidChar); + return error_; + } + } + return kErrorNone; + } + + sonic_force_inline bool advanceIndex(size_t index) /* found */ { + auto c = advance(); + if (c == ']') { + return false; + } + + pos_--; // backwared for skip the first elem + while (c != ']' && index > 0) { + if (skipOne() != SonicError::kErrorNone) { + return false; + } + + // get the next key + c = advance(); + if (c == ',') { + index--; + } else if (c != ']') { + setError(SonicError::kParseErrorInvalidChar); + return false; + } + } + + return (index == 0); + } + + sonic_force_inline SonicError skipArrayRemain() { + auto c = advance(); + while (c != ']') { + if (c != ',') { + setError(SonicError::kParseErrorInvalidChar); + return error_; + } + + if (skipOne() != SonicError::kErrorNone) { + return error_; + } + + c = advance(); + } + return kErrorNone; + } + + sonic_force_inline SonicError skipObjectRemain() { + auto c = advance(); + while (c != '}') { + if (c != ',') { + setError(SonicError::kParseErrorInvalidChar); + return error_; + } + + c = advance(); + if (c != '"') { + setError(SonicError::kParseErrorInvalidChar); + return error_; + } + + // skip the key + if (!SkipString(data_, pos_, len_)) { + setError(SonicError::kParseErrorInvalidChar); + return error_; + } + + if (!consume(':')) { + return error_; + } + + if (skipOne() != SonicError::kErrorNone) { + return error_; + } + + // get the next key + c = advance(); + } + return kErrorNone; + } + + enum WriteStyle { RAW, FLATTEN, QUOTE }; + enum JsonValueType { STRING, OTHER }; + + template + class JsonGeneratorInterface { + public: + virtual bool writeRaw(StringView sv) = 0; + virtual bool copyCurrentStructure(StringView sv) = 0; + virtual bool copyCurrentStructureJsonTupleCodeGen( + StringView raw, size_t index, + std::vector> &result, + JsonValueType type) = 0; + virtual bool writeRawValue(StringView sv) = 0; + virtual bool writeStartArray() = 0; + virtual bool writeEndArray() = 0; + virtual bool writeComma() = 0; + virtual bool isEmpty() = 0; + virtual bool isBeginArray() = 0; + virtual ~JsonGeneratorInterface() {} + }; + template + using JsonGeneratorFactory = + std::function>( + WriteBuffer &)>; + + template + inline bool getJsonPathArrayIndex( + const JsonPath &path, size_t index, + JsonGeneratorInterface *jsonGenerator, + const JsonGeneratorFactory &jsonGeneratorFactory, + const int64_t idx) { + RETURN_FALSE_IF_PARSE_ERROR(consume('[')); + int64_t cur_idx = 0; + bool dirty = false; + while (peek() != ']') { + if (cur_idx == idx) { + dirty = getJsonPath( + path, index + 1, jsonGenerator, jsonGeneratorFactory); + while (peek() != ']') { + RETURN_FALSE_IF_PARSE_ERROR(skipIfPresent(',')); + if (peek() == ']') { + break; + } + RETURN_FALSE_IF_PARSE_ERROR(skipOne()); + } + break; + } else { + RETURN_FALSE_IF_PARSE_ERROR(skipOne()); + } + RETURN_FALSE_IF_PARSE_ERROR(skipIfPresent(',')); + cur_idx++; + } + RETURN_FALSE_IF_PARSE_ERROR(consume(']')); + return dirty; + } + template + inline bool jsonTupleWithCodeGenImpl( + const std::vector &keys, + JsonGeneratorInterface *jsonGenerator, + std::vector> &result) { + RETURN_FALSE_IF_PARSE_ERROR(consume('{')); + + int todo = keys.size(); + + while (peek() != '}' && todo > 0) { + int keyMatchIndex = advanceKeys(keys); + if (keyMatchIndex != -1) { + todo--; + JsonValueType type = + peek() == '"' ? JsonValueType::STRING : JsonValueType::OTHER; + if (peek() == 'n') { + // do not do anything for null + RETURN_FALSE_IF_PARSE_ERROR(skipOne()); + } else { + const auto sv = getOne(); + if (error_ != kErrorNone) { + return false; + } + const auto copy_success = + jsonGenerator->copyCurrentStructureJsonTupleCodeGen( + sv, keyMatchIndex, result, type); + if (!copy_success) { + error_ = kParseErrorUnexpect; + return false; + } + } + } + RETURN_FALSE_IF_PARSE_ERROR(skipIfPresent(',')); + } + + return true; + } + template + inline std::vector> jsonTupleWithCodeGen( + const std::vector &keys, + JsonGeneratorInterface *jsonGenerator, bool legacy) { + std::vector> result(keys.size(), std::nullopt); + const auto success = jsonTupleWithCodeGenImpl(keys, jsonGenerator, result); + + if (!success && !legacy) { + std::vector> all_nulls(keys.size(), + std::nullopt); + return all_nulls; + } + + return result; + } + template + inline bool getJsonPath( + const JsonPath &path, size_t index, + JsonGeneratorInterface *jsonGenerator, + const JsonGeneratorFactory &jsonGeneratorFactory) { + const bool path_is_nil = index >= path.size(); + const auto c = peek(); + const bool is_field_name = getAndClearIsFieldName(); + const bool value_string = !is_field_name && c == '"'; + const bool field_name = c == '"' && is_field_name; + + if (is_field_name && !value_string && !field_name) { + setError(kParseErrorUnexpect); + return false; + } + // superhack to guarantee advancement + if (c == 'n' && !path_is_nil) { + // null cannot evaluate + RETURN_FALSE_IF_PARSE_ERROR(skipOne()); + return false; + } + if (value_string && !path_is_nil) { + RETURN_FALSE_IF_PARSE_ERROR(skipOne()); + return false; + } + + if (value_string && path_is_nil) { + if constexpr (style == RAW) { + const auto sv = getOne(); + if (error_ != kErrorNone) { + return false; + } + if (!jsonGenerator->writeRaw(sv)) { + setError(kParseErrorUnexpect); + return false; + } + return true; + } + } + + if (c == '[' && path_is_nil) { + if constexpr (style == FLATTEN) { + RETURN_FALSE_IF_PARSE_ERROR(consume('[')); + bool dirty = false; + + while (peek() != ']') { + dirty |= getJsonPath( + path, index + 1, jsonGenerator, jsonGeneratorFactory); + RETURN_FALSE_IF_PARSE_ERROR(skipIfPresent(',')); + } + RETURN_FALSE_IF_PARSE_ERROR(consume(']')); + return dirty; + } + } + + if (path_is_nil) { + if (!jsonGenerator->isBeginArray() && !jsonGenerator->isEmpty()) { + jsonGenerator->writeComma(); + } + + const auto sv = getOne(); + if (error_ != kErrorNone) { + return false; + } + const auto copy_success = jsonGenerator->copyCurrentStructure(sv); + if (!copy_success) { + error_ = kParseErrorUnexpect; + return false; + } + + return true; + } + + if (c == '{' && path[index].is_key()) { + RETURN_FALSE_IF_PARSE_ERROR(consume('{')); + bool dirty = false; + while (peek() != '}') { + if (dirty) { + // Skip children + while (peek() != '}') { + RETURN_FALSE_IF_PARSE_ERROR(skipOne()); + RETURN_FALSE_IF_PARSE_ERROR(consume(':')); + RETURN_FALSE_IF_PARSE_ERROR(skipOne()); + RETURN_FALSE_IF_PARSE_ERROR(skipIfPresent(',')); + } + } else { + // The next "string_value" is a key + setIsFieldName(); + dirty = getJsonPath(path, index, jsonGenerator, + jsonGeneratorFactory); + + RETURN_FALSE_IF_PARSE_ERROR(skipIfPresent(',')); + } + } + RETURN_FALSE_IF_PARSE_ERROR(consume('}')); + return dirty; + } + + if (c == '[' && index + 1 < path.size() && path[index].is_wildcard() && + path[index + 1].is_wildcard()) { + RETURN_FALSE_IF_PARSE_ERROR(consume('[')); + bool dirty = false; + if (!jsonGenerator->isBeginArray() && !jsonGenerator->isEmpty()) { + jsonGenerator->writeComma(); + } + jsonGenerator->writeStartArray(); + while (peek() != ']') { + const auto index_plus_two = index + 2; + dirty |= getJsonPath( + path, index_plus_two, jsonGenerator, jsonGeneratorFactory); + RETURN_FALSE_IF_PARSE_ERROR(skipIfPresent(',')); + } + jsonGenerator->writeEndArray(); + RETURN_FALSE_IF_PARSE_ERROR(consume(']')); + return dirty; + } + + if (c == '[' && path[index].is_wildcard()) { + if constexpr (style != QUOTE) { + int64_t dirty = 0; + auto constexpr nextStyle = style == RAW ? QUOTE : style; + WriteBuffer wb; + auto localJsonGenerator = jsonGeneratorFactory(wb); + + RETURN_FALSE_IF_PARSE_ERROR(consume('[')); + while ((pos_ < len_) && peek() != ']') { + size_t pos_before = pos_; + dirty += getJsonPath( + path, index + 1, localJsonGenerator.get(), + jsonGeneratorFactory) + ? 1 + : 0; + if (pos_ == pos_before) { + // getJsonPath() must consume at least one value on success/failure. + // If not, skip the current JSON value to avoid infinite loop and + // prevent desync by blindly advancing one byte. + RETURN_FALSE_IF_PARSE_ERROR(skipOne()); + } + RETURN_FALSE_IF_PARSE_ERROR(skipIfPresent(',')); + } + if (sonic_unlikely(pos_ == len_)) { + setError(SonicError::kParseErrorEof); + return false; + } + RETURN_FALSE_IF_PARSE_ERROR(consume(']')); + if (dirty > 1) { + if (!jsonGenerator->isBeginArray() && !jsonGenerator->isEmpty()) { + jsonGenerator->writeComma(); + } + jsonGenerator->writeStartArray(); + // should always use explicit `Size`, because there maybe '\0' in the + // wb + jsonGenerator->writeRawValue(wb.ToStringView()); + jsonGenerator->writeEndArray(); + } else if (dirty == 1) { + jsonGenerator->writeRawValue(wb.ToStringView()); + } + + return dirty > 0; + } + } + + if (c == '[' && path[index].is_wildcard()) { + bool dirty = false; + if (!jsonGenerator->isBeginArray() && !jsonGenerator->isEmpty()) { + jsonGenerator->writeComma(); + } + jsonGenerator->writeStartArray(); + RETURN_FALSE_IF_PARSE_ERROR(consume('[')); + while (peek() != ']') { + const auto index_plus_one = index + 1; + + dirty |= getJsonPath( + path, index_plus_one, jsonGenerator, jsonGeneratorFactory); + RETURN_FALSE_IF_PARSE_ERROR(skipIfPresent(',')); + } + RETURN_FALSE_IF_PARSE_ERROR(consume(']')); + jsonGenerator->writeEndArray(); + return dirty; + } + + if (c == '[' && path[index].is_index()) { + const auto array_index = path[index].index(); + + const bool path_has_two_more = index + 2 < path.size(); + if (path_has_two_more && path[index + 1].is_wildcard()) { + return getJsonPathArrayIndex( + path, index, jsonGenerator, jsonGeneratorFactory, array_index); + } + + return getJsonPathArrayIndex( + path, index, jsonGenerator, jsonGeneratorFactory, array_index); + } + + if (field_name && path[index].is_key()) { + const bool found = advanceKey(path[index].key()); + if (error_ != kErrorNone) { + return false; + } + + if (found) { + // if not null + if (peek() != 'n') { + return getJsonPath( + path, index + 1, jsonGenerator, jsonGeneratorFactory); + } else { + // skip null + RETURN_FALSE_IF_PARSE_ERROR(skipOne()); + return false; + } + } + return false; + } + + if (field_name && path[index].is_wildcard()) { + RETURN_FALSE_IF_PARSE_ERROR(skipOne()); + RETURN_FALSE_IF_PARSE_ERROR(consume(':')); + return getJsonPath(path, index + 1, jsonGenerator, + jsonGeneratorFactory); + } + + if (c == '{' || c == '[') { + if (c == '{') { + RETURN_FALSE_IF_PARSE_ERROR(consume('{')); + while (peek() != '}') { + // SkipString returns a status (0/1/2) but doesn't set error_. + // Don't use RETURN_FALSE_IF_PARSE_ERROR here. + if (!SkipString(data_, pos_, len_)) { + setError(SonicError::kParseErrorInvalidChar); + return false; + } + RETURN_FALSE_IF_PARSE_ERROR(consume(':')); + RETURN_FALSE_IF_PARSE_ERROR(skipOne()); + RETURN_FALSE_IF_PARSE_ERROR(skipIfPresent(',')); + } + } else if (c == '[') { + RETURN_FALSE_IF_PARSE_ERROR(consume('[')); + while (peek() != ']') { + RETURN_FALSE_IF_PARSE_ERROR(skipOne()); + RETURN_FALSE_IF_PARSE_ERROR(skipIfPresent(',')); + } + } + } + return false; + } + // SkipOne skip one raw json value and return the start of value, return the + // negative if errors. + inline SonicError getJsonPath(const JsonPath &path, size_t index, + std::vector &res, + bool complete = false) { + if (index >= path.size()) { + res.push_back(getOne()); + return error_; + } + + auto c = advance(); + if (path[index].is_wildcard()) { + if (c == '{') { + return traverseObject(path, index, res); + } else if (c == '[') { + return traverseArray(path, index, res); + } else { + // wildcard do nothing when meets the primitive value + return kErrorNone; + } + } + + if (path[index].is_key()) { + if (c != '{') { + if (complete) { + pos_--; + skipOne(); + } else { + setError(SonicError::kUnmatchedTypeInJsonPath); + } + return error_; + } + + bool found = advanceKey(path[index].key()); + if (hasError()) { + return error_; + } + + if (!found) { + return complete ? kErrorNone : kParseErrorUnknownObjKey; + } + + error_ = getJsonPath(path, index + 1, res, complete); + if (hasError()) { + return error_; + } + return complete ? skipObjectRemain() : kErrorNone; + } + + if (path[index].is_index()) { + if (c != '[') { + if (complete) { + pos_--; + skipOne(); + } else { + setError(SonicError::kUnmatchedTypeInJsonPath); + } + return error_; + } + + // index maybe negative + int64_t idx = path[index].index(); + if (idx < 0) { + setError(SonicError::kUnsupportedJsonPath); + return error_; + } + + bool found = advanceIndex(path[index].index()); + if (hasError()) { + return error_; + } + + if (found) { + error_ = getJsonPath(path, index + 1, res, complete); + if (!hasError() && complete) { + return skipArrayRemain(); + } + return error_; + } + + // not found the index + if (!complete) { + return kParseErrorArrIndexOutOfRange; + } + return kErrorNone; + } + return kUnsupportedJsonPath; + } + + public: + SkipScanner scanner_; + const uint8_t *data_ = nullptr; + size_t pos_ = 0; + size_t len_ = 0; + SonicError error_ = SonicError::kErrorNone; + std::vector kbuf_ = {}; + bool isFieldName = false; +}; } // namespace internal } // namespace sonic_json diff --git a/include/sonic/internal/arch/sse/unicode.h b/include/sonic/internal/arch/sse/unicode.h index 38f10d57..01bd923a 100644 --- a/include/sonic/internal/arch/sse/unicode.h +++ b/include/sonic/internal/arch/sse/unicode.h @@ -38,8 +38,15 @@ using sonic_json::internal::common::handle_unicode_codepoint; struct StringBlock { public: sonic_force_inline static StringBlock Find(const uint8_t *src); + template sonic_force_inline bool HasQuoteFirst() { - return (((bs_bits - 1) & quote_bits) != 0) && !HasUnescaped(); + constexpr bool kAllowUnescapedControlChars = + (parseFlags & ParseFlags::kParseAllowUnescapedControlChars) != 0; + if constexpr (kAllowUnescapedControlChars) { + return (((bs_bits - 1) & quote_bits) != 0); + } else { + return (((bs_bits - 1) & quote_bits) != 0) && (!HasUnescaped()); + } } sonic_force_inline bool HasBackslash() { return ((quote_bits - 1) & bs_bits) != 0; @@ -49,9 +56,6 @@ struct StringBlock { } sonic_force_inline int QuoteIndex() { return TrailingZeroes(quote_bits); } sonic_force_inline int BsIndex() { return TrailingZeroes(bs_bits); } - sonic_force_inline int UnescapedIndex() { - return TrailingZeroes(unescaped_bits); - } uint32_t bs_bits; uint32_t quote_bits; diff --git a/include/sonic/internal/arch/sve2-128/quote.h b/include/sonic/internal/arch/sve2-128/quote.h index 3eae2c0f..807b69cc 100644 --- a/include/sonic/internal/arch/sve2-128/quote.h +++ b/include/sonic/internal/arch/sve2-128/quote.h @@ -28,23 +28,29 @@ namespace sve2_128 { using sonic_json::internal::arm_common::Quote; +template sonic_force_inline size_t parseStringInplace(uint8_t *&src, SonicError &err) { -#define SONIC_REPEAT8(v) {v v v v v v v v} +#define SONIC_REPEAT8(v) \ + { v v v v v v v v } + constexpr bool kAllowUnescapedControlChars = + (parseFlags & ParseFlags::kParseAllowUnescapedControlChars) != 0; uint8_t *dst = src; uint8_t *sdst = src; while (1) { find: auto block = StringBlock::Find(src); - if (block.HasQuoteFirst()) { + if (block.HasQuoteFirst()) { int idx = block.QuoteIndex(); src += idx; *src++ = '\0'; return src - sdst - 1; } - if (block.HasUnescaped()) { - err = kParseErrorUnEscaped; - return 0; + if constexpr (!kAllowUnescapedControlChars) { + if (block.HasUnescaped()) { + err = kParseErrorUnEscaped; + return 0; + } } if (!block.HasBackslash()) { src += VEC_LEN; @@ -71,7 +77,7 @@ sonic_force_inline size_t parseStringInplace(uint8_t *&src, SonicError &err) { src += 2; dst += 1; } - // fast path for continous escaped chars + // fast path for continuous escaped chars if (*src == '\\') { bs_dist = 0; goto cont; @@ -79,10 +85,10 @@ sonic_force_inline size_t parseStringInplace(uint8_t *&src, SonicError &err) { find_and_move: // Copy the next n bytes, and find the backslash and quote in them. - uint8x16_t v = vld1q_u8(src); + svuint8x16_t v = svld1_u8(svptrue_b8(), src); block = StringBlock::Find(v); // If the next thing is the end quote, copy and return - if (block.HasQuoteFirst()) { + if (block.HasQuoteFirst()) { // we encountered quotes first. Move dst to point to quotes and exit while (1) { SONIC_REPEAT8(if (sonic_unlikely(*src == '"')) break; @@ -92,14 +98,16 @@ sonic_force_inline size_t parseStringInplace(uint8_t *&src, SonicError &err) { src++; return dst - sdst; } - if (block.HasUnescaped()) { - err = kParseErrorUnEscaped; - return 0; + if constexpr (!kAllowUnescapedControlChars) { + if (block.HasUnescaped()) { + err = kParseErrorUnEscaped; + return 0; + } } if (!block.HasBackslash()) { /* they are the same. Since they can't co-occur, it means we * encountered neither. */ - vst1q_u8(dst, v); + svst1_u8(svptrue_b8(), dst, v); src += VEC_LEN; dst += VEC_LEN; goto find_and_move; diff --git a/include/sonic/internal/arch/sve2-128/unicode.h b/include/sonic/internal/arch/sve2-128/unicode.h index 1196c834..6f1a6cc9 100644 --- a/include/sonic/internal/arch/sve2-128/unicode.h +++ b/include/sonic/internal/arch/sve2-128/unicode.h @@ -26,6 +26,7 @@ #include "../common/unicode_common.h" #include "base.h" #include "simd.h" +#include "sonic/dom/flags.h" namespace sonic_json { namespace internal { @@ -36,10 +37,17 @@ using sonic_json::internal::common::handle_unicode_codepoint; struct StringBlock { public: sonic_force_inline static StringBlock Find(const uint8_t* src); - sonic_force_inline static StringBlock Find(uint8x16_t& v); + sonic_force_inline static StringBlock Find(const svuint8x16_t& v); // has quote, and no backslash or unescaped before it + template sonic_force_inline bool HasQuoteFirst() const { - return (bs_index > quote_index) && !HasUnescaped(); + constexpr bool kAllowUnescapedControlChars = + (parseFlags & ParseFlags::kParseAllowUnescapedControlChars) != 0; + if constexpr (kAllowUnescapedControlChars) { + return (bs_index > quote_index); + } else { + return (bs_index > quote_index) && (!HasUnescaped()); + } } // has backslash, and no quote before it sonic_force_inline bool HasBackslash() const { @@ -89,11 +97,7 @@ sonic_force_inline unsigned LocateTokenLe(const svuint8x16_t v, char token) { sonic_force_inline StringBlock StringBlock::Find(const uint8_t* src) { svuint8x16_t v = svld1(svptrue_b8(), src); - return { - LocateToken(v, '\\'), - LocateToken(v, '"'), - LocateTokenLe(v, '\x1f'), - }; + return Find(v); } sonic_force_inline unsigned FirstIndexFromToBitmask(uint64_t bits) { @@ -112,7 +116,7 @@ sonic_force_inline unsigned LocateTokenLe(const uint8x16_t v, char token) { to_bitmask(vcleq_u8(v, vdupq_n_u8(static_cast(token))))); } -sonic_force_inline StringBlock StringBlock::Find(uint8x16_t& v) { +sonic_force_inline StringBlock StringBlock::Find(const svuint8x16_t& v) { return { LocateToken(v, '\\'), LocateToken(v, '"'), diff --git a/include/sonic/internal/arch/x86_ifuncs/quote.h b/include/sonic/internal/arch/x86_ifuncs/quote.h index 03f213f5..5341c2dd 100644 --- a/include/sonic/internal/arch/x86_ifuncs/quote.h +++ b/include/sonic/internal/arch/x86_ifuncs/quote.h @@ -19,42 +19,96 @@ #include #include "../avx2/quote.h" +#include "../common/quote_common.h" #include "../sse/quote.h" namespace sonic_json { namespace internal { -__attribute__((target("default"))) inline size_t parseStringInplace( - uint8_t *&, SonicError &) { - // TODO static_assert(!!!"Not Implemented!"); - return 0; -} -__attribute__((target("default"))) inline char *Quote(const char *, size_t, - char *) { - // TODO static_assert(!!!"Not Implemented!"); - return 0; +inline bool CpuSupportsHaswell() { +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) +#if defined(__GNUC__) || defined(__clang__) + __builtin_cpu_init(); + return __builtin_cpu_supports("avx2"); +#else + // MSVC does not support __builtin_cpu_supports. + return false; +#endif +#else + return false; +#endif } -__attribute__((target(SONIC_WESTMERE))) inline size_t parseStringInplace( - uint8_t *&src, SonicError &err) { - return sse::parseStringInplace(src, err); +inline bool CpuSupportsWestmere() { +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) +#if defined(__GNUC__) || defined(__clang__) + __builtin_cpu_init(); + return __builtin_cpu_supports("sse4.2") && __builtin_cpu_supports("pclmul"); +#else + // MSVC does not support __builtin_cpu_supports. + return false; +#endif +#else + return false; +#endif } -__attribute__((target(SONIC_WESTMERE))) inline char *Quote(const char *src, - size_t nb, - char *dst) { - return sse::Quote(src, nb, dst); -} +template +struct ParseStringDispatcher { + using FuncType = size_t (*)(uint8_t *&, SonicError &); + + static size_t FallbackImpl(uint8_t *&src, SonicError &err) { + return common::parseStringInplace(src, err); + } + + static FuncType Resolve() { + if (CpuSupportsHaswell()) { + return avx2::parseStringInplace; + } + if (CpuSupportsWestmere()) { + return sse::parseStringInplace; + } + return FallbackImpl; + } + + static FuncType &Func() { + static FuncType func = Resolve(); + return func; + } +}; + +template +struct QuoteDispatcher { + using FuncType = char *(*)(const char *src, size_t nb, char *dst); + + static char *FallbackImpl(const char *src, size_t nb, char *dst) { + return common::Quote(src, nb, dst); + } + + static FuncType Resolve() { + if (CpuSupportsHaswell()) { + return avx2::Quote; + } + if (CpuSupportsWestmere()) { + return sse::Quote; + } + return FallbackImpl; + } + + static FuncType &Func() { + static FuncType func = Resolve(); + return func; + } +}; -__attribute__((target(SONIC_HASWELL))) inline size_t parseStringInplace( - uint8_t *&src, SonicError &err) { - return avx2::parseStringInplace(src, err); +template +inline size_t parseStringInplace(uint8_t *&src, SonicError &err) { + return ParseStringDispatcher::Func()(src, err); } -__attribute__((target(SONIC_HASWELL))) inline char *Quote(const char *src, - size_t nb, - char *dst) { - return avx2::Quote(src, nb, dst); +template +inline char *Quote(const char *src, size_t nb, char *dst) { + return QuoteDispatcher::Func()(src, nb, dst); } } // namespace internal diff --git a/include/sonic/internal/ftoa.h b/include/sonic/internal/ftoa.h index 90114d9c..8a30abf8 100644 --- a/include/sonic/internal/ftoa.h +++ b/include/sonic/internal/ftoa.h @@ -28,6 +28,7 @@ #include +#include "sonic/dom/flags.h" #include "sonic/internal/itoa.h" #include "sonic/macro.h" @@ -766,7 +767,7 @@ static sonic_force_inline unsigned Ctz10(const uint64_t v) { return 10; } -// FormatSignificand trimed the trailing zeros. +// Format Significand trimmed the trailing zeros. static sonic_force_inline char* FormatSignificand(uint64_t sig, char* out, int cnt) { char* p = out + cnt; @@ -824,28 +825,40 @@ static sonic_force_inline bool IsDivPow2(uint64_t val, int32_t e) { return (val & mask) == 0; } +template static sonic_force_inline char* FormatExponent(F64Decimal v, char* out, unsigned cnt) { char* p = out + 1; char* end = FormatSignificand(v.sig, p, cnt); while (*(end - 1) == '0') end--; - - /* print decimal point if needed */ *out = *p; - if (end - p > 1) { + + if constexpr (serializeFlags & SerializeFlags::kSerializeFloatFormatJava) { *p = '.'; + if ((end - p) <= 1) { + *(++p) = '0'; + end = p + 1; + } + *end++ = 'E'; } else { - end--; + /* print decimal point if needed */ + if (end - p > 1) { + *p = '.'; + } else { + end--; + } + *end++ = 'e'; } - /* print the exponent */ - *end++ = 'e'; int32_t exp = v.exp + (int32_t)cnt - 1; if (exp < 0) { *end++ = '-'; exp = -exp; } else { - *end++ = '+'; + if constexpr (!(serializeFlags & + SerializeFlags::kSerializeFloatFormatJava)) { + *end++ = '+'; + } } if (exp >= 100) { @@ -984,6 +997,7 @@ static sonic_force_inline F64Decimal F64ToDecimal(uint64_t rsig, int32_t rexp, return dec; } +template sonic_static_noinline int F64toa(char* out, double fp) { char* p = out; uint64_t raw = F64ToRaw(fp); @@ -1017,14 +1031,16 @@ sonic_static_noinline int F64toa(char* out, double fp) { /* double is normal */ c = rsig | F64_HIDDEN_BIT; q = rexp - F64_EXP_BIAS - F64_SIG_BITS; - - /* fast path for integer */ - if (q <= 0 && q >= -F64_SIG_BITS && IsDivPow2(c, -q)) { - uint64_t u = c >> -q; - p = U64toa(p, u); - *p++ = '.'; - *p++ = '0'; - return p - out; + if constexpr (!(serializeFlags & + SerializeFlags::kSerializeFloatFormatJava)) { + /* fast path for integer */ + if (q <= 0 && q >= -F64_SIG_BITS && IsDivPow2(c, -q)) { + uint64_t u = c >> -q; + p = U64toa(p, u); + *p++ = '.'; + *p++ = '0'; + return p - out; + } } } else { @@ -1035,15 +1051,27 @@ sonic_static_noinline int F64toa(char* out, double fp) { F64Decimal dec = F64ToDecimal(rsig, rexp, c, q); int cnt = Ctz10(dec.sig); int dot = cnt + dec.exp; - int sci_exp = dot - 1; - bool exp_fmt = sci_exp < -6 || sci_exp > 20; - bool has_dot = dot < cnt; - + int sci_exp = 0; + bool exp_fmt = false; + if constexpr (serializeFlags & SerializeFlags::kSerializeFloatFormatJava) { + /* + * Floating point values in the range 1.0E-3 <= x < 1.0E7 have to be printed + * without exponent. This test checks the values at those boundaries. + * reference from + * https://github.com/FasterXML/jackson-core/blob/511704247fe020f81b8b37303d3c8acffab6aa0b/src/main/java/com/fasterxml/jackson/core/io/schubfach/DoubleToDecimal.java#L500 + * + */ + sci_exp = cnt - 1 + dec.exp; + exp_fmt = !(sci_exp >= -3 && sci_exp < 7); + } else { + sci_exp = dot - 1; + exp_fmt = sci_exp < -6 || sci_exp > 20; + } if (exp_fmt) { - return FormatExponent(dec, p, cnt) - out; + return FormatExponent(dec, p, cnt) - out; } - if (has_dot) { + if (dec.exp < 0) { return FormatDecimal(dec, p, cnt) - out; } diff --git a/include/sonic/jsonpath/dom.h b/include/sonic/jsonpath/dom.h new file mode 100644 index 00000000..f9f934fc --- /dev/null +++ b/include/sonic/jsonpath/dom.h @@ -0,0 +1,90 @@ + +#pragma once + +#include "sonic/dom/generic_document.h" +#include "sonic/jsonpath/dump.h" + +namespace sonic_json { + +sonic_force_inline std::tuple GetByJsonPathInternal( + Document& dom, StringView jsonpath) { + // get the nodes + auto result = dom.AtJsonPath(jsonpath); + if (result.error != kErrorNone) { + return std::make_tuple("", result.error); + } + + // filter the null nodes + result.nodes.erase( + std::remove_if(result.nodes.begin(), result.nodes.end(), + [](const auto& node) { return node->IsNull(); }), + result.nodes.end()); + + if (result.nodes.empty()) { + return std::make_tuple("", result.error); + } + + WriteBuffer wb; + if (result.nodes.size() == 1) { + // not serialize the single string + auto& root = result.nodes[0]; + if (root->IsString()) { + wb.Push(root->GetStringView().data(), root->Size()); + } else { + auto err = + result.nodes[0] + ->template Serialize(wb); + if (err != kErrorNone) { + return std::make_tuple("", err); + } + } + } else { + wb.Push('['); + for (const auto& node : result.nodes) { + auto err = + node->template Serialize(wb); + if (err != kErrorNone) { + return std::make_tuple("", err); + } + wb.Push(','); + } + if (*(wb.Top()) == ',') { + wb.Pop(1); + } + wb.Push(']'); + } + auto sv = wb.ToStringView(); + return std::make_tuple(std::string(sv.data(), sv.size()), kErrorNone); +} + +sonic_force_inline std::tuple GetByJsonPath( + StringView json, StringView jsonpath) { + // parse json into dom + Document dom; + dom.Parse(json); + if (dom.HasParseError()) { + return std::make_tuple("", dom.GetParseError()); + } + return GetByJsonPathInternal(dom, jsonpath); +} + +sonic_force_inline + std::tuple>, SonicError> + GetByJsonPaths(StringView json, const std::vector& jsonpaths) { + // parse json into dom + Document dom; + dom.Parse(json); + if (dom.HasParseError()) { + return std::make_tuple(std::vector>(), + dom.GetParseError()); + } + std::vector> results; + results.reserve(jsonpaths.size()); + + for (const auto& jsonpath : jsonpaths) { + results.emplace_back(GetByJsonPathInternal(dom, jsonpath)); + } + return std::make_tuple(results, kErrorNone); +} +} // namespace sonic_json diff --git a/include/sonic/jsonpath/dump.h b/include/sonic/jsonpath/dump.h new file mode 100644 index 00000000..d3adfe46 --- /dev/null +++ b/include/sonic/jsonpath/dump.h @@ -0,0 +1,61 @@ + +#pragma once + +#include +#include +#include + +#include "sonic/dom/generic_document.h" + +namespace sonic_json { + +namespace internal { +template +sonic_force_inline std::tuple Serialize( + JsonPathResult& result) { + // filter the null nodes + result.nodes.erase( + std::remove_if(result.nodes.begin(), result.nodes.end(), + [](const auto& node) { return node->IsNull(); }), + result.nodes.end()); + + if (result.nodes.empty()) { + return std::make_tuple("null", kErrorNone); + } + + WriteBuffer wb; + if (result.nodes.size() == 1) { + // not serialize the single string + auto& root = result.nodes[0]; + if (root->IsString()) { + wb.Push(root->GetStringView().data(), root->Size()); + } else { + auto err = + result.nodes[0] + ->template Serialize(wb); + if (err != kErrorNone) { + return std::make_tuple("", err); + } + } + } else { + wb.Push('['); + for (const auto& node : result.nodes) { + auto err = + node->template Serialize(wb); + if (err != kErrorNone) { + return std::make_tuple("", err); + } + wb.Push(','); + } + if (*(wb.Top()) == ',') { + wb.Pop(1); + } + wb.Push(']'); + } + auto sv = wb.ToStringView(); + return std::make_tuple(std::string(sv.data(), sv.size()), kErrorNone); +} +} // namespace internal + +} // namespace sonic_json diff --git a/include/sonic/jsonpath/jsonpath.h b/include/sonic/jsonpath/jsonpath.h new file mode 100644 index 00000000..4f6a6519 --- /dev/null +++ b/include/sonic/jsonpath/jsonpath.h @@ -0,0 +1,411 @@ + + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sonic/internal/arch/common/unicode_common.h" +#include "sonic/string_view.h" + +namespace sonic_json { + +namespace internal { +static constexpr char NONE = '\0'; +static constexpr char WILDCARD = '*'; +static constexpr char ROOT = '$'; +static constexpr char IS_KEY = '\x01'; +static constexpr char IS_INDEX = '\x02'; +static constexpr char KEY_OR_INDEX = '\x03'; + +class JsonPathNode { + public: + JsonPathNode() noexcept = default; + JsonPathNode(int64_t index) noexcept : index_(index), token_(IS_INDEX) {} + JsonPathNode(StringView key) noexcept : key_(key), token_(IS_KEY) {} + JsonPathNode(StringView key, int64_t index) noexcept + : index_(index), key_(key), token_(KEY_OR_INDEX) {} + JsonPathNode(char token) noexcept : token_(token) {} + ~JsonPathNode() = default; + + public: + bool is_wildcard() const noexcept { return token_ == WILDCARD; } + + bool is_key() const noexcept { + return token_ == IS_KEY || token_ == KEY_OR_INDEX; + } + + bool is_index() const noexcept { + return token_ == IS_INDEX || token_ == KEY_OR_INDEX; + } + + bool is_none() const noexcept { return token_ == NONE; } + + bool is_root() const noexcept { return token_ == ROOT; } + + StringView key() const noexcept { + sonic_assert(is_key()); + return key_; + } + + int64_t index() const noexcept { + sonic_assert(is_index()); + return index_; + } + + char token() const noexcept { + sonic_assert(!is_key() && !is_index() && !is_none()); + return token_; + } + + std::string to_string() const { + std::stringstream ss; + + switch (token_) { + case NONE: + ss << "NONE(\\0)"; + break; + + case WILDCARD: + ss << "WILDCARD(*)"; + break; + + case ROOT: + ss << "ROOT($)"; + break; + + case IS_KEY: + ss << "KEY(\""; + for (char c : key_) { + if (std::isprint(static_cast(c))) { + ss << c; + } else { + ss << "\\x" << std::hex << std::uppercase + << static_cast(static_cast(c)); + } + } + ss << "\")"; + break; + + case IS_INDEX: + ss << "INDEX(" << index_ << ")"; + break; + + case KEY_OR_INDEX: + ss << "KEY_OR_INDEX(\""; + for (char c : key_) { + if (std::isprint(static_cast(c))) { + ss << c; + } else { + ss << "\\x" << std::hex << std::uppercase + << static_cast(static_cast(c)); + } + } + ss << "\", " << index_ << ")"; + break; + + default: + // Handle other special-character tokens + if (std::isprint(static_cast(token_))) { + ss << "TOKEN(" << token_ << ")"; + } else { + ss << "TOKEN(\\x" << std::hex << std::uppercase + << static_cast(static_cast(token_)) << ")"; + } + break; + } + + return ss.str(); + } + + private: + int64_t index_ = 0; + StringView key_ = ""; + // record special tokens, also distinguish key and index + char token_ = '\0'; +}; + +// to parse escaped chars inplace +sonic_force_inline std::string paddingJsonPath(StringView path) { + // Keep the extra '\0' bytes *within* the string size so that + // internal::common::unescape_with_padding() can safely read past the logical + // end of the path without triggering out-of-bounds reads. + std::string padded(path.data(), path.size()); + padded.append(8, '\0'); + return padded; +} + +/** + * Represent a JSON path. RFC is https://datatracker.ietf.org/doc/rfc9535/. + * NOTE: descendant, slice, filter and current node not support. + */ +class JsonPath : public std::vector { + private: + // Keep a writable, padded copy of the input path so that: + // - parseQuotedName() can do in-place unescaping safely + // - unescape_with_padding() can read past logical end without OOB + std::string padded_; + + // Parse using a caller-provided padded, writable buffer. + // The caller must ensure `padded.data()` points to a buffer that has at least + // 8 extra '\0' bytes after `logical_len`. + // The caller must keep the buffer alive while the parsed JsonPath is used. + sonic_force_inline bool ParsePaddedInternal(StringView padded, + size_t logical_len) noexcept { + StringView p(padded.data(), logical_len); + + if (p.empty() || p[0] != '$') { + return false; + } + + this->emplace_back(JsonPathNode('$')); + size_t i = 1; + bool valid = false; + JsonPathNode node; + while (i < p.size()) { + valid = false; + + if (i + 2 < p.size() && p[i] == '.' && p[i + 1] == '.') { + return false; + } + + if (p[i] == '.') { + if (i + 1 >= p.size()) { + return false; + } + + i++; + if (p[i] == '*') { + this->emplace_back(JsonPathNode(WILDCARD)); + i++; + continue; + } + valid = parseUnquotedKey(p, i, node); + } else if (p[i] == '[') { + if (i + 1 >= p.size()) { + return false; + } + + i++; + if (p[i] == '*') { + if (i + 1 < p.size() && p[i + 1] == ']') { + this->emplace_back(JsonPathNode(WILDCARD)); + i += 2; + continue; + } + return false; + } + + if (p[i] == '\'' || p[i] == '"') { + valid = parseQuotedName(p, i, node); + } else if ((p[i] >= '0' && p[i] <= '9') || p[i] == '-') { + valid = parseBracktedIndex(p, i, node); + } else { + // Unsupported bracket expression (e.g. unquoted name). + valid = false; + } + } else { + // Unknown token, prevent infinite loop / stale `valid` reuse. + return false; + } + + if (!valid) { + this->clear(); + return false; + } + + this->emplace_back(node); + } + return true; + } + + sonic_force_inline bool parseNumber(StringView path, size_t& index, + uint64_t& sum) { + size_t start = index; + // check leading zero + if (index < path.size() && path[index] == '0') { + index++; + return true; + } + + while (index < path.size() && path[index] >= '0' && path[index] <= '9') { + auto last = sum * 10 + (path[index] - '0'); + // check overflow + if (last < sum) { + return false; + } + sum = last; + index++; + } + + return (sum <= INT64_MAX) && index != start; + } + + // case as .abc + sonic_force_inline bool parseUnquotedKey(StringView path, size_t& index, + JsonPathNode& node) { + size_t start = index; + while (index < path.size() && path[index] != '.' && path[index] != '[') { + index++; + } + size_t len = index - start; + if (len == 0) { + return false; + } + + node = JsonPathNode(path.substr(index - len, len)); + return true; + } + + // case as [123] or [-123] + sonic_force_inline bool parseBracktedIndex(StringView path, size_t& index, + JsonPathNode& node) { + uint64_t sum = 0; + int sign = 1; + + // check negative + if (index < path.size() && path[index] == '-') { + index++; + sign = -1; + } + + if (!parseNumber(path, index, sum)) { + return false; + } + + // match ']' + if (index >= path.size() || path[index] != ']') { + return false; + } + index++; + node = JsonPathNode(int64_t(sum) * sign); + return true; + } + + // case as ['abc'] or ["abc"] + sonic_force_inline bool parseQuotedName(StringView path, size_t& index, + JsonPathNode& node) { + if (index >= path.size()) { + return false; + } + + if (path[index] != '\'' && path[index] != '"') { + return false; + } + + const char* base = path.data(); + const size_t n = path.size(); + + char quote = base[index++]; + size_t start = index; + if (start >= n) { + return false; + } + + char* dst = const_cast(base + start); + const char* src = base + start; + const char* end = base + n; + size_t len = 0; + // normalized path + if (quote == '\"') { + while (src < end && *src != quote) { + if (*src == '\\') { + if (internal::common::unescape_with_padding( + reinterpret_cast(&src), + reinterpret_cast(&dst)) == 0) { + return false; + } + } else { + *dst++ = *src++; + } + } + if (src >= end) { + return false; + } + len = static_cast(dst - (base + start)); + } else { + while (src < end && *src != quote) { + src++; + } + if (src >= end) { + return false; + } + len = static_cast(src - (base + start)); + } + + const size_t quote_pos = static_cast(src - base); + node = JsonPathNode(path.substr(start, len)); + // Expect closing quote then ']'. + if (start == quote_pos || quote_pos + 1 >= n || + base[quote_pos + 1] != ']') { + return false; + } + + index = quote_pos + 2; + + return true; + } + + // case as [abc] + sonic_force_inline bool parseBrackedUnquotedKey(StringView path, + size_t& index, + JsonPathNode& node) { + size_t start = index; + while (index < path.size() && path[index] != ']') { + index++; + } + if (start == index) { + return false; + } + node = JsonPathNode(path.substr(start, index - start)); + index++; + return true; + } + + sonic_force_inline bool parseWildcard(StringView path, size_t& index, + JsonPathNode& node) { + if (index + 1 < path.size() && path[index] == '*' && + path[index + 1] == ']') { + node = JsonPathNode('*'); + index += 2; + return true; + } + return false; + } + + public: + // Parse with a padded, writable buffer (avoids extra copy). + // See ParsePaddedInternal() for lifetime and padding requirements. + sonic_force_inline bool ParsePadded(StringView padded, + size_t logical_len) noexcept { + this->clear(); + padded_.clear(); + return ParsePaddedInternal(padded, logical_len); + } + + sonic_force_inline bool Parse(StringView path) noexcept { + this->clear(); + padded_ = paddingJsonPath(path); + return ParsePaddedInternal(StringView(padded_.data(), padded_.size()), + path.size()); + } + + std::string to_string() const { + std::stringstream ss; + ss << "["; + for (const auto& node : *this) { + ss << node.to_string() << ", "; + } + ss << "]"; + return ss.str(); + } +}; + +} // namespace internal +} // namespace sonic_json diff --git a/include/sonic/jsonpath/ondemand.h b/include/sonic/jsonpath/ondemand.h new file mode 100644 index 00000000..4724b43d --- /dev/null +++ b/include/sonic/jsonpath/ondemand.h @@ -0,0 +1,196 @@ +/* + * Copyright (c) ByteDance Ltd. and/or its affiliates + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include "sonic/dom/generic_document.h" +#include "sonic/dom/parser.h" +#include "sonic/jsonpath/dump.h" +#include "sonic/jsonpath/jsonpath.h" + +namespace sonic_json { + +struct JsonPathRawResult { + std::vector raw; + SonicError error; +}; +template +class JsonGenerator + : public internal::SkipScanner2::JsonGeneratorInterface { + public: + JsonGenerator(Document& dom_doc, WriteBuffer& wb) + : dom_doc_(dom_doc), wb_(wb) {} + bool writeRaw(StringView raw) override { + dom_doc_.template Parse(raw); + auto n = &dom_doc_; + // check parse error + if (dom_doc_.HasParseError()) { + return false; + } + wb_.PushStr(n->GetStringView()); + + return true; + } + bool writeComma() override { + wb_.Push(','); + return true; + } + bool isEmpty() override { return wb_.Empty(); } + bool writeStartArray() override { + wb_.Push('['); + return true; + } + bool isBeginArray() override { + return !wb_.Empty() && *(wb_.Top()) == '['; + } + bool writeEndArray() override { + wb_.Push(']'); + return true; + } + bool copyCurrentStructure(StringView raw) override { + dom_doc_.template Parse(raw); + // check parse error + if (dom_doc_.HasParseError()) { + return false; + } + auto n = &dom_doc_; + auto err = n->template Serialize(wb_); + if (sonic_unlikely(err != kErrorNone)) { + return false; + } + + return true; + } + bool copyCurrentStructureJsonTupleCodeGen( + StringView raw, size_t index, + std::vector>& result, + internal::SkipScanner2::JsonValueType type) override { + wb_.Clear(); + dom_doc_.template Parse(raw); + // check parse error + if (dom_doc_.HasParseError()) { + return false; + } + auto n = &dom_doc_; + + if (type == internal::SkipScanner2::JsonValueType::STRING) { + // strip the quotes + wb_.PushStr(n->GetStringView()); + result[index] = std::string(wb_.ToStringView()); + return true; + } + + auto err = n->template Serialize(wb_); + if (sonic_unlikely(err != kErrorNone)) { + return false; + } + + result[index] = std::string(wb_.ToStringView()); + return true; + } + bool writeRawValue(StringView sv) override { + this->wb_.PushStr(sv); + return true; + } + ~JsonGenerator() override = default; + + private: + Document& dom_doc_; + WriteBuffer& wb_; +}; + +template +sonic_force_inline std::tuple GetByJsonPathOnDemand( + StringView json, StringView jsonpath) { + internal::SkipScanner2 scan; + + scan.data_ = reinterpret_cast(json.data()); + scan.len_ = json.size(); + internal::JsonPath path; + + // padding some buffers + std::string pathpadd = internal::paddingJsonPath(jsonpath); + // Only parse the logical jsonpath length; the extra '\0' bytes are for safe + // lookahead during unescaping. + if (!path.ParsePadded(StringView(pathpadd.data(), pathpadd.size()), + jsonpath.size())) { + return std::make_tuple("", kUnsupportedJsonPath); + } + + Document dom_doc; + WriteBuffer wb; + + const internal::SkipScanner2::JsonGeneratorFactory + jsonGeneratorFactory = [&](WriteBuffer& local_wb) { + std::shared_ptr< + internal::SkipScanner2::JsonGeneratorInterface> + local_ret = std::make_shared>( + dom_doc, local_wb); + return local_ret; + }; + + const bool matched = + scan.getJsonPath( + path, 1, jsonGeneratorFactory(wb).get(), jsonGeneratorFactory); + if (matched) { + return std::make_tuple(std::string(wb.ToStringView()), kErrorNone); + } + // if no match, it could be because valid json, just no path. + if (!scan.hasError()) { + return std::make_tuple("", kErrorNoneNoMatch); + } + // or parse error caused premature path match termination, hence no match. + // In this case, return whatever that's been written to buffer. + return std::make_tuple(std::string(wb.ToStringView()), scan.error_); +} + +template +sonic_force_inline std::vector> JsonTupleWithCodeGen( + StringView json, const std::vector& keys, const bool legacy) { + internal::SkipScanner2 scan; + + scan.data_ = reinterpret_cast(json.data()); + scan.len_ = json.size(); + + Document dom_doc; + WriteBuffer wb; + + const internal::SkipScanner2::JsonGeneratorFactory + jsonGeneratorFactory = [&](WriteBuffer& local_wb) { + std::shared_ptr< + internal::SkipScanner2::JsonGeneratorInterface> + local_ret = std::make_shared>( + dom_doc, local_wb); + return local_ret; + }; + + return scan.jsonTupleWithCodeGen(keys, jsonGeneratorFactory(wb).get(), + legacy); +} + +} // namespace sonic_json diff --git a/include/sonic/sonic.h b/include/sonic/sonic.h index d399d1ce..aeb803c1 100644 --- a/include/sonic/sonic.h +++ b/include/sonic/sonic.h @@ -18,6 +18,8 @@ #include "sonic/dom/dynamicnode.h" #include "sonic/dom/generic_document.h" +#include "sonic/jsonpath/dom.h" +#include "sonic/jsonpath/ondemand.h" #define SONIC_MAJOR_VERSION 1 #define SONIC_MINOR_VERSION 0 diff --git a/tests/allocator_test.cpp b/tests/allocator_test.cpp index d7e65356..71740adb 100644 --- a/tests/allocator_test.cpp +++ b/tests/allocator_test.cpp @@ -44,4 +44,50 @@ TEST(Allocator, Free) { MEMSTAT_ISEMPTY(); } +TEST(Allocator, SimpleAllocatorEdgeCases) { + SimpleAllocator a; + + // Malloc(0) should return nullptr. + EXPECT_EQ(a.Malloc(0), nullptr); + + // Realloc(..., new_size=0) should free and return nullptr. + void *ptr = a.Malloc(8); + ASSERT_NE(ptr, nullptr); + EXPECT_EQ(a.Realloc(ptr, 8, 0), nullptr); +} + +TEST(Allocator, AdaptiveChunkPolicyGrowth) { + AdaptiveChunkPolicy cp(1024); + + // grow to next power-of-two (bounded by SONIC_ALLOCATOR_MAX_CHUNK_CAPACITY) + EXPECT_EQ(cp.ChunkSize(3000), 4096u); + EXPECT_EQ(cp.ChunkSize(5000), 8192u); + + const size_t max_cap = SONIC_ALLOCATOR_MAX_CHUNK_CAPACITY; + // When request exceeds max_cap, returned size must still satisfy need. + const size_t huge_need = max_cap * 2; + EXPECT_EQ(cp.ChunkSize(huge_need), huge_need); + + // But internal min_chunk_size_ should be capped at max_cap. + EXPECT_EQ(cp.ChunkSize(max_cap - 1), max_cap); +} + +TEST(Allocator, MemoryPoolAllocatorMoveAndMapAllocator) { + // Moved-from allocator should be a no-op on destruction. + { + MemoryPoolAllocator<> a; + MemoryPoolAllocator<> b(std::move(a)); + (void)b; + } + + // Exercise MapAllocator::deallocate via MemoryPoolAllocator::Free (no-op). + { + MemoryPoolAllocator<> pool; + MapAllocator> ma(&pool); + int *p = ma.allocate(1); + ASSERT_NE(p, nullptr); + ma.deallocate(p, 1); + } +} + } // namespace diff --git a/tests/document_test.cpp b/tests/document_test.cpp index 1c9836e5..6765b833 100644 --- a/tests/document_test.cpp +++ b/tests/document_test.cpp @@ -597,6 +597,32 @@ TYPED_TEST(DocumentTest, SonicErrorInfinity) { EXPECT_TRUE(this->doc_.Dump().empty()); } +TYPED_TEST(DocumentTest, SerializeInfinity) { + Document dom; + dom.SetDouble(std::numeric_limits::infinity()); + WriteBuffer wb; + SonicError err = dom.Serialize(wb); + EXPECT_EQ(err, kErrorNone); + EXPECT_STREQ(wb.ToString(), "\"Infinity\""); + + dom.SetDouble(-std::numeric_limits::infinity()); + err = dom.Serialize(wb); + EXPECT_EQ(err, kErrorNone); + EXPECT_STREQ(wb.ToString(), "\"-Infinity\""); +} + +TYPED_TEST(DocumentTest, SerializeNaN) { + Document dom; + dom.SetDouble(std::numeric_limits::quiet_NaN()); + WriteBuffer wb; + SonicError err = dom.Serialize(wb); + EXPECT_EQ(err, kErrorNone); + EXPECT_STREQ(wb.ToString(), "\"NaN\""); + dom.SetDouble(-std::numeric_limits::quiet_NaN()); + err = dom.Serialize(wb); + EXPECT_STREQ(wb.ToString(), "\"-NaN\""); +} + TYPED_TEST(DocumentTest, swap) { using Document = TypeParam; Document doc1; @@ -737,10 +763,10 @@ TYPED_TEST(DocumentTest, NodeCopyControl) { EXPECT_TRUE(this->doc_["titles"].IsNull()); // test swap - NodeType swaped; - EXPECT_TRUE(swaped.IsNull()); - swaped.Swap(new_node); - EXPECT_FALSE(swaped.IsStringConst()); + NodeType swapped; + EXPECT_TRUE(swapped.IsNull()); + swapped.Swap(new_node); + EXPECT_FALSE(swapped.IsStringConst()); EXPECT_TRUE(new_node.IsNull()); } diff --git a/tests/exp_update_test.cpp b/tests/exp_update_test.cpp index 2425febb..3de8f1bb 100644 --- a/tests/exp_update_test.cpp +++ b/tests/exp_update_test.cpp @@ -113,9 +113,39 @@ TEST(UpdateLazy, Basic) { }; for (const auto &t : tests) { - auto ret = sonic_json::UpdateLazy(t.target, t.source); + auto ret = + sonic_json::UpdateLazy(t.target, t.source); EXPECT_STREQ(ret.c_str(), t.updated.c_str()); } } -} // namespace \ No newline at end of file +TEST(UpdateLazy, InvalidJson) { + // invalid source -> keep target (when target parses ok) + { + std::string target = R"({"a":1})"; + std::string source = R"({"a":)"; // invalid json + auto ret = + sonic_json::UpdateLazy(target, source); + EXPECT_STREQ(ret.c_str(), target.c_str()); + } + + // invalid target -> return source (when source parses ok) + { + std::string target = R"({"a":)"; // invalid json + std::string source = R"({"b":2})"; + auto ret = + sonic_json::UpdateLazy(target, source); + EXPECT_STREQ(ret.c_str(), source.c_str()); + } + + // both invalid -> return empty object + { + std::string target = R"({"a":)"; + std::string source = R"({"b":)"; + auto ret = + sonic_json::UpdateLazy(target, source); + EXPECT_STREQ(ret.c_str(), "{}"); + } +} + +} // namespace diff --git a/tests/ftoa_test.cpp b/tests/ftoa_test.cpp index 1e8bdb76..daa4e1d2 100644 --- a/tests/ftoa_test.cpp +++ b/tests/ftoa_test.cpp @@ -34,7 +34,7 @@ using namespace sonic_json; static void TestF64toa(const std::string& expect, double val) { char out[32]; - int len = F64toa(out, val); + int len = F64toa(out, val); out[len] = '\0'; EXPECT_STREQ(expect.data(), out); EXPECT_EQ(expect.size(), len); diff --git a/tests/json_tuple_test.cpp b/tests/json_tuple_test.cpp new file mode 100644 index 00000000..d5ac968b --- /dev/null +++ b/tests/json_tuple_test.cpp @@ -0,0 +1,202 @@ +/* + * Copyright 2022 ByteDance Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "sonic/sonic.h" + +namespace { + +using namespace sonic_json; + +#define TestOk(json, path, expect) \ + { \ + auto got = GetByJsonPathOnDemand(json, path); \ + EXPECT_EQ(std::get<0>(got), expect) \ + << "json: " << json << ", path: " << path \ + << ", error: " << ErrorMsg(std::get<1>(got)); \ + } + +TEST(JsonTuple, Basic) { + auto json = R"({ + "a": 1, + "b": [0,1,2,3], + "c": {"33": 123} + })"; + // StringView json, std::vector keys + auto result = + JsonTupleWithCodeGen(json, {"b"}, true); + std::vector> expected = {R"([0,1,2,3])"}; + EXPECT_EQ(result, expected); +} +TEST(JsonTuple, sparkCornerCase) { + std::string json = R"({"1.a": "b"})"; + std::vector paths{"1.a"}; + std::vector> expected = {R"(b)"}; + auto result = + JsonTupleWithCodeGen(json, paths, true); + EXPECT_EQ(result, expected); +} +TEST(JsonTuple, escapeQuote) { + std::string json = R"({"1.a": "{\"options\"}")"; + std::vector paths{"1.a"}; + std::vector> expected = {R"({"options"})"}; + auto result = + JsonTupleWithCodeGen(json, paths, true); + EXPECT_EQ(result, expected); +} + +TEST(JsonTuple, EscapedKeys) { + // Keys contain '\\' and '\"' in JSON source. + std::string json = R"({"a\\": 1, "b\"": 2})"; + std::vector paths{"a\\", "b\""}; + std::vector> expected = {"1", "2"}; + + auto result = + JsonTupleWithCodeGen(json, paths, true); + EXPECT_EQ(result, expected); + + result = JsonTupleWithCodeGen(json, paths, false); + EXPECT_EQ(result, expected); +} + +TEST(JsonTuple, japanese) { + std::string json = R"json( + { + "id": 903487807, + "id_str": "903487807", + "name": "RT&ファボ魔のむっつんさっm", + "screen_name": "yuttari1998", + "location": "関西 ↓詳しいプロ↓", + "description": "無言フォローはあまり好みません ゲームと動画が好きですシモ野郎ですがよろしく…最近はMGSとブレイブルー、音ゲーをプレイしてます", + "url": "http://t.co/Yg9e1Fl8wd", + "entities": { + "url": { + "urls": [ + { + "url": "http://t.co/Yg9e1Fl8wd", + "expanded_url": "http://twpf.jp/yuttari1998", + "display_url": "twpf.jp/yuttari1998", + "indices": [ + 0, + 22 + ] + } + ] + }, + "description": { + "urls": [] + } + }, + "protected": false, + "followers_count": 95, + "friends_count": 158, + "listed_count": 1, + "created_at": "Thu Oct 25 08:27:13 +0000 2012", + "favourites_count": 3652, + "utc_offset": null, + "time_zone": null, + "geo_enabled": false, + "verified": false, + "statuses_count": 10276, + "lang": "ja", + "contributors_enabled": false, + "is_translator": false, + "is_translation_enabled": false, + "profile_background_color": "C0DEED", + "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png", + "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png", + "profile_background_tile": false, + "profile_image_url": "http://pbs.twimg.com/profile_images/500268849275494400/AoXHZ7Ij_normal.jpeg", + "profile_image_url_https": "https://pbs.twimg.com/profile_images/500268849275494400/AoXHZ7Ij_normal.jpeg", + "profile_banner_url": "https://pbs.twimg.com/profile_banners/903487807/1409062272", + "profile_link_color": "0084B4", + "profile_sidebar_border_color": "C0DEED", + "profile_sidebar_fill_color": "DDEEF6", + "profile_text_color": "333333", + "profile_use_background_image": true, + "default_profile": true, + "default_profile_image": false, + "following": false, + "follow_request_sent": false, + "notifications": false + } + )json"; + std::vector paths{ + "id_str", + "id", + "location", + "description", + "entities.url.urls[0].indices[1]", + }; + auto result = + JsonTupleWithCodeGen(json, paths, true); + std::vector> expected = { + "903487807", "903487807", "関西 ↓詳しいプロ↓", + "無言フォローはあまり好みません " + "ゲームと動画が好きですシモ野郎ですがよろしく…最近はMGSとブレイブルー、音" + "ゲーをプレイしてます", + std::nullopt}; + + EXPECT_EQ(result, expected); + result = JsonTupleWithCodeGen(json, paths, false); + EXPECT_EQ(result, expected); +} + +TEST(JsonTuple, invalidValue) { + std::string json = "{\"a\":1,\"b\":2c}"; + std::vector paths{"a", "b"}; + std::vector> expected = {"1", std::nullopt}; + + auto result = + JsonTupleWithCodeGen(json, paths, true); + EXPECT_EQ(result, expected); + expected = {std::nullopt, std::nullopt}; + result = JsonTupleWithCodeGen(json, paths, false); + EXPECT_EQ(result, expected); +} + +TEST(JsonTuple, NoMatchAllKeys) { + std::string json = R"({"a": 1})"; + std::vector paths{"x", "y"}; + std::vector> expected = {std::nullopt, + std::nullopt}; + + auto result = + JsonTupleWithCodeGen(json, paths, true); + EXPECT_EQ(result, expected); + + result = JsonTupleWithCodeGen(json, paths, false); + EXPECT_EQ(result, expected); +} + +TEST(JsonTuple, malformed) { + std::string json = "{\"a\":1,\"b\":2, c}"; + std::vector paths{"a", "b", "c", "d"}; + std::vector> expected = {"1", "2", std::nullopt, + std::nullopt}; + std::vector> expected2 = { + std::nullopt, std::nullopt, std::nullopt, std::nullopt}; + + auto result = + JsonTupleWithCodeGen(json, paths, true); + EXPECT_EQ(result, expected); + + result = JsonTupleWithCodeGen(json, paths, false); + EXPECT_EQ(result, expected2); +} + +} // namespace diff --git a/tests/jsonpath_test.cpp b/tests/jsonpath_test.cpp new file mode 100644 index 00000000..8cb5d12e --- /dev/null +++ b/tests/jsonpath_test.cpp @@ -0,0 +1,791 @@ +/* + * Copyright 2022 ByteDance Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include "sonic/sonic.h" + +namespace { + +using namespace sonic_json; + +#define TestOk(json, path, expect) \ + do { \ + auto got = GetByJsonPathOnDemand(json, path); \ + EXPECT_EQ(std::get<0>(got), expect) \ + << "json: " << json << ", path: " << path \ + << ", error: " << ErrorMsg(std::get<1>(got)); \ + } while (0) + +#define TestOkWithErr(json, path, expect, expect_err) \ + do { \ + auto got = GetByJsonPathOnDemand(json, path); \ + EXPECT_EQ(std::get<0>(got), expect) \ + << "json: " << json << ", path: " << path \ + << ", error: " << ErrorMsg(std::get<1>(got)); \ + EXPECT_EQ(std::get<1>(got), (expect_err)) \ + << "json: " << json << ", path: " << path \ + << ", got: " << ErrorMsg(std::get<1>(got)); \ + } while (0) + +#define TestNoMatch(json, path) \ + do { \ + auto got = GetByJsonPathOnDemand(json, path); \ + EXPECT_EQ(std::get<0>(got), "") \ + << "json: " << json << ", path: " << path \ + << ", error: " << ErrorMsg(std::get<1>(got)); \ + EXPECT_EQ(std::get<1>(got), kErrorNoneNoMatch) \ + << "json: " << json << ", path: " << path \ + << ", got: " << ErrorMsg(std::get<1>(got)); \ + } while (0) + +#define TestUnsupportedPath(json, path) \ + TestOkWithErr(json, path, "", kUnsupportedJsonPath) + +void TestFail(const std::string json, const std::string path) { + auto result = GetByJsonPathOnDemand(json, path); + ASSERT_NE(std::get<1>(result), kErrorNone) + << "Expected error for json: " << json << ", path: " << path + << ", but actual: " << std::get<0>(result); +} + +void ValidBatchOK(const std::string json, + const std::vector& paths) { + std::vector jsonpaths; + for (const auto& path : paths) { + jsonpaths.emplace_back(path); + } + auto batch = GetByJsonPaths(json, jsonpaths); + + ASSERT_EQ(std::get<1>(batch), kErrorNone) + << "json: " << json << ", parse failed."; + auto results = std::get<0>(batch); + for (size_t i = 0; i < paths.size(); ++i) { + auto result = GetByJsonPath(json, paths[i]); + ASSERT_EQ(std::get<0>(results[i]), std::get<0>(result)) + << "json: " << json << ", path: " << paths[i]; + ASSERT_EQ(std::get<1>(results[i]), std::get<1>(result)) + << "json: " << json << ", path: " << paths[i]; + } +} + +TEST(JsonPath, RootIdentifier) { + TestOk("[\"[\\\",\"]", "$", "[\"[\\\",\"]"); + TestOk(" null ", "$", "null"); + TestOk("true", "$", "true"); + TestOk("false", "$", "false"); + TestOk("false ", "$", "false"); + TestOk(" false ", "$", "false"); + TestOk("true ", "$", "true"); + TestOk("true", "$", "true"); + TestOk(" true ", "$", "true"); + TestOk(" true\n", "$", "true"); + + // string + TestOk("\"123\" ", "$", "123"); + TestOk("\"😊\" ", "$", "😊"); + TestOk("\"null\"", "$", "null"); + + // container + TestOk(" [] ", "$", "[]"); + TestOk(" [\"😊\"] ", "$", "[\"\\uD83D\\uDE0A\"]"); + TestOk(" {\"a\": \"😊💎\"} ", "$", "{\"a\":\"\\uD83D\\uDE0A\\uD83D\\uDC8E\"}"); + TestOk(" {} ", "$", "{}"); + TestOk(R"( {"a":null} )", "$", R"({"a":null})"); + TestOk(R"( [[], {}, []] )", "$", R"([[],{},[]])"); + + // has unescaped chars + TestOk("\"\t\n\"", "$", "\t\n"); + TestOk("\"\\t\\n\"", "$", "\t\n"); + TestOk("[\"\t\n\"]", "$", "[\"\\t\\n\"]"); + TestOk("[\"\\t\\n\"]", "$", "[\"\\t\\n\"]"); + + // invalid json + TestFail("123x ", "$"); + TestFail(" nullx ", "$"); + TestFail("truex", "$"); + TestFail("xtrue", "$"); + TestFail("falsex", "$"); + TestFail("xfalse", "$"); + TestFail(" [} ", "$"); + TestFail(R"( {"a:null} )", "$"); + TestFail(R"( [[], {[]}, []] )", "$"); +} + +TEST(JsonPath, UnicodeTest) { + // unicode control character in value + TestOk( + R"( {"error_msg":"X.G1x: Expected value at line 1 column 1 path $","origin_string":"=\u0007mEs\u000FA���%חk�9ded06cd author: John Doe"})", + "$.origin_string", "=\u0007mEs\u000FA���%חk�9ded06cd author: John Doe"); + + // unicode control character within quotes + TestOk( + R"( {"category":{"error_msg":"X.G1x: Expected value at line 1 column 1 path $","origin_string":"=\u0007mEs\u000fA���%חk�9ded06cd author: John Doe"},"server_uuid":"1234"} )", + "$.category", + "{\"error_msg\":\"X.G1x: Expected value at line 1 column 1 path " + "$\",\"origin_string\":\"=\\u0007mEs\\u000FA���%חk�9ded06cd author: John " + "Doe\"}"); +} + +TEST(Root, Number) { + TestOk("123 ", "$", "123"); + TestOk("1.23 ", "$", "1.23"); + TestOk("1.0E7", "$", "1.0E7"); + TestOk("9999999.999999998", "$", "9999999.999999998"); + TestOk("0.001", "$", "0.001"); + TestOk("0.0001", "$", "1.0E-4"); + TestOk("0.0009999999999999998", "$", "9.999999999999998E-4"); + + // all interges will parse as Raw + TestOk("5555555555555555555555555555", "$", "5555555555555555555555555555"); + TestOk("[5555555555555555555555555555]", "$", + "[5555555555555555555555555555]"); + TestOk("[0,1,-0,-1,-123,-5555555555555555555555555555]", "$", + "[0,1,-0,-1,-123,-5555555555555555555555555555]"); + TestOk(R"({"a": 5555555555555555555555555555 })", "$", + R"({"a":5555555555555555555555555555})"); +} + +TEST(JsonPath, IndexSelector) { + auto json = R"([ + 0, + 1.23, + 4e56, + "null", + true, + {}, + [] + ])"; + + TestOk(json, "$[0]", "0"); + TestOk(json, "$[1]", "1.23"); + TestOk(json, "$[1]", "1.23"); + TestOk(json, "$[2]", "4.0E56"); + TestOk(json, "$[3]", "null"); + TestOk(json, "$[4]", "true"); + TestOk(json, "$[5]", "{}"); + TestOk(json, "$[6]", "[]"); + + TestOk("[1,2]", "$[1]", "2"); + + TestFail(json, "$.a"); + TestFail(json, "$[7]"); + TestFail(json, "$[-8]"); + TestFail(json, "$[5].a"); + TestFail(json, "$[6][0]"); +} + +TEST(JsonPathWildcard, Basic) { + auto json = R"([ + 0, + [1,2,3], + {"a":1,"b":[1,2,3]}, + [] + ])"; + + TestOk(json, "$.*", R"([0,[1,2,3],{"a":1,"b":[1,2,3]},[]])"); + TestOk(json, "$[1].*", "[1,2,3]"); + // spark cannot handle Object[*] + // TestOk(json, "$[2].*", "[1,[1,2,3]]"); + TestOk(json, "$[2].b.*", "[1,2,3]"); + // spark cannot handle Object[*] + // TestOk(json, "$[3].*", "null"); + + // ignore when not found + TestOk(R"([{"a":123}, {}])", "$[*].a", "123"); + TestOk(R"([[123, 456], []])", "$[*][1]", "456"); + + // ignore when encounter the mismatched type + TestOk(R"([{"a":123}, null])", "$[*].a", "123"); + TestOk(R"([[123, 456], [], null])", "$[*][1]", "456"); +} + +TEST(JsonPathWildcard, Primitive) { + TestOk("1", "$[*]", ""); + TestOk("null", "$[*]", ""); + TestOk("true", "$[*]", ""); + TestOk("false", "$[*]", ""); + TestOk("\"\"", "$[*]", ""); + TestOk("\"hello\"", "$[*]", ""); +} + +TEST(JsonPathWildcard, PrimitiveNoMatchErrorCode) { + TestNoMatch("1", "$[*]"); + TestNoMatch("null", "$[*]"); + TestNoMatch("true", "$[*]"); + TestNoMatch("false", "$[*]"); +} + +TEST(JsonPath, WildcardBatch) { + auto json = R"([ + 0, + [1,2,3], + {"a":1,"b":[1,2,3]}, + [] + ])"; + std::vector paths = {"$.*", "$[1].*", "$[2].*", "$[2].b.*", + "$[3].*", "$[0].*", "$[5].a", "$[6][0].*"}; + ValidBatchOK(json, paths); +} + +TEST(JsonPath, BadBatch) { + auto json = R"([ + 0, + [1,2,3], + {"a":1,"b":[1,2,3]}, + [] bad balaba)"; + std::vector paths = {"$.*", "$[1].*", "$[2].*", "$[2].b.*"}; + std::vector jsonpaths; + for (const auto& path : paths) { + jsonpaths.emplace_back(path); + } + ASSERT_NE(std::get<1>(GetByJsonPaths(json, jsonpaths)), kErrorNone); +} + +TEST(JsonPath, WildcardMany) { + auto json = R"([ + [0], + [1,2,3], + [{"a":1,"b":[1,2,3]}], + [] + ])"; + + TestOk(json, "$.*.*", R"([0,1,2,3,{"a":1,"b":[1,2,3]}])"); +} + +TEST(JsonPath, WildcardArray) { + auto json = R"({ + "a": { + "b": [ + [ + [ + { + "c": 1 + }, + { + "c": 2 + } + ] + ] + ] + } + })"; + auto path = "$.a.b[0][0][*].c"; + TestOk(json, path, "[1,2]"); + json = R"([{ + "a": 1, + "b": 2 + }, { + "a": 3, + "b": 4 + }])"; + path = "$[*].a"; + TestOk(json, path, "[1,3]"); + json = R"([{ + "TaskKey": 3010, + "Status": 3, + "OutTaskType": 5, + "OutTaskContent": { + "OutTaskId": "8121973866456483870", + "Source": 5133, + "productId": "1738761753995320", + "productSnapshotId": "0", + "extra": "{"sku_name":"Caribbean Water Park Student Ticket (Jul 22-Jul 24}}"}" + }, + { + "TaskKey": 3001, + "Status": 3, + "OutTaskType": 1, + "OutTaskContent": { + "ObjectID": "1738762988118075", + "AuditID": "NT011658213969796EDed000", + "Extra": "{" + namek_task_id ":" + 8121974700481349645 "}}" + }])"; + path = "$[*].TaskKey"; + TestFail(json, path); +} + +TEST(JsonPath, KeySelector) { + auto json = R"({ + "a": 1, + "b": 2, + "c": 3, + "d": { + "d1": 4, + "d2": [ + 0, + 1, + { + "d21": 5 + }, + [ true], + [], + [[null]] + ] + }, + "e": "null", + "f\"": "f key\"" + })"; + TestOk(json, "$", + "{\"a\":1,\"b\":2,\"c\":3,\"d\":{\"d1\":4,\"d2\":[0,1,{\"d21\":5},[" + "true],[],[[null]]]},\"e\":\"null\",\"f\\\"\":\"f key\\\"\"}"); + TestOk(json, "$.a", "1"); + TestOk(json, "$.b", "2"); + TestOk(json, "$['b']", "2"); + // TestOk(json, "$[b]", "2"); + TestOk(json, "$[\"b\"]", "2"); + TestOk(json, "$.d", "{\"d1\":4,\"d2\":[0,1,{\"d21\":5},[true],[],[[null]]]}"); + + TestFail(json, "$[1]"); + TestFail(json, "$.a.b"); + TestFail(json, "$.a[1]"); + + TestOk(json, "$.d.d2[0]", "0"); + TestOk(json, "$.d.d2[1]", "1"); + TestOk(json, "$.d.d2[2]", "{\"d21\":5}"); + TestOk(json, "$.d.d2[3]", "[true]"); + TestOk(json, "$.d.d2[3][0]", "true"); + TestOk(json, "$.d.d2[4]", "[]"); + TestOk(json, "$.d.d2[5][0][0]", "null"); + TestFail(json, "$.d.d2[4].a"); + TestFail(json, "$.d.d2[5][0][0][0]"); +} + +TEST(JsonPath, EscapedKeySelector) { + auto json = R"({ + "a\\": 1, + "b\"": 2, + "bA": 3, + "b.9": 4, + "b@": 5 + })"; + TestOk(json, R"($["a\\"])", "1"); + TestOk(json, R"($['a\'])", "1"); + TestOk(json, R"($["b\""])", "2"); + TestOk(json, R"($["b\u0041"])", "3"); + TestOk(json, "$['b.9']", "4"); + TestOk(json, "$['b@']", "5"); +} + +TEST(JsonPath, BadCases) { + auto json = R"({ + "a": { + "b": { + "c": "value1", + "d": "value2" + } + }, + "e.f": "value3", + "g.h.i": "value4" + })"; + TestOk(json, "$.a.b.c", "value1"); + + TestOk(R"({"root": [{"a":null},{"a":"foo"},{"a":"bar"}]})", "$.root[*].a", + R"(["foo","bar"])"); +} + +TEST(JsonPath, KeyNullElementPair) { + auto json = + R"( { "last_name": "Kim", "first_name":"Sam", "gender": false, "income": null, "age": 38} )"; + TestOk(json, "$.income", ""); +} + +TEST(JsonPath, KeyIntoStringValue) { + auto json = R"( {"name": {"name" : "bytedance"}} )"; + TestOk(json, "$.name.name.name.name.name", ""); +} + +TEST(JsonPath, BeforeNan) { + auto json = + R"( {"name":"xiaoxiao", "gender": false, "height": Nan, "passed": true} )"; + TestOk(json, "$.name", "xiaoxiao"); + TestOk(json, "$.gender", "false"); + TestOk(json, "$.height", ""); + TestOk(json, "$.passed", ""); +} + +TEST(JsonPath, BackslashZero) { + std::string json = " {\"name\": 321, \"req_id\": \"344 "; + json.push_back('\0'); + json += " 43321\"} "; + TestOk(json, "$.req_id", + std::string("344 ") + std::string{'\0'} + std::string(" 43321")); + TestOk(json, "$.name", "321"); +} + +TEST(JsonPath, sparkFeature) { + auto json = + R"( {"price":"129.99","suggested_price":"106.39","sku_name":"Shoe Model 4 825 Mint Green [High Quality Basketball Shoe )"; + TestOk(json, "$.price", "129.99"); + + json = R"( +{"key":"fakekey","labels":"ProductMgmt","labelsIterator":"ProductMgmt","labelsSize":1,"name":"Link","setExtra":false,"setKey":true,"setLabels":true,"setName":true,"setType":false,"setValues":true,"type":0,"values":"2mPs6","valuesIterator":"2mPs6","valuesSize":1 +)"; + TestOk(json, "$.key", "fakekey"); + TestOk(json, "$.labels", "ProductMgmt"); +} + +TEST(JsonPath, illegalJson) { + auto badJson3 = + R"({ + "creative_setting": { + "CreativeKeywords": [ + "careful_mom", + "good_stuff_recommendation\\\\\", + "life_fun" + ] } + })"; + TestOk(badJson3, "$.creative_setting.CreativeKeywords", ""); + + // Make sure this is treated as parse error, not a normal "no match". + auto got = GetByJsonPathOnDemand( + badJson3, "$.creative_setting.CreativeKeywords"); + EXPECT_NE(std::get<1>(got), kErrorNone); + EXPECT_NE(std::get<1>(got), kErrorNoneNoMatch); +} + +TEST(JsonPath, InvalidJsonPath) { + auto json = R"({})"; + + TestFail(json, "$[01]"); + TestFail(json, "$[-01]"); + TestFail(json, "$[-0"); + TestFail(json, "$[18446744073709551616]"); + TestFail(json, "$[]"); +} + +TEST(JsonPath, InvalidJsonPathMore) { + auto json = R"({})"; + + // Invalid token after '$' + TestUnsupportedPath(json, "$a"); + TestUnsupportedPath(json, "$$"); + TestUnsupportedPath(json, "$ "); + + // Unclosed bracket / quote + TestUnsupportedPath(json, "$[0"); + TestUnsupportedPath(json, "$[*"); + TestUnsupportedPath(json, R"($["a])"); + TestUnsupportedPath(json, R"($['a])"); + + // Unsupported features / expressions + TestUnsupportedPath(json, "$..a"); + TestUnsupportedPath(json, "$[a]"); + TestUnsupportedPath(json, "$[1:3]"); + TestUnsupportedPath(json, "$[0,1]"); + TestUnsupportedPath(json, "$[?(@.a)]"); + + // Empty key + TestUnsupportedPath(json, "$."); + TestUnsupportedPath(json, R"($[""])"); + TestUnsupportedPath(json, R"($[''])"); + + // Index > INT64_MAX + TestUnsupportedPath(json, "$[9223372036854775808]"); +} + +TEST(JsonPath, QuotedNameEscapes) { + // Double-quoted name supports backslash escapes. + TestOk(R"({"a\"b":1})", R"($["a\"b"])", "1"); + + // Invalid escape sequence should fail parsing and be treated as unsupported. + TestUnsupportedPath(R"({})", R"($["a\x"])"); + TestNoMatch(R"({})", R"($["\\"])"); + + // Missing closing bracket after quoted name. + TestUnsupportedPath(R"({})", R"($["a"])x)"); +} + +TEST(JsonPath, InternalParserBranches) { + using sonic_json::internal::JsonPath; + using sonic_json::internal::paddingJsonPath; + + auto parse_ok = [](const std::string& p) { + std::string padded = paddingJsonPath(p); + JsonPath path; + return path.ParsePadded(StringView(padded.data(), padded.size()), p.size()); + }; + + // Valid cases. + EXPECT_TRUE(parse_ok("$")); + EXPECT_TRUE(parse_ok("$.a")); + EXPECT_TRUE(parse_ok("$.*")); + EXPECT_TRUE(parse_ok("$[*]")); + EXPECT_TRUE(parse_ok("$[0]")); + EXPECT_TRUE(parse_ok("$[-1]")); + // Use "\\u" to keep it as two chars: '\\' + 'u' (avoid universal-char-name). + EXPECT_TRUE(parse_ok("$[\"\\uD83D\\uDE0A\"]")); + + // Invalid / edge cases. + EXPECT_FALSE(parse_ok("")); + EXPECT_FALSE(parse_ok("a")); + EXPECT_FALSE(parse_ok("$.")); + EXPECT_FALSE(parse_ok("$[")); + EXPECT_FALSE(parse_ok("$[01]")); + EXPECT_FALSE(parse_ok("$[999999999999999999999999999999999999999999]")); + EXPECT_FALSE(parse_ok("$..a")); +} + +TEST(JsonPath, TruncatedJsonReturnsInvalidChar) { + // Root parsing path is slightly different from deep-path scanning and may + // report invalid-char for truncated containers. + TestOkWithErr("[8", "$", "", kParseErrorInvalidChar); +} + +TEST(JsonPath, KeyNumSelector) { + auto json = R"({ + "1": 1, + "2": [0,1,2,3], + "3": {"33": 123} + })"; + TestOk(json, "$.1", "1"); + TestOk(json, "$.2[2]", "2"); + TestOk(json, "$.3.33", "123"); +} + +TEST(JsonPath, WildCardSpark) { + auto json = R"({ + "Person": [ + { + "name": "a", + "value": ["9.2", "3.0"] + }, + { + "name": "b", + "value": ["6", "666"] + } + ] + })"; + TestOk(json, "$.Person[*].value[*]", R"([["9.2","3.0"],["6","666"]])"); + + json = R"({ + "Person": [ + { + "name": "a", + "value": ["9.2"] + }, + { + "name": "b", + "value": ["6"] + } + ] + })"; + TestOk(json, "$.Person[*].value[*]", R"([["9.2"],["6"]])"); +} + +TEST(JsonPath, DoubleWildcardFlattenArrays) { + TestOk("[[1,2],[3]]", "$[*][*]", "[1,2,3]"); + TestOk(R"([[{"a":1}],[{"a":2}]])", "$[*][*]", R"([{"a":1},{"a":2}])"); +} + +TEST(JsonPath, IndexThenWildcard) { + TestOk("[[1,2],[3,4]]", "$[1][*]", "[3,4]"); + TestOk("[[1,2],[3,4]]", "$[0][*]", "[1,2]"); +} +TEST(JsonPath, DoubleEscape) { + auto json = R"({"output":{"ens_prob":0.004286}})"; + TestOk(json, "$.output.ens_prob", "0.004286"); +} +std::vector splitToInts(const std::string& str) { + std::vector numbers; + std::string_view sv(str); + + while (!sv.empty()) { + auto space = sv.find(' '); + auto token = sv.substr(0, space); + + if (!token.empty()) { + numbers.push_back(std::stoi(std::string(token))); + } + + if (space == std::string_view::npos) break; + sv = sv.substr(space + 1); + } + + return numbers; +} +TEST(JsonPath, DISABLED_JsonInfiniteLoop) { + const std::string integers( + "123 34 -28 -65 -99 -23 -103 -87 34 58 32 48 46 55 57 49 53 44 32 34 -23 " + "-127 -109 -27 -91 -121 -23 -123 -73 -27 -88 -127 34 58 32 48 46 55 56 " + "48 56 44 32 34 -24 -121 -86 -27 -118 -88 -26 -116 -95 34 58 32 48 46 55 " + "55 56 49 125"); + auto ints = splitToInts(integers); + std::string json(""); + for (const auto i : ints) { + json.push_back((char)i); + } + + auto got = + GetByJsonPathOnDemand( + json, "$.motor_content_boost"); + EXPECT_EQ(std::get<1>(got), kParseErrorUnexpect); + EXPECT_EQ(std::get<0>(got), ""); +} +TEST(JsonPath, JsonInfiniteLoop2) { + std::string json("[8"); + auto got = + GetByJsonPathOnDemand( + json, "$.motor_content_boost"); + EXPECT_EQ(std::get<1>(got), kParseErrorEof); + EXPECT_EQ(std::get<0>(got), ""); +} + +TEST(JsonPath, JsonInfiniteLoop3) { + std::string json = R"json([{"a":["c"]},)json"; + std::string path = "$[*].a"; + auto got = + GetByJsonPathOnDemand( + json, path); + EXPECT_EQ(std::get<1>(got), kParseErrorEof); + EXPECT_EQ(std::get<0>(got), ""); +} + +TEST(JsonPath, JsonTuple) { + auto json = R"({a:1, b:2c})"; + auto got = + GetByJsonPathOnDemand( + json, "$.b"); + EXPECT_EQ(std::get<1>(got), kParseErrorUnexpect); + EXPECT_EQ(std::get<0>(got), ""); +} + +TEST(JsonPathDom, SingleStringIsUnquoted) { + auto got = GetByJsonPath(R"({"s":"abc"})", "$.s"); + EXPECT_EQ(std::get<1>(got), kErrorNone); + EXPECT_EQ(std::get<0>(got), "abc"); +} + +TEST(JsonPathDom, ConstAtJsonPathIsConstCorrect) { + Document doc; + doc.Parse(R"({"s":"abc"})"); + ASSERT_FALSE(doc.HasParseError()); + + const Document& cdoc = doc; + JsonPathResult result = cdoc.AtJsonPath("$.s"); + EXPECT_EQ(result.error, kErrorNone); + ASSERT_EQ(result.nodes.size(), 1U); + const Node* n = result.nodes[0]; + ASSERT_NE(n, nullptr); + EXPECT_TRUE(n->IsString()); + EXPECT_EQ(n->GetStringView(), "abc"); +} + +TEST(JsonPathDom, NullNodeFilteredToEmptyString) { + auto got = GetByJsonPath(R"({"a":null})", "$.a"); + EXPECT_EQ(std::get<1>(got), kErrorNone); + EXPECT_EQ(std::get<0>(got), ""); +} + +TEST(JsonPathDom, MultiNodesSerializeAndFilterNulls) { + auto got = GetByJsonPath(R"({"a":[null,"x",1]})", "$.a[*]"); + EXPECT_EQ(std::get<1>(got), kErrorNone); + EXPECT_EQ(std::get<0>(got), R"(["x",1])"); +} + +TEST(JsonPathDom, NotFoundAndUnsupportedPath) { + auto not_found = GetByJsonPath(R"({"a":1})", "$.b"); + EXPECT_EQ(std::get<1>(not_found), kNotFoundByJsonPath); + EXPECT_EQ(std::get<0>(not_found), ""); + + auto unsupported = GetByJsonPath(R"({"a":1})", "$..a"); + EXPECT_EQ(std::get<1>(unsupported), kUnsupportedJsonPath); + EXPECT_EQ(std::get<0>(unsupported), ""); +} + +TEST(JsonPathDom, NegativeIndexSupportedButOnDemandUnsupported) { + auto dom_got = GetByJsonPath("[0,1,2]", "$[-1]"); + EXPECT_EQ(std::get<1>(dom_got), kErrorNone); + EXPECT_EQ(std::get<0>(dom_got), "2"); + + auto ondemand_got = + GetByJsonPathOnDemand("[0,1,2]", "$[-1]"); + // OnDemand mode does not support negative indexes. + EXPECT_NE(std::get<1>(ondemand_got), kErrorNone); + EXPECT_EQ(std::get<0>(ondemand_got), ""); +} + +TEST(JsonPathDump, SerializeCoversEmptySingleAndMulti) { + Document doc; + doc.Parse(R"({"a":null,"s":"abc","arr":[null,"x",1]})"); + ASSERT_FALSE(doc.HasParseError()); + + { + auto result = doc.AtJsonPath("$.a"); + ASSERT_EQ(result.error, kErrorNone); + auto dumped = sonic_json::internal::Serialize(result); + EXPECT_EQ(std::get<1>(dumped), kErrorNone); + EXPECT_EQ(std::get<0>(dumped), "null"); + } + + { + auto result = doc.AtJsonPath("$.s"); + ASSERT_EQ(result.error, kErrorNone); + auto dumped = sonic_json::internal::Serialize(result); + EXPECT_EQ(std::get<1>(dumped), kErrorNone); + EXPECT_EQ(std::get<0>(dumped), "abc"); + } + + { + auto result = doc.AtJsonPath("$.arr[*]"); + ASSERT_EQ(result.error, kErrorNone); + auto dumped = sonic_json::internal::Serialize(result); + EXPECT_EQ(std::get<1>(dumped), kErrorNone); + EXPECT_EQ(std::get<0>(dumped), R"(["x",1])"); + } +} + +TEST(JsonPathDump, SerializeCoversSingleNonStringAndErrorPaths) { + Document doc; + doc.Parse(R"({"n":1,"inf":0})"); + ASSERT_FALSE(doc.HasParseError()); + + // size == 1 and not string + { + auto result = doc.AtJsonPath("$.n"); + ASSERT_EQ(result.error, kErrorNone); + auto dumped = sonic_json::internal::Serialize(result); + EXPECT_EQ(std::get<1>(dumped), kErrorNone); + EXPECT_EQ(std::get<0>(dumped), "1"); + } + + // serialization error path (infinity is not allowed unless kSerializeInfNan) + doc["inf"].SetDouble(std::numeric_limits::infinity()); + { + auto result = doc.AtJsonPath("$.inf"); + ASSERT_EQ(result.error, kErrorNone); + auto dumped = sonic_json::internal::Serialize(result); + EXPECT_EQ(std::get<1>(dumped), kSerErrorInfinity); + EXPECT_EQ(std::get<0>(dumped), ""); + } + + // multi-nodes error path + { + JsonPathResult result; + result.error = kErrorNone; + result.nodes.push_back(&doc["n"]); + result.nodes.push_back(&doc["inf"]); + + auto dumped = sonic_json::internal::Serialize(result); + EXPECT_EQ(std::get<1>(dumped), kSerErrorInfinity); + EXPECT_EQ(std::get<0>(dumped), ""); + } +} +} // namespace diff --git a/tests/node_test.cpp b/tests/node_test.cpp index 5f77c35a..d3270d8a 100644 --- a/tests/node_test.cpp +++ b/tests/node_test.cpp @@ -15,6 +15,7 @@ * limitations under the License. */ +#include #include #include @@ -784,6 +785,71 @@ TEST(DNodeTest, AllocatorReturnNull) { EXPECT_EQ(node1.GetStringView(), ""); } +// Allocator for verifying owned-buffer free behavior. +class CountingAllocator { + public: + void* Malloc(size_t size) { + ++malloc_cnt; + return size ? std::malloc(size) : nullptr; + } + void* Realloc(void* old_ptr, size_t /*old_size*/, size_t new_size) { + if (new_size == 0) { + Free(old_ptr); + return nullptr; + } + ++realloc_cnt; + return std::realloc(old_ptr, new_size); + } + + static void Free(void* ptr) { + ++free_cnt; + std::free(ptr); + } + + static void Reset() { + malloc_cnt = 0; + realloc_cnt = 0; + free_cnt = 0; + } + + static inline size_t malloc_cnt = 0; + static inline size_t realloc_cnt = 0; + static inline size_t free_cnt = 0; + static constexpr bool kNeedFree = true; +}; + +TEST(DNodeTest, OwnedRawAndNumStrFreed) { + using NodeType = DNode; + CountingAllocator a; + + // Copy a raw node into a kNeedFree allocator should not assert and should be + // freed on destruction. + CountingAllocator::Reset(); + { + Document doc; + const char* json = "123"; + doc.Parse(json, 3); + ASSERT_FALSE(doc.HasParseError()); + ASSERT_TRUE(doc.IsRaw()); + + NodeType copied(doc, a); + EXPECT_TRUE(copied.IsRaw()); + EXPECT_EQ(copied.GetRaw(), "123"); + } + EXPECT_EQ(CountingAllocator::malloc_cnt, CountingAllocator::free_cnt); + + // SetStringNumber with allocator should also be freed on destruction. + CountingAllocator::Reset(); + { + NodeType n; + std::string s = "18446744073709551616"; + n.SetStringNumber(StringView(s.data(), s.size()), a); + EXPECT_TRUE(n.IsStringNumber()); + EXPECT_EQ(n.GetStringView(), s); + } + EXPECT_EQ(CountingAllocator::malloc_cnt, CountingAllocator::free_cnt); +} + TYPED_TEST(NodeTest, SourceAllocator) { using NodeType = TypeParam; using Allocator = typename NodeType::alloc_type; diff --git a/tests/parsenumber_test.cpp b/tests/parsenumber_test.cpp index 32c32749..c4bcbec4 100644 --- a/tests/parsenumber_test.cpp +++ b/tests/parsenumber_test.cpp @@ -57,6 +57,35 @@ void TestParseDouble(double num, const std::string& input) { { EXPECT_DOUBLE_EQ(num, internal::AtofNative(input.data(), input.size())); } } +void testStringNumberNode(const std::string& expect, Node& node) { + EXPECT_TRUE(node.IsStringNumber()) << expect; + EXPECT_EQ(expect, node.GetStringView()) << expect; + EXPECT_EQ(expect, node.Dump()) << expect; +} + +void TestStringNumber(const std::string& expect, const std::string& input) { + auto doc_ptr = std::make_unique(); + doc_ptr->Parse(input.data(), + input.size()); + EXPECT_FALSE(doc_ptr->HasParseError()) << input; + testStringNumberNode(expect, *doc_ptr); + + // test copy + MemoryPoolAllocator<> a; + Node copied(*doc_ptr, a); + doc_ptr.reset(); + testStringNumberNode(expect, copied); + + // test set + Node n; + n.SetStringNumber(input); + testStringNumberNode(expect, n); + + // test set with copy + n.SetStringNumber(std::string(input), a); + testStringNumberNode(expect, n); +} + void TestParseError(const std::string& input, size_t off, SonicError err) { Document doc; doc.Parse(input.data(), input.size()); @@ -119,6 +148,67 @@ TEST(ParserTest, ParseNumber) { TestParseDouble(-1234567890123456789012345.0, "-1234567890123456789012345"); } +TEST(ParserTest, ParseIntegerAsRaw) { + { + Document doc; + doc.Parse("0", 1); + ASSERT_FALSE(doc.HasParseError()); + EXPECT_TRUE(doc.IsRaw()); + EXPECT_EQ(doc.GetRaw(), "0"); + } + { + Document doc; + doc.Parse("-0", 2); + ASSERT_FALSE(doc.HasParseError()); + EXPECT_TRUE(doc.IsRaw()); + EXPECT_EQ(doc.GetRaw(), "-0"); + } + { + Document doc; + doc.Parse("123", 3); + ASSERT_FALSE(doc.HasParseError()); + EXPECT_TRUE(doc.IsRaw()); + EXPECT_EQ(doc.GetRaw(), "123"); + } + { + Document doc; + doc.Parse("-123", 4); + ASSERT_FALSE(doc.HasParseError()); + EXPECT_TRUE(doc.IsRaw()); + EXPECT_EQ(doc.GetRaw(), "-123"); + } + { + // Scientific notation is not treated as integer. + Document doc; + doc.Parse("0e+1", 4); + ASSERT_FALSE(doc.HasParseError()); + EXPECT_TRUE(doc.IsDouble()); + EXPECT_DOUBLE_EQ(doc.GetDouble(), 0.0); + } +} + +TEST(ParserTest, AllowUnescapedControlChars) { + std::string s = "\""; + s.push_back('\t'); + s += "\""; + + { + Document doc; + doc.Parse(s.data(), s.size()); + EXPECT_TRUE(doc.HasParseError()); + } + + { + Document doc; + doc.Parse(s.data(), s.size()); + ASSERT_FALSE(doc.HasParseError()); + ASSERT_TRUE(doc.IsString()); + auto sv = doc.GetStringView(); + ASSERT_EQ(sv.size(), 1U); + EXPECT_EQ(sv[0], '\t'); + } +} + void ParseFloatInFiles() { std::vector files = {"./testdata/num/float-1.txt", "./testdata/num/float-8.txt"}; @@ -155,7 +245,7 @@ TEST(ParserTest, ParseFloatExponent) { TestParseDouble(0, "-0.00e+0456"); TestParseDouble(0, "-0e+456"); - // zero exponets + // zero exponents TestParseDouble(1, "1e0"); TestParseDouble(12, "12e-00"); @@ -222,6 +312,7 @@ TEST(ParserTest, ParseInvalidNumber) { TestParseInval(1, "-"); TestParseInval(1, "00"); TestParseInval(1, "01"); + TestParseInval(2, "-01"); TestParseInval(2, "0."); TestParseInval(1, "0-"); TestParseInval(2, "0e"); @@ -237,4 +328,38 @@ TEST(ParserTest, ParseInvalidNumber) { 8, "1234567 123"); // Only support parse single JSON value one time } +TEST(ParserTest, ParseStringNumber) { + TestStringNumber("-9223372036854775809", "-9223372036854775809"); + TestStringNumber("18446744073709551616", "18446744073709551616"); + TestStringNumber("-4.94065645841247E-324", "-4.94065645841247E-324"); + TestStringNumber("4.94065645841247E-324", "4.94065645841247E-324"); + TestStringNumber("1.79769313486231E308", "1.79769313486231E308"); + TestStringNumber("-1.79769313486231E308", "-1.79769313486231E308"); +} + +TEST(ParserTest, ParseStringNumber_NormalFloat) { + TestStringNumber("1.0", "1.0"); + TestStringNumber("0.1", "0.1"); + TestStringNumber("1e2", "1e2"); +} + +TEST(ParserTest, ParseStringNumber_ZeroExponentStaysDouble) { + Document doc; + std::string input = "0e+123"; + doc.Parse(input.data(), input.size()); + EXPECT_FALSE(doc.HasParseError()) << input; + EXPECT_TRUE(doc.IsDouble()) << input; + EXPECT_DOUBLE_EQ(0.0, doc.GetDouble()) << input; +} + +TEST(ParserTest, ParseStringNumber_Uint64KeepsType) { + Document doc; + std::string input = "18446744073709551615"; // ULLONG_MAX + doc.Parse(input.data(), input.size()); + EXPECT_FALSE(doc.HasParseError()) << input; + EXPECT_TRUE(doc.IsUint64()) << input; + EXPECT_EQ(18446744073709551615ULL, doc.GetUint64()) << input; + EXPECT_EQ(input, doc.Dump()) << input; +} + } // namespace diff --git a/tests/quote_test.cpp b/tests/quote_test.cpp index 2c149d38..6cfb0db0 100644 --- a/tests/quote_test.cpp +++ b/tests/quote_test.cpp @@ -34,7 +34,8 @@ typedef struct quoteTests { void TestQuote(const std::string& input, const std::string& expect) { size_t n = input.size(); auto buf = std::unique_ptr(new char[(n + 2) * 6 + 32]); - char* end = Quote(input.data(), n, buf.get()); + char* end = + Quote(input.data(), n, buf.get()); *end = '\0'; EXPECT_STREQ(buf.get(), expect.data()); } From 0e2c2ffac52d183793e60a2b5ed6f52df216d32b Mon Sep 17 00:00:00 2001 From: yangzhengguo Date: Tue, 17 Mar 2026 11:06:20 +0800 Subject: [PATCH 2/6] opt workflow --- .github/workflows/clang-format-check.yml | 37 +++++++++++-- .github/workflows/test_arm.yml | 16 ++++-- .github/workflows/test_coverage.yml | 31 +++++++++-- .github/workflows/test_x86.yml | 66 ++++++++++++++++++++---- 4 files changed, 128 insertions(+), 22 deletions(-) diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml index c004f856..bca80031 100644 --- a/.github/workflows/clang-format-check.yml +++ b/.github/workflows/clang-format-check.yml @@ -1,15 +1,44 @@ -name: clang-format Check +name: CI / clang-format concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true -on: [push, pull_request] +on: + push: + branches: [ master ] + paths: + - '.clang-format' + - 'include/sonic/**' + - 'benchmark/**' + - 'tests/**' + - 'example/**' + - 'fuzz/**' + - '.github/workflows/clang-format-check.yml' + pull_request: + paths: + - '.clang-format' + - 'include/sonic/**' + - 'benchmark/**' + - 'tests/**' + - 'example/**' + - 'fuzz/**' + - '.github/workflows/clang-format-check.yml' + workflow_dispatch: + +permissions: + contents: read + +defaults: + run: + shell: bash + jobs: formatting-check: - name: Formatting Check + name: clang-format runs-on: ubuntu-24.04 strategy: + fail-fast: false matrix: path: - 'include/sonic' diff --git a/.github/workflows/test_arm.yml b/.github/workflows/test_arm.yml index b8640cbf..c7a4be61 100644 --- a/.github/workflows/test_arm.yml +++ b/.github/workflows/test_arm.yml @@ -1,7 +1,14 @@ -name: Test ARM +name: CI / Test (ARM) + +permissions: + contents: read + +defaults: + run: + shell: bash concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true on: @@ -14,12 +21,13 @@ on: jobs: test_arm: runs-on: ubuntu-24.04-arm + timeout-minutes: 60 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Install dependencies run: | sudo apt-get update - sudo apt-get install -y cmake ninja-build g++ + sudo apt-get install -y --no-install-recommends cmake ninja-build g++ - name: Build and test NEON run: | cmake -S . -B build-neon -G Ninja diff --git a/.github/workflows/test_coverage.yml b/.github/workflows/test_coverage.yml index 5971a39a..2ec0bba6 100644 --- a/.github/workflows/test_coverage.yml +++ b/.github/workflows/test_coverage.yml @@ -1,17 +1,29 @@ # yaml-language-server: $schema=https://json-schema.org/draft-07/schema# name: Test Coverage +permissions: + contents: read + actions: write + id-token: write + +defaults: + run: + shell: bash + concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true on: push: + branches: [ master ] pull_request: jobs: test-coverage: + name: bazel coverage runs-on: ubuntu-24.04 + timeout-minutes: 60 env: GCC_VERSION: 12 @@ -20,10 +32,20 @@ jobs: uses: actions/checkout@v4 - name: Read Bazel version - shell: bash run: | echo "BAZEL_VERSION=$(cat .bazelversion)" >> "$GITHUB_ENV" + - name: Cache Bazel + uses: actions/cache@v4 + with: + path: | + ~/.cache/bazel + ~/.cache/bazelisk + key: ${{ runner.os }}-bazel-${{ env.BAZEL_VERSION }}-${{ hashFiles('MODULE.bazel.lock', 'WORKSPACE.bzlmod', 'BUILD.bazel') }} + restore-keys: | + ${{ runner.os }}-bazel-${{ env.BAZEL_VERSION }}- + ${{ runner.os }}-bazel- + - name: Setup bazel uses: jwlawson/actions-setup-bazel@v2 with: @@ -31,10 +53,11 @@ jobs: - name: Setup GCC run: | - sudo apt-get install -y gcc-12 g++-12 + sudo apt-get update + sudo apt-get install -y --no-install-recommends gcc-12 g++-12 - name: Install lcov run: | - sudo apt install lcov + sudo apt-get install -y --no-install-recommends lcov - name: Coverage run: | diff --git a/.github/workflows/test_x86.yml b/.github/workflows/test_x86.yml index 1d349925..eabc4b4d 100644 --- a/.github/workflows/test_x86.yml +++ b/.github/workflows/test_x86.yml @@ -1,13 +1,34 @@ -name: Test +name: CI / Test (x86) + +permissions: + contents: read + actions: write + +defaults: + run: + shell: bash + concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true -on: [push, pull_request] +on: + push: + branches: [ master ] + paths-ignore: + - 'docs/**' + - 'licenses/**' + - '**/*.md' + - 'Doxyfile' + - '.vscode/**' + pull_request: + workflow_dispatch: jobs: test-llvm: + name: clang ${{ matrix.llvm_version }} / ${{ matrix.tool }} / ${{ matrix.arch }} runs-on: ubuntu-24.04 + timeout-minutes: 45 strategy: fail-fast: false matrix: @@ -15,8 +36,8 @@ jobs: tool: ['cmake', 'bazel'] arch: [westmere, haswell] exclude: - - tool: 'cmake' - arch: westmere + - tool: 'cmake' + arch: westmere env: CC: clang CXX: clang++ @@ -26,10 +47,21 @@ jobs: - name: Read Bazel version if: matrix.tool == 'bazel' - shell: bash run: | echo "BAZEL_VERSION=$(cat .bazelversion)" >> "$GITHUB_ENV" + - name: Cache Bazel + if: matrix.tool == 'bazel' + uses: actions/cache@v4 + with: + path: | + ~/.cache/bazel + ~/.cache/bazelisk + key: ${{ runner.os }}-bazel-${{ env.BAZEL_VERSION }}-${{ hashFiles('MODULE.bazel.lock', 'WORKSPACE.bzlmod', 'BUILD.bazel') }} + restore-keys: | + ${{ runner.os }}-bazel-${{ env.BAZEL_VERSION }}- + ${{ runner.os }}-bazel- + - name: Install LLVM and Clang uses: KyleMayes/install-llvm-action@v2 with: @@ -49,7 +81,7 @@ jobs: - name: Run ${{ matrix.arch }} Test Use Bazel if: matrix.tool == 'bazel' - run : | + run: | bash ./scripts/unittest.sh -c --arch=${{ matrix.arch }} - name: Run Test Use CMake @@ -58,7 +90,9 @@ jobs: bash ./scripts/run_cmake.sh test-gcc: + name: gcc ${{ matrix.gcc_version }} / ${{ matrix.tool }} / ${{ matrix.arch }} / ${{ matrix.dispatch }} runs-on: ubuntu-24.04 + timeout-minutes: 60 strategy: fail-fast: false matrix: @@ -84,13 +118,25 @@ jobs: - name: Read Bazel version if: matrix.tool == 'bazel' - shell: bash run: | echo "BAZEL_VERSION=$(cat .bazelversion)" >> "$GITHUB_ENV" - name: Install GCC run: | - sudo apt-get install -y gcc-${{ matrix.gcc_version }} g++-${{ matrix.gcc_version }} + sudo apt-get update + sudo apt-get install -y --no-install-recommends gcc-${{ matrix.gcc_version }} g++-${{ matrix.gcc_version }} + + - name: Cache Bazel + if: matrix.tool == 'bazel' + uses: actions/cache@v4 + with: + path: | + ~/.cache/bazel + ~/.cache/bazelisk + key: ${{ runner.os }}-bazel-${{ env.BAZEL_VERSION }}-${{ hashFiles('MODULE.bazel.lock', 'WORKSPACE.bzlmod', 'BUILD.bazel') }} + restore-keys: | + ${{ runner.os }}-bazel-${{ env.BAZEL_VERSION }}- + ${{ runner.os }}-bazel- - name: Setup bazel if: matrix.tool == 'bazel' @@ -106,7 +152,7 @@ jobs: - name: Run ${{ matrix.arch }} ${{ matrix.dispatch }} Test Use Bazel if: matrix.tool == 'bazel' - run : | + run: | bash ./scripts/unittest.sh -g --arch=${{ matrix.arch }} --dispatch=${{ matrix.dispatch }} - name: Run Test Use CMake From 06223d3d6975db944579f663f2e32545201b98d1 Mon Sep 17 00:00:00 2001 From: yangzhengguo Date: Tue, 17 Mar 2026 11:35:01 +0800 Subject: [PATCH 3/6] fix_test --- tests/jsonpath_test.cpp | 66 +++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 16 deletions(-) diff --git a/tests/jsonpath_test.cpp b/tests/jsonpath_test.cpp index 8cb5d12e..faad09d3 100644 --- a/tests/jsonpath_test.cpp +++ b/tests/jsonpath_test.cpp @@ -16,8 +16,15 @@ #include +#include +#include #include +#if !defined(_WIN32) +#include +#include +#endif + #include "sonic/sonic.h" namespace { @@ -621,23 +628,50 @@ std::vector splitToInts(const std::string& str) { return numbers; } -TEST(JsonPath, DISABLED_JsonInfiniteLoop) { - const std::string integers( - "123 34 -28 -65 -99 -23 -103 -87 34 58 32 48 46 55 57 49 53 44 32 34 -23 " - "-127 -109 -27 -91 -121 -23 -123 -73 -27 -88 -127 34 58 32 48 46 55 56 " - "48 56 44 32 34 -24 -121 -86 -27 -118 -88 -26 -116 -95 34 58 32 48 46 55 " - "55 56 49 125"); - auto ints = splitToInts(integers); - std::string json(""); - for (const auto i : ints) { - json.push_back((char)i); - } +TEST(JsonPath, JsonInfiniteLoop) { +#if defined(_WIN32) + GTEST_SKIP() << "Windows does not support alarm()-based timeout guard."; +#else + // This case historically could hang; run it in a subprocess with an alarm so + // the whole test suite won't get stuck. + ASSERT_EXIT( + { + alarm(2); + + const std::string integers( + "123 34 -28 -65 -99 -23 -103 -87 34 58 32 48 46 55 57 49 53 44 32 " + "34 " + "-23 " + "-127 -109 -27 -91 -121 -23 -123 -73 -27 -88 -127 34 58 32 48 46 " + "55 " + "56 " + "48 56 44 32 34 -24 -121 -86 -27 -118 -88 -26 -116 -95 34 58 32 48 " + "46 55 " + "55 56 49 125"); + auto ints = splitToInts(integers); + std::string json(""); + for (const auto i : ints) { + json.push_back((char)i); + } - auto got = - GetByJsonPathOnDemand( - json, "$.motor_content_boost"); - EXPECT_EQ(std::get<1>(got), kParseErrorUnexpect); - EXPECT_EQ(std::get<0>(got), ""); + auto got = GetByJsonPathOnDemand< + SerializeFlags::kSerializeUnicodeEscapeUppercase>( + json, "$.motor_content_boost"); + const auto& out = std::get<0>(got); + const auto err = std::get<1>(got); + if (err == kErrorNone) { + std::fprintf(stderr, "unexpected success, out=%s\n", out.c_str()); + std::_Exit(1); + } + if (!out.empty()) { + std::fprintf(stderr, "expected empty output, err=%d, out=%s\n", + static_cast(err), out.c_str()); + std::_Exit(1); + } + std::_Exit(0); + }, + ::testing::ExitedWithCode(0), ""); +#endif } TEST(JsonPath, JsonInfiniteLoop2) { std::string json("[8"); From 2168ad3315f9806f7953cb2e9c099b3f25f22b34 Mon Sep 17 00:00:00 2001 From: yangzhengguo Date: Tue, 24 Mar 2026 18:46:15 +0800 Subject: [PATCH 4/6] fix comments by codex --- include/sonic/dom/dynamicnode.h | 19 +++++--- include/sonic/dom/genericnode.h | 6 ++- include/sonic/dom/parser.h | 2 +- include/sonic/dom/type.h | 2 +- include/sonic/error.h | 2 + include/sonic/internal/arch/simd_skip.h | 7 ++- include/sonic/jsonpath/dump.h | 21 +++++---- include/sonic/jsonpath/jsonpath.h | 47 ++++--------------- include/sonic/jsonpath/ondemand.h | 28 ++++++++++-- tests/document_test.cpp | 17 +++++++ tests/jsonpath_test.cpp | 61 +++++++++++++++++++++++-- tests/node_test.cpp | 43 +++++++++++++++++ tests/parsenumber_test.cpp | 10 ++++ 13 files changed, 199 insertions(+), 66 deletions(-) diff --git a/include/sonic/dom/dynamicnode.h b/include/sonic/dom/dynamicnode.h index 3439513d..ba3555a3 100644 --- a/include/sonic/dom/dynamicnode.h +++ b/include/sonic/dom/dynamicnode.h @@ -129,13 +129,18 @@ class DNode : public GenericNode> { } case kRaw: { size_t len = rhs.Size(); - // Mark buffer as owned so destroy() will free it for kNeedFree alloc. - this->sv.len = rhs.getTypeAndLen() | kOwnedStringMask; - this->sv.p = (char*)(alloc.Malloc(len + 1)); - sonic_assert(this->sv.p != nullptr); - std::memcpy(const_cast(this->sv.p), rhs.GetStringView().data(), - len); - const_cast(this->sv.p)[len] = '\0'; + char* p = static_cast(alloc.Malloc(len + 1)); + if (p) { + // Mark buffer as owned so destroy() will free it for kNeedFree alloc. + this->sv.len = rhs.getTypeAndLen() | kOwnedStringMask; + this->sv.p = p; + std::memcpy(const_cast(this->sv.p), rhs.GetStringView().data(), + len); + const_cast(this->sv.p)[len] = '\0'; + } else { + this->sv.p = ""; + this->setLength(0, rhs.GetType()); + } break; } default: diff --git a/include/sonic/dom/genericnode.h b/include/sonic/dom/genericnode.h index 33a0570b..1a20419f 100644 --- a/include/sonic/dom/genericnode.h +++ b/include/sonic/dom/genericnode.h @@ -1079,7 +1079,11 @@ class GenericNode { std::string Dump() const { WriteBuffer wb; SonicError err = Serialize(wb); - return err == kErrorNone ? wb.ToString() : ""; + if (err != kErrorNone) { + return ""; + } + auto sv = wb.ToStringView(); + return std::string(sv.data(), sv.size()); } protected: diff --git a/include/sonic/dom/parser.h b/include/sonic/dom/parser.h index aab1e304..10cee9aa 100644 --- a/include/sonic/dom/parser.h +++ b/include/sonic/dom/parser.h @@ -641,7 +641,7 @@ class Parser { } unsigned num = s[i - 1] - '0'; if (man < kUint64Max / 10 || - (man == kUint64Max / 10 && num <= UINT_MAX % 10)) { + (man == kUint64Max / 10 && num <= kUint64Max % 10)) { man = man * 10 + num; if (sgn == -1) { goto double_string_fast; diff --git a/include/sonic/dom/type.h b/include/sonic/dom/type.h index f98b7968..3c53d0d1 100644 --- a/include/sonic/dom/type.h +++ b/include/sonic/dom/type.h @@ -36,7 +36,7 @@ enum TypeFlag { kUint = ((uint8_t)(0 << 3)) | kNumber, // xxx00_011, 3 kSint = ((uint8_t)(1 << 3)) | kNumber, // xxx01_011, 11 kReal = ((uint8_t)(2 << 3)) | kNumber, // xxx10_011, 19 - kNumStr = ((uint8_t)(3 << 3)) | kNumber, // xx100_011, 27 + kNumStr = ((uint8_t)(3 << 3)) | kNumber, // xxx11_011, 27 // kStringCopy: sv.p is copied, but not need free, e.g. node's string buffer // is dom str_ kStringCopy = kString, // xxx00_100, 4 diff --git a/include/sonic/error.h b/include/sonic/error.h index 0ea14d86..b62f9deb 100644 --- a/include/sonic/error.h +++ b/include/sonic/error.h @@ -99,6 +99,8 @@ inline const char* ErrorMsg(SonicError error) noexcept { {kErrorNoneNoMatch, "JsonPath: no match."}, }; + static_assert(sizeof(kErrorMsg) / sizeof(kErrorMsg[0]) == kErrorNums, + "kErrorMsg must stay in sync with SonicError"); const int idx = static_cast(error); if (idx < 0 || idx >= static_cast(kErrorNums)) { diff --git a/include/sonic/internal/arch/simd_skip.h b/include/sonic/internal/arch/simd_skip.h index 527bed48..f284db8a 100644 --- a/include/sonic/internal/arch/simd_skip.h +++ b/include/sonic/internal/arch/simd_skip.h @@ -652,6 +652,7 @@ class SkipScanner2 { public: virtual bool writeRaw(StringView sv) = 0; virtual bool copyCurrentStructure(StringView sv) = 0; + virtual bool copyCurrentStructureSingleResult(StringView sv) = 0; virtual bool copyCurrentStructureJsonTupleCodeGen( StringView raw, size_t index, std::vector> &result, @@ -908,7 +909,11 @@ class SkipScanner2 { jsonGenerator->writeRawValue(wb.ToStringView()); jsonGenerator->writeEndArray(); } else if (dirty == 1) { - jsonGenerator->writeRawValue(wb.ToStringView()); + if (!jsonGenerator->copyCurrentStructureSingleResult( + wb.ToStringView())) { + setError(kParseErrorUnexpect); + return false; + } } return dirty > 0; diff --git a/include/sonic/jsonpath/dump.h b/include/sonic/jsonpath/dump.h index d3adfe46..d527e7c3 100644 --- a/include/sonic/jsonpath/dump.h +++ b/include/sonic/jsonpath/dump.h @@ -12,26 +12,27 @@ namespace sonic_json { namespace internal { template sonic_force_inline std::tuple Serialize( - JsonPathResult& result) { + const JsonPathResult& result) { + auto local = result; // filter the null nodes - result.nodes.erase( - std::remove_if(result.nodes.begin(), result.nodes.end(), + local.nodes.erase( + std::remove_if(local.nodes.begin(), local.nodes.end(), [](const auto& node) { return node->IsNull(); }), - result.nodes.end()); + local.nodes.end()); - if (result.nodes.empty()) { - return std::make_tuple("null", kErrorNone); + if (local.nodes.empty()) { + return std::make_tuple("", kErrorNone); } WriteBuffer wb; - if (result.nodes.size() == 1) { + if (local.nodes.size() == 1) { // not serialize the single string - auto& root = result.nodes[0]; + auto& root = local.nodes[0]; if (root->IsString()) { wb.Push(root->GetStringView().data(), root->Size()); } else { auto err = - result.nodes[0] + local.nodes[0] ->template Serialize(wb); if (err != kErrorNone) { return std::make_tuple("", err); @@ -39,7 +40,7 @@ sonic_force_inline std::tuple Serialize( } } else { wb.Push('['); - for (const auto& node : result.nodes) { + for (const auto& node : local.nodes) { auto err = node->template Serialize(wb); diff --git a/include/sonic/jsonpath/jsonpath.h b/include/sonic/jsonpath/jsonpath.h index 4f6a6519..f7a1298f 100644 --- a/include/sonic/jsonpath/jsonpath.h +++ b/include/sonic/jsonpath/jsonpath.h @@ -205,7 +205,7 @@ class JsonPath : public std::vector { if (p[i] == '\'' || p[i] == '"') { valid = parseQuotedName(p, i, node); } else if ((p[i] >= '0' && p[i] <= '9') || p[i] == '-') { - valid = parseBracktedIndex(p, i, node); + valid = parseBracketedIndex(p, i, node); } else { // Unsupported bracket expression (e.g. unquoted name). valid = false; @@ -228,6 +228,8 @@ class JsonPath : public std::vector { sonic_force_inline bool parseNumber(StringView path, size_t& index, uint64_t& sum) { size_t start = index; + static constexpr uint64_t kInt64Max = + static_cast(std::numeric_limits::max()); // check leading zero if (index < path.size() && path[index] == '0') { index++; @@ -235,16 +237,15 @@ class JsonPath : public std::vector { } while (index < path.size() && path[index] >= '0' && path[index] <= '9') { - auto last = sum * 10 + (path[index] - '0'); - // check overflow - if (last < sum) { + const uint64_t digit = static_cast(path[index] - '0'); + if (sum > (kInt64Max - digit) / 10) { return false; } - sum = last; + sum = sum * 10 + digit; index++; } - return (sum <= INT64_MAX) && index != start; + return index != start; } // case as .abc @@ -264,8 +265,8 @@ class JsonPath : public std::vector { } // case as [123] or [-123] - sonic_force_inline bool parseBracktedIndex(StringView path, size_t& index, - JsonPathNode& node) { + sonic_force_inline bool parseBracketedIndex(StringView path, size_t& index, + JsonPathNode& node) { uint64_t sum = 0; int sign = 1; @@ -342,8 +343,7 @@ class JsonPath : public std::vector { const size_t quote_pos = static_cast(src - base); node = JsonPathNode(path.substr(start, len)); // Expect closing quote then ']'. - if (start == quote_pos || quote_pos + 1 >= n || - base[quote_pos + 1] != ']') { + if (quote_pos + 1 >= n || base[quote_pos + 1] != ']') { return false; } @@ -352,33 +352,6 @@ class JsonPath : public std::vector { return true; } - // case as [abc] - sonic_force_inline bool parseBrackedUnquotedKey(StringView path, - size_t& index, - JsonPathNode& node) { - size_t start = index; - while (index < path.size() && path[index] != ']') { - index++; - } - if (start == index) { - return false; - } - node = JsonPathNode(path.substr(start, index - start)); - index++; - return true; - } - - sonic_force_inline bool parseWildcard(StringView path, size_t& index, - JsonPathNode& node) { - if (index + 1 < path.size() && path[index] == '*' && - path[index + 1] == ']') { - node = JsonPathNode('*'); - index += 2; - return true; - } - return false; - } - public: // Parse with a padded, writable buffer (avoids extra copy). // See ParsePaddedInternal() for lifetime and padding requirements. diff --git a/include/sonic/jsonpath/ondemand.h b/include/sonic/jsonpath/ondemand.h index 4724b43d..4f8a72f8 100644 --- a/include/sonic/jsonpath/ondemand.h +++ b/include/sonic/jsonpath/ondemand.h @@ -28,10 +28,6 @@ namespace sonic_json { -struct JsonPathRawResult { - std::vector raw; - SonicError error; -}; template class JsonGenerator : public internal::SkipScanner2::JsonGeneratorInterface { @@ -83,6 +79,27 @@ class JsonGenerator return true; } + bool copyCurrentStructureSingleResult(StringView raw) override { + dom_doc_.template Parse(raw); + if (dom_doc_.HasParseError()) { + return false; + } + + auto n = &dom_doc_; + if (n->IsString()) { + wb_.PushStr(n->GetStringView()); + return true; + } + + auto err = n->template Serialize(wb_); + if (sonic_unlikely(err != kErrorNone)) { + return false; + } + return true; + } bool copyCurrentStructureJsonTupleCodeGen( StringView raw, size_t index, std::vector>& result, @@ -154,9 +171,10 @@ sonic_force_inline std::tuple GetByJsonPathOnDemand( return local_ret; }; + auto rootJsonGenerator = jsonGeneratorFactory(wb); const bool matched = scan.getJsonPath( - path, 1, jsonGeneratorFactory(wb).get(), jsonGeneratorFactory); + path, 1, rootJsonGenerator.get(), jsonGeneratorFactory); if (matched) { return std::make_tuple(std::string(wb.ToStringView()), kErrorNone); } diff --git a/tests/document_test.cpp b/tests/document_test.cpp index 6765b833..3aed1c0e 100644 --- a/tests/document_test.cpp +++ b/tests/document_test.cpp @@ -623,6 +623,23 @@ TYPED_TEST(DocumentTest, SerializeNaN) { EXPECT_STREQ(wb.ToString(), "\"-NaN\""); } +TYPED_TEST(DocumentTest, DumpPreservesEmbeddedNullInString) { + using NodeType = typename TypeParam::NodeType; + NodeType node; + std::string raw = "12"; + raw.push_back('\0'); + raw += "34"; + node.SetStringNumber(StringView(raw.data(), raw.size())); + + std::string dumped = node.Dump(); + ASSERT_EQ(dumped.size(), raw.size()); + EXPECT_EQ(dumped[0], '1'); + EXPECT_EQ(dumped[1], '2'); + EXPECT_EQ(dumped[2], '\0'); + EXPECT_EQ(dumped[3], '3'); + EXPECT_EQ(dumped[4], '4'); +} + TYPED_TEST(DocumentTest, swap) { using Document = TypeParam; Document doc1; diff --git a/tests/jsonpath_test.cpp b/tests/jsonpath_test.cpp index faad09d3..78a8dd55 100644 --- a/tests/jsonpath_test.cpp +++ b/tests/jsonpath_test.cpp @@ -222,6 +222,10 @@ TEST(JsonPathWildcard, Basic) { TestOk(R"([[123, 456], [], null])", "$[*][1]", "456"); } +TEST(JsonPathWildcard, SingleStringIsUnquotedOnDemand) { + TestOk(R"([{"a":"x"}, {}])", "$[*].a", "x"); +} + TEST(JsonPathWildcard, Primitive) { TestOk("1", "$[*]", ""); TestOk("null", "$[*]", ""); @@ -501,16 +505,19 @@ TEST(JsonPath, InvalidJsonPathMore) { // Empty key TestUnsupportedPath(json, "$."); - TestUnsupportedPath(json, R"($[""])"); - TestUnsupportedPath(json, R"($[''])"); + TestNoMatch(R"({})", R"($[""])"); + TestNoMatch(R"({})", R"($[''])"); // Index > INT64_MAX TestUnsupportedPath(json, "$[9223372036854775808]"); + TestUnsupportedPath(json, "$[43788737869027501872]"); } TEST(JsonPath, QuotedNameEscapes) { // Double-quoted name supports backslash escapes. TestOk(R"({"a\"b":1})", R"($["a\"b"])", "1"); + TestOk(R"({"":1})", R"($[""])", "1"); + TestOk(R"({"":1})", R"($[''])", "1"); // Invalid escape sequence should fail parsing and be treated as unsupported. TestUnsupportedPath(R"({})", R"($["a\x"])"); @@ -766,7 +773,7 @@ TEST(JsonPathDump, SerializeCoversEmptySingleAndMulti) { ASSERT_EQ(result.error, kErrorNone); auto dumped = sonic_json::internal::Serialize(result); EXPECT_EQ(std::get<1>(dumped), kErrorNone); - EXPECT_EQ(std::get<0>(dumped), "null"); + EXPECT_EQ(std::get<0>(dumped), ""); } { @@ -822,4 +829,52 @@ TEST(JsonPathDump, SerializeCoversSingleNonStringAndErrorPaths) { EXPECT_EQ(std::get<0>(dumped), ""); } } + +TEST(JsonPathDump, SerializeMatchesPublicJsonPathForFilteredNullOnly) { + Document doc; + doc.Parse(R"({"a":null})"); + ASSERT_FALSE(doc.HasParseError()); + + auto result = doc.AtJsonPath("$.a"); + ASSERT_EQ(result.error, kErrorNone); + + auto dumped = sonic_json::internal::Serialize(result); + auto public_got = GetByJsonPath(R"({"a":null})", "$.a"); + + EXPECT_EQ(std::get<1>(dumped), std::get<1>(public_got)); + EXPECT_EQ(std::get<0>(dumped), std::get<0>(public_got)); +} + +TEST(JsonPathDump, SerializeDoesNotMutateInputResult) { + Document doc; + doc.Parse(R"({"arr":[null,"x",1]})"); + ASSERT_FALSE(doc.HasParseError()); + + auto result = doc.AtJsonPath("$.arr[*]"); + ASSERT_EQ(result.error, kErrorNone); + ASSERT_EQ(result.nodes.size(), 3U); + ASSERT_TRUE(result.nodes[0]->IsNull()); + + auto dumped = sonic_json::internal::Serialize(result); + EXPECT_EQ(std::get<1>(dumped), kErrorNone); + EXPECT_EQ(std::get<0>(dumped), R"(["x",1])"); + + ASSERT_EQ(result.nodes.size(), 3U); + EXPECT_TRUE(result.nodes[0]->IsNull()); + EXPECT_TRUE(result.nodes[1]->IsString()); +} + +TEST(JsonPathDump, SerializeAcceptsConstResult) { + Document doc; + doc.Parse(R"({"s":"abc"})"); + ASSERT_FALSE(doc.HasParseError()); + + const Document& cdoc = doc; + const auto result = cdoc.AtJsonPath("$.s"); + ASSERT_EQ(result.error, kErrorNone); + + auto dumped = sonic_json::internal::Serialize(result); + EXPECT_EQ(std::get<1>(dumped), kErrorNone); + EXPECT_EQ(std::get<0>(dumped), "abc"); +} } // namespace diff --git a/tests/node_test.cpp b/tests/node_test.cpp index d3270d8a..b7a6982c 100644 --- a/tests/node_test.cpp +++ b/tests/node_test.cpp @@ -850,6 +850,49 @@ TEST(DNodeTest, OwnedRawAndNumStrFreed) { EXPECT_EQ(CountingAllocator::malloc_cnt, CountingAllocator::free_cnt); } +TEST(DNodeTest, CopyRawOrNumStrWithNullAllocatorDoesNotCrash) { +#if defined(_WIN32) + GTEST_SKIP() << "Subprocess exit assertions are not enabled on Windows here."; +#else + ASSERT_EXIT( + { + using FailingNode = DNode; + InvalidAllocator alloc; + + { + Document doc; + const char* json = "123"; + doc.Parse(json, 3); + if (doc.HasParseError() || !doc.IsRaw()) { + std::_Exit(2); + } + FailingNode copied(doc, alloc); + if (!copied.IsRaw() || copied.Size() != 0 || copied.GetRaw() != "") { + std::_Exit(3); + } + } + + { + Document doc; + const std::string json = "18446744073709551616"; + doc.Parse(json.data(), + json.size()); + if (doc.HasParseError() || !doc.IsStringNumber()) { + std::_Exit(4); + } + FailingNode copied(doc, alloc); + if (!copied.IsStringNumber() || copied.Size() != 0 || + copied.GetStringView() != "") { + std::_Exit(5); + } + } + + std::_Exit(0); + }, + ::testing::ExitedWithCode(0), ""); +#endif +} + TYPED_TEST(NodeTest, SourceAllocator) { using NodeType = TypeParam; using Allocator = typename NodeType::alloc_type; diff --git a/tests/parsenumber_test.cpp b/tests/parsenumber_test.cpp index c4bcbec4..6027ffef 100644 --- a/tests/parsenumber_test.cpp +++ b/tests/parsenumber_test.cpp @@ -362,4 +362,14 @@ TEST(ParserTest, ParseStringNumber_Uint64KeepsType) { EXPECT_EQ(input, doc.Dump()) << input; } +TEST(ParserTest, ParseStringNumber_Uint64OverflowBecomesNumStr) { + Document doc; + std::string input = "18446744073709551616"; // ULLONG_MAX + 1 + doc.Parse(input.data(), input.size()); + EXPECT_FALSE(doc.HasParseError()) << input; + EXPECT_TRUE(doc.IsStringNumber()) << input; + EXPECT_EQ(input, doc.GetStringView()) << input; + EXPECT_EQ(input, doc.Dump()) << input; +} + } // namespace From 43620286c9a592e934b8876de4dfebeed0bfe7ca Mon Sep 17 00:00:00 2001 From: yangzhengguo Date: Tue, 24 Mar 2026 20:01:00 +0800 Subject: [PATCH 5/6] fix comments --- include/sonic/dom/flags.h | 6 +-- include/sonic/dom/parser.h | 27 ++++++++++--- include/sonic/jsonpath/dom.h | 2 +- include/sonic/jsonpath/dump.h | 2 +- tests/jsonpath_test.cpp | 6 +-- tests/parsenumber_test.cpp | 72 +++++++++++++++++++++++++++++++---- 6 files changed, 94 insertions(+), 21 deletions(-) diff --git a/include/sonic/dom/flags.h b/include/sonic/dom/flags.h index b9279343..a16a4272 100644 --- a/include/sonic/dom/flags.h +++ b/include/sonic/dom/flags.h @@ -25,9 +25,9 @@ enum class ParseFlags : uint32_t { kParseAllowUnescapedControlChars = 1 << 1, // parse all integer as raw number kParseIntegerAsRaw = 1 << 2, - // Parse numbers as number strings (NumStr) when needed. - // When enabled, floating-point numbers are stored as NumStr; integers are - // still stored as int64/uint64 when in range, otherwise stored as NumStr. + // Parse numbers as number strings (NumStr) only when they overflow the + // native numeric representation. In-range floating-point numbers stay + // double; in-range integers stay int64/uint64. kParseOverflowNumAsNumStr = 1 << 3, }; diff --git a/include/sonic/dom/parser.h b/include/sonic/dom/parser.h index 10cee9aa..c1c7458b 100644 --- a/include/sonic/dom/parser.h +++ b/include/sonic/dom/parser.h @@ -219,11 +219,6 @@ class Parser { template sonic_force_inline bool parseNumber(SAX &sax) { - // check flags - if constexpr (parseFlags & ParseFlags::kParseOverflowNumAsNumStr) { - return parseNumberAsString(sax); - } - // These helper macros are used only within this function. // Define/undefine them locally to avoid leaking into includers. #undef FLOATING_LONGEST_DIGITS @@ -381,6 +376,10 @@ class Parser { if (sgn == -1) { if (man > ((uint64_t)1 << 63)) { // overflow signed integer + if constexpr (parseFlags & ParseFlags::kParseOverflowNumAsNumStr) { + pos_ = start_idx + 1; + return parseNumberAsString(sax); + } // Assume compiler supports convert uint64 to double SET_DOUBLE_AND_RETURN(-(double)(man)); } else { @@ -396,15 +395,27 @@ class Parser { (man == kUint64Max / 10 && num <= UINT_MAX % 10)) { man = man * 10 + num; if (sgn == -1) { + if constexpr (parseFlags & ParseFlags::kParseOverflowNumAsNumStr) { + pos_ = start_idx + 1; + return parseNumberAsString(sax); + } SET_DOUBLE_AND_RETURN(-(double)(man)); } else { SET_UINT_AND_RETURN(man); } } else { + if constexpr (parseFlags & ParseFlags::kParseOverflowNumAsNumStr) { + pos_ = start_idx + 1; + return parseNumberAsString(sax); + } trunc = 1; goto double_fast; } } else { + if constexpr (parseFlags & ParseFlags::kParseOverflowNumAsNumStr) { + pos_ = start_idx + 1; + return parseNumberAsString(sax); + } trunc = 1; goto double_fast; } @@ -499,6 +510,12 @@ class Parser { double d; SonicError error_code = parseFloatEiselLemire64(d, exp10, man, sgn, trunc, s); + if constexpr (parseFlags & ParseFlags::kParseOverflowNumAsNumStr) { + if (error_code == kParseErrorInfinity) { + pos_ = start_idx + 1; + return parseNumberAsString(sax); + } + } if (!sax.Double(d)) RETURN_SET_ERROR_CODE(kParseErrorInvalidChar); RETURN_SET_ERROR_CODE(error_code); diff --git a/include/sonic/jsonpath/dom.h b/include/sonic/jsonpath/dom.h index f9f934fc..2eea47c0 100644 --- a/include/sonic/jsonpath/dom.h +++ b/include/sonic/jsonpath/dom.h @@ -21,7 +21,7 @@ sonic_force_inline std::tuple GetByJsonPathInternal( result.nodes.end()); if (result.nodes.empty()) { - return std::make_tuple("", result.error); + return std::make_tuple("null", result.error); } WriteBuffer wb; diff --git a/include/sonic/jsonpath/dump.h b/include/sonic/jsonpath/dump.h index d527e7c3..62d50e82 100644 --- a/include/sonic/jsonpath/dump.h +++ b/include/sonic/jsonpath/dump.h @@ -21,7 +21,7 @@ sonic_force_inline std::tuple Serialize( local.nodes.end()); if (local.nodes.empty()) { - return std::make_tuple("", kErrorNone); + return std::make_tuple("null", kErrorNone); } WriteBuffer wb; diff --git a/tests/jsonpath_test.cpp b/tests/jsonpath_test.cpp index 78a8dd55..083ffbc8 100644 --- a/tests/jsonpath_test.cpp +++ b/tests/jsonpath_test.cpp @@ -729,10 +729,10 @@ TEST(JsonPathDom, ConstAtJsonPathIsConstCorrect) { EXPECT_EQ(n->GetStringView(), "abc"); } -TEST(JsonPathDom, NullNodeFilteredToEmptyString) { +TEST(JsonPathDom, NullNodeFilteredToNull) { auto got = GetByJsonPath(R"({"a":null})", "$.a"); EXPECT_EQ(std::get<1>(got), kErrorNone); - EXPECT_EQ(std::get<0>(got), ""); + EXPECT_EQ(std::get<0>(got), "null"); } TEST(JsonPathDom, MultiNodesSerializeAndFilterNulls) { @@ -773,7 +773,7 @@ TEST(JsonPathDump, SerializeCoversEmptySingleAndMulti) { ASSERT_EQ(result.error, kErrorNone); auto dumped = sonic_json::internal::Serialize(result); EXPECT_EQ(std::get<1>(dumped), kErrorNone); - EXPECT_EQ(std::get<0>(dumped), ""); + EXPECT_EQ(std::get<0>(dumped), "null"); } { diff --git a/tests/parsenumber_test.cpp b/tests/parsenumber_test.cpp index 6027ffef..614486c9 100644 --- a/tests/parsenumber_test.cpp +++ b/tests/parsenumber_test.cpp @@ -331,16 +331,72 @@ TEST(ParserTest, ParseInvalidNumber) { TEST(ParserTest, ParseStringNumber) { TestStringNumber("-9223372036854775809", "-9223372036854775809"); TestStringNumber("18446744073709551616", "18446744073709551616"); - TestStringNumber("-4.94065645841247E-324", "-4.94065645841247E-324"); - TestStringNumber("4.94065645841247E-324", "4.94065645841247E-324"); - TestStringNumber("1.79769313486231E308", "1.79769313486231E308"); - TestStringNumber("-1.79769313486231E308", "-1.79769313486231E308"); + TestStringNumber("1e309", "1e309"); + TestStringNumber("-1e309", "-1e309"); } -TEST(ParserTest, ParseStringNumber_NormalFloat) { - TestStringNumber("1.0", "1.0"); - TestStringNumber("0.1", "0.1"); - TestStringNumber("1e2", "1e2"); +TEST(ParserTest, ParseOverflowNumAsNumStr_NormalFloatStaysDouble) { + { + Document doc; + std::string input = "1.5"; + doc.Parse(input.data(), + input.size()); + EXPECT_FALSE(doc.HasParseError()) << input; + ASSERT_TRUE(doc.IsDouble()) << input; + EXPECT_DOUBLE_EQ(1.5, doc.GetDouble()) << input; + } + + { + Document doc; + std::string input = "1.0"; + doc.Parse(input.data(), + input.size()); + EXPECT_FALSE(doc.HasParseError()) << input; + ASSERT_TRUE(doc.IsDouble()) << input; + EXPECT_DOUBLE_EQ(1.0, doc.GetDouble()) << input; + } + + { + Document doc; + std::string input = "0.1"; + doc.Parse(input.data(), + input.size()); + EXPECT_FALSE(doc.HasParseError()) << input; + ASSERT_TRUE(doc.IsDouble()) << input; + EXPECT_DOUBLE_EQ(0.1, doc.GetDouble()) << input; + } + + { + Document doc; + std::string input = "1e2"; + doc.Parse(input.data(), + input.size()); + EXPECT_FALSE(doc.HasParseError()) << input; + ASSERT_TRUE(doc.IsDouble()) << input; + EXPECT_DOUBLE_EQ(100.0, doc.GetDouble()) << input; + } +} + +TEST(ParserTest, ParseOverflowNumAsNumStr_InRangeIntegerKeepsType) { + { + Document doc; + std::string input = "42"; + doc.Parse(input.data(), + input.size()); + EXPECT_FALSE(doc.HasParseError()) << input; + ASSERT_TRUE(doc.IsInt64()) << input; + EXPECT_EQ(42, doc.GetInt64()) << input; + } + + { + Document doc; + std::string input = "-42"; + doc.Parse(input.data(), + input.size()); + EXPECT_FALSE(doc.HasParseError()) << input; + ASSERT_TRUE(doc.IsInt64()) << input; + EXPECT_EQ(-42, doc.GetInt64()) << input; + } } TEST(ParserTest, ParseStringNumber_ZeroExponentStaysDouble) { From 61f46c43fe7e1480a5819f6d4b7d4ae736833db3 Mon Sep 17 00:00:00 2001 From: yangzhengguo Date: Wed, 25 Mar 2026 16:58:39 +0800 Subject: [PATCH 6/6] fix comments --- include/sonic/dom/dynamicnode.h | 2 ++ include/sonic/dom/genericnode.h | 5 ++--- include/sonic/dom/schema_handler.h | 5 +++++ include/sonic/internal/arch/simd_skip.h | 4 ++++ include/sonic/jsonpath/jsonpath.h | 29 ++++++++++--------------- tests/jsonpath_test.cpp | 18 ++++++++++++++- tests/node_test.cpp | 18 +++++++++++++++ tests/parse_schema_test.cpp | 11 ++++++++++ 8 files changed, 71 insertions(+), 21 deletions(-) diff --git a/include/sonic/dom/dynamicnode.h b/include/sonic/dom/dynamicnode.h index ba3555a3..74cc99aa 100644 --- a/include/sonic/dom/dynamicnode.h +++ b/include/sonic/dom/dynamicnode.h @@ -226,6 +226,8 @@ class DNode : public GenericNode> { case kStringCopy: case kStringFree: case kStringConst: + case kNumStr: + case kRaw: return this->GetStringView() == rhs.GetStringView(); case kReal: diff --git a/include/sonic/dom/genericnode.h b/include/sonic/dom/genericnode.h index 1a20419f..7b3c4e9d 100644 --- a/include/sonic/dom/genericnode.h +++ b/include/sonic/dom/genericnode.h @@ -725,9 +725,8 @@ class GenericNode { /** * @brief get specific nodes by json path - * @param path json pointer - * @retval nullptr get node failed - * @retval others success + * @param jsonpath json path + * @return JsonPathResult containing matched nodes and error state */ JsonPathResult AtJsonPath(const StringView jsonpath) { return AtJsonPathCommon(downCast(), jsonpath); diff --git a/include/sonic/dom/schema_handler.h b/include/sonic/dom/schema_handler.h index 035a1ecc..2b7f5dde 100644 --- a/include/sonic/dom/schema_handler.h +++ b/include/sonic/dom/schema_handler.h @@ -243,6 +243,11 @@ class SchemaHandler { } sonic_force_inline bool NumStr(StringView s) { + if (cur_node_) { + cur_node_->setLength(s.size(), kNumStr); + cur_node_->sv.p = s.data(); + return true; + } SONIC_ADD_NODE(); new (&st_[np_ - 1]) NodeType(); st_[np_ - 1].setLength(s.size(), kNumStr); diff --git a/include/sonic/internal/arch/simd_skip.h b/include/sonic/internal/arch/simd_skip.h index f284db8a..1f4ec258 100644 --- a/include/sonic/internal/arch/simd_skip.h +++ b/include/sonic/internal/arch/simd_skip.h @@ -993,12 +993,14 @@ class SkipScanner2 { RETURN_FALSE_IF_PARSE_ERROR(skipOne()); RETURN_FALSE_IF_PARSE_ERROR(skipIfPresent(',')); } + RETURN_FALSE_IF_PARSE_ERROR(consume('}')); } else if (c == '[') { RETURN_FALSE_IF_PARSE_ERROR(consume('[')); while (peek() != ']') { RETURN_FALSE_IF_PARSE_ERROR(skipOne()); RETURN_FALSE_IF_PARSE_ERROR(skipIfPresent(',')); } + RETURN_FALSE_IF_PARSE_ERROR(consume(']')); } } return false; @@ -1103,3 +1105,5 @@ class SkipScanner2 { }; } // namespace internal } // namespace sonic_json + +#undef RETURN_FALSE_IF_PARSE_ERROR diff --git a/include/sonic/jsonpath/jsonpath.h b/include/sonic/jsonpath/jsonpath.h index f7a1298f..8f0e9a0b 100644 --- a/include/sonic/jsonpath/jsonpath.h +++ b/include/sonic/jsonpath/jsonpath.h @@ -314,31 +314,26 @@ class JsonPath : public std::vector { const char* end = base + n; size_t len = 0; // normalized path - if (quote == '\"') { - while (src < end && *src != quote) { - if (*src == '\\') { + while (src < end && *src != quote) { + if (*src == '\\') { + if (src[1] == '\'') { + *dst++ = '\''; + src += 2; + } else { if (internal::common::unescape_with_padding( reinterpret_cast(&src), reinterpret_cast(&dst)) == 0) { return false; } - } else { - *dst++ = *src++; } + } else { + *dst++ = *src++; } - if (src >= end) { - return false; - } - len = static_cast(dst - (base + start)); - } else { - while (src < end && *src != quote) { - src++; - } - if (src >= end) { - return false; - } - len = static_cast(src - (base + start)); } + if (src >= end) { + return false; + } + len = static_cast(dst - (base + start)); const size_t quote_pos = static_cast(src - base); node = JsonPathNode(path.substr(start, len)); diff --git a/tests/jsonpath_test.cpp b/tests/jsonpath_test.cpp index 083ffbc8..acf91540 100644 --- a/tests/jsonpath_test.cpp +++ b/tests/jsonpath_test.cpp @@ -388,7 +388,7 @@ TEST(JsonPath, EscapedKeySelector) { "b@": 5 })"; TestOk(json, R"($["a\\"])", "1"); - TestOk(json, R"($['a\'])", "1"); + TestOk(json, R"($['a\\'])", "1"); TestOk(json, R"($["b\""])", "2"); TestOk(json, R"($["b\u0041"])", "3"); TestOk(json, "$['b.9']", "4"); @@ -519,6 +519,9 @@ TEST(JsonPath, QuotedNameEscapes) { TestOk(R"({"":1})", R"($[""])", "1"); TestOk(R"({"":1})", R"($[''])", "1"); + // Single-quoted name supports backslash escapes. + TestOk(R"({"a'b":1})", R"($['a\'b'])", "1"); + // Invalid escape sequence should fail parsing and be treated as unsupported. TestUnsupportedPath(R"({})", R"($["a\x"])"); TestNoMatch(R"({})", R"($["\\"])"); @@ -699,6 +702,19 @@ TEST(JsonPath, JsonInfiniteLoop3) { EXPECT_EQ(std::get<0>(got), ""); } +TEST(JsonPath, NestedContainersDesynchronization) { + // Test for getJsonPath fallthrough handler consuming arrays/objects properly. + // We want to ensure that parsing doesn't desynchronize when a key path + // encounters a mismatched array and exits the skip block properly. + auto json = R"([[1], {"x": 2}])"; + auto path = "$[*].x"; + auto got = + GetByJsonPathOnDemand( + json, path); + EXPECT_EQ(std::get<1>(got), kErrorNone); + EXPECT_EQ(std::get<0>(got), "2"); +} + TEST(JsonPath, JsonTuple) { auto json = R"({a:1, b:2c})"; auto got = diff --git a/tests/node_test.cpp b/tests/node_test.cpp index b7a6982c..25f4a490 100644 --- a/tests/node_test.cpp +++ b/tests/node_test.cpp @@ -292,6 +292,24 @@ TYPED_TEST(NodeTest, Equal) { EXPECT_FALSE(node1 == node2); EXPECT_FALSE(node2 == node1); } + + { + sonic_json::Document doc1, doc2; + doc1.Parse("18446744073709551616"); + doc2.Parse("18446744073709551617"); + EXPECT_FALSE(doc1 == doc2); + EXPECT_FALSE(doc2 == doc1); + doc2.Parse("18446744073709551616"); + EXPECT_TRUE(doc1 == doc2); + EXPECT_TRUE(doc2 == doc1); + + sonic_json::Document doc3, doc4; + doc3.Parse( + "123456789012345678901234567890"); + doc4.Parse( + "123456789012345678901234567891"); + EXPECT_FALSE(doc3 == doc4); + } } TYPED_TEST(NodeTest, FindMember) { diff --git a/tests/parse_schema_test.cpp b/tests/parse_schema_test.cpp index cc1c02bb..2860d1c2 100644 --- a/tests/parse_schema_test.cpp +++ b/tests/parse_schema_test.cpp @@ -164,4 +164,15 @@ TEST(ParseSchema, FailedBasic) { TestFailed(R"(null)", R"([null,])"); } +TEST(ParseSchema, ParseOverflowNumAsNumStr) { + std::string schema = R"({"val": 1})"; + std::string json = R"({"val": 18446744073709551616})"; + Document doc; + doc.Parse(schema); + doc.ParseSchema(json); + EXPECT_FALSE(doc.HasParseError()); + EXPECT_TRUE(doc["val"].IsStringNumber()); + EXPECT_EQ(doc["val"].GetStringView(), "18446744073709551616"); +} + } // namespace