diff --git a/builtin-functions/kphp-light/stdlib/regex-functions.txt b/builtin-functions/kphp-light/stdlib/regex-functions.txt index 54c35bb468..8eaa7f9d1b 100644 --- a/builtin-functions/kphp-light/stdlib/regex-functions.txt +++ b/builtin-functions/kphp-light/stdlib/regex-functions.txt @@ -17,14 +17,14 @@ define('PREG_SPLIT_DELIM_CAPTURE', 16); define('PREG_SPLIT_OFFSET_CAPTURE', 32); define('PREG_UNMATCHED_AS_NULL', 64); -function preg_match ($regex ::: string, $subject ::: string, &$matches ::: mixed = TODO, $flags ::: int = 0, $offset ::: int = 0): int | false; +function preg_match ($regex ::: regexp, $subject ::: string, &$matches ::: mixed = TODO, $flags ::: int = 0, $offset ::: int = 0): int | false; -function preg_match_all ($regex ::: string, $subject ::: string, &$matches ::: mixed = TODO, $flags ::: int = 0, $offset ::: int = 0): int | false; +function preg_match_all ($regex ::: regexp, $subject ::: string, &$matches ::: mixed = TODO, $flags ::: int = 0, $offset ::: int = 0): int | false; function preg_quote ($str ::: string, $delimiter ::: string = ''): string; function preg_replace( - $regex ::: string | array, + $regex ::: regexp, $replace_val ::: string | array, $subject ::: string | array, $limit ::: int = -1, @@ -32,14 +32,14 @@ function preg_replace( /** @kphp-extern-func-info interruptible */ function preg_replace_callback( - $regex ::: string | array, + $regex ::: regexp, callable(string[] $x):string $callback, $subject ::: string | array, $limit ::: int = -1, &$replace_count ::: int = TODO, $flags ::: int = 0): string | ^3 | null; -function preg_split ($pattern ::: string, $subject ::: string, $limit ::: int = -1, $flags ::: int = 0) ::: mixed[] | false; +function preg_split ($pattern ::: regexp, $subject ::: string, $limit ::: int = -1, $flags ::: int = 0) ::: mixed[] | false; // ===== UNSUPPORTED ===== diff --git a/compiler/code-gen/files/const-vars-init.cpp b/compiler/code-gen/files/const-vars-init.cpp index 752237fd6e..51c1519ffe 100644 --- a/compiler/code-gen/files/const-vars-init.cpp +++ b/compiler/code-gen/files/const-vars-init.cpp @@ -23,12 +23,15 @@ struct InitConstVar { Location save_location = stage::get_location(); VertexPtr init_val = var->init_val; - if (init_val->type() == op_conv_regexp && !G->is_output_mode_k2()) { - const auto &location = init_val->get_location(); - kphp_assert(location.function && location.file); - W << var->name << ".init (" << var->init_val << ", " << RawString(location.function->name) << ", " - << RawString(location.file->relative_file_name + ':' + std::to_string(location.line)) - << ");" << NL; + if (init_val->type() == op_conv_regexp) { + if (!G->is_output_mode_k2()) { + const auto& location = init_val->get_location(); + kphp_assert(location.function && location.file); + W << var->name << ".init (" << var->init_val << ", " << RawString(location.function->name) << ", " + << RawString(location.file->relative_file_name + ':' + std::to_string(location.line)) << ");" << NL; + } else { + W << var->name << ".compile_time_init(" << var->init_val << ");" << NL; + } } else { W << var->name << " = " << var->init_val << ";" << NL; } diff --git a/compiler/inferring/primitive-type.cpp b/compiler/inferring/primitive-type.cpp index f88e82768a..6930a6837f 100644 --- a/compiler/inferring/primitive-type.cpp +++ b/compiler/inferring/primitive-type.cpp @@ -10,6 +10,7 @@ #include "common/algorithms/find.h" #include "compiler/stage.h" +#include "compiler/compiler-core.h" const char *ptype_name(PrimitiveType id) { switch (id) { @@ -27,7 +28,7 @@ const char *ptype_name(PrimitiveType id) { case tp_shape: return "shape"; case tp_future: return "future"; case tp_future_queue: return "future_queue"; - case tp_regexp: return "regexp"; + case tp_regexp: return G->is_output_mode_k2() ? "kphp::regex::regexp" : "regexp" ; case tp_Class: return "Class"; case tp_object: return "object"; case tp_void: return "void"; diff --git a/runtime-light/state/image-state.h b/runtime-light/state/image-state.h index e86c79064d..7d3fabae02 100644 --- a/runtime-light/state/image-state.h +++ b/runtime-light/state/image-state.h @@ -24,6 +24,7 @@ #include "runtime-light/stdlib/math/math-state.h" #include "runtime-light/stdlib/rpc/rpc-client-state.h" #include "runtime-light/stdlib/serialization/serialization-state.h" +#include "runtime-light/stdlib/string/regex-state.h" #include "runtime-light/stdlib/string/string-state.h" #include "runtime-light/stdlib/time/time-state.h" #include "runtime-light/stdlib/visitors/shape-visitors.h" @@ -50,6 +51,7 @@ struct ImageState final : private vk::not_copyable { MathImageState math_image_state; RpcImageState rpc_image_state; CurlImageState curl_image_state; + RegexImageState regex_image_state; ImageState() noexcept { if (const int64_t sysconf_max_buffer_size{k2::sysconf(_SC_GETPW_R_SIZE_MAX)}; sysconf_max_buffer_size != -1) { diff --git a/runtime-light/stdlib/string/regex-functions.cpp b/runtime-light/stdlib/string/regex-functions.cpp index 366c3a4901..fbd21fcf26 100644 --- a/runtime-light/stdlib/string/regex-functions.cpp +++ b/runtime-light/stdlib/string/regex-functions.cpp @@ -4,754 +4,57 @@ #include "runtime-light/stdlib/string/regex-functions.h" -#include -#include #include #include #include -#include #include -#include -#include -#include -#include -#include #include #include -#include "common/containers/final_action.h" -#include "runtime-common/core/allocator/script-allocator.h" #include "runtime-common/core/runtime-core.h" -#include "runtime-common/core/std/containers.h" -#include "runtime-common/stdlib/string/mbstring-functions.h" #include "runtime-light/stdlib/diagnostics/logs.h" -#include "runtime-light/stdlib/string/regex-include.h" -#include "runtime-light/stdlib/string/regex-state.h" -namespace { +// === preg_replace implementation ================================================================ -using backref = std::string_view; - -bool correct_offset(int64_t& offset, std::string_view subject) noexcept { - if (offset < 0) [[unlikely]] { - offset += subject.size(); - if (offset < 0) [[unlikely]] { - offset = 0; - return true; - } - } - return offset <= subject.size(); -} - -std::optional try_get_backref(std::string_view preg_replacement) noexcept { - if (preg_replacement.empty() || !std::isdigit(preg_replacement[0])) { - return std::nullopt; - } - - if (preg_replacement.size() == 1 || !std::isdigit(preg_replacement[1])) { - return backref{preg_replacement.substr(0, 1)}; - } - - return backref{preg_replacement.substr(0, 2)}; -} - -using replacement_term = std::variant; - -class preg_replacement_parser { - std::string_view preg_replacement; - - replacement_term parse_term_internal() noexcept { - kphp::log::assertion(!preg_replacement.empty()); - auto first_char{preg_replacement.front()}; - preg_replacement = preg_replacement.substr(1); - if (preg_replacement.empty()) { - return first_char; - } - switch (first_char) { - case '$': - // $1, ${1} - if (preg_replacement.front() == '{') { - return try_get_backref(preg_replacement.substr(1)) - .and_then([this](auto br) noexcept -> std::optional { - auto digits_end_pos{1 + br.size()}; - if (digits_end_pos < preg_replacement.size() && preg_replacement[digits_end_pos] == '}') { - preg_replacement = preg_replacement.substr(1 + br.size() + 1); - return br; - } - - return std::nullopt; - }) - .value_or('$'); - } - - return try_get_backref(preg_replacement) - .transform([this](auto br) noexcept -> replacement_term { - auto digits_end_pos{br.size()}; - preg_replacement = preg_replacement.substr(digits_end_pos); - return br; - }) - .value_or('$'); - - case '\\': { - // \1 - auto opt_back_reference{try_get_backref(preg_replacement).transform([this](auto br) noexcept -> replacement_term { - auto digits_end_pos{br.size()}; - preg_replacement = preg_replacement.substr(digits_end_pos); - return br; - })}; - if (opt_back_reference.has_value()) { - return *opt_back_reference; - } else { - auto c{preg_replacement.front()}; - if (c == '$' || c == '\\') { - preg_replacement = preg_replacement.substr(1); - return c; - } - return '\\'; - } - } - default: - return first_char; - } - } - -public: - explicit preg_replacement_parser(std::string_view preg_replacement) noexcept - : preg_replacement{preg_replacement} {} - - struct iterator { - preg_replacement_parser* parser{nullptr}; - replacement_term current_term{'\0'}; - - using difference_type = std::ptrdiff_t; - using value_type = replacement_term; - using reference = const replacement_term&; - using pointer = const replacement_term*; - using iterator_category = std::input_iterator_tag; - - iterator() noexcept = default; - explicit iterator(preg_replacement_parser* p) noexcept - : parser{p} { - if (parser->preg_replacement.empty()) { - parser = nullptr; - } else { - current_term = parser->parse_term_internal(); - } - } - - reference operator*() const noexcept { - return current_term; - } - pointer operator->() const noexcept { - return std::addressof(current_term); - } - - iterator& operator++() noexcept { - if (!parser->preg_replacement.empty()) { - current_term = parser->parse_term_internal(); - } else { - parser = nullptr; - } - return *this; - } - iterator operator++(int) noexcept { // NOLINT - iterator temp{*this}; - ++(*this); - return temp; - } - - friend bool operator==(const iterator& a, const iterator& b) noexcept { - return a.parser == b.parser; - } - friend bool operator!=(const iterator& a, const iterator& b) noexcept { - return !(a == b); - } - }; - - iterator begin() noexcept { - return iterator{this}; - } - iterator end() noexcept { - return iterator{}; - } -}; - -array to_mixed_array(const kphp::regex::details::match_results_wrapper& wrapper) noexcept { - const bool numeric_only{wrapper.name_count() == 0}; - - array result_map{array_size{static_cast(wrapper.max_potential_size()), numeric_only}}; - for (auto [key, value] : wrapper) { - result_map.set_value(key, value); - } - return result_map; -} - -// *** importrant *** -// in case of a pattern order all_matches must already contain all groups as empty arrays before the first call to set_all_matches -void set_all_matches(const kphp::pcre2::regex& re, const kphp::stl::vector& group_names, - const kphp::pcre2::match_view& match_view, int64_t flags, std::optional> opt_all_matches) noexcept { - const auto is_pattern_order{!static_cast(flags & kphp::regex::PREG_SET_ORDER)}; - const auto is_offset_capture{static_cast(flags & kphp::regex::PREG_OFFSET_CAPTURE)}; - const auto is_unmatched_as_null{static_cast(flags & kphp::regex::PREG_UNMATCHED_AS_NULL)}; - - // early return in case we don't actually need to set matches - if (!opt_all_matches.has_value()) { - return; - } - - auto last_unmatched_policy{is_pattern_order ? kphp::regex::details::trailing_unmatch::include : kphp::regex::details::trailing_unmatch::skip}; - mixed matches{to_mixed_array({match_view, group_names, re.capture_count(), re.name_count(), last_unmatched_policy, is_offset_capture, is_unmatched_as_null})}; - - mixed& all_matches{(*opt_all_matches).get()}; - if (is_pattern_order) [[likely]] { - for (const auto& it : std::as_const(matches)) { - all_matches[it.get_key()].push_back(it.get_value()); - } - } else { - all_matches.push_back(matches); - } -} - -std::optional replace_regex(kphp::regex::details::info& regex_info, const kphp::pcre2::regex& re, uint64_t limit) noexcept { - regex_info.replace_count = 0; - - if (limit == 0) { - return regex_info.subject; - } - - auto& regex_state{RegexInstanceState::get()}; - if (!regex_state.match_context) [[unlikely]] { - return std::nullopt; - } - - auto& runtime_ctx{RuntimeContext::get()}; - PCRE2_SIZE buffer_length{ - std::max({regex_info.subject.size(), static_cast(RegexInstanceState::REPLACE_BUFFER_SIZE), runtime_ctx.static_SB.size()})}; - runtime_ctx.static_SB.clean().reserve(buffer_length); - - size_t last_pos{}; - string output_str{}; - - kphp::log::assertion(regex_info.replacement.has_value()); - const auto& replacement{*regex_info.replacement}; - - kphp::pcre2::matcher pcre2_matcher{ - re, {regex_info.subject.c_str(), regex_info.subject.size()}, {}, regex_state.match_context, regex_state.match_data, regex_info.match_options}; - while (regex_info.replace_count < limit) { - auto expected_opt_match_view{pcre2_matcher.next()}; - - if (!expected_opt_match_view.has_value()) [[unlikely]] { - kphp::log::warning("can't replace by pcre2 regex due to match error: {}", expected_opt_match_view.error()); - return std::nullopt; - } - auto opt_match_view{*expected_opt_match_view}; - if (!opt_match_view.has_value()) { - break; - } - - auto& match_view{*opt_match_view}; - - output_str.append(std::next(regex_info.subject.c_str(), last_pos), match_view.match_start() - last_pos); - - auto sub_res{match_view.substitute({replacement.c_str(), replacement.size()}, {runtime_ctx.static_SB.buffer(), buffer_length}, regex_state.match_context)}; - if (!sub_res.has_value()) { - auto [needed_size, error]{sub_res.error()}; - if (error.code == PCRE2_ERROR_NOMEMORY) [[unlikely]] { - runtime_ctx.static_SB.reserve(needed_size); - buffer_length = needed_size; - sub_res = match_view.substitute({replacement.c_str(), replacement.size()}, {runtime_ctx.static_SB.buffer(), buffer_length}, regex_state.match_context); - } - if (!sub_res.has_value()) [[unlikely]] { - kphp::log::warning("pcre2_substitute error {}", sub_res.error().second); - return std::nullopt; - } - } - - output_str.append(runtime_ctx.static_SB.buffer(), *sub_res); - - last_pos = match_view.match_end(); - ++regex_info.replace_count; - } - - output_str.append(std::next(regex_info.subject.c_str(), last_pos), regex_info.subject.size() - last_pos); - - return output_str; -} - -std::optional> split_regex(kphp::regex::details::info& regex_info, const kphp::pcre2::regex& re, int64_t limit, bool no_empty, bool delim_capture, - bool offset_capture) noexcept { - if (limit == 0) { - limit = kphp::regex::PREG_NOLIMIT; - } - - auto& regex_state{RegexInstanceState::get()}; - if (!regex_state.match_context) [[unlikely]] { - return std::nullopt; - } - - array output{}; - - kphp::pcre2::matcher pcre2_matcher{ - re, {regex_info.subject.c_str(), regex_info.subject.size()}, {}, regex_state.match_context, regex_state.match_data, regex_info.match_options}; - size_t offset{}; - for (size_t out_parts_count{1}; limit == kphp::regex::PREG_NOLIMIT || out_parts_count < limit;) { - auto expected_opt_match_view{pcre2_matcher.next()}; - if (!expected_opt_match_view.has_value()) [[unlikely]] { - kphp::log::warning("can't split by pcre2 regex due to match error: {}", expected_opt_match_view.error()); - return std::nullopt; - } - auto opt_match_view{*expected_opt_match_view}; - if (!opt_match_view.has_value()) { - break; - } - - kphp::pcre2::match_view match_view{*opt_match_view}; - - if (const auto size{match_view.match_start() - offset}; !no_empty || size != 0) { - string val{std::next(regex_info.subject.c_str(), offset), static_cast(size)}; - - mixed output_val; - if (offset_capture) { - output_val = array::create(std::move(val), static_cast(offset)); - } else { - output_val = std::move(val); - } - - output.emplace_back(std::move(output_val)); - ++out_parts_count; - } - - if (delim_capture) { - for (size_t i{1}; i < match_view.size(); i++) { - auto opt_submatch{match_view.get_group(i)}; - auto submatch_string_view{opt_submatch.value_or(std::string_view{})}; - const auto size{submatch_string_view.size()}; - if (!no_empty || size != 0) { - string val; - if (opt_submatch.has_value()) [[likely]] { - val = string{submatch_string_view.data(), static_cast(size)}; - } - - mixed output_val; - if (offset_capture) { - output_val = - array::create(std::move(val), opt_submatch - .transform([®ex_info](auto submatch_string_view) noexcept { - return static_cast(std::distance(regex_info.subject.c_str(), submatch_string_view.data())); - }) - .value_or(-1)); - } else { - output_val = std::move(val); - } - - output.emplace_back(std::move(output_val)); - } - } - } - - offset = match_view.match_end(); - } - - const auto size{regex_info.subject.size() - offset}; - if (!no_empty || size != 0) { - string val{std::next(regex_info.subject.c_str(), offset), static_cast(size)}; - - mixed output_val; - if (offset_capture) { - output_val = array::create(std::move(val), static_cast(offset)); - } else { - output_val = std::move(val); - } - - output.emplace_back(std::move(output_val)); - } - - return output; -} - -} // namespace - -namespace kphp::regex { - -namespace details { - -match_results_wrapper::iterator::reference match_results_wrapper::iterator::operator*() const noexcept { - auto content_opt{m_parent.m_view.get_group_content(m_group_idx)}; - - mixed val_mixed; - - mixed unmatched_val{m_parent.m_is_unmatched_as_null ? mixed{} : mixed{string{}}}; - - if (m_parent.m_is_offset_capture) { - val_mixed = content_opt ? array::create(string{content_opt->text.data(), static_cast(content_opt->text.size())}, - static_cast(content_opt->offset)) - : array::create(unmatched_val, static_cast(-1)); - } else { - val_mixed = content_opt ? string{content_opt->text.data(), static_cast(content_opt->text.size())} : unmatched_val; - } - - mixed key_mixed; - if (m_yield_name) { - auto name{m_parent.m_group_names[m_group_idx].name}; - key_mixed = string{name.data(), static_cast(name.size())}; - } else { - key_mixed = static_cast(m_group_idx); - } - - return {key_mixed, val_mixed}; -} - -std::optional> compile_regex(info& regex_info) noexcept { - auto& regex_state{RegexInstanceState::get()}; - if (!regex_state.compile_context) [[unlikely]] { - return std::nullopt; - } - - // check runtime cache - if (auto opt_ref{regex_state.get_compiled_regex(regex_info.regex)}; opt_ref.has_value()) { - const auto& [compile_options, regex_code]{opt_ref->get()}; - regex_info.compile_options = compile_options; - return regex_code; - } - - if (regex_info.regex.empty()) { - kphp::log::warning("empty regex"); - return std::nullopt; - } - - char end_delim{}; - switch (const char start_delim{regex_info.regex[0]}; start_delim) { - case '(': { - end_delim = ')'; - break; - } - case '[': { - end_delim = ']'; - break; - } - case '{': { - end_delim = '}'; - break; - } - case '<': { - end_delim = '>'; - break; - } - case '>': - case '!' ... '\'': - case '*' ... '/': - case ':': - case ';': - case '=': - case '?': - case '@': - case '^': - case '_': - case '`': - case '|': - case '~': { - end_delim = start_delim; - break; - } - default: { - kphp::log::warning("wrong regex delimiter {}", start_delim); - return std::nullopt; - } - } - - uint32_t compile_options{}; - std::string_view regex_body{regex_info.regex.c_str(), regex_info.regex.size()}; - - // remove start delimiter - regex_body.remove_prefix(1); - // parse compile options and skip all symbols until the end delimiter - for (; !regex_body.empty() && regex_body.back() != end_delim; regex_body.remove_suffix(1)) { - // spaces and newlines are ignored - if (regex_body.back() == ' ' || regex_body.back() == '\n') { - continue; - } - - switch (regex_body.back()) { - case 'i': { - compile_options |= PCRE2_CASELESS; - break; - } - case 'm': { - compile_options |= PCRE2_MULTILINE; - break; - } - case 's': { - compile_options |= PCRE2_DOTALL; - break; - } - case 'x': { - compile_options |= PCRE2_EXTENDED; - break; - } - case 'A': { - compile_options |= PCRE2_ANCHORED; - break; - } - case 'D': { - compile_options |= PCRE2_DOLLAR_ENDONLY; - break; - } - case 'U': { - compile_options |= PCRE2_UNGREEDY; - break; - } - case 'X': { - compile_options |= PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL; - break; - } - case 'J': { - compile_options |= PCRE2_INFO_JCHANGED; - break; - } - case 'u': { - compile_options |= PCRE2_UTF | PCRE2_UCP; - break; - } - default: { - kphp::log::warning("unsupported regex modifier {}", regex_body.back()); - break; - } - } - } - - if (regex_body.empty()) { - kphp::log::warning("no ending regex delimiter: {}", regex_info.regex.c_str()); - return std::nullopt; - } - // UTF-8 validation - if (static_cast(compile_options & PCRE2_UTF)) { - if (!mb_UTF8_check(regex_info.regex.c_str())) [[unlikely]] { - kphp::log::warning("invalid UTF-8 pattern: {}", regex_info.regex.c_str()); - return std::nullopt; - } - if (!mb_UTF8_check(regex_info.subject.c_str())) [[unlikely]] { - kphp::log::warning("invalid UTF-8 subject: {}", regex_info.subject.c_str()); - return std::nullopt; - } - } - - // remove the end delimiter - regex_body.remove_suffix(1); - regex_info.compile_options = compile_options; - - // compile pcre2_code - auto expected_re{pcre2::regex::compile(regex_body, regex_state.compile_context, regex_info.compile_options)}; - if (!expected_re.has_value()) [[unlikely]] { - const auto& err{expected_re.error()}; - kphp::log::warning("can't compile pcre2 regex due to error: {}", static_cast(err)); - return std::nullopt; - } - - auto& re{*expected_re}; - // add compiled code to runtime cache - return regex_state.add_compiled_regex(regex_info.regex, compile_options, std::move(re))->get().regex_code; -} - -kphp::stl::vector collect_group_names(const pcre2::regex& re) noexcept { - kphp::stl::vector names; - // initialize an array of strings to hold group names - names.resize(re.capture_count() + 1); - - if (re.name_count() == 0) { - return names; - } - - for (const auto& entry : re.group_names()) { - names[entry.index] = entry; - } - - return names; -} - -} // namespace details - -} // namespace kphp::regex - -Optional f$preg_match(const string& pattern, const string& subject, Optional>> opt_matches, - int64_t flags, int64_t offset) noexcept { - kphp::regex::details::info regex_info{pattern, subject, {}}; - - if (!kphp::regex::details::valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_OFFSET_CAPTURE, kphp::regex::PREG_UNMATCHED_AS_NULL)) - [[unlikely]] { - return false; - } - if (!correct_offset(offset, {regex_info.subject.c_str(), regex_info.subject.size()})) [[unlikely]] { - return false; - } - auto opt_re{kphp::regex::details::compile_regex(regex_info)}; - if (!opt_re.has_value()) [[unlikely]] { - return false; - } - const kphp::pcre2::regex& re{opt_re->get()}; - auto group_names{kphp::regex::details::collect_group_names(re)}; - - auto& regex_state{RegexInstanceState::get()}; - kphp::log::assertion(regex_state.match_context != nullptr); - - auto expected_opt_match_view{kphp::pcre2::matcher{re, - {regex_info.subject.c_str(), regex_info.subject.size()}, - static_cast(offset), - regex_state.match_context, - regex_state.match_data, - regex_info.match_options} - .next()}; - if (!expected_opt_match_view.has_value()) [[unlikely]] { - kphp::log::warning("can't match by pcre2 regex due to error: {}", expected_opt_match_view.error()); - return false; - } - auto opt_match_view{*expected_opt_match_view}; - - if (opt_matches.has_value()) { - const auto is_offset_capture{static_cast(flags & kphp::regex::PREG_OFFSET_CAPTURE)}; - const auto is_unmatched_as_null{static_cast(flags & kphp::regex::PREG_UNMATCHED_AS_NULL)}; - - kphp::log::assertion(std::holds_alternative>(opt_matches.val())); - auto& inner_ref{std::get>(opt_matches.val()).get()}; - inner_ref = array{}; - opt_match_view.transform([is_offset_capture, is_unmatched_as_null, &inner_ref, &group_names, &re](const auto& match_view) { - inner_ref = to_mixed_array({match_view, group_names, re.capture_count(), re.name_count(), kphp::regex::details::trailing_unmatch::skip, is_offset_capture, - is_unmatched_as_null}); - return 0; - }); - } - return opt_match_view.has_value() ? 1 : 0; -} - -Optional f$preg_match_all(const string& pattern, const string& subject, - Optional>> opt_matches, int64_t flags, int64_t offset) noexcept { - int64_t entire_match_count{}; - kphp::regex::details::info regex_info{pattern, subject, {}}; - - if (!kphp::regex::details::valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_PATTERN_ORDER, kphp::regex::PREG_SET_ORDER, - kphp::regex::PREG_OFFSET_CAPTURE, kphp::regex::PREG_UNMATCHED_AS_NULL)) [[unlikely]] { - return false; - } - if (!correct_offset(offset, {regex_info.subject.c_str(), regex_info.subject.size()})) [[unlikely]] { - return false; - } - auto opt_re{kphp::regex::details::compile_regex(regex_info)}; - if (!opt_re.has_value()) [[unlikely]] { - return false; - } - const auto& re{*opt_re}; - auto group_names{kphp::regex::details::collect_group_names(re)}; - - std::optional> matches{}; - if (opt_matches.has_value()) { - kphp::log::assertion(std::holds_alternative>(opt_matches.val())); - auto& inner_ref{std::get>(opt_matches.val()).get()}; - inner_ref = array{}; - matches.emplace(inner_ref); - } +mixed f$preg_replace(const kphp::regex::regexp& regex, const mixed& replacement, const mixed& subject, int64_t limit, + Optional>> opt_count) noexcept { + int64_t count{}; + auto count_finalizer{kphp::regex::details::get_count_finalizer(count, opt_count)}; - // pre-init matches in case of pattern order - if (matches.has_value() && !static_cast(flags & kphp::regex::PREG_SET_ORDER)) [[likely]] { - auto& inner_ref{(*matches).get()}; - const array init_val{}; - for (const auto [name, index] : group_names) { - if (!name.empty()) { - inner_ref.set_value(string{name.data(), static_cast(name.size())}, init_val); - } - inner_ref.push_back(init_val); - } + if (replacement.is_object()) [[unlikely]] { + kphp::log::warning("invalid replacement: object could not be converted to string"); + return {}; } - - auto& regex_state{RegexInstanceState::get()}; - kphp::log::assertion(regex_state.match_context != nullptr); - - kphp::pcre2::matcher pcre2_matcher{re, - {regex_info.subject.c_str(), regex_info.subject.size()}, - static_cast(offset), - regex_state.match_context, - regex_state.match_data, - regex_info.match_options}; - - while (true) { - auto expected_opt_match_view{pcre2_matcher.next()}; - - if (!expected_opt_match_view.has_value()) [[unlikely]] { - kphp::log::warning("can't find all matches due to match error: {}", expected_opt_match_view.error()); - return false; - } - auto opt_match_view{*expected_opt_match_view}; - if (!opt_match_view.has_value()) { - break; - } - - kphp::pcre2::match_view match_view{*opt_match_view}; - set_all_matches(re, group_names, match_view, flags, matches); - ++entire_match_count; + if (subject.is_object()) [[unlikely]] { + kphp::log::warning("invalid subject: object could not be converted to string"); + return {}; } - return entire_match_count; -} - -Optional f$preg_replace(const string& pattern, const string& replacement, const string& subject, int64_t limit, - Optional>> opt_count) noexcept { - int64_t count{}; - vk::final_action count_finalizer{[&count, &opt_count]() noexcept { - if (opt_count.has_value()) { - kphp::log::assertion(std::holds_alternative>(opt_count.val())); - auto& inner_ref{std::get>(opt_count.val()).get()}; - inner_ref = count; - } - }}; - - if (limit < 0 && limit != kphp::regex::PREG_NOLIMIT) [[unlikely]] { - kphp::log::warning("invalid limit {} in preg_replace", limit); - return {}; + if (!subject.is_array()) { + return f$preg_replace(regex, replacement, subject.to_string(), limit, count); } - // we need to replace PHP's back references with PCRE2 ones - auto parser{preg_replacement_parser{{replacement.c_str(), replacement.size()}}}; - string pcre2_replacement{}; - for (const auto& term : parser) { - if (std::holds_alternative(term)) { - auto c{std::get(term)}; - pcre2_replacement.push_back(c); - if (c == '$') { - pcre2_replacement.push_back('$'); - } + const auto& subject_arr{subject.as_array()}; + array result{subject_arr.size()}; + for (const auto& it : subject_arr) { + int64_t replace_one_count{}; + if (Optional replace_result{f$preg_replace(regex, replacement, it.get_value().to_string(), limit, replace_one_count)}; replace_result.has_value()) + [[likely]] { + count += replace_one_count; + result.set_value(it.get_key(), std::move(replace_result.val())); } else { - auto backreference{std::get(term)}; - pcre2_replacement.reserve_at_least(pcre2_replacement.size() + backreference.size() + 3); - pcre2_replacement.append("${"); - pcre2_replacement.append(backreference.data(), backreference.size()); - pcre2_replacement.append("}"); + count = 0; + return {}; } } - kphp::regex::details::info regex_info{pattern, subject, pcre2_replacement}; - - auto opt_re{kphp::regex::details::compile_regex(regex_info)}; - if (!opt_re.has_value()) [[unlikely]] { - return {}; - } - const auto& re{opt_re->get()}; - auto opt_replace_result{ - replace_regex(regex_info, re, limit == kphp::regex::PREG_NOLIMIT ? std::numeric_limits::max() : static_cast(limit))}; - if (!opt_replace_result.has_value()) { - return {}; - } - count = regex_info.replace_count; - return std::move(*opt_replace_result); + return std::move(result); } Optional f$preg_replace(const mixed& pattern, const string& replacement, const string& subject, int64_t limit, Optional>> opt_count) noexcept { int64_t count{}; - vk::final_action count_finalizer{[&count, &opt_count]() noexcept { - if (opt_count.has_value()) { - kphp::log::assertion(std::holds_alternative>(opt_count.val())); - auto& inner_ref{std::get>(opt_count.val()).get()}; - inner_ref = count; - } - }}; + auto count_finalizer{kphp::regex::details::get_count_finalizer(count, opt_count)}; if (pattern.is_object()) [[unlikely]] { kphp::log::warning("invalid pattern: object could not be converted to string"); @@ -782,13 +85,7 @@ Optional f$preg_replace(const mixed& pattern, const string& replacement, Optional f$preg_replace(const mixed& pattern, const mixed& replacement, const string& subject, int64_t limit, Optional>> opt_count) noexcept { int64_t count{}; - vk::final_action count_finalizer{[&count, &opt_count]() noexcept { - if (opt_count.has_value()) { - kphp::log::assertion(std::holds_alternative>(opt_count.val())); - auto& inner_ref{std::get>(opt_count.val()).get()}; - inner_ref = count; - } - }}; + auto count_finalizer{kphp::regex::details::get_count_finalizer(count, opt_count)}; if (pattern.is_object()) [[unlikely]] { kphp::log::warning("invalid pattern: object could not be converted to string"); @@ -835,13 +132,7 @@ Optional f$preg_replace(const mixed& pattern, const mixed& replacement, mixed f$preg_replace(const mixed& pattern, const mixed& replacement, const mixed& subject, int64_t limit, Optional>> opt_count) noexcept { int64_t count{}; - vk::final_action count_finalizer{[&count, &opt_count]() noexcept { - if (opt_count.has_value()) { - kphp::log::assertion(std::holds_alternative>(opt_count.val())); - auto& inner_ref{std::get>(opt_count.val()).get()}; - inner_ref = count; - } - }}; + auto count_finalizer{kphp::regex::details::get_count_finalizer(count, opt_count)}; if (pattern.is_object()) [[unlikely]] { kphp::log::warning("invalid pattern: object could not be converted to string"); @@ -876,25 +167,3 @@ mixed f$preg_replace(const mixed& pattern, const mixed& replacement, const mixed return std::move(result); } - -Optional> f$preg_split(const string& pattern, const string& subject, int64_t limit, int64_t flags) noexcept { - kphp::regex::details::info regex_info{pattern, subject, {}}; - - if (!kphp::regex::details::valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_SPLIT_NO_EMPTY, kphp::regex::PREG_SPLIT_DELIM_CAPTURE, - kphp::regex::PREG_SPLIT_OFFSET_CAPTURE)) { - return false; - } - auto opt_re{kphp::regex::details::compile_regex(regex_info)}; - if (!opt_re.has_value()) [[unlikely]] { - return {}; - } - const auto& re{opt_re->get()}; - auto opt_output{split_regex(regex_info, re, limit, (flags & kphp::regex::PREG_SPLIT_NO_EMPTY) != 0, // - (flags & kphp::regex::PREG_SPLIT_DELIM_CAPTURE) != 0, // - (flags & kphp::regex::PREG_SPLIT_OFFSET_CAPTURE) != 0)}; - if (!opt_output.has_value()) [[unlikely]] { - return false; - } - - return std::move(*opt_output); -} diff --git a/runtime-light/stdlib/string/regex-functions.h b/runtime-light/stdlib/string/regex-functions.h index da1a2b03c0..69fe73486a 100644 --- a/runtime-light/stdlib/string/regex-functions.h +++ b/runtime-light/stdlib/string/regex-functions.h @@ -8,47 +8,536 @@ #include #include #include +#include #include #include #include #include "common/containers/final_action.h" +#include "runtime-common/core/allocator/script-allocator.h" #include "runtime-common/core/runtime-core.h" +#include "runtime-common/core/std/containers.h" +#include "runtime-common/stdlib/string/mbstring-functions.h" #include "runtime-light/coroutine/task.h" #include "runtime-light/coroutine/type-traits.h" #include "runtime-light/stdlib/diagnostics/logs.h" // correctly include PCRE2 lib -#include "runtime-light/stdlib/string/regex-include.h" #include "runtime-light/stdlib/string/regex-state.h" namespace kphp::regex { +class regexp final { +private: + void compile_regex(kphp::regex::details::RegexCoreState& regex_state, string pattern, const string& subject = {}) noexcept { + if (!should_compile(regex_state, pattern)) { + return; + } + if (pattern.empty()) { + kphp::log::warning("empty regex"); + return; + } + + char end_delim{}; + switch (const char start_delim{pattern[0]}; start_delim) { + case '(': { + end_delim = ')'; + break; + } + case '[': { + end_delim = ']'; + break; + } + case '{': { + end_delim = '}'; + break; + } + case '<': { + end_delim = '>'; + break; + } + case '>': + case '!' ... '\'': + case '*' ... '/': + case ':': + case ';': + case '=': + case '?': + case '@': + case '^': + case '_': + case '`': + case '|': + case '~': { + end_delim = start_delim; + break; + } + default: { + kphp::log::warning("wrong regex delimiter {}", start_delim); + return; + } + } + + uint32_t compile_options{}; + std::string_view regex_body{pattern.c_str(), pattern.size()}; + + // remove start delimiter + regex_body.remove_prefix(1); + // parse compile options and skip all symbols until the end delimiter + for (; !regex_body.empty() && regex_body.back() != end_delim; regex_body.remove_suffix(1)) { + // spaces and newlines are ignored + if (regex_body.back() == ' ' || regex_body.back() == '\n') { + continue; + } + + switch (regex_body.back()) { + case 'i': { + compile_options |= PCRE2_CASELESS; + break; + } + case 'm': { + compile_options |= PCRE2_MULTILINE; + break; + } + case 's': { + compile_options |= PCRE2_DOTALL; + break; + } + case 'x': { + compile_options |= PCRE2_EXTENDED; + break; + } + case 'A': { + compile_options |= PCRE2_ANCHORED; + break; + } + case 'D': { + compile_options |= PCRE2_DOLLAR_ENDONLY; + break; + } + case 'U': { + compile_options |= PCRE2_UNGREEDY; + break; + } + case 'X': { + compile_options |= PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL; + break; + } + case 'J': { + compile_options |= PCRE2_INFO_JCHANGED; + break; + } + case 'u': { + compile_options |= PCRE2_UTF | PCRE2_UCP; + break; + } + default: { + kphp::log::warning("unsupported regex modifier {}", regex_body.back()); + break; + } + } + } + + if (regex_body.empty()) { + kphp::log::warning("no ending regex delimiter: {}", pattern.c_str()); + return; + } + // UTF-8 validation + if (static_cast(compile_options & PCRE2_UTF)) { + if (!mb_UTF8_check(pattern.c_str())) [[unlikely]] { + kphp::log::warning("invalid UTF-8 pattern: {}", pattern.c_str()); + return; + } + if (!mb_UTF8_check(subject.c_str())) [[unlikely]] { + kphp::log::warning("invalid UTF-8 subject: {}", subject.c_str()); + return; + } + } + + // remove the end delimiter + regex_body.remove_suffix(1); + this->compile_options = compile_options; + + // compile pcre2_code + auto expected_re{kphp::pcre2::regex::compile(regex_body, regex_state.compile_context, this->compile_options)}; + if (!expected_re.has_value()) [[unlikely]] { + const auto& err{expected_re.error()}; + kphp::log::warning("can't compile pcre2 regex due to error: {}", static_cast(err)); + return; + } + + auto& re{*expected_re}; + // add compiled code to cache + m_re = regex_state.add_compiled_regex(pattern, this->compile_options, std::move(re))->get().regex_code; + } + + bool should_compile(const kphp::regex::details::RegexCoreState& regex_state, const string& pattern) noexcept { + if (!regex_state.compile_context) [[unlikely]] { + return false; + } + if (auto opt_ref{regex_state.get_compiled_regex(pattern)}; opt_ref.has_value()) { + const auto& [compile_options, regex_code]{opt_ref->get()}; + this->compile_options = compile_options; + m_re = regex_code; + return false; + } + return true; + } + + std::optional> m_re; + +public: + regexp() noexcept = default; + + explicit regexp(const string& pattern, const string& subject) noexcept { + if (!should_compile(RegexImageState::get(), pattern)) { + return; + } + compile_regex(RegexInstanceState::get(), pattern, subject); + } + + // DO NOT USE. For code-gen purposes only + void compile_time_init(const string& pattern) noexcept { + compile_regex(RegexImageState::get_mutable(), pattern); + } + + auto get_regex() const noexcept { + return m_re; + } + + // PCRE compile options of the regex + uint32_t compile_options{}; + uint32_t match_options{PCRE2_NO_UTF_CHECK}; + uint32_t replace_options{PCRE2_SUBSTITUTE_UNKNOWN_UNSET | PCRE2_SUBSTITUTE_UNSET_EMPTY}; +}; + +inline constexpr int64_t PREG_NOLIMIT = -1; inline constexpr int64_t PREG_NO_FLAGS = 0; +inline constexpr int64_t PREG_NO_ERROR = 0; +inline constexpr int64_t PREG_INTERNAL_ERROR = 1; +inline constexpr int64_t PREG_BACKTRACK_LIMIT_ERROR = 2; +inline constexpr int64_t PREG_RECURSION_LIMIT = 3; +inline constexpr int64_t PREG_BAD_UTF8_ERROR = 4; +inline constexpr int64_t PREG_BAD_UTF8_OFFSET_ERROR = 5; + +inline constexpr auto PREG_PATTERN_ORDER = static_cast(1U << 0U); +inline constexpr auto PREG_SET_ORDER = static_cast(1U << 1U); +inline constexpr auto PREG_OFFSET_CAPTURE = static_cast(1U << 2U); +inline constexpr auto PREG_SPLIT_NO_EMPTY = static_cast(1U << 3U); +inline constexpr auto PREG_SPLIT_DELIM_CAPTURE = static_cast(1U << 4U); +inline constexpr auto PREG_SPLIT_OFFSET_CAPTURE = static_cast(1U << 5U); +inline constexpr auto PREG_UNMATCHED_AS_NULL = static_cast(1U << 6U); namespace details { enum class trailing_unmatch : uint8_t { skip, include }; -struct info final { - const string& regex; - const string& subject; - std::optional replacement; +using backref = std::string_view; +using replacement_term = std::variant; - // PCRE compile options of the regex - uint32_t compile_options{}; +template +requires((std::is_same_v && ...) && sizeof...(Args) > 0) +bool valid_regex_flags(int64_t flags, Args... supported_flags) noexcept { + const bool valid{(flags & ~(supported_flags | ...)) == kphp::regex::PREG_NO_FLAGS}; + if (!valid) [[unlikely]] { + kphp::log::warning("invalid flags: {}", flags); + } + return valid; +} - int64_t match_count{}; - uint32_t match_options{PCRE2_NO_UTF_CHECK}; +inline bool correct_offset(int64_t& offset, std::string_view subject) noexcept { + if (offset < 0) [[unlikely]] { + offset += subject.size(); + if (offset < 0) [[unlikely]] { + offset = 0; + return true; + } + } + return offset <= subject.size(); +} - int64_t replace_count{}; - uint32_t replace_options{PCRE2_SUBSTITUTE_UNKNOWN_UNSET | PCRE2_SUBSTITUTE_UNSET_EMPTY}; +inline std::optional try_get_backref(std::string_view preg_replacement) noexcept { + if (preg_replacement.empty() || !std::isdigit(preg_replacement[0])) { + return std::nullopt; + } + + if (preg_replacement.size() == 1 || !std::isdigit(preg_replacement[1])) { + return kphp::regex::details::backref{preg_replacement.substr(0, 1)}; + } + + return kphp::regex::details::backref{preg_replacement.substr(0, 2)}; +} + +inline std::optional replace_regex(const kphp::pcre2::regex& re, const string& subject, const std::optional& replacement, uint64_t limit, + uint32_t match_options, int64_t& replace_count) noexcept { + replace_count = 0; + + if (limit == 0) { + return subject; + } + + auto& regex_state{RegexInstanceState::get()}; + if (!regex_state.match_context) [[unlikely]] { + return std::nullopt; + } + + auto& runtime_ctx{RuntimeContext::get()}; + PCRE2_SIZE buffer_length{std::max({subject.size(), static_cast(RegexInstanceState::REPLACE_BUFFER_SIZE), runtime_ctx.static_SB.size()})}; + runtime_ctx.static_SB.clean().reserve(buffer_length); + + size_t last_pos{}; + string output_str{}; + + kphp::log::assertion(replacement.has_value()); + std::string_view sv_replacement{replacement->c_str(), replacement->size()}; + + kphp::pcre2::matcher pcre2_matcher{re, {subject.c_str(), subject.size()}, {}, regex_state.match_context, regex_state.match_data, match_options}; + while (replace_count < limit) { + auto expected_opt_match_view{pcre2_matcher.next()}; + + if (!expected_opt_match_view.has_value()) [[unlikely]] { + kphp::log::warning("can't replace by pcre2 regex due to match error: {}", expected_opt_match_view.error()); + return std::nullopt; + } + auto opt_match_view{*expected_opt_match_view}; + if (!opt_match_view.has_value()) { + break; + } - info() = delete; + auto& match_view{*opt_match_view}; + + output_str.append(std::next(subject.c_str(), last_pos), match_view.match_start() - last_pos); + + auto sub_res{match_view.substitute(sv_replacement, {runtime_ctx.static_SB.buffer(), buffer_length}, regex_state.match_context)}; + if (!sub_res.has_value()) { + auto [needed_size, error]{sub_res.error()}; + if (error.code == PCRE2_ERROR_NOMEMORY) [[unlikely]] { + runtime_ctx.static_SB.reserve(needed_size); + buffer_length = needed_size; + sub_res = match_view.substitute(sv_replacement, {runtime_ctx.static_SB.buffer(), buffer_length}, regex_state.match_context); + } + if (!sub_res.has_value()) [[unlikely]] { + kphp::log::warning("pcre2_substitute error {}", sub_res.error().second); + return std::nullopt; + } + } + + output_str.append(runtime_ctx.static_SB.buffer(), *sub_res); + + last_pos = match_view.match_end(); + ++replace_count; + } - info(const string& regex_, const string& subject_, std::optional replacement_) noexcept - : regex(regex_), - subject(subject_), - replacement(std::move(replacement_)) {} + output_str.append(std::next(subject.c_str(), last_pos), subject.size() - last_pos); + + return output_str; +} + +inline std::optional> split_regex(const kphp::pcre2::regex& re, const string& subject, int64_t limit, uint32_t match_options, bool no_empty, + bool delim_capture, bool offset_capture) noexcept { + if (limit == 0) { + limit = kphp::regex::PREG_NOLIMIT; + } + + auto& regex_state{RegexInstanceState::get()}; + if (!regex_state.match_context) [[unlikely]] { + return std::nullopt; + } + + array output{}; + + kphp::pcre2::matcher pcre2_matcher{re, {subject.c_str(), subject.size()}, {}, regex_state.match_context, regex_state.match_data, match_options}; + size_t offset{}; + for (size_t out_parts_count{1}; limit == kphp::regex::PREG_NOLIMIT || out_parts_count < limit;) { + auto expected_opt_match_view{pcre2_matcher.next()}; + if (!expected_opt_match_view.has_value()) [[unlikely]] { + kphp::log::warning("can't split by pcre2 regex due to match error: {}", expected_opt_match_view.error()); + return std::nullopt; + } + auto opt_match_view{*expected_opt_match_view}; + if (!opt_match_view.has_value()) { + break; + } + + kphp::pcre2::match_view match_view{*opt_match_view}; + + if (const auto size{match_view.match_start() - offset}; !no_empty || size != 0) { + string val{std::next(subject.c_str(), offset), static_cast(size)}; + + mixed output_val{}; + if (offset_capture) { + output_val = array::create(std::move(val), static_cast(offset)); + } else { + output_val = std::move(val); + } + + output.emplace_back(std::move(output_val)); + ++out_parts_count; + } + + if (delim_capture) { + for (size_t i{1}; i < match_view.size(); i++) { + auto opt_submatch{match_view.get_group(i)}; + auto submatch_string_view{opt_submatch.value_or(std::string_view{})}; + const auto size{submatch_string_view.size()}; + if (!no_empty || size != 0) { + string val{}; + if (opt_submatch.has_value()) [[likely]] { + val = string{submatch_string_view.data(), static_cast(size)}; + } + + mixed output_val{}; + if (offset_capture) { + output_val = array::create(std::move(val), opt_submatch + .transform([&subject](auto submatch_string_view) noexcept { + return static_cast(std::distance(subject.c_str(), submatch_string_view.data())); + }) + .value_or(-1)); + } else { + output_val = std::move(val); + } + + output.emplace_back(std::move(output_val)); + } + } + } + + offset = match_view.match_end(); + } + + const auto size{subject.size() - offset}; + if (!no_empty || size != 0) { + string val{std::next(subject.c_str(), offset), static_cast(size)}; + + mixed output_val{}; + if (offset_capture) { + output_val = array::create(std::move(val), static_cast(offset)); + } else { + output_val = std::move(val); + } + + output.emplace_back(std::move(output_val)); + } + + return output; +} + +class preg_replacement_parser { + std::string_view preg_replacement; + + replacement_term parse_term_internal() noexcept { + kphp::log::assertion(!preg_replacement.empty()); + auto first_char{preg_replacement.front()}; + preg_replacement = preg_replacement.substr(1); + if (preg_replacement.empty()) { + return first_char; + } + switch (first_char) { + case '$': + // $1, ${1} + if (preg_replacement.front() == '{') { + return try_get_backref(preg_replacement.substr(1)) + .and_then([this](auto br) noexcept -> std::optional { + auto digits_end_pos{1 + br.size()}; + if (digits_end_pos < preg_replacement.size() && preg_replacement[digits_end_pos] == '}') { + preg_replacement = preg_replacement.substr(1 + br.size() + 1); + return br; + } + + return std::nullopt; + }) + .value_or('$'); + } + + return try_get_backref(preg_replacement) + .transform([this](auto br) noexcept -> replacement_term { + auto digits_end_pos{br.size()}; + preg_replacement = preg_replacement.substr(digits_end_pos); + return br; + }) + .value_or('$'); + + case '\\': { + // \1 + auto opt_back_reference{try_get_backref(preg_replacement).transform([this](auto br) noexcept -> replacement_term { + auto digits_end_pos{br.size()}; + preg_replacement = preg_replacement.substr(digits_end_pos); + return br; + })}; + if (opt_back_reference.has_value()) { + return *opt_back_reference; + } else { + auto c{preg_replacement.front()}; + if (c == '$' || c == '\\') { + preg_replacement = preg_replacement.substr(1); + return c; + } + return '\\'; + } + } + default: + return first_char; + } + } + +public: + explicit preg_replacement_parser(std::string_view preg_replacement) noexcept + : preg_replacement{preg_replacement} {} + + struct iterator { + preg_replacement_parser* parser{nullptr}; + replacement_term current_term{'\0'}; + + using difference_type = std::ptrdiff_t; + using value_type = replacement_term; + using reference = const replacement_term&; + using pointer = const replacement_term*; + using iterator_category = std::input_iterator_tag; + + iterator() noexcept = default; + explicit iterator(preg_replacement_parser* p) noexcept + : parser{p} { + if (parser->preg_replacement.empty()) { + parser = nullptr; + } else { + current_term = parser->parse_term_internal(); + } + } + + reference operator*() const noexcept { + return current_term; + } + pointer operator->() const noexcept { + return std::addressof(current_term); + } + + iterator& operator++() noexcept { + if (!parser->preg_replacement.empty()) { + current_term = parser->parse_term_internal(); + } else { + parser = nullptr; + } + return *this; + } + iterator operator++(int) noexcept { // NOLINT + iterator temp{*this}; + ++(*this); + return temp; + } + + friend bool operator==(const iterator& a, const iterator& b) noexcept { + return a.parser == b.parser; + } + friend bool operator!=(const iterator& a, const iterator& b) noexcept { + return !(a == b); + } + }; + + iterator begin() noexcept { + return iterator{this}; + } + iterator end() noexcept { + return iterator{}; + } }; class match_results_wrapper { @@ -122,95 +611,450 @@ class match_results_wrapper { return *this; } - bool operator==(const iterator& other) const noexcept { - return m_group_idx == other.m_group_idx && m_yield_name == other.m_yield_name; + bool operator==(const iterator& other) const noexcept { + return m_group_idx == other.m_group_idx && m_yield_name == other.m_yield_name; + } + bool operator!=(const iterator& other) const noexcept { + return !(*this == other); + } + }; + + iterator begin() const noexcept { + return iterator{*this, 0}; + } + + iterator end() const noexcept { + return iterator{*this, match_count()}; + } +}; + +inline array to_mixed_array(const kphp::regex::details::match_results_wrapper& wrapper) noexcept { + const bool numeric_only{wrapper.name_count() == 0}; + + array result_map{array_size{static_cast(wrapper.max_potential_size()), numeric_only}}; + for (auto [key, value] : wrapper) { + result_map.set_value(key, value); + } + return result_map; +} + +inline match_results_wrapper::iterator::reference match_results_wrapper::iterator::operator*() const noexcept { + auto content_opt{m_parent.m_view.get_group_content(m_group_idx)}; + + mixed val_mixed; + + mixed unmatched_val{m_parent.m_is_unmatched_as_null ? mixed{} : mixed{string{}}}; + + if (m_parent.m_is_offset_capture) { + val_mixed = content_opt ? array::create(string{content_opt->text.data(), static_cast(content_opt->text.size())}, + static_cast(content_opt->offset)) + : array::create(unmatched_val, static_cast(-1)); + } else { + val_mixed = content_opt ? string{content_opt->text.data(), static_cast(content_opt->text.size())} : unmatched_val; + } + + mixed key_mixed; + if (m_yield_name) { + auto name{m_parent.m_group_names[m_group_idx].name}; + key_mixed = string{name.data(), static_cast(name.size())}; + } else { + key_mixed = static_cast(m_group_idx); + } + + return {key_mixed, val_mixed}; +} + +// *** importrant *** +// in case of a pattern order all_matches must already contain all groups as empty arrays before the first call to set_all_matches +inline void set_all_matches(const kphp::pcre2::regex& re, const kphp::stl::vector& group_names, + const kphp::pcre2::match_view& match_view, int64_t flags, std::optional> opt_all_matches) noexcept { + const auto is_pattern_order{!static_cast(flags & kphp::regex::PREG_SET_ORDER)}; + const auto is_offset_capture{static_cast(flags & kphp::regex::PREG_OFFSET_CAPTURE)}; + const auto is_unmatched_as_null{static_cast(flags & kphp::regex::PREG_UNMATCHED_AS_NULL)}; + + // early return in case we don't actually need to set matches + if (!opt_all_matches.has_value()) { + return; + } + + auto last_unmatched_policy{is_pattern_order ? kphp::regex::details::trailing_unmatch::include : kphp::regex::details::trailing_unmatch::skip}; + mixed matches{kphp::regex::details::to_mixed_array( + {match_view, group_names, re.capture_count(), re.name_count(), last_unmatched_policy, is_offset_capture, is_unmatched_as_null})}; + + mixed& all_matches{opt_all_matches->get()}; + if (is_pattern_order) [[likely]] { + for (const auto& it : std::as_const(matches)) { + all_matches[it.get_key()].push_back(it.get_value()); + } + } else { + all_matches.push_back(matches); + } +} + +/** + * Collects all named capture groups and maps them to their group numbers. + * + * This function extracts the identifier for each named capture group and places + * it into a vector where the index exactly matches the group's capture number. + * + * @param re The compiled PCRE2 regular expression to inspect. + * @return A vector of group_name objects indexed by their group number. + * Index 0 (the whole match) and any unnamed group numbers will + * contain default/empty group_name values. + * @noexcept + */ +inline kphp::stl::vector collect_group_names(const pcre2::regex& re) noexcept { + kphp::stl::vector names; + // initialize an array of strings to hold group names + names.resize(re.capture_count() + 1); + + if (re.name_count() == 0) { + return names; + } + + for (const auto& entry : re.group_names()) { + names[entry.index] = entry; + } + + return names; +} + +inline auto get_count_finalizer(int64_t& count, Optional>>& opt_count) noexcept { + return vk::final_action{[&count, &opt_count]() noexcept { + if (opt_count.has_value()) { + kphp::log::assertion(std::holds_alternative>(opt_count.val())); + auto& inner_ref{std::get>(opt_count.val()).get()}; + inner_ref = count; + } + }}; +} + +inline bool preg_match_check_args(const string& subject, int64_t flags, int64_t& offset) noexcept { + if (!kphp::regex::details::valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_OFFSET_CAPTURE, kphp::regex::PREG_UNMATCHED_AS_NULL)) + [[unlikely]] { + return false; + } + + if (!kphp::regex::details::correct_offset(offset, {subject.c_str(), subject.size()})) [[unlikely]] { + return false; + } + + return true; +} + +inline Optional preg_match_impl(const regexp& regex, const string& subject, + Optional>> opt_matches, int64_t flags, + int64_t offset) noexcept { + const auto opt_re{regex.get_regex()}; + if (!opt_re.has_value()) [[unlikely]] { + return false; + } + const kphp::pcre2::regex& re{opt_re->get()}; + auto group_names{kphp::regex::details::collect_group_names(re)}; + + auto& regex_state{RegexInstanceState::get()}; + kphp::log::assertion(regex_state.match_context != nullptr); + + auto expected_opt_match_view{kphp::pcre2::matcher{ + re, {subject.c_str(), subject.size()}, static_cast(offset), regex_state.match_context, regex_state.match_data, regex.match_options} + .next()}; + if (!expected_opt_match_view.has_value()) [[unlikely]] { + kphp::log::warning("can't match by pcre2 regex due to error: {}", expected_opt_match_view.error()); + return false; + } + auto opt_match_view{*expected_opt_match_view}; + + if (opt_matches.has_value()) { + const auto is_offset_capture{static_cast(flags & kphp::regex::PREG_OFFSET_CAPTURE)}; + const auto is_unmatched_as_null{static_cast(flags & kphp::regex::PREG_UNMATCHED_AS_NULL)}; + + kphp::log::assertion(std::holds_alternative>(opt_matches.val())); + auto& inner_ref{std::get>(opt_matches.val()).get()}; + inner_ref = array{}; + opt_match_view.transform([is_offset_capture, is_unmatched_as_null, &inner_ref, &group_names, &re](const auto& match_view) { + inner_ref = kphp::regex::details::to_mixed_array({match_view, group_names, re.capture_count(), re.name_count(), + kphp::regex::details::trailing_unmatch::skip, is_offset_capture, is_unmatched_as_null}); + return 0; + }); + } + return opt_match_view.has_value() ? 1 : 0; +} + +inline bool preg_match_all_check_args(const string& subject, int64_t flags, int64_t& offset) noexcept { + if (!kphp::regex::details::valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_PATTERN_ORDER, kphp::regex::PREG_SET_ORDER, + kphp::regex::PREG_OFFSET_CAPTURE, kphp::regex::PREG_UNMATCHED_AS_NULL)) [[unlikely]] { + return false; + } + if (!kphp::regex::details::correct_offset(offset, {subject.c_str(), subject.size()})) [[unlikely]] { + return false; + } + + return true; +} + +inline Optional preg_match_all_impl(const regexp& regex, const string& subject, + Optional>> opt_matches, int64_t flags, + int64_t offset) noexcept { + auto opt_re{regex.get_regex()}; + if (!opt_re.has_value()) [[unlikely]] { + return false; + } + + int64_t entire_match_count{}; + const auto& re{*opt_re}; + auto group_names{kphp::regex::details::collect_group_names(re)}; + + std::optional> matches{}; + if (opt_matches.has_value()) { + kphp::log::assertion(std::holds_alternative>(opt_matches.val())); + auto& inner_ref{std::get>(opt_matches.val()).get()}; + inner_ref = array{}; + matches.emplace(inner_ref); + } + + // pre-init matches in case of pattern order + if (matches.has_value() && !static_cast(flags & kphp::regex::PREG_SET_ORDER)) [[likely]] { + auto& inner_ref{(*matches).get()}; + const array init_val{}; + for (const auto [name, index] : group_names) { + if (!name.empty()) { + inner_ref.set_value(string{name.data(), static_cast(name.size())}, init_val); + } + inner_ref.push_back(init_val); + } + } + + auto& regex_state{RegexInstanceState::get()}; + kphp::log::assertion(regex_state.match_context != nullptr); + + kphp::pcre2::matcher pcre2_matcher{ + re, {subject.c_str(), subject.size()}, static_cast(offset), regex_state.match_context, regex_state.match_data, regex.match_options}; + + while (true) { + auto expected_opt_match_view{pcre2_matcher.next()}; + + if (!expected_opt_match_view.has_value()) [[unlikely]] { + kphp::log::warning("can't find all matches due to match error: {}", expected_opt_match_view.error()); + return false; + } + auto opt_match_view{*expected_opt_match_view}; + if (!opt_match_view.has_value()) { + break; + } + + kphp::pcre2::match_view match_view{*opt_match_view}; + kphp::regex::details::set_all_matches(re, group_names, match_view, flags, matches); + ++entire_match_count; + } + + return entire_match_count; +} + +inline std::optional preg_replace_preparing(const string& replacement, int64_t limit) noexcept { + if (limit < 0 && limit != kphp::regex::PREG_NOLIMIT) [[unlikely]] { + kphp::log::warning("invalid limit {} in preg_replace", limit); + return std::nullopt; + } + + // we need to replace PHP's back references with PCRE2 ones + auto parser{kphp::regex::details::preg_replacement_parser{{replacement.c_str(), replacement.size()}}}; + string pcre2_replacement{}; + for (const auto& term : parser) { + if (std::holds_alternative(term)) { + auto c{std::get(term)}; + pcre2_replacement.push_back(c); + if (c == '$') { + pcre2_replacement.push_back('$'); + } + } else { + auto backreference{std::get(term)}; + pcre2_replacement.reserve_at_least(pcre2_replacement.size() + backreference.size() + 3); + pcre2_replacement.append("${"); + pcre2_replacement.append(backreference.data(), backreference.size()); + pcre2_replacement.append("}"); + } + } + return pcre2_replacement; +} + +inline Optional preg_replace_impl(const regexp& regex, const string& subject, const string& replacement, int64_t limit, int64_t& count) noexcept { + + auto opt_re{regex.get_regex()}; + if (!opt_re.has_value()) [[unlikely]] { + return {}; + } + + const auto& re{opt_re->get()}; + int64_t replace_count{}; + auto opt_replace_result{kphp::regex::details::replace_regex( + re, subject, std::optional{replacement}, limit == kphp::regex::PREG_NOLIMIT ? std::numeric_limits::max() : static_cast(limit), + regex.match_options, replace_count)}; + if (!opt_replace_result.has_value()) { + return {}; + } + count = replace_count; + return std::move(*opt_replace_result); +} + +inline bool preg_replace_callback_check_args(int64_t limit = kphp::regex::PREG_NOLIMIT, int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept { + if (limit < 0 && limit != kphp::regex::PREG_NOLIMIT) [[unlikely]] { + kphp::log::warning("invalid limit {} in preg_replace_callback", limit); + return false; + } + if (!kphp::regex::details::valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_OFFSET_CAPTURE, kphp::regex::PREG_UNMATCHED_AS_NULL)) + [[unlikely]] { + return false; + } + return true; +} + +template> F> +kphp::coro::task> preg_replace_callback_impl(const regexp& regex, F callback, string subject, int64_t& count, + int64_t limit = kphp::regex::PREG_NOLIMIT) noexcept { + static_assert(std::same_as>, string>); + + const auto opt_re{regex.get_regex()}; + if (!opt_re.has_value()) [[unlikely]] { + co_return Optional{}; + } + const auto& re{opt_re->get()}; + auto group_names{kphp::regex::details::collect_group_names(re)}; + auto unsigned_limit{limit == kphp::regex::PREG_NOLIMIT ? std::numeric_limits::max() : static_cast(limit)}; + int64_t replace_count{}; + + if (limit == 0) { + count = replace_count; + co_return subject; + } + + auto& regex_state{RegexInstanceState::get()}; + if (!regex_state.match_context) [[unlikely]] { + co_return Optional{}; + } + + size_t last_pos{}; + string output_str{}; + + kphp::pcre2::matcher pcre2_matcher{re, {subject.c_str(), subject.size()}, {}, regex_state.match_context, regex_state.match_data, regex.match_options}; + while (replace_count < unsigned_limit) { + auto expected_opt_match_view{pcre2_matcher.next()}; + + if (!expected_opt_match_view.has_value()) [[unlikely]] { + kphp::log::warning("can't replace with callback by pcre2 regex due to match error: {}", expected_opt_match_view.error()); + co_return Optional{}; + } + auto opt_match_view{*expected_opt_match_view}; + if (!opt_match_view.has_value()) { + break; + } + + auto& match_view{*opt_match_view}; + + output_str.append(std::next(subject.c_str(), last_pos), match_view.match_start() - last_pos); + + last_pos = match_view.match_end(); + + // retrieve the named groups count + uint32_t named_groups_count{re.name_count()}; + + array matches{array_size{static_cast(match_view.size() + named_groups_count), named_groups_count == 0}}; + for (auto [key, value] : kphp::regex::details::match_results_wrapper{match_view, group_names, re.capture_count(), re.name_count(), + kphp::regex::details::trailing_unmatch::skip, false, false}) { + matches.set_value(key, value.to_string()); } - bool operator!=(const iterator& other) const noexcept { - return !(*this == other); + string replacement{}; + if constexpr (kphp::coro::is_async_function_v>) { + replacement = co_await std::invoke(callback, std::move(matches)); + } else { + replacement = std::invoke(callback, std::move(matches)); } - }; - iterator begin() const noexcept { - return iterator{*this, 0}; - } + output_str.append(replacement); - iterator end() const noexcept { - return iterator{*this, match_count()}; + ++replace_count; } -}; -template -requires((std::is_same_v && ...) && sizeof...(Args) > 0) -bool valid_regex_flags(int64_t flags, Args... supported_flags) noexcept { - const bool valid{(flags & ~(supported_flags | ...)) == kphp::regex::PREG_NO_FLAGS}; - if (!valid) [[unlikely]] { - kphp::log::warning("invalid flags: {}", flags); - } - return valid; + output_str.append(std::next(subject.c_str(), last_pos), subject.size() - last_pos); + + count = replace_count; + co_return output_str; } -std::optional> compile_regex(info& regex_info) noexcept; +inline bool preg_split_check_args(int64_t flags) noexcept { + return kphp::regex::details::valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_SPLIT_NO_EMPTY, kphp::regex::PREG_SPLIT_DELIM_CAPTURE, + kphp::regex::PREG_SPLIT_OFFSET_CAPTURE); +} -/** - * Collects all named capture groups and maps them to their group numbers. - * - * This function extracts the identifier for each named capture group and places - * it into a vector where the index exactly matches the group's capture number. - * - * @param re The compiled PCRE2 regular expression to inspect. - * @return A vector of group_name objects indexed by their group number. - * Index 0 (the whole match) and any unnamed group numbers will - * contain default/empty group_name values. - * @noexcept - */ -kphp::stl::vector collect_group_names(const pcre2::regex& re) noexcept; +inline Optional> preg_split_impl(const regexp& regex, const string& subject, int64_t limit, int64_t flags) noexcept { -} // namespace details + auto opt_re{regex.get_regex()}; + if (!opt_re.has_value()) [[unlikely]] { + return {}; + } -inline constexpr int64_t PREG_NO_ERROR = 0; -inline constexpr int64_t PREG_INTERNAL_ERROR = 1; -inline constexpr int64_t PREG_BACKTRACK_LIMIT_ERROR = 2; -inline constexpr int64_t PREG_RECURSION_LIMIT = 3; -inline constexpr int64_t PREG_BAD_UTF8_ERROR = 4; -inline constexpr int64_t PREG_BAD_UTF8_OFFSET_ERROR = 5; + const auto& re{opt_re->get()}; + auto opt_output{kphp::regex::details::split_regex(re, subject, limit, regex.match_options, (flags & kphp::regex::PREG_SPLIT_NO_EMPTY) != 0, + (flags & kphp::regex::PREG_SPLIT_DELIM_CAPTURE) != 0, + (flags & kphp::regex::PREG_SPLIT_OFFSET_CAPTURE) != 0)}; + if (!opt_output.has_value()) [[unlikely]] { + return false; + } -inline constexpr auto PREG_PATTERN_ORDER = static_cast(1U << 0U); -inline constexpr auto PREG_SET_ORDER = static_cast(1U << 1U); -inline constexpr auto PREG_OFFSET_CAPTURE = static_cast(1U << 2U); -inline constexpr auto PREG_SPLIT_NO_EMPTY = static_cast(1U << 3U); -inline constexpr auto PREG_SPLIT_DELIM_CAPTURE = static_cast(1U << 4U); -inline constexpr auto PREG_SPLIT_OFFSET_CAPTURE = static_cast(1U << 5U); -inline constexpr auto PREG_UNMATCHED_AS_NULL = static_cast(1U << 6U); + return std::move(*opt_output); +} -inline constexpr int64_t PREG_NOLIMIT = -1; +} // namespace details } // namespace kphp::regex -using regexp = string; - // === preg_match ================================================================================= +Optional f$preg_match(const kphp::regex::regexp& regex, const string& subject, + const Optional>>& opt_matches = {}, + int64_t flags = kphp::regex::PREG_NO_FLAGS, int64_t offset = 0) noexcept; + Optional f$preg_match(const string& pattern, const string& subject, - Optional>> opt_matches = {}, + const Optional>>& opt_matches = {}, + int64_t flags = kphp::regex::PREG_NO_FLAGS, int64_t offset = 0) noexcept; + +Optional f$preg_match(const mixed& pattern, const string& subject, + const Optional>>& opt_matches = {}, int64_t flags = kphp::regex::PREG_NO_FLAGS, int64_t offset = 0) noexcept; // === preg_match_all ============================================================================= +Optional f$preg_match_all(const kphp::regex::regexp& regex, const string& subject, + const Optional>>& opt_matches = {}, + int64_t flags = kphp::regex::PREG_NO_FLAGS, int64_t offset = 0) noexcept; + Optional f$preg_match_all(const string& pattern, const string& subject, - Optional>> opt_matches = {}, + const Optional>>& opt_matches = {}, + int64_t flags = kphp::regex::PREG_NO_FLAGS, int64_t offset = 0) noexcept; + +Optional f$preg_match_all(const mixed& pattern, const string& subject, + const Optional>>& opt_matches = {}, int64_t flags = kphp::regex::PREG_NO_FLAGS, int64_t offset = 0) noexcept; // === preg_replace =============================================================================== +Optional f$preg_replace(const kphp::regex::regexp& regex, const string& replacement, const string& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, + Optional>> opt_count = {}) noexcept; +Optional f$preg_replace(const kphp::regex::regexp& regex, const mixed& replacement, const string& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, + Optional>> opt_count = {}) noexcept; +mixed f$preg_replace(const kphp::regex::regexp& regex, const string& replacement, const mixed& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, + const Optional>>& opt_count = {}) noexcept; +mixed f$preg_replace(const kphp::regex::regexp& regex, const mixed& replacement, const mixed& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, + Optional>> opt_count = {}) noexcept; + Optional f$preg_replace(const string& pattern, const string& replacement, const string& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}) noexcept; Optional f$preg_replace(const mixed& pattern, const string& replacement, const string& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}) noexcept; - +mixed f$preg_replace(const mixed& pattern, const string& replace_val, const mixed& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, + const Optional>>& opt_count = {}) noexcept; Optional f$preg_replace(const mixed& pattern, const mixed& replacement, const string& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}) noexcept; - mixed f$preg_replace(const mixed& pattern, const mixed& replacement, const mixed& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}) noexcept; @@ -222,104 +1066,224 @@ auto f$preg_replace(const T1& regex, const T2& replace_val, const T3& subject, i // === preg_replace_callback ====================================================================== +template> F> +kphp::coro::task> f$preg_replace_callback(const kphp::regex::regexp& regex, F callback, string subject, + int64_t limit = kphp::regex::PREG_NOLIMIT, + Optional>> opt_count = {}, + int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept; + +template +kphp::coro::task f$preg_replace_callback(const kphp::regex::regexp& regex, F callback, const mixed& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, + Optional>> opt_count = {}, + int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept; + template> F> kphp::coro::task> f$preg_replace_callback(string pattern, F callback, string subject, int64_t limit = kphp::regex::PREG_NOLIMIT, Optional>> opt_count = {}, - int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept { - static_assert(std::same_as>, string>); + int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept; - int64_t count{}; - vk::final_action count_finalizer{[&count, &opt_count]() noexcept { - if (opt_count.has_value()) { - kphp::log::assertion(std::holds_alternative>(opt_count.val())); - auto& inner_ref{std::get>(opt_count.val()).get()}; - inner_ref = count; - } - }}; +template +kphp::coro::task> f$preg_replace_callback(mixed pattern, F callback, string subject, int64_t limit = kphp::regex::PREG_NOLIMIT, + Optional>> opt_count = {}, + int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept; - if (limit < 0 && limit != kphp::regex::PREG_NOLIMIT) [[unlikely]] { - kphp::log::warning("invalid limit {} in preg_replace_callback", limit); - co_return Optional{}; +template +kphp::coro::task f$preg_replace_callback(mixed pattern, F callback, mixed subject, int64_t limit = kphp::regex::PREG_NOLIMIT, + Optional>> opt_count = {}, + int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept; + +template> +auto f$preg_replace_callback(T1&& pattern, T2&& callback, T3&& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, + Optional>> opt_count = {}, + int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept + -> decltype(f$preg_replace_callback(std::forward(pattern), std::forward(callback), std::forward(subject).val(), limit, opt_count, flags)); + +// === preg_split ================================================================================= + +Optional> f$preg_split(const kphp::regex::regexp& regex, const string& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, + int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept; + +Optional> f$preg_split(const string& pattern, const string& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, + int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept; + +Optional> f$preg_split(const mixed& pattern, const string& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, + int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept; + +// ================================================================================================================================================== + +// === preg_match implementation ================================================================== + +inline Optional f$preg_match(const kphp::regex::regexp& regex, const string& subject, + const Optional>>& opt_matches, int64_t flags, + int64_t offset) noexcept { + if (!kphp::regex::details::preg_match_check_args(subject, flags, offset)) [[unlikely]] { + return false; } + return kphp::regex::details::preg_match_impl(regex, subject, opt_matches, flags, offset); +} - kphp::regex::details::info regex_info{pattern, subject, {}}; +inline Optional f$preg_match(const string& pattern, const string& subject, + const Optional>>& opt_matches, int64_t flags, + int64_t offset) noexcept { + if (!kphp::regex::details::preg_match_check_args(subject, flags, offset)) [[unlikely]] { + return false; + } + return kphp::regex::details::preg_match_impl(kphp::regex::regexp{pattern, subject}, subject, opt_matches, flags, offset); +} - if (!kphp::regex::details::valid_regex_flags(flags, kphp::regex::PREG_NO_FLAGS, kphp::regex::PREG_OFFSET_CAPTURE, kphp::regex::PREG_UNMATCHED_AS_NULL)) - [[unlikely]] { - co_return Optional{}; +inline Optional f$preg_match(const mixed& pattern, const string& subject, + const Optional>>& opt_matches, int64_t flags, + int64_t offset) noexcept { + return f$preg_match(pattern.to_string(), subject, opt_matches, flags, offset); +} + +// === preg_match_all implementation ============================================================== + +inline Optional f$preg_match_all(const kphp::regex::regexp& regex, const string& subject, + const Optional>>& opt_matches, int64_t flags, + int64_t offset) noexcept { + if (!kphp::regex::details::preg_match_all_check_args(subject, flags, offset)) [[unlikely]] { + return false; } - auto opt_re{kphp::regex::details::compile_regex(regex_info)}; - if (!opt_re.has_value()) [[unlikely]] { - co_return Optional{}; + return kphp::regex::details::preg_match_all_impl(regex, subject, opt_matches, flags, offset); +} + +inline Optional f$preg_match_all(const string& pattern, const string& subject, + const Optional>>& opt_matches, int64_t flags, + int64_t offset) noexcept { + if (!kphp::regex::details::preg_match_all_check_args(subject, flags, offset)) [[unlikely]] { + return false; } - const auto& re{opt_re->get()}; - auto group_names{kphp::regex::details::collect_group_names(re)}; - auto unsigned_limit{limit == kphp::regex::PREG_NOLIMIT ? std::numeric_limits::max() : static_cast(limit)}; - regex_info.replace_count = 0; + return kphp::regex::details::preg_match_all_impl(kphp::regex::regexp{pattern, subject}, subject, opt_matches, flags, offset); +} - if (limit == 0) { - count = regex_info.replace_count; - co_return regex_info.subject; +inline Optional f$preg_match_all(const mixed& pattern, const string& subject, + const Optional>>& opt_matches, int64_t flags, + int64_t offset) noexcept { + return f$preg_match_all(pattern.to_string(), subject, opt_matches, flags, offset); +} + +// === preg_replace part of implementation ======================================================== + +inline Optional f$preg_replace(const kphp::regex::regexp& regex, const string& replacement, const string& subject, int64_t limit, + Optional>> opt_count) noexcept { + int64_t count{}; + auto count_finalizer{kphp::regex::details::get_count_finalizer(count, opt_count)}; + const auto& pcre2_replacement{kphp::regex::details::preg_replace_preparing(replacement, limit)}; + if (!pcre2_replacement.has_value()) [[unlikely]] { + return false; } + return kphp::regex::details::preg_replace_impl(regex, subject, pcre2_replacement.value(), limit, count); +} - auto& regex_state{RegexInstanceState::get()}; - if (!regex_state.match_context) [[unlikely]] { - co_return Optional{}; +inline Optional f$preg_replace(const kphp::regex::regexp& regex, const mixed& replacement, const string& subject, int64_t limit, + Optional>> opt_count) noexcept { + int64_t count{}; + auto count_finalizer{kphp::regex::details::get_count_finalizer(count, opt_count)}; + + if (replacement.is_array()) { + kphp::log::warning("parameter mismatch, pattern is a string while replacement is an array"); + return false; } - size_t last_pos{}; - string output_str{}; + return f$preg_replace(regex, replacement.to_string(), subject, limit, opt_count); +} - kphp::pcre2::matcher pcre2_matcher{ - re, {regex_info.subject.c_str(), regex_info.subject.size()}, {}, regex_state.match_context, regex_state.match_data, regex_info.match_options}; - while (regex_info.replace_count < unsigned_limit) { - auto expected_opt_match_view{pcre2_matcher.next()}; +inline mixed f$preg_replace(const kphp::regex::regexp& regex, const string& replacement, const mixed& subject, int64_t limit, + const Optional>>& opt_count) noexcept { + return f$preg_replace(regex, mixed{replacement}, subject, limit, opt_count); +} - if (!expected_opt_match_view.has_value()) [[unlikely]] { - kphp::log::warning("can't replace with callback by pcre2 regex due to match error: {}", expected_opt_match_view.error()); - co_return Optional{}; - } - auto opt_match_view{*expected_opt_match_view}; - if (!opt_match_view.has_value()) { - break; - } +inline Optional f$preg_replace(const string& pattern, const string& replacement, const string& subject, int64_t limit, + Optional>> opt_count) noexcept { + int64_t count{}; + auto count_finalizer{kphp::regex::details::get_count_finalizer(count, opt_count)}; + const auto& pcre2_replacement{kphp::regex::details::preg_replace_preparing(replacement, limit)}; + if (!pcre2_replacement.has_value()) [[unlikely]] { + return false; + } + const kphp::regex::regexp regex{pattern, subject}; + return kphp::regex::details::preg_replace_impl(regex, subject, pcre2_replacement.value(), limit, count); +} - auto& match_view{*opt_match_view}; +inline mixed f$preg_replace(const mixed& pattern, const string& replace_val, const mixed& subject, int64_t limit, + const Optional>>& opt_count) noexcept { + return f$preg_replace(pattern, mixed{replace_val}, subject, limit, opt_count); +} - output_str.append(std::next(regex_info.subject.c_str(), last_pos), match_view.match_start() - last_pos); +// === preg_replace_callback implementation ======================================================= - last_pos = match_view.match_end(); +template> F> +inline kphp::coro::task> f$preg_replace_callback(const kphp::regex::regexp& regex, F callback, string subject, int64_t limit, + Optional>> opt_count, + int64_t flags) noexcept { + static_assert(std::same_as>, string>); + int64_t count{}; + auto count_finalizer{kphp::regex::details::get_count_finalizer(count, opt_count)}; + if (!kphp::regex::details::preg_replace_callback_check_args(limit, flags)) [[unlikely]] { + co_return Optional{}; + } - // retrieve the named groups count - uint32_t named_groups_count{re.name_count()}; + co_return co_await kphp::regex::details::preg_replace_callback_impl(regex, callback, subject, count, limit); +} - array matches{array_size{static_cast(match_view.size() + named_groups_count), named_groups_count == 0}}; - for (auto [key, value] : kphp::regex::details::match_results_wrapper{match_view, group_names, re.capture_count(), re.name_count(), - kphp::regex::details::trailing_unmatch::skip, false, false}) { - matches.set_value(key, value.to_string()); +template +kphp::coro::task f$preg_replace_callback(const kphp::regex::regexp& regex, F callback, const mixed& subject, int64_t limit, + Optional>> opt_count, int64_t flags) noexcept { + if (subject.is_object()) [[unlikely]] { + kphp::log::warning("invalid subject: object could not be converted to string"); + co_return mixed{}; + } + + if (!subject.is_array()) { + co_return co_await f$preg_replace_callback(regex, std::move(callback), subject.to_string(), limit, opt_count, flags); + } + + int64_t count{}; + vk::final_action count_finalizer{[&count, &opt_count]() noexcept { + if (!opt_count.has_value()) { + return; } - string replacement{}; - if constexpr (kphp::coro::is_async_function_v>) { - replacement = co_await std::invoke(callback, std::move(matches)); + kphp::log::assertion(std::holds_alternative>(opt_count.val())); + auto& inner_ref{std::get>(opt_count.val()).get()}; + inner_ref = count; + }}; + + const auto& subject_arr{subject.as_array()}; + array result{subject_arr.size()}; + for (const auto& it : subject_arr) { + int64_t replace_one_count{}; + if (auto replace_result{co_await f$preg_replace_callback(regex, callback, it.get_value().to_string(), limit, replace_one_count, flags)}; + replace_result.has_value()) [[likely]] { + count += replace_one_count; + result.set_value(it.get_key(), std::move(replace_result.val())); } else { - replacement = std::invoke(callback, std::move(matches)); + count = 0; + co_return mixed{}; } - - output_str.append(replacement); - - ++regex_info.replace_count; } - output_str.append(std::next(regex_info.subject.c_str(), last_pos), regex_info.subject.size() - last_pos); + co_return std::move(result); +} - count = regex_info.replace_count; - co_return output_str; +template> F> +kphp::coro::task> f$preg_replace_callback(string pattern, F callback, string subject, int64_t limit, + Optional>> opt_count, + int64_t flags) noexcept { + static_assert(std::same_as>, string>); + int64_t count{}; + auto count_finalizer{kphp::regex::details::get_count_finalizer(count, opt_count)}; + if (!kphp::regex::details::preg_replace_callback_check_args(limit, flags)) [[unlikely]] { + co_return Optional{}; + } + const kphp::regex::regexp regex{pattern, subject}; + co_return co_await kphp::regex::details::preg_replace_callback_impl(regex, callback, subject, count, limit); } template -kphp::coro::task> f$preg_replace_callback(mixed pattern, F callback, string subject, int64_t limit = kphp::regex::PREG_NOLIMIT, - Optional>> opt_count = {}, - int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept { +kphp::coro::task> f$preg_replace_callback(mixed pattern, F callback, string subject, int64_t limit, + Optional>> opt_count, + int64_t flags) noexcept { if (pattern.is_object()) [[unlikely]] { kphp::log::warning("invalid pattern: object could not be converted to string"); co_return Optional{}; @@ -357,9 +1321,8 @@ kphp::coro::task> f$preg_replace_callback(mixed pattern, F call } template -kphp::coro::task f$preg_replace_callback(mixed pattern, F callback, mixed subject, int64_t limit = kphp::regex::PREG_NOLIMIT, - Optional>> opt_count = {}, - int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept { +kphp::coro::task f$preg_replace_callback(mixed pattern, F callback, mixed subject, int64_t limit, + Optional>> opt_count, int64_t flags) noexcept { if (pattern.is_object()) [[unlikely]] { kphp::log::warning("invalid pattern: object could not be converted to string"); co_return mixed{}; @@ -400,21 +1363,31 @@ kphp::coro::task f$preg_replace_callback(mixed pattern, F callback, mixed co_return std::move(result); } -template> -auto f$preg_replace_callback(T1&& pattern, T2&& callback, T3&& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, - Optional>> opt_count = {}, - int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept - -> decltype(f$preg_replace_callback(std::forward(pattern), std::forward(callback), std::forward(subject).val(), limit, opt_count, flags)) { +template +auto f$preg_replace_callback(T1&& pattern, T2&& callback, T3&& subject, int64_t limit, + Optional>> opt_count, + int64_t flags) noexcept -> decltype(f$preg_replace_callback(std::forward(pattern), std::forward(callback), + std::forward(subject).val(), limit, opt_count, flags)) { co_return co_await f$preg_replace_callback(std::forward(pattern), std::forward(callback), std::forward(subject).val(), limit, opt_count, flags); } -// === preg_split ================================================================================= +// === preg_split implementation ================================================================== -Optional> f$preg_split(const string& pattern, const string& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, - int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept; +inline Optional> f$preg_split(const kphp::regex::regexp& regex, const string& subject, int64_t limit, int64_t flags) noexcept { + if (!kphp::regex::details::preg_split_check_args(flags)) { + return false; + } + return kphp::regex::details::preg_split_impl(regex, subject, limit, flags); +} + +inline Optional> f$preg_split(const string& pattern, const string& subject, int64_t limit, int64_t flags) noexcept { + if (!kphp::regex::details::preg_split_check_args(flags)) { + return false; + } + return kphp::regex::details::preg_split_impl(kphp::regex::regexp{pattern, subject}, subject, limit, flags); +} -inline Optional> f$preg_split(const mixed& pattern, const string& subject, int64_t limit = kphp::regex::PREG_NOLIMIT, - int64_t flags = kphp::regex::PREG_NO_FLAGS) noexcept { +inline Optional> f$preg_split(const mixed& pattern, const string& subject, int64_t limit, int64_t flags) noexcept { if (!pattern.is_string()) [[unlikely]] { kphp::log::warning("preg_split() expects parameter 1 to be string, {} given", pattern.get_type_or_class_name()); return false; diff --git a/runtime-light/stdlib/string/regex-state.cpp b/runtime-light/stdlib/string/regex-state.cpp index b88d7535fd..39e14265af 100644 --- a/runtime-light/stdlib/string/regex-state.cpp +++ b/runtime-light/stdlib/string/regex-state.cpp @@ -9,3 +9,11 @@ RegexInstanceState& RegexInstanceState::get() noexcept { return InstanceState::get().regex_instance_state; } + +const RegexImageState& RegexImageState::get() noexcept { + return ImageState::get().regex_image_state; +} + +RegexImageState& RegexImageState::get_mutable() noexcept { + return ImageState::get_mutable().regex_image_state; +} diff --git a/runtime-light/stdlib/string/regex-state.h b/runtime-light/stdlib/string/regex-state.h index 0e3b8727e3..3d5cb0f876 100644 --- a/runtime-light/stdlib/string/regex-state.h +++ b/runtime-light/stdlib/string/regex-state.h @@ -15,12 +15,14 @@ #include "runtime-common/core/allocator/script-malloc-interface.h" #include "runtime-common/core/runtime-core.h" #include "runtime-common/core/std/containers.h" +#include "runtime-light/core/reference-counter/reference-counter-functions.h" #include "runtime-light/stdlib/diagnostics/logs.h" #include "runtime-light/stdlib/string/pcre2-functions.h" // correctly include PCRE2 lib #include "runtime-light/stdlib/string/regex-include.h" -struct RegexInstanceState final : private vk::not_copyable { +namespace kphp::regex::details { +struct RegexCoreState : private vk::not_copyable { struct compiled_regex { // PCRE compile options of the regex uint32_t compile_options{}; @@ -28,12 +30,14 @@ struct RegexInstanceState final : private vk::not_copyable { kphp::pcre2::regex regex_code; }; -private: +protected: using hasher_type = decltype([](const string& s) noexcept { return static_cast(s.hash()); }); + kphp::stl::unordered_map regex_pcre2_code_cache; - static constexpr size_t MAX_SUBPATTERNS_COUNT{512}; + virtual void set_global_const_refcnt(string& /*unused*/) noexcept {}; - kphp::stl::unordered_map regex_pcre2_code_cache; +private: + static constexpr size_t MAX_SUBPATTERNS_COUNT{512}; static void* regex_malloc(PCRE2_SIZE size, [[maybe_unused]] void* memory_data) noexcept { auto* mem{kphp::memory::script::alloc(size)}; @@ -56,27 +60,18 @@ struct RegexInstanceState final : private vk::not_copyable { kphp::pcre2::general_context general_context; kphp::pcre2::compile_context compile_context; - kphp::pcre2::match_context match_context; - kphp::pcre2::match_data match_data; - RegexInstanceState() noexcept + RegexCoreState() noexcept : general_context(pcre2_general_context_create_8(regex_malloc, regex_free, nullptr), pcre2_general_context_free_8), - compile_context(pcre2_compile_context_create_8(general_context.get()), pcre2_compile_context_free_8), - match_context(pcre2_match_context_create_8(general_context.get()), pcre2_match_context_free_8), - match_data(pcre2_match_data_create_8(OVECTOR_SIZE, general_context.get()), pcre2_match_data_free_8) { + compile_context(pcre2_compile_context_create_8(general_context.get()), pcre2_compile_context_free_8) { if (!general_context) [[unlikely]] { kphp::log::error("can't create pcre2_general_context"); } if (!compile_context) [[unlikely]] { kphp::log::error("can't create pcre2_compile_context"); } - if (!match_context) [[unlikely]] { - kphp::log::error("can't create pcre2_match_context"); - } - if (!match_data) [[unlikely]] { - kphp::log::error("can't create match_data"); - } } + virtual ~RegexCoreState() = default; std::optional> get_compiled_regex(const string& regex) const noexcept { if (const auto it{regex_pcre2_code_cache.find(regex)}; it != regex_pcre2_code_cache.end()) { @@ -87,9 +82,38 @@ struct RegexInstanceState final : private vk::not_copyable { std::optional> add_compiled_regex(string regex, uint32_t compile_options, kphp::pcre2::regex regex_code) noexcept { + set_global_const_refcnt(regex); return regex_pcre2_code_cache.emplace(std::move(regex), compiled_regex{.compile_options = compile_options, .regex_code = std::move(regex_code)}) .first->second; } +}; +} // namespace kphp::regex::details + +struct RegexInstanceState final : public kphp::regex::details::RegexCoreState { + kphp::pcre2::match_context match_context; + kphp::pcre2::match_data match_data; + + RegexInstanceState() noexcept + : match_context(pcre2_match_context_create_8(general_context.get()), pcre2_match_context_free_8), + match_data(pcre2_match_data_create_8(OVECTOR_SIZE, general_context.get()), pcre2_match_data_free_8) { + if (!match_context) [[unlikely]] { + kphp::log::error("can't create pcre2_match_context"); + } + if (!match_data) [[unlikely]] { + kphp::log::error("can't create match_data"); + } + } static RegexInstanceState& get() noexcept; }; + +struct RegexImageState final : public kphp::regex::details::RegexCoreState { + static const RegexImageState& get() noexcept; + static RegexImageState& get_mutable() noexcept; + +private: + void set_global_const_refcnt(string& obj) noexcept override { + kphp::log::assertion((kphp::core::set_reference_counter_recursive(obj, ExtraRefCnt::for_global_const), + kphp::core::is_reference_counter_recursive(obj, ExtraRefCnt::for_global_const))); + } +};