apache · zzzxl1993 · Nov 13, 2025 · Nov 20, 2025 · Nov 20, 2025 · Nov 21, 2025
diff --git a/be/src/olap/inverted_index_parser.cpp b/be/src/olap/inverted_index_parser.cpp
@@ -167,13 +167,19 @@ std::string get_parser_dict_compression_from_properties(
     }
 }
 
-std::string get_custom_analyzer_string_from_properties(
+std::string get_analyzer_name_from_properties(
         const std::map<std::string, std::string>& properties) {
-    if (properties.find(INVERTED_INDEX_CUSTOM_ANALYZER_KEY) != properties.end()) {
-        return properties.at(INVERTED_INDEX_CUSTOM_ANALYZER_KEY);
-    } else {
-        return "";
+    auto it = properties.find(INVERTED_INDEX_ANALYZER_NAME_KEY);
+    if (it != properties.end() && !it->second.empty()) {
+        return it->second;
     }
+
+    it = properties.find(INVERTED_INDEX_NORMALIZER_NAME_KEY);
+    if (it != properties.end() && !it->second.empty()) {
+        return it->second;
+    }
+
+    return "";
 }
 
 } // namespace doris
diff --git a/be/src/olap/inverted_index_parser.h b/be/src/olap/inverted_index_parser.h
@@ -46,7 +46,7 @@ enum class InvertedIndexParserType {
 using CharFilterMap = std::map<std::string, std::string>;
 
 struct InvertedIndexCtx {
-    std::string custom_analyzer;
+    std::string analyzer_name;
     InvertedIndexParserType parser_type;
     std::string parser_mode;
     std::string support_phrase;
@@ -97,7 +97,8 @@ const std::string INVERTED_INDEX_PARSER_STOPWORDS_KEY = "stopwords";
 
 const std::string INVERTED_INDEX_PARSER_DICT_COMPRESSION_KEY = "dict_compression";
 
-const std::string INVERTED_INDEX_CUSTOM_ANALYZER_KEY = "analyzer";
+const std::string INVERTED_INDEX_ANALYZER_NAME_KEY = "analyzer";
+const std::string INVERTED_INDEX_NORMALIZER_NAME_KEY = "normalizer";
 
 std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_type);
 
@@ -138,7 +139,6 @@ std::string get_parser_stopwords_from_properties(
 std::string get_parser_dict_compression_from_properties(
         const std::map<std::string, std::string>& properties);
 
-std::string get_custom_analyzer_string_from_properties(
-        const std::map<std::string, std::string>& properties);
+std::string get_analyzer_name_from_properties(const std::map<std::string, std::string>& properties);
 
 } // namespace doris
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.cpp
@@ -19,8 +19,10 @@
 
 #include "olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter_factory.h"
 #include "olap/rowset/segment_v2/inverted_index/char_filter/empty_char_filter_factory.h"
+#include "olap/rowset/segment_v2/inverted_index/char_filter/icu_normalizer_char_filter_factory.h"
 #include "olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter_factory.h"
 #include "olap/rowset/segment_v2/inverted_index/token_filter/empty_token_filter_factory.h"
+#include "olap/rowset/segment_v2/inverted_index/token_filter/icu_normalizer_filter_factory.h"
 #include "olap/rowset/segment_v2/inverted_index/token_filter/lower_case_filter_factory.h"
 #include "olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_factory.h"
 #include "olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_factory.h"
@@ -43,6 +45,9 @@ void AnalysisFactoryMgr::initialise() {
                 "empty", []() { return std::make_shared<EmptyCharFilterFactory>(); });
         registerFactory<CharFilterFactory>(
                 "char_replace", []() { return std::make_shared<CharReplaceCharFilterFactory>(); });
+        registerFactory<CharFilterFactory>("icu_normalizer", []() {
+            return std::make_shared<ICUNormalizerCharFilterFactory>();
+        });
 
         // tokenizer
         registerFactory<TokenizerFactory>(
@@ -75,6 +80,8 @@ void AnalysisFactoryMgr::initialise() {
                 "word_delimiter", []() { return std::make_shared<WordDelimiterFilterFactory>(); });
         registerFactory<TokenFilterFactory>(
                 "pinyin", []() { return std::make_shared<PinyinFilterFactory>(); });
+        registerFactory<TokenFilterFactory>(
+                "icu_normalizer", []() { return std::make_shared<ICUNormalizerFilterFactory>(); });
     });
 }
 

diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
@@ -125,7 +125,7 @@ AnalyzerPtr InvertedIndexAnalyzer::create_builtin_analyzer(InvertedIndexParserTy
 
 std::shared_ptr<lucene::analysis::Analyzer> InvertedIndexAnalyzer::create_analyzer(
         const InvertedIndexCtx* inverted_index_ctx) {
-    const std::string& analyzer_name = inverted_index_ctx->custom_analyzer;
+    const std::string& analyzer_name = inverted_index_ctx->analyzer_name;
     if (analyzer_name.empty()) {
         return create_builtin_analyzer(
                 inverted_index_ctx->parser_type, inverted_index_ctx->parser_mode,
@@ -177,7 +177,7 @@ std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
 std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
         const std::string& search_str, const std::map<std::string, std::string>& properties) {
     InvertedIndexCtxSPtr inverted_index_ctx = std::make_shared<InvertedIndexCtx>(
-            get_custom_analyzer_string_from_properties(properties),
+            get_analyzer_name_from_properties(properties),
             get_inverted_index_parser_type_from_string(
                     get_parser_string_from_properties(properties)),
             get_parser_mode_string_from_properties(properties),
@@ -195,7 +195,7 @@ std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
 bool InvertedIndexAnalyzer::should_analyzer(const std::map<std::string, std::string>& properties) {
     auto parser_type = get_inverted_index_parser_type_from_string(
             get_parser_string_from_properties(properties));
-    auto analyzer_name = get_custom_analyzer_string_from_properties(properties);
+    auto analyzer_name = get_analyzer_name_from_properties(properties);
     if (!analyzer_name.empty()) {
         return true;
     }

diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.h
@@ -19,7 +19,6 @@
 
 #include <memory>
 #include <string>
-#include <unordered_map>
 
 #include "olap/rowset/segment_v2/inverted_index/setting.h"
 

diff --git a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/icu_normalizer_char_filter.cpp b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/icu_normalizer_char_filter.cpp
@@ -0,0 +1,91 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "icu_normalizer_char_filter.h"
+
+#include <unicode/normalizer2.h>
+#include <unicode/unistr.h>
+
+#include "common/exception.h"
+#include "common/logging.h"
+
+namespace doris::segment_v2::inverted_index {
+
+ICUNormalizerCharFilter::ICUNormalizerCharFilter(ReaderPtr reader,
+                                                 std::shared_ptr<const icu::Normalizer2> normalizer)
+        : DorisCharFilter(std::move(reader)), _normalizer(std::move(normalizer)) {
+    if (_normalizer == nullptr) {
+        throw Exception(ErrorCode::INVALID_ARGUMENT,
+                        "ICUNormalizerCharFilter: normalizer cannot be null");
+    }
+}
+
+void ICUNormalizerCharFilter::initialize() {
+    if (_transformed_input.size() != 0) {
+        return;
+    }
+    fill();
+}
+
+void ICUNormalizerCharFilter::init(const void* _value, int32_t _length, bool copyData) {
+    _reader->init(_value, _length, copyData);
+    fill();
+}
+
+int32_t ICUNormalizerCharFilter::read(const void** start, int32_t min, int32_t max) {
+    return _transformed_input.read(start, min, max);
+}
+
+int32_t ICUNormalizerCharFilter::readCopy(void* start, int32_t off, int32_t len) {
+    return _transformed_input.readCopy(start, off, len);
+}
+
+void ICUNormalizerCharFilter::fill() {
+    std::string input;
+    input.resize(_reader->size());
+    _reader->readCopy(input.data(), 0, static_cast<int32_t>(input.size()));
+    normalize_text(input, _buf);
+    _transformed_input.init(_buf.data(), static_cast<int32_t>(_buf.size()), false);
+}
+
+void ICUNormalizerCharFilter::normalize_text(const std::string& input, std::string& output) {
+    if (input.empty()) {
+        output.clear();
+        return;
+    }
+
+    UErrorCode status = U_ZERO_ERROR;
+    icu::UnicodeString src16 = icu::UnicodeString::fromUTF8(input);
+    UNormalizationCheckResult quick_result = _normalizer->quickCheck(src16, status);
+    if (U_SUCCESS(status) && quick_result == UNORM_YES) {
+        output = input;
+        return;
+    }
+
+    icu::UnicodeString result16;
+    status = U_ZERO_ERROR;
+    _normalizer->normalize(src16, result16, status);
+    if (U_FAILURE(status)) {
+        LOG(WARNING) << "ICU normalize failed: " << u_errorName(status) << ", using original text";
+        output = input;
+        return;
+    }
+
+    result16.toUTF8String(output);
+}
+
+} // namespace doris::segment_v2::inverted_index
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/icu_normalizer_char_filter.h b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/icu_normalizer_char_filter.h
@@ -0,0 +1,51 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unicode/normalizer2.h>
+
+#include <string>
+
+#include "char_filter.h"
+
+namespace doris::segment_v2::inverted_index {
+
+class ICUNormalizerCharFilter : public DorisCharFilter {
+public:
+    ICUNormalizerCharFilter(ReaderPtr reader, std::shared_ptr<const icu::Normalizer2> normalizer);
+    ~ICUNormalizerCharFilter() override = default;
+
+    void initialize() override;
+
+    void init(const void* _value, int32_t _length, bool copyData) override;
+    int32_t read(const void** start, int32_t min, int32_t max) override;
+    int32_t readCopy(void* start, int32_t off, int32_t len) override;
+
+    size_t size() override { return _buf.size(); }
+
+private:
+    void fill();
+    void normalize_text(const std::string& input, std::string& output);
+
+    std::shared_ptr<const icu::Normalizer2> _normalizer;
+    std::string _buf;
+    lucene::util::SStringReader<char> _transformed_input;
+};
+using ICUNormalizerCharFilterPtr = std::shared_ptr<ICUNormalizerCharFilter>;
+
+} // namespace doris::segment_v2::inverted_index
diff --git a/...rc/olap/rowset/segment_v2/inverted_index/char_filter/icu_normalizer_char_filter_factory.h b/...rc/olap/rowset/segment_v2/inverted_index/char_filter/icu_normalizer_char_filter_factory.h
@@ -0,0 +1,110 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unicode/filteredbrk.h>
+#include <unicode/normalizer2.h>
+#include <unicode/uniset.h>
+#include <unicode/unistr.h>
+
+#include <memory>
+#include <string>
+
+#include "char_filter_factory.h"
+#include "common/exception.h"
+#include "icu_normalizer_char_filter.h"
+
+namespace doris::segment_v2::inverted_index {
+
+class ICUNormalizerCharFilterFactory : public CharFilterFactory {
+public:
+    ICUNormalizerCharFilterFactory() = default;
+    ~ICUNormalizerCharFilterFactory() override = default;
+
+    void initialize(const Settings& settings) override {
+        std::string name = settings.get_string("name", "nfkc_cf");
+        std::string mode = settings.get_string("mode", "compose");
+        std::string unicode_set_filter = settings.get_string("unicode_set_filter", "");
+        if (mode != "compose" && mode != "decompose") {
+            throw Exception(ErrorCode::INVALID_ARGUMENT,
+                            "ICUNormalizerCharFilterFactory: mode must be 'compose' or "
+                            "'decompose', got: " +
+                                    mode);
+        }
+
+        UErrorCode status = U_ZERO_ERROR;
+        const icu::Normalizer2* base = get_normalizer(name, mode, status);
+        if (U_FAILURE(status) || base == nullptr) {
+            throw Exception(ErrorCode::INVALID_ARGUMENT,
+                            "Failed to get normalizer instance for '" + name + "' with mode '" +
+                                    mode + "': " + std::string(u_errorName(status)));
+        }
+
+        if (unicode_set_filter.empty()) {
+            _normalizer =
+                    std::shared_ptr<const icu::Normalizer2>(base, [](const icu::Normalizer2*) {});
+            return;
+        }
+
+        icu::UnicodeSet unicode_set(icu::UnicodeString::fromUTF8(unicode_set_filter), status);
+        if (U_FAILURE(status)) {
+            throw Exception(ErrorCode::INVALID_ARGUMENT, "Failed to parse unicode_set_filter: " +
+                                                                 std::string(u_errorName(status)));
+        }
+        if (unicode_set.isEmpty()) {
+            _normalizer =
+                    std::shared_ptr<const icu::Normalizer2>(base, [](const icu::Normalizer2*) {});
+            return;
+        }
+        unicode_set.freeze();
+
+        _normalizer = std::make_shared<icu::FilteredNormalizer2>(*base, unicode_set);
+    }
+
+    ReaderPtr create(const ReaderPtr& in) override {
+        if (!_normalizer) {
+            throw Exception(ErrorCode::INVALID_ARGUMENT,
+                            "ICUNormalizerCharFilterFactory not initialized. Call initialize() "
+                            "first.");
+        }
+        return std::make_shared<ICUNormalizerCharFilter>(in, _normalizer);
+    }
+
+private:
+    static const icu::Normalizer2* get_normalizer(const std::string& name, const std::string& mode,
+                                                  UErrorCode& status) {
+        UNormalization2Mode icu_mode = (mode == "compose" ? UNORM2_COMPOSE : UNORM2_DECOMPOSE);
+        if (name == "nfc" || name == "nfkc" || name == "nfkc_cf") {
+            return icu::Normalizer2::getInstance(nullptr, name.c_str(), icu_mode, status);
+        }
+
+        if (name == "nfd") {
+            return icu::Normalizer2::getNFDInstance(status);
+        } else if (name == "nfkd") {
+            return icu::Normalizer2::getNFKDInstance(status);
+        }
+
+        status = U_ILLEGAL_ARGUMENT_ERROR;
+        return nullptr;
+    }
+
+    std::shared_ptr<const icu::Normalizer2> _normalizer;
+};
+using ICUNormalizerCharFilterFactoryPtr = std::shared_ptr<ICUNormalizerCharFilterFactory>;
+
+} // namespace doris::segment_v2::inverted_index