Skip to content

Commit b1ba21f

Browse files
committed
[Feature] Add inverted index NORMALIZER support
1 parent 09fc3fd commit b1ba21f

File tree

58 files changed

+2621
-122
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+2621
-122
lines changed

be/src/olap/inverted_index_parser.cpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -167,13 +167,19 @@ std::string get_parser_dict_compression_from_properties(
167167
}
168168
}
169169

170-
std::string get_custom_analyzer_string_from_properties(
170+
std::string get_analyzer_name_from_properties(
171171
const std::map<std::string, std::string>& properties) {
172-
if (properties.find(INVERTED_INDEX_CUSTOM_ANALYZER_KEY) != properties.end()) {
173-
return properties.at(INVERTED_INDEX_CUSTOM_ANALYZER_KEY);
174-
} else {
175-
return "";
172+
auto it = properties.find(INVERTED_INDEX_ANALYZER_NAME_KEY);
173+
if (it != properties.end() && !it->second.empty()) {
174+
return it->second;
176175
}
176+
177+
it = properties.find(INVERTED_INDEX_NORMALIZER_NAME_KEY);
178+
if (it != properties.end() && !it->second.empty()) {
179+
return it->second;
180+
}
181+
182+
return "";
177183
}
178184

179185
} // namespace doris

be/src/olap/inverted_index_parser.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ enum class InvertedIndexParserType {
4646
using CharFilterMap = std::map<std::string, std::string>;
4747

4848
struct InvertedIndexCtx {
49-
std::string custom_analyzer;
49+
std::string analyzer_name;
5050
InvertedIndexParserType parser_type;
5151
std::string parser_mode;
5252
std::string support_phrase;
@@ -97,7 +97,8 @@ const std::string INVERTED_INDEX_PARSER_STOPWORDS_KEY = "stopwords";
9797

9898
const std::string INVERTED_INDEX_PARSER_DICT_COMPRESSION_KEY = "dict_compression";
9999

100-
const std::string INVERTED_INDEX_CUSTOM_ANALYZER_KEY = "analyzer";
100+
const std::string INVERTED_INDEX_ANALYZER_NAME_KEY = "analyzer";
101+
const std::string INVERTED_INDEX_NORMALIZER_NAME_KEY = "normalizer";
101102

102103
std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_type);
103104

@@ -138,7 +139,6 @@ std::string get_parser_stopwords_from_properties(
138139
std::string get_parser_dict_compression_from_properties(
139140
const std::map<std::string, std::string>& properties);
140141

141-
std::string get_custom_analyzer_string_from_properties(
142-
const std::map<std::string, std::string>& properties);
142+
std::string get_analyzer_name_from_properties(const std::map<std::string, std::string>& properties);
143143

144144
} // namespace doris

be/src/olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,10 @@
1919

2020
#include "olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter_factory.h"
2121
#include "olap/rowset/segment_v2/inverted_index/char_filter/empty_char_filter_factory.h"
22+
#include "olap/rowset/segment_v2/inverted_index/char_filter/icu_normalizer_char_filter_factory.h"
2223
#include "olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter_factory.h"
2324
#include "olap/rowset/segment_v2/inverted_index/token_filter/empty_token_filter_factory.h"
25+
#include "olap/rowset/segment_v2/inverted_index/token_filter/icu_normalizer_filter_factory.h"
2426
#include "olap/rowset/segment_v2/inverted_index/token_filter/lower_case_filter_factory.h"
2527
#include "olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_factory.h"
2628
#include "olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_factory.h"
@@ -43,6 +45,9 @@ void AnalysisFactoryMgr::initialise() {
4345
"empty", []() { return std::make_shared<EmptyCharFilterFactory>(); });
4446
registerFactory<CharFilterFactory>(
4547
"char_replace", []() { return std::make_shared<CharReplaceCharFilterFactory>(); });
48+
registerFactory<CharFilterFactory>("icu_normalizer", []() {
49+
return std::make_shared<ICUNormalizerCharFilterFactory>();
50+
});
4651

4752
// tokenizer
4853
registerFactory<TokenizerFactory>(
@@ -75,6 +80,8 @@ void AnalysisFactoryMgr::initialise() {
7580
"word_delimiter", []() { return std::make_shared<WordDelimiterFilterFactory>(); });
7681
registerFactory<TokenFilterFactory>(
7782
"pinyin", []() { return std::make_shared<PinyinFilterFactory>(); });
83+
registerFactory<TokenFilterFactory>(
84+
"icu_normalizer", []() { return std::make_shared<ICUNormalizerFilterFactory>(); });
7885
});
7986
}
8087

be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ AnalyzerPtr InvertedIndexAnalyzer::create_builtin_analyzer(InvertedIndexParserTy
125125

126126
std::shared_ptr<lucene::analysis::Analyzer> InvertedIndexAnalyzer::create_analyzer(
127127
const InvertedIndexCtx* inverted_index_ctx) {
128-
const std::string& analyzer_name = inverted_index_ctx->custom_analyzer;
128+
const std::string& analyzer_name = inverted_index_ctx->analyzer_name;
129129
if (analyzer_name.empty()) {
130130
return create_builtin_analyzer(
131131
inverted_index_ctx->parser_type, inverted_index_ctx->parser_mode,
@@ -177,7 +177,7 @@ std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
177177
std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
178178
const std::string& search_str, const std::map<std::string, std::string>& properties) {
179179
InvertedIndexCtxSPtr inverted_index_ctx = std::make_shared<InvertedIndexCtx>(
180-
get_custom_analyzer_string_from_properties(properties),
180+
get_analyzer_name_from_properties(properties),
181181
get_inverted_index_parser_type_from_string(
182182
get_parser_string_from_properties(properties)),
183183
get_parser_mode_string_from_properties(properties),
@@ -195,7 +195,7 @@ std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
195195
bool InvertedIndexAnalyzer::should_analyzer(const std::map<std::string, std::string>& properties) {
196196
auto parser_type = get_inverted_index_parser_type_from_string(
197197
get_parser_string_from_properties(properties));
198-
auto analyzer_name = get_custom_analyzer_string_from_properties(properties);
198+
auto analyzer_name = get_analyzer_name_from_properties(properties);
199199
if (!analyzer_name.empty()) {
200200
return true;
201201
}

be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919

2020
#include <memory>
2121
#include <string>
22-
#include <unordered_map>
2322

2423
#include "olap/rowset/segment_v2/inverted_index/setting.h"
2524

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include "icu_normalizer_char_filter.h"
19+
20+
#include <unicode/normalizer2.h>
21+
#include <unicode/unistr.h>
22+
23+
#include "common/exception.h"
24+
#include "common/logging.h"
25+
26+
namespace doris::segment_v2::inverted_index {
27+
28+
ICUNormalizerCharFilter::ICUNormalizerCharFilter(ReaderPtr reader,
29+
std::shared_ptr<const icu::Normalizer2> normalizer)
30+
: DorisCharFilter(std::move(reader)), _normalizer(std::move(normalizer)) {
31+
if (_normalizer == nullptr) {
32+
throw Exception(ErrorCode::INVALID_ARGUMENT,
33+
"ICUNormalizerCharFilter: normalizer cannot be null");
34+
}
35+
}
36+
37+
void ICUNormalizerCharFilter::initialize() {
38+
if (_transformed_input.size() != 0) {
39+
return;
40+
}
41+
fill();
42+
}
43+
44+
void ICUNormalizerCharFilter::init(const void* _value, int32_t _length, bool copyData) {
45+
_reader->init(_value, _length, copyData);
46+
fill();
47+
}
48+
49+
int32_t ICUNormalizerCharFilter::read(const void** start, int32_t min, int32_t max) {
50+
return _transformed_input.read(start, min, max);
51+
}
52+
53+
int32_t ICUNormalizerCharFilter::readCopy(void* start, int32_t off, int32_t len) {
54+
return _transformed_input.readCopy(start, off, len);
55+
}
56+
57+
void ICUNormalizerCharFilter::fill() {
58+
std::string input;
59+
input.resize(_reader->size());
60+
_reader->readCopy(input.data(), 0, static_cast<int32_t>(input.size()));
61+
normalize_text(input, _buf);
62+
_transformed_input.init(_buf.data(), static_cast<int32_t>(_buf.size()), false);
63+
}
64+
65+
void ICUNormalizerCharFilter::normalize_text(const std::string& input, std::string& output) {
66+
if (input.empty()) {
67+
output.clear();
68+
return;
69+
}
70+
71+
UErrorCode status = U_ZERO_ERROR;
72+
icu::UnicodeString src16 = icu::UnicodeString::fromUTF8(input);
73+
UNormalizationCheckResult quick_result = _normalizer->quickCheck(src16, status);
74+
if (U_SUCCESS(status) && quick_result == UNORM_YES) {
75+
output = input;
76+
return;
77+
}
78+
79+
icu::UnicodeString result16;
80+
status = U_ZERO_ERROR;
81+
_normalizer->normalize(src16, result16, status);
82+
if (U_FAILURE(status)) {
83+
LOG(WARNING) << "ICU normalize failed: " << u_errorName(status) << ", using original text";
84+
output = input;
85+
return;
86+
}
87+
88+
result16.toUTF8String(output);
89+
}
90+
91+
} // namespace doris::segment_v2::inverted_index
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#pragma once
19+
20+
#include <unicode/normalizer2.h>
21+
22+
#include <string>
23+
24+
#include "char_filter.h"
25+
26+
namespace doris::segment_v2::inverted_index {
27+
28+
class ICUNormalizerCharFilter : public DorisCharFilter {
29+
public:
30+
ICUNormalizerCharFilter(ReaderPtr reader, std::shared_ptr<const icu::Normalizer2> normalizer);
31+
~ICUNormalizerCharFilter() override = default;
32+
33+
void initialize() override;
34+
35+
void init(const void* _value, int32_t _length, bool copyData) override;
36+
int32_t read(const void** start, int32_t min, int32_t max) override;
37+
int32_t readCopy(void* start, int32_t off, int32_t len) override;
38+
39+
size_t size() override { return _buf.size(); }
40+
41+
private:
42+
void fill();
43+
void normalize_text(const std::string& input, std::string& output);
44+
45+
std::shared_ptr<const icu::Normalizer2> _normalizer;
46+
std::string _buf;
47+
lucene::util::SStringReader<char> _transformed_input;
48+
};
49+
using ICUNormalizerCharFilterPtr = std::shared_ptr<ICUNormalizerCharFilter>;
50+
51+
} // namespace doris::segment_v2::inverted_index
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#pragma once
19+
20+
#include <unicode/filteredbrk.h>
21+
#include <unicode/normalizer2.h>
22+
#include <unicode/uniset.h>
23+
#include <unicode/unistr.h>
24+
25+
#include <memory>
26+
#include <string>
27+
28+
#include "char_filter_factory.h"
29+
#include "common/exception.h"
30+
#include "icu_normalizer_char_filter.h"
31+
32+
namespace doris::segment_v2::inverted_index {
33+
34+
class ICUNormalizerCharFilterFactory : public CharFilterFactory {
35+
public:
36+
ICUNormalizerCharFilterFactory() = default;
37+
~ICUNormalizerCharFilterFactory() override = default;
38+
39+
void initialize(const Settings& settings) override {
40+
std::string name = settings.get_string("name", "nfkc_cf");
41+
std::string mode = settings.get_string("mode", "compose");
42+
std::string unicode_set_filter = settings.get_string("unicode_set_filter", "");
43+
if (mode != "compose" && mode != "decompose") {
44+
throw Exception(ErrorCode::INVALID_ARGUMENT,
45+
"ICUNormalizerCharFilterFactory: mode must be 'compose' or "
46+
"'decompose', got: " +
47+
mode);
48+
}
49+
50+
UErrorCode status = U_ZERO_ERROR;
51+
const icu::Normalizer2* base = get_normalizer(name, mode, status);
52+
if (U_FAILURE(status) || base == nullptr) {
53+
throw Exception(ErrorCode::INVALID_ARGUMENT,
54+
"Failed to get normalizer instance for '" + name + "' with mode '" +
55+
mode + "': " + std::string(u_errorName(status)));
56+
}
57+
58+
if (unicode_set_filter.empty()) {
59+
_normalizer =
60+
std::shared_ptr<const icu::Normalizer2>(base, [](const icu::Normalizer2*) {});
61+
return;
62+
}
63+
64+
icu::UnicodeSet unicode_set(icu::UnicodeString::fromUTF8(unicode_set_filter), status);
65+
if (U_FAILURE(status)) {
66+
throw Exception(ErrorCode::INVALID_ARGUMENT, "Failed to parse unicode_set_filter: " +
67+
std::string(u_errorName(status)));
68+
}
69+
if (unicode_set.isEmpty()) {
70+
_normalizer =
71+
std::shared_ptr<const icu::Normalizer2>(base, [](const icu::Normalizer2*) {});
72+
return;
73+
}
74+
unicode_set.freeze();
75+
76+
_normalizer = std::make_shared<icu::FilteredNormalizer2>(*base, unicode_set);
77+
}
78+
79+
ReaderPtr create(const ReaderPtr& in) override {
80+
if (!_normalizer) {
81+
throw Exception(ErrorCode::INVALID_ARGUMENT,
82+
"ICUNormalizerCharFilterFactory not initialized. Call initialize() "
83+
"first.");
84+
}
85+
return std::make_shared<ICUNormalizerCharFilter>(in, _normalizer);
86+
}
87+
88+
private:
89+
static const icu::Normalizer2* get_normalizer(const std::string& name, const std::string& mode,
90+
UErrorCode& status) {
91+
UNormalization2Mode icu_mode = (mode == "compose" ? UNORM2_COMPOSE : UNORM2_DECOMPOSE);
92+
if (name == "nfc" || name == "nfkc" || name == "nfkc_cf") {
93+
return icu::Normalizer2::getInstance(nullptr, name.c_str(), icu_mode, status);
94+
}
95+
96+
if (name == "nfd") {
97+
return icu::Normalizer2::getNFDInstance(status);
98+
} else if (name == "nfkd") {
99+
return icu::Normalizer2::getNFKDInstance(status);
100+
}
101+
102+
status = U_ILLEGAL_ARGUMENT_ERROR;
103+
return nullptr;
104+
}
105+
106+
std::shared_ptr<const icu::Normalizer2> _normalizer;
107+
};
108+
using ICUNormalizerCharFilterFactoryPtr = std::shared_ptr<ICUNormalizerCharFilterFactory>;
109+
110+
} // namespace doris::segment_v2::inverted_index

0 commit comments

Comments
 (0)