Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions be/src/olap/inverted_index_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,13 +167,19 @@ std::string get_parser_dict_compression_from_properties(
}
}

std::string get_custom_analyzer_string_from_properties(
std::string get_analyzer_name_from_properties(
const std::map<std::string, std::string>& properties) {
if (properties.find(INVERTED_INDEX_CUSTOM_ANALYZER_KEY) != properties.end()) {
return properties.at(INVERTED_INDEX_CUSTOM_ANALYZER_KEY);
} else {
return "";
auto it = properties.find(INVERTED_INDEX_ANALYZER_NAME_KEY);
if (it != properties.end() && !it->second.empty()) {
return it->second;
}

it = properties.find(INVERTED_INDEX_NORMALIZER_NAME_KEY);
if (it != properties.end() && !it->second.empty()) {
return it->second;
}

return "";
}

} // namespace doris
8 changes: 4 additions & 4 deletions be/src/olap/inverted_index_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ enum class InvertedIndexParserType {
using CharFilterMap = std::map<std::string, std::string>;

struct InvertedIndexCtx {
std::string custom_analyzer;
std::string analyzer_name;
InvertedIndexParserType parser_type;
std::string parser_mode;
std::string support_phrase;
Expand Down Expand Up @@ -97,7 +97,8 @@ const std::string INVERTED_INDEX_PARSER_STOPWORDS_KEY = "stopwords";

const std::string INVERTED_INDEX_PARSER_DICT_COMPRESSION_KEY = "dict_compression";

const std::string INVERTED_INDEX_CUSTOM_ANALYZER_KEY = "analyzer";
const std::string INVERTED_INDEX_ANALYZER_NAME_KEY = "analyzer";
const std::string INVERTED_INDEX_NORMALIZER_NAME_KEY = "normalizer";

std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_type);

Expand Down Expand Up @@ -138,7 +139,6 @@ std::string get_parser_stopwords_from_properties(
std::string get_parser_dict_compression_from_properties(
const std::map<std::string, std::string>& properties);

std::string get_custom_analyzer_string_from_properties(
const std::map<std::string, std::string>& properties);
std::string get_analyzer_name_from_properties(const std::map<std::string, std::string>& properties);

} // namespace doris
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@

#include "olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter_factory.h"
#include "olap/rowset/segment_v2/inverted_index/char_filter/empty_char_filter_factory.h"
#include "olap/rowset/segment_v2/inverted_index/char_filter/icu_normalizer_char_filter_factory.h"
#include "olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter_factory.h"
#include "olap/rowset/segment_v2/inverted_index/token_filter/empty_token_filter_factory.h"
#include "olap/rowset/segment_v2/inverted_index/token_filter/icu_normalizer_filter_factory.h"
#include "olap/rowset/segment_v2/inverted_index/token_filter/lower_case_filter_factory.h"
#include "olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_factory.h"
#include "olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_factory.h"
Expand All @@ -43,6 +45,9 @@ void AnalysisFactoryMgr::initialise() {
"empty", []() { return std::make_shared<EmptyCharFilterFactory>(); });
registerFactory<CharFilterFactory>(
"char_replace", []() { return std::make_shared<CharReplaceCharFilterFactory>(); });
registerFactory<CharFilterFactory>("icu_normalizer", []() {
return std::make_shared<ICUNormalizerCharFilterFactory>();
});

// tokenizer
registerFactory<TokenizerFactory>(
Expand Down Expand Up @@ -75,6 +80,8 @@ void AnalysisFactoryMgr::initialise() {
"word_delimiter", []() { return std::make_shared<WordDelimiterFilterFactory>(); });
registerFactory<TokenFilterFactory>(
"pinyin", []() { return std::make_shared<PinyinFilterFactory>(); });
registerFactory<TokenFilterFactory>(
"icu_normalizer", []() { return std::make_shared<ICUNormalizerFilterFactory>(); });
});
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ AnalyzerPtr InvertedIndexAnalyzer::create_builtin_analyzer(InvertedIndexParserTy

std::shared_ptr<lucene::analysis::Analyzer> InvertedIndexAnalyzer::create_analyzer(
const InvertedIndexCtx* inverted_index_ctx) {
const std::string& analyzer_name = inverted_index_ctx->custom_analyzer;
const std::string& analyzer_name = inverted_index_ctx->analyzer_name;
if (analyzer_name.empty()) {
return create_builtin_analyzer(
inverted_index_ctx->parser_type, inverted_index_ctx->parser_mode,
Expand Down Expand Up @@ -177,7 +177,7 @@ std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
const std::string& search_str, const std::map<std::string, std::string>& properties) {
InvertedIndexCtxSPtr inverted_index_ctx = std::make_shared<InvertedIndexCtx>(
get_custom_analyzer_string_from_properties(properties),
get_analyzer_name_from_properties(properties),
get_inverted_index_parser_type_from_string(
get_parser_string_from_properties(properties)),
get_parser_mode_string_from_properties(properties),
Expand All @@ -195,7 +195,7 @@ std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
bool InvertedIndexAnalyzer::should_analyzer(const std::map<std::string, std::string>& properties) {
auto parser_type = get_inverted_index_parser_type_from_string(
get_parser_string_from_properties(properties));
auto analyzer_name = get_custom_analyzer_string_from_properties(properties);
auto analyzer_name = get_analyzer_name_from_properties(properties);
if (!analyzer_name.empty()) {
return true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

#include <memory>
#include <string>
#include <unordered_map>

#include "olap/rowset/segment_v2/inverted_index/setting.h"

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "icu_normalizer_char_filter.h"

#include <unicode/normalizer2.h>
#include <unicode/unistr.h>

#include "common/exception.h"
#include "common/logging.h"

namespace doris::segment_v2::inverted_index {

ICUNormalizerCharFilter::ICUNormalizerCharFilter(ReaderPtr reader,
std::shared_ptr<const icu::Normalizer2> normalizer)
: DorisCharFilter(std::move(reader)), _normalizer(std::move(normalizer)) {
if (_normalizer == nullptr) {
throw Exception(ErrorCode::INVALID_ARGUMENT,
"ICUNormalizerCharFilter: normalizer cannot be null");
}
}

void ICUNormalizerCharFilter::initialize() {
if (_transformed_input.size() != 0) {
return;
}
fill();
}

void ICUNormalizerCharFilter::init(const void* _value, int32_t _length, bool copyData) {
_reader->init(_value, _length, copyData);
fill();
}

int32_t ICUNormalizerCharFilter::read(const void** start, int32_t min, int32_t max) {
return _transformed_input.read(start, min, max);
}

int32_t ICUNormalizerCharFilter::readCopy(void* start, int32_t off, int32_t len) {
return _transformed_input.readCopy(start, off, len);
}

void ICUNormalizerCharFilter::fill() {
std::string input;
input.resize(_reader->size());
_reader->readCopy(input.data(), 0, static_cast<int32_t>(input.size()));
normalize_text(input, _buf);
_transformed_input.init(_buf.data(), static_cast<int32_t>(_buf.size()), false);
}

void ICUNormalizerCharFilter::normalize_text(const std::string& input, std::string& output) {
if (input.empty()) {
output.clear();
return;
}

UErrorCode status = U_ZERO_ERROR;
icu::UnicodeString src16 = icu::UnicodeString::fromUTF8(input);
UNormalizationCheckResult quick_result = _normalizer->quickCheck(src16, status);
if (U_SUCCESS(status) && quick_result == UNORM_YES) {
output = input;
return;
}

icu::UnicodeString result16;
status = U_ZERO_ERROR;
_normalizer->normalize(src16, result16, status);
if (U_FAILURE(status)) {
LOG(WARNING) << "ICU normalize failed: " << u_errorName(status) << ", using original text";
output = input;
return;
}

result16.toUTF8String(output);
}

} // namespace doris::segment_v2::inverted_index
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <unicode/normalizer2.h>

#include <string>

#include "char_filter.h"

namespace doris::segment_v2::inverted_index {

class ICUNormalizerCharFilter : public DorisCharFilter {
public:
ICUNormalizerCharFilter(ReaderPtr reader, std::shared_ptr<const icu::Normalizer2> normalizer);
~ICUNormalizerCharFilter() override = default;

void initialize() override;

void init(const void* _value, int32_t _length, bool copyData) override;
int32_t read(const void** start, int32_t min, int32_t max) override;
int32_t readCopy(void* start, int32_t off, int32_t len) override;

size_t size() override { return _buf.size(); }

private:
void fill();
void normalize_text(const std::string& input, std::string& output);

std::shared_ptr<const icu::Normalizer2> _normalizer;
std::string _buf;
lucene::util::SStringReader<char> _transformed_input;
};
using ICUNormalizerCharFilterPtr = std::shared_ptr<ICUNormalizerCharFilter>;

} // namespace doris::segment_v2::inverted_index
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <unicode/filteredbrk.h>
#include <unicode/normalizer2.h>
#include <unicode/uniset.h>
#include <unicode/unistr.h>

#include <memory>
#include <string>

#include "char_filter_factory.h"
#include "common/exception.h"
#include "icu_normalizer_char_filter.h"

namespace doris::segment_v2::inverted_index {

class ICUNormalizerCharFilterFactory : public CharFilterFactory {
public:
ICUNormalizerCharFilterFactory() = default;
~ICUNormalizerCharFilterFactory() override = default;

void initialize(const Settings& settings) override {
std::string name = settings.get_string("name", "nfkc_cf");
std::string mode = settings.get_string("mode", "compose");
std::string unicode_set_filter = settings.get_string("unicode_set_filter", "");
if (mode != "compose" && mode != "decompose") {
throw Exception(ErrorCode::INVALID_ARGUMENT,
"ICUNormalizerCharFilterFactory: mode must be 'compose' or "
"'decompose', got: " +
mode);
}

UErrorCode status = U_ZERO_ERROR;
const icu::Normalizer2* base = get_normalizer(name, mode, status);
if (U_FAILURE(status) || base == nullptr) {
throw Exception(ErrorCode::INVALID_ARGUMENT,
"Failed to get normalizer instance for '" + name + "' with mode '" +
mode + "': " + std::string(u_errorName(status)));
}

if (unicode_set_filter.empty()) {
_normalizer =
std::shared_ptr<const icu::Normalizer2>(base, [](const icu::Normalizer2*) {});
return;
}

icu::UnicodeSet unicode_set(icu::UnicodeString::fromUTF8(unicode_set_filter), status);
if (U_FAILURE(status)) {
throw Exception(ErrorCode::INVALID_ARGUMENT, "Failed to parse unicode_set_filter: " +
std::string(u_errorName(status)));
}
if (unicode_set.isEmpty()) {
_normalizer =
std::shared_ptr<const icu::Normalizer2>(base, [](const icu::Normalizer2*) {});
return;
}
unicode_set.freeze();

_normalizer = std::make_shared<icu::FilteredNormalizer2>(*base, unicode_set);
}

ReaderPtr create(const ReaderPtr& in) override {
if (!_normalizer) {
throw Exception(ErrorCode::INVALID_ARGUMENT,
"ICUNormalizerCharFilterFactory not initialized. Call initialize() "
"first.");
}
return std::make_shared<ICUNormalizerCharFilter>(in, _normalizer);
}

private:
static const icu::Normalizer2* get_normalizer(const std::string& name, const std::string& mode,
UErrorCode& status) {
UNormalization2Mode icu_mode = (mode == "compose" ? UNORM2_COMPOSE : UNORM2_DECOMPOSE);
if (name == "nfc" || name == "nfkc" || name == "nfkc_cf") {
return icu::Normalizer2::getInstance(nullptr, name.c_str(), icu_mode, status);
}

if (name == "nfd") {
return icu::Normalizer2::getNFDInstance(status);
} else if (name == "nfkd") {
return icu::Normalizer2::getNFKDInstance(status);
}

status = U_ILLEGAL_ARGUMENT_ERROR;
return nullptr;
}

std::shared_ptr<const icu::Normalizer2> _normalizer;
};
using ICUNormalizerCharFilterFactoryPtr = std::shared_ptr<ICUNormalizerCharFilterFactory>;

} // namespace doris::segment_v2::inverted_index
Loading
Loading