Skip to content

Commit bf90bfb

Browse files
committed
add regex
1 parent bf45cd3 commit bf90bfb

3 files changed

Lines changed: 42 additions & 2 deletions

File tree

answer/routes/base.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
from llm.llm import get_answer
3737
from search.filter import length_filter
3838
from search.nn import FilteredEnsembleRetriever, init_embedder
39-
from search.preprocess import preprocess_stem
39+
from search.preprocess import preprocess_stem, TextPreprocessor
4040
from search.search import generate_keywords_dict, get_context, get_documents_from_qdrant
4141

4242

@@ -160,6 +160,8 @@ async def init_resources():
160160
vector_store=app.state.vector_store,
161161
output_json_path="file/key_words_dict.json"
162162
)
163+
164+
app.state.text_preprocessor = TextPreprocessor.from_file()
163165

164166

165167
app_state_dict = {
@@ -189,9 +191,11 @@ async def generate_response(user_input: UserInput):
189191
ensemble_retriever = app.state.ensemble_retriever
190192
else:
191193
ensemble_retriever = app.state.filtered_ensemble_retriever
194+
195+
processed_text = app.state.text_preprocessor.preprocess(user_input.text)
192196

193197
results, combined_text = get_context(
194-
query=user_input.text,
198+
query=processed_text,
195199
key_words_dict=app.state.keywords_dict,
196200
ensemble_retriever=ensemble_retriever,
197201
vector_store=app.state.vector_store,

search/preprocess.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
import re
22

3+
import json
4+
from typing import Dict, Pattern
35
from nltk.corpus import stopwords
46
from nltk.stem.snowball import SnowballStemmer
57
from nltk.tokenize import word_tokenize
68
from pymystem3 import Mystem
9+
from pathlib import Path
710

811

912
_MYSTEM = Mystem()
@@ -14,6 +17,9 @@
1417
_STEMMED_BANNED_WORDS = {_STEMMER.stem(w) for w in _BANNED_WORDS}
1518
_LEMMATIZED_BANNED_WORDS = {lemma.strip() for w in _BANNED_WORDS for lemma in _MYSTEM.lemmatize(w)}
1619

20+
_REGEX_PATH = Path(__file__).parent / "regex.json"
21+
22+
1723

1824
def preprocess_stem(text, filter_stopwords=True, filter_stemmed_banned_words=True):
1925
"""
@@ -63,3 +69,30 @@ def preprocess_lemma(text, filter_stopwords=False, filter_lemmatized_banned_word
6369
if filter_lemmatized_banned_words:
6470
return [w for w in lemmas if w not in _LEMMATIZED_BANNED_WORDS]
6571
return lemmas
72+
73+
74+
class TextPreprocessor:
75+
"""Класс для предобработки текста запросов с использованием регулярных выражений."""
76+
77+
def __init__(self, patterns, path=_REGEX_PATH):
78+
"""
79+
:param patterns: словарь вида {регулярное_выражение: замена}
80+
"""
81+
self.compiled_patterns = {}
82+
for pattern, replacement in patterns.items():
83+
self.compiled_patterns[re.compile(pattern, re.IGNORECASE | re.UNICODE)] = replacement
84+
85+
self.path = path
86+
87+
@classmethod
88+
def from_file(cls, file_path=_REGEX_PATH):
89+
"""Загружает правила из JSON-файла и создает экземпляр препроцессора."""
90+
with open(file_path, 'r', encoding='utf-8') as f:
91+
patterns = json.load(f)
92+
return cls(patterns, path=file_path)
93+
94+
def preprocess(self, text: str) -> str:
95+
"""Применяет все правила замены к тексту."""
96+
for pattern, replacement in self.compiled_patterns.items():
97+
text = pattern.sub(replacement, text)
98+
return text

search/regex.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"\\bакадем\\b": "академический отпуск"
3+
}

0 commit comments

Comments
 (0)