11import re
22
3+ import json
4+ from typing import Dict , Pattern
35from nltk .corpus import stopwords
46from nltk .stem .snowball import SnowballStemmer
57from nltk .tokenize import word_tokenize
68from pymystem3 import Mystem
9+ from pathlib import Path
710
811
912_MYSTEM = Mystem ()
1417_STEMMED_BANNED_WORDS = {_STEMMER .stem (w ) for w in _BANNED_WORDS }
1518_LEMMATIZED_BANNED_WORDS = {lemma .strip () for w in _BANNED_WORDS for lemma in _MYSTEM .lemmatize (w )}
1619
20+ _REGEX_PATH = Path (__file__ ).parent / "regex.json"
21+
22+
1723
1824def preprocess_stem (text , filter_stopwords = True , filter_stemmed_banned_words = True ):
1925 """
@@ -63,3 +69,30 @@ def preprocess_lemma(text, filter_stopwords=False, filter_lemmatized_banned_word
6369 if filter_lemmatized_banned_words :
6470 return [w for w in lemmas if w not in _LEMMATIZED_BANNED_WORDS ]
6571 return lemmas
72+
73+
74+ class TextPreprocessor :
75+ """Класс для предобработки текста запросов с использованием регулярных выражений."""
76+
77+ def __init__ (self , patterns , path = _REGEX_PATH ):
78+ """
79+ :param patterns: словарь вида {регулярное_выражение: замена}
80+ """
81+ self .compiled_patterns = {}
82+ for pattern , replacement in patterns .items ():
83+ self .compiled_patterns [re .compile (pattern , re .IGNORECASE | re .UNICODE )] = replacement
84+
85+ self .path = path
86+
87+ @classmethod
88+ def from_file (cls , file_path = _REGEX_PATH ):
89+ """Загружает правила из JSON-файла и создает экземпляр препроцессора."""
90+ with open (file_path , 'r' , encoding = 'utf-8' ) as f :
91+ patterns = json .load (f )
92+ return cls (patterns , path = file_path )
93+
94+ def preprocess (self , text : str ) -> str :
95+ """Применяет все правила замены к тексту."""
96+ for pattern , replacement in self .compiled_patterns .items ():
97+ text = pattern .sub (replacement , text )
98+ return text
0 commit comments