Skip to content

Commit ba5185e

Browse files
committed
Work Area: Add Dependency Parser - Search Settings - Search term position
1 parent 362bbf2 commit ba5185e

File tree

6 files changed

+67
-43
lines changed

6 files changed

+67
-43
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
- Settings: Add Settings - Tables - Miscellaneous Settings
2626
- Utils: Add Wordless's character tokenizer
2727
- Work Area: Add Profiler - Export all tables
28+
- Work Area: Add Dependency Parser - Search Settings - Search term position
2829
- Work Area: Add Work Area - Table Settings - Show total
2930
- Work Area: Add Work Area - Sample
3031

tests/tests_nlp/test_nlp_utils.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def test_wl_worker_download_model_spacy():
105105
).run()
106106

107107
def test_wl_worker_download_model_stanza():
108-
for lang in ('zho_cn', 'zho_tw', 'other', 'ara'):
108+
for lang in ('ara', 'eng_us'):
109109
wl_nlp_utils.Wl_Worker_Download_Model_Stanza(
110110
main,
111111
dialog_progress = wl_dialogs_misc.Wl_Dialog_Progress_Download_Model(main),
@@ -130,14 +130,10 @@ def test_init_model_spacy():
130130
assert 'spacy_nlp_sentencizer' in main.__dict__
131131

132132
def test_init_model_stanza():
133-
wl_nlp_utils.init_model_stanza(main, lang = 'zho_cn', lang_util = 'sentence_tokenizer')
134-
wl_nlp_utils.init_model_stanza(main, lang = 'zho_tw', lang_util = 'sentence_tokenizer')
135-
wl_nlp_utils.init_model_stanza(main, lang = 'eng_gb', lang_util = 'sentence_tokenizer')
136-
wl_nlp_utils.init_model_stanza(main, lang = 'eng_us', lang_util = 'sentence_tokenizer')
133+
wl_nlp_utils.init_model_stanza(main, lang = 'eng_gb', lang_util = 'sentence_tokenizer', tokenized = False)
134+
wl_nlp_utils.init_model_stanza(main, lang = 'eng_us', lang_util = 'sentence_tokenizer', tokenized = True)
137135
wl_nlp_utils.init_model_stanza(main, lang = 'other', lang_util = 'sentence_tokenizer')
138136

139-
assert 'stanza_nlp_zho_cn' in main.__dict__
140-
assert 'stanza_nlp_zho_tw' in main.__dict__
141137
assert 'stanza_nlp_eng' in main.__dict__
142138
assert 'stanza_nlp_eng_gb' not in main.__dict__
143139
assert 'stanza_nlp_eng_us' not in main.__dict__
@@ -182,7 +178,6 @@ def test_init_word_detokenizers():
182178
def test_init_pos_taggers():
183179
wl_nlp_utils.init_pos_taggers(main, 'eng_us', 'sapcy_eng')
184180
wl_nlp_utils.init_pos_taggers(main, 'eng_us', 'stanza_eng')
185-
wl_nlp_utils.init_pos_taggers(main, 'eng_us', 'stanza_eng', tokenized = True)
186181

187182
wl_nlp_utils.init_pos_taggers(main, 'jpn', 'sudachipy_jpn')
188183
wl_nlp_utils.init_pos_taggers(main, 'kor', 'python_mecab_ko_mecab')
@@ -193,7 +188,6 @@ def test_init_pos_taggers():
193188
def test_init_lemmatizers():
194189
wl_nlp_utils.init_lemmatizers(main, 'eng_us', 'sapcy_eng')
195190
wl_nlp_utils.init_lemmatizers(main, 'eng_us', 'stanza_eng')
196-
wl_nlp_utils.init_lemmatizers(main, 'eng_us', 'stanza_eng', tokenized = True)
197191

198192
wl_nlp_utils.init_lemmatizers(main, 'jpn', 'sudachipy_jpn')
199193

@@ -203,11 +197,9 @@ def test_init_lemmatizers():
203197
def test_init_dependency_parsers():
204198
wl_nlp_utils.init_dependency_parsers(main, 'eng_us', 'spacy_eng')
205199
wl_nlp_utils.init_dependency_parsers(main, 'eng_us', 'stanza_eng')
206-
wl_nlp_utils.init_dependency_parsers(main, 'eng_us', 'stanza_eng', tokenized = True)
207200

208201
def test_init_sentiment_analyzers():
209202
wl_nlp_utils.init_sentiment_analyzers(main, 'eng_us', 'stanza_eng')
210-
wl_nlp_utils.init_sentiment_analyzers(main, 'eng_us', 'stanza_eng', tokenized = True)
211203

212204
def test_align_tokens():
213205
assert wl_nlp_utils.align_tokens(['a', 'b'], ['a', 'b'], ['1', '2']) == ['1', '2']

wordless/wl_dependency_parser.py

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
wl_threading
4242
)
4343
from wordless.wl_widgets import (
44+
wl_boxes,
4445
wl_labels,
4546
wl_layouts,
4647
wl_tables,
@@ -121,11 +122,20 @@ def __init__(self, main):
121122
) = wl_widgets.wl_widgets_search_settings_tokens(self, tab = self.tab)
122123
self.checkbox_match_dependency_relations = QtWidgets.QCheckBox(self.tr('Match dependency relations'), self)
123124

125+
self.label_search_term_position = QtWidgets.QLabel(self.tr('Search term position:'), self)
126+
self.combo_box_search_term_position = wl_boxes.Wl_Combo_Box(self)
127+
124128
(
125129
self.label_context_settings,
126130
self.button_context_settings
127131
) = wl_widgets.wl_widgets_context_settings(self, tab = self.tab)
128132

133+
self.combo_box_search_term_position.addItems([
134+
self.tr('Head/dependent'),
135+
self.tr('Head'),
136+
self.tr('Dependent')
137+
])
138+
129139
self.checkbox_multi_search_mode.stateChanged.connect(self.search_settings_changed)
130140
self.line_edit_search_term.textChanged.connect(self.search_settings_changed)
131141
self.line_edit_search_term.returnPressed.connect(self.table_dependency_parser.button_generate_table.click)
@@ -140,6 +150,13 @@ def __init__(self, main):
140150
self.checkbox_match_without_tags.stateChanged.connect(self.search_settings_changed)
141151
self.checkbox_match_tags.stateChanged.connect(self.search_settings_changed)
142152
self.checkbox_match_dependency_relations.stateChanged.connect(self.search_settings_changed)
153+
self.combo_box_search_term_position.currentTextChanged.connect(self.search_settings_changed)
154+
155+
layout_search_term_position = wl_layouts.Wl_Layout()
156+
layout_search_term_position.addWidget(self.label_search_term_position, 0, 0)
157+
layout_search_term_position.addWidget(self.combo_box_search_term_position, 0, 1)
158+
159+
layout_search_term_position.setColumnStretch(1, 1)
143160

144161
layout_context_settings = wl_layouts.Wl_Layout()
145162
layout_context_settings.addWidget(self.label_context_settings, 0, 0)
@@ -163,7 +180,11 @@ def __init__(self, main):
163180

164181
self.group_box_search_settings.layout().addWidget(wl_layouts.Wl_Separator(self), 10, 0, 1, 2)
165182

166-
self.group_box_search_settings.layout().addLayout(layout_context_settings, 11, 0, 1, 2)
183+
self.group_box_search_settings.layout().addLayout(layout_search_term_position, 11, 0, 1, 2)
184+
185+
self.group_box_search_settings.layout().addWidget(wl_layouts.Wl_Separator(self), 12, 0, 1, 2)
186+
187+
self.group_box_search_settings.layout().addLayout(layout_context_settings, 13, 0, 1, 2)
167188

168189
# Table Settings
169190
self.group_box_table_settings = QtWidgets.QGroupBox(self.tr('Table Settings'), self)
@@ -243,6 +264,7 @@ def load_settings(self, defaults = False):
243264
self.checkbox_match_without_tags.setChecked(settings['search_settings']['match_without_tags'])
244265
self.checkbox_match_tags.setChecked(settings['search_settings']['match_tags'])
245266
self.checkbox_match_dependency_relations.setChecked(settings['search_settings']['match_dependency_relations'])
267+
self.combo_box_search_term_position.setCurrentText(settings['search_settings']['search_term_position'])
246268

247269
# Context Settings
248270
if defaults:
@@ -295,6 +317,7 @@ def search_settings_changed(self):
295317
settings['match_without_tags'] = self.checkbox_match_without_tags.isChecked()
296318
settings['match_tags'] = self.checkbox_match_tags.isChecked()
297319
settings['match_dependency_relations'] = self.checkbox_match_dependency_relations.isChecked()
320+
settings['search_term_position'] = self.combo_box_search_term_position.currentText()
298321

299322
# Match dependency relations
300323
if settings['match_dependency_relations']:
@@ -551,7 +574,19 @@ def run(self):
551574
(
552575
(
553576
not settings['search_settings']['match_dependency_relations']
554-
and (token in search_terms or token.head in search_terms)
577+
and settings['search_settings']['search_term_position'] == self.tr('Head/dependent')
578+
and (
579+
token in search_terms
580+
or token.head in search_terms
581+
)
582+
) or (
583+
not settings['search_settings']['match_dependency_relations']
584+
and settings['search_settings']['search_term_position'] == self.tr('Head')
585+
and token.head in search_terms
586+
) or (
587+
not settings['search_settings']['match_dependency_relations']
588+
and settings['search_settings']['search_term_position'] == self.tr('Dependent')
589+
and token in search_terms
555590
) or (
556591
settings['search_settings']['match_dependency_relations']
557592
and token.dependency_relation in wl_texts.to_display_texts(search_terms)

wordless/wl_nlp/wl_nlp_utils.py

Lines changed: 21 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,15 @@ def to_lang_util_texts(main, util_type, util_codes):
8686
for util_code in util_codes
8787
)
8888

89+
def get_langs_stanza(main, util_type):
90+
langs_stanza = set()
91+
92+
for lang_code, lang_utils in main.settings_global[util_type].items():
93+
if any((lang_util.startswith('stanza_') for lang_util in lang_utils)):
94+
langs_stanza.add(lang_code)
95+
96+
return langs_stanza
97+
8998
LANGS_SPACY = {
9099
'cat': 'ca_core_news_trf',
91100
'zho': 'zh_core_web_trf',
@@ -115,14 +124,17 @@ def to_lang_util_texts(main, util_type, util_codes):
115124
'other': 'en_core_web_trf'
116125
}
117126

118-
def get_langs_stanza(main, util_type):
119-
langs_stanza = set()
127+
LANGS_SPACY_LEMMATIZERS = (
128+
'ben', 'ces', 'grc', 'hun', 'ind', 'gle', 'ltz', 'fas', 'srp', 'tgl',
129+
'tur', 'urd'
130+
)
120131

121-
for lang_code, lang_utils in main.settings_global[util_type].items():
122-
if any((lang_util.startswith('stanza_') for lang_util in lang_utils)):
123-
langs_stanza.add(lang_code)
132+
LANGS_STANZA = {
133+
'zho_cn': 'zh-hans',
134+
'zho_tw': 'zh-hant',
124135

125-
return langs_stanza
136+
'other': 'en'
137+
}
126138

127139
@wl_misc.log_time
128140
def check_models(parent, langs, lang_utils = None):
@@ -333,15 +345,7 @@ def run(self):
333345
if self.lang in get_langs_stanza(self.main, util_type = 'sentiment_analyzers'):
334346
processors.append('sentiment')
335347

336-
match self.lang:
337-
case 'zho_cn':
338-
lang_stanza = 'zh-hans'
339-
case 'zho_tw':
340-
lang_stanza = 'zh-hant'
341-
case 'other':
342-
lang_stanza = 'en'
343-
case _:
344-
lang_stanza = wl_conversion.to_iso_639_1(self.main, self.lang, no_suffix = True)
348+
lang_stanza = LANGS_STANZA.get(self.lang, wl_conversion.to_iso_639_1(self.main, self.lang, no_suffix = True))
345349

346350
# Using existing resources.json if network error occurs
347351
try:
@@ -368,11 +372,6 @@ def run(self):
368372
self.progress_updated.emit(self.tr('Download completed successfully.'))
369373
self.finished.emit(err_msg)
370374

371-
LANGS_SPACY_LEMMATIZERS = (
372-
'ben', 'ces', 'grc', 'hun', 'ind', 'gle', 'ltz', 'fas', 'srp', 'tgl',
373-
'tur', 'urd'
374-
)
375-
376375
def init_model_spacy(main, lang, sentencizer_only = False):
377376
sentencizer_config = {'punct_chars': wl_sentence_tokenization.SENTENCE_TERMINATORS}
378377

@@ -448,15 +447,7 @@ def init_model_stanza(main, lang, lang_util, tokenized = False):
448447
or set(processors) | {'mwt'} != set(main.__dict__[f'stanza_nlp_{lang}'].processors) | {'mwt'}
449448
or tokenized != main.__dict__[f'stanza_nlp_{lang}'].kwargs.get('tokenize_pretokenized', False)
450449
):
451-
match lang:
452-
case 'zho_cn':
453-
lang_stanza = 'zh-hans'
454-
case 'zho_tw':
455-
lang_stanza = 'zh-hant'
456-
case 'other':
457-
lang_stanza = 'en'
458-
case _:
459-
lang_stanza = wl_conversion.to_iso_639_1(main, lang, no_suffix = True)
450+
lang_stanza = LANGS_STANZA.get(lang, wl_conversion.to_iso_639_1(main, lang, no_suffix = True))
460451

461452
if getattr(sys, '_MEIPASS', False):
462453
model_dir = wl_paths.get_path_file('stanza_resources')
@@ -569,7 +560,7 @@ def init_syl_tokenizers(main, lang, syl_tokenizer):
569560
main.__dict__[f'pyphen_syl_tokenizer_{lang}'] = pyphen.Pyphen(lang = lang_pyphen)
570561

571562
def init_word_detokenizers(main, lang):
572-
if lang not in ('zho_cn', 'zho_tw', 'jpn', 'tha', 'bod'):
563+
if lang not in LANGS_WITHOUT_SPACES:
573564
# Sacremoses
574565
lang_sacremoses = wl_conversion.remove_lang_code_suffixes(wl_conversion.to_iso_639_1(main, lang))
575566
lang = wl_conversion.remove_lang_code_suffixes(lang)

wordless/wl_settings/wl_settings_default.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,7 @@ def init_settings_default(main):
404404
'match_without_tags': False,
405405
'match_tags': False,
406406
'match_dependency_relations': False,
407+
'search_term_position': _tr('wl_settings_default', 'Head/dependent'),
407408

408409
'context_settings': {
409410
'incl': {

wordless/wl_widgets/wl_tables.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -686,6 +686,10 @@ def run(self):
686686
for i, _ in enumerate(worksheet.rows):
687687
worksheet.row_dimensions[2 + i].height = self.table.verticalHeader().sectionSize(0) / dpi_vertical * 72
688688

689+
# Filter
690+
if self.table.results_filter:
691+
worksheet.auto_filter.ref = worksheet.dimensions
692+
689693
self.progress_updated.emit(self.tr('Saving file...'))
690694

691695
workbook.save(self.file_path)

0 commit comments

Comments
 (0)