Skip to content

Commit 27f5fb1

Browse files
committed
Work Area: Add Keyword Extractor - Generation Settings - N-gram size
1 parent 16c196f commit 27f5fb1

File tree

14 files changed

+378
-225
lines changed

14 files changed

+378
-225
lines changed

.github/workflows/pylint.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ jobs:
2828
- uses: actions/checkout@v5
2929

3030
- name: Setup Python
31-
uses: actions/setup-python@v5
31+
uses: actions/setup-python@v6
3232
with:
3333
python-version: '3.11'
3434
architecture: 'x64'

.github/workflows/tests.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929
- uses: actions/checkout@v5
3030

3131
- name: Setup Python
32-
uses: actions/setup-python@v5
32+
uses: actions/setup-python@v6
3333
with:
3434
python-version: '3.11'
3535
architecture: 'x64'
@@ -90,7 +90,7 @@ jobs:
9090
- uses: actions/checkout@v5
9191

9292
- name: Setup Python
93-
uses: actions/setup-python@v5
93+
uses: actions/setup-python@v6
9494
with:
9595
python-version: '3.11'
9696
architecture: 'x64'
@@ -144,7 +144,7 @@ jobs:
144144
- uses: actions/checkout@v5
145145

146146
- name: Setup Python
147-
uses: actions/setup-python@v5
147+
uses: actions/setup-python@v6
148148
with:
149149
python-version: '3.11'
150150
architecture: 'x64'

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
- Menu: Add Edit - Sample
2525
- Settings: Add Settings - Tables - Miscellaneous Settings
2626
- Utils: Add Wordless's character tokenizer
27+
- Work Area: Add Keyword Extractor - Generation Settings - N-gram size
2728
- Work Area: Add Profiler - Export all tables
2829
- Work Area: Add Dependency Parser - Search Settings - Search term position
2930
- Work Area: Add Work Area - Table Settings - Show total

tests/test_keyword_extractor.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,5 +137,41 @@ def update_gui(err_msg, keywords_freq_files, keywords_stats_files):
137137
# Number of Files Found
138138
assert len([freq for freq in freq_files[1:-1] if freq]) >= 1
139139

140+
# Check that the results of unigrams and bigrams are the exact concatenation of those of unigrams and those of bigrams
141+
def test_keyword_extractor_ngram_size():
142+
settings = main_global.settings_custom['keyword_extractor']
143+
144+
wl_test_init.select_test_files(main_global, no_files = (0,))
145+
146+
for ngram_size_min, ngram_size_max in (
147+
(1, 1),
148+
(2, 2),
149+
(1, 2)
150+
):
151+
settings['generation_settings']['ngram_size_min'] = ngram_size_min
152+
settings['generation_settings']['ngram_size_max'] = ngram_size_max
153+
154+
worker_keyword_extractor = wl_keyword_extractor.Wl_Worker_Keyword_Extractor_Table(
155+
main_global,
156+
dialog_progress = wl_dialogs_misc.Wl_Dialog_Progress_Process_Data(main_global),
157+
)
158+
159+
worker_keyword_extractor.finished.connect(update_gui_ngram_size(ngram_size_min, ngram_size_max))
160+
worker_keyword_extractor.run()
161+
162+
# pylint: disable=undefined-variable
163+
assert keywords_freq_files_1_2 == keywords_freq_files_1_1 | keywords_freq_files_2_2
164+
assert keywords_stats_files_1_2 == keywords_stats_files_1_1 | keywords_stats_files_2_2
165+
166+
def update_gui_ngram_size(ngram_size_min, ngram_size_max):
167+
def update_gui(err_msg, keywords_freq_files, keywords_stats_files):
168+
assert not err_msg
169+
170+
globals()[f'keywords_freq_files_{ngram_size_min}_{ngram_size_max}'] = keywords_freq_files
171+
globals()[f'keywords_stats_files_{ngram_size_min}_{ngram_size_max}'] = keywords_stats_files
172+
173+
return update_gui
174+
140175
if __name__ == '__main__':
141176
test_keyword_extractor()
177+
test_keyword_extractor_ngram_size()

tests/test_ngram_generator.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
)
2525
from wordless import wl_ngram_generator
2626
from wordless.wl_dialogs import wl_dialogs_misc
27+
from wordless.wl_nlp import wl_texts
2728

2829
main_global = None
2930

@@ -35,6 +36,8 @@ def test_ngram_generator():
3536

3637
settings['search_settings']['multi_search_mode'] = True
3738
settings['search_settings']['search_terms'] = wl_test_init.SEARCH_TERMS
39+
settings['generation_settings']['ngram_size_min'] = 1
40+
settings['generation_settings']['ngram_size_max'] = 1
3841

3942
measures_dispersion = list(main.settings_global['measures_dispersion'])
4043
measures_adjusted_freq = list(main.settings_global['measures_adjusted_freq'])
@@ -127,5 +130,54 @@ def update_gui(err_msg, ngrams_freq_files, ngrams_stats_files):
127130
# Number of Files Found
128131
assert len([freq for freq in freq_files[:-1] if freq]) >= 1
129132

133+
# Check that the results of unigrams and bigrams are the exact concatenation of those of unigrams and those of bigrams
134+
def test_ngram_generator_ngram_size():
135+
settings = main_global.settings_custom['ngram_generator']
136+
137+
settings['search_settings']['search_term'] = wl_test_init.SEARCH_TERMS[0]
138+
settings['generation_settings']['measure_dispersion'] = 'juillands_d'
139+
settings['generation_settings']['measure_adjusted_freq'] = 'juillands_u'
140+
141+
wl_test_init.select_test_files(main_global, no_files = (0,))
142+
143+
for ngram_size_min, ngram_size_max in (
144+
(1, 1),
145+
(2, 2),
146+
(1, 2)
147+
):
148+
settings['generation_settings']['ngram_size_min'] = ngram_size_min
149+
settings['generation_settings']['ngram_size_max'] = ngram_size_max
150+
151+
worker_ngram_generator = wl_ngram_generator.Wl_Worker_Ngram_Generator_Table(
152+
main_global,
153+
dialog_progress = wl_dialogs_misc.Wl_Dialog_Progress_Process_Data(main_global),
154+
)
155+
156+
worker_ngram_generator.finished.connect(update_gui_ngram_size(ngram_size_min, ngram_size_max))
157+
worker_ngram_generator.run()
158+
159+
# pylint: disable=undefined-variable
160+
assert ngrams_freq_files_1_2 == ngrams_freq_files_1_1 | ngrams_freq_files_2_2
161+
assert ngrams_stats_files_1_2 == ngrams_stats_files_1_1 | ngrams_stats_files_2_2
162+
163+
def update_gui_ngram_size(ngram_size_min, ngram_size_max):
164+
def update_gui(err_msg, ngrams_freq_files, ngrams_stats_files):
165+
assert not err_msg
166+
167+
globals()[f'ngrams_freq_files_{ngram_size_min}_{ngram_size_max}'] = ngrams_freq_files
168+
globals()[f'ngrams_stats_files_{ngram_size_min}_{ngram_size_max}'] = ngrams_stats_files
169+
170+
return update_gui
171+
172+
def test_get_ngrams_is():
173+
tokens = wl_texts.to_tokens(('1', '2', '1'))
174+
175+
assert wl_ngram_generator.get_ngrams_is(
176+
ngrams = [(tokens[0],), (tokens[1],), (tokens[2],)],
177+
tokens = tokens
178+
) == [((tokens[0],), 0), ((tokens[1],), 1), ((tokens[2],), 2)]
179+
130180
if __name__ == '__main__':
131181
test_ngram_generator()
182+
test_ngram_generator_ngram_size()
183+
test_get_ngrams_is()

tests/tests_measures/test_measures_effect_size.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ def test_log_ratio():
156156
numpy.array([0, 0, 0]),
157157
numpy.array([1, 1, 0])
158158
),
159-
numpy.array([float('-inf'), float('inf'), 0])
159+
numpy.array([-math.inf, math.inf, 0])
160160
)
161161

162162
def test_mi_log_f():
@@ -246,7 +246,7 @@ def test_odds_ratio():
246246
numpy.array([0, 0, 0]),
247247
numpy.array([1, 1, 0])
248248
),
249-
numpy.array([float('-inf'), float('inf'), 0])
249+
numpy.array([-math.inf, math.inf, 0])
250250
)
251251

252252
# Reference: Gabrielatos, C., & Marchi, A. (2011, November 5). Keyness: Matching metrics to definitions [Conference session]. Corpus Linguistics in the South 1, University of Portsmouth, United Kingdom. https://eprints.lancs.ac.uk/id/eprint/51449/4/Gabrielatos_Marchi_Keyness.pdf | p. 18
@@ -270,7 +270,7 @@ def test_pct_diff():
270270
numpy.array([0, 0, 0]),
271271
numpy.array([1, 1, 0])
272272
),
273-
numpy.array([float('-inf'), float('inf'), 0])
273+
numpy.array([-math.inf, math.inf, 0])
274274
)
275275

276276
# Reference: Church, K. W., & Hanks, P. (1990). Word association norms, mutual information, and lexicography. Computational Linguistics, 16(1), 22–29. | p. 24

tests/wl_test_init.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
from wordless.wl_utils import wl_misc
4343
from wordless.wl_widgets import wl_tables
4444

45-
# English
45+
# English / Tibetan
4646
SEARCH_TERMS = ['take', 'ལ་']
4747

4848
# An instance of QApplication must be created before any instance of QWidget

wordless/wl_colligation_extractor.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1030,7 +1030,7 @@ def run(self):
10301030
if not self._running:
10311031
raise wl_excs.Wl_Exc_Aborted(self.main)
10321032

1033-
# Limit Searching
1033+
# Limit searching
10341034
if settings_limit_searching != _tr('Wl_Worker_Colligation_Extractor', 'None'):
10351035
if settings_limit_searching == _tr('Wl_Worker_Colligation_Extractor', 'Within sentence segments'):
10361036
offsets_unit = offsets_sentence_segs
@@ -1052,7 +1052,7 @@ def run(self):
10521052
tags_right = []
10531053

10541054
if window_left < 0 < window_right:
1055-
# Limit Searching
1055+
# Limit searching
10561056
if settings_limit_searching == _tr('Wl_Worker_Colligation_Extractor', 'None'):
10571057
tags_left = text.tags[max(0, i + window_left) : i]
10581058
tags_right = text.tags[i + ngram_size : i + ngram_size + window_right]
@@ -1095,7 +1095,7 @@ def run(self):
10951095

10961096
colligations_freqs_file_all[ngram_size][(ngram, collocate)] += 1
10971097
elif window_left < 0 and window_right < 0:
1098-
# Limit Searching
1098+
# Limit searching
10991099
if settings_limit_searching == _tr('Wl_Worker_Colligation_Extractor', 'None'):
11001100
tags_left = text.tags[max(0, i + window_left) : max(0, i + window_right + 1)]
11011101
else:
@@ -1118,7 +1118,7 @@ def run(self):
11181118

11191119
colligations_freqs_file_all[ngram_size][(ngram, collocate)] += 1
11201120
elif window_left > 0 and window_right > 0:
1121-
# Limit Searching
1121+
# Limit searching
11221122
if settings_limit_searching == _tr('Wl_Worker_Colligation_Extractor', 'None'):
11231123
tags_right = text.tags[i + ngram_size + window_left - 1 : i + ngram_size + window_right]
11241124
else:

wordless/wl_collocation_extractor.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1028,7 +1028,7 @@ def run(self):
10281028
if not self._running:
10291029
raise wl_excs.Wl_Exc_Aborted(self.main)
10301030

1031-
# Limit Searching
1031+
# Limit searching
10321032
if settings_limit_searching != _tr('Wl_Worker_Collocation_Extractor', 'None'):
10331033
if settings_limit_searching == _tr('Wl_Worker_Collocation_Extractor', 'Within sentence segments'):
10341034
offsets_unit = offsets_sentence_segs
@@ -1050,7 +1050,7 @@ def run(self):
10501050
tokens_right = []
10511051

10521052
if window_left < 0 < window_right:
1053-
# Limit Searching
1053+
# Limit searching
10541054
if settings_limit_searching == _tr('Wl_Worker_Collocation_Extractor', 'None'):
10551055
tokens_left = tokens[max(0, i + window_left) : i]
10561056
tokens_right = tokens[i + ngram_size : i + ngram_size + window_right]
@@ -1093,7 +1093,7 @@ def run(self):
10931093

10941094
collocations_freqs_file_all[ngram_size][(ngram, collocate)] += 1
10951095
elif window_left < 0 and window_right < 0:
1096-
# Limit Searching
1096+
# Limit searching
10971097
if settings_limit_searching == _tr('Wl_Worker_Collocation_Extractor', 'None'):
10981098
tokens_left = tokens[max(0, i + window_left) : max(0, i + window_right + 1)]
10991099
else:

0 commit comments

Comments
 (0)