Skip to content

Commit 9c42d4f

Browse files
committed
Work Area: Add Keyword Extractor - Generation Settings - N-gram size
1 parent 16c196f commit 9c42d4f

17 files changed

+383
-228
lines changed

.github/workflows/pylint.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ jobs:
2828
- uses: actions/checkout@v5
2929

3030
- name: Setup Python
31-
uses: actions/setup-python@v5
31+
uses: actions/setup-python@v6
3232
with:
3333
python-version: '3.11'
3434
architecture: 'x64'

.github/workflows/tests.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929
- uses: actions/checkout@v5
3030

3131
- name: Setup Python
32-
uses: actions/setup-python@v5
32+
uses: actions/setup-python@v6
3333
with:
3434
python-version: '3.11'
3535
architecture: 'x64'
@@ -90,7 +90,7 @@ jobs:
9090
- uses: actions/checkout@v5
9191

9292
- name: Setup Python
93-
uses: actions/setup-python@v5
93+
uses: actions/setup-python@v6
9494
with:
9595
python-version: '3.11'
9696
architecture: 'x64'
@@ -144,7 +144,7 @@ jobs:
144144
- uses: actions/checkout@v5
145145

146146
- name: Setup Python
147-
uses: actions/setup-python@v5
147+
uses: actions/setup-python@v6
148148
with:
149149
python-version: '3.11'
150150
architecture: 'x64'

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
- Menu: Add Edit - Sample
2525
- Settings: Add Settings - Tables - Miscellaneous Settings
2626
- Utils: Add Wordless's character tokenizer
27+
- Work Area: Add Keyword Extractor - Generation Settings - N-gram size
2728
- Work Area: Add Profiler - Export all tables
2829
- Work Area: Add Dependency Parser - Search Settings - Search term position
2930
- Work Area: Add Work Area - Table Settings - Show total

doc/doc.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -678,7 +678,7 @@ You can generate line charts or word clouds for patterns of colligation using an
678678
> [!NOTE]
679679
> Renamed from **Keyword** to **Keyword Extractor** in *Wordless* 2.2.
680680
681-
In *Keyword Extractor*, you can search for candidates of potential keywords (tokens that have far more or far less frequency in the observed corpus than in the reference corpus) in different files given a reference corpus, conduct different tests of statistical significance on each keyword and calculate the Bayes factor and effect size for each keyword using different measures. You can adjust the settings for the generated data via **Generation Settings**. You can disable the calculation of statistical significance and/or Bayes factor and/or effect size by setting **Generation Settings → Test of Statistical Significance / Measures of Bayes Factor / Measure of Effect Size** to **None**.
681+
In *Keyword Extractor*, you can search for potential keywords (tokens or n-grams that appear much more or much less frequently in the observed corpus than in the reference corpus) in different files given a reference corpus, conduct different tests of statistical significance on each keyword and calculate the Bayes factor and effect size for each keyword using different measures. You can adjust the settings for the generated data via **Generation Settings**. You can disable the calculation of statistical significance and/or Bayes factor and/or effect size by setting **Generation Settings → Test of Statistical Significance / Measures of Bayes Factor / Measure of Effect Size** to **None**.
682682

683683
You can filter the results by clicking **Filter results** or search in *Data Table* for parts that might be of interest to you by clicking **Search in results**.
684684

requirements/requirements_dev.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,4 +71,5 @@ wordcloud
7171
# See: https://stackoverflow.com/a/57060441
7272
--index-url=https://download.pytorch.org/whl/cpu
7373
--extra-index-url=https://pypi.org/simple
74-
torch
74+
# To be compatible with Windows
75+
torch == 2.8.0

requirements/requirements_tests.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,4 +72,5 @@ wordcloud
7272
# See: https://stackoverflow.com/a/57060441
7373
--index-url=https://download.pytorch.org/whl/cpu
7474
--extra-index-url=https://pypi.org/simple
75-
torch
75+
# To be compatible with Windows
76+
torch == 2.8.0

tests/test_keyword_extractor.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,5 +137,41 @@ def update_gui(err_msg, keywords_freq_files, keywords_stats_files):
137137
# Number of Files Found
138138
assert len([freq for freq in freq_files[1:-1] if freq]) >= 1
139139

140+
# Check that the results of unigrams and bigrams are the exact concatenation of those of unigrams and those of bigrams
141+
def test_keyword_extractor_ngram_size():
142+
settings = main_global.settings_custom['keyword_extractor']
143+
144+
wl_test_init.select_test_files(main_global, no_files = (0,))
145+
146+
for ngram_size_min, ngram_size_max in (
147+
(1, 1),
148+
(2, 2),
149+
(1, 2)
150+
):
151+
settings['generation_settings']['ngram_size_min'] = ngram_size_min
152+
settings['generation_settings']['ngram_size_max'] = ngram_size_max
153+
154+
worker_keyword_extractor = wl_keyword_extractor.Wl_Worker_Keyword_Extractor_Table(
155+
main_global,
156+
dialog_progress = wl_dialogs_misc.Wl_Dialog_Progress_Process_Data(main_global),
157+
)
158+
159+
worker_keyword_extractor.finished.connect(update_gui_ngram_size(ngram_size_min, ngram_size_max))
160+
worker_keyword_extractor.run()
161+
162+
# pylint: disable=undefined-variable
163+
assert keywords_freq_files_1_2 == keywords_freq_files_1_1 | keywords_freq_files_2_2
164+
assert keywords_stats_files_1_2 == keywords_stats_files_1_1 | keywords_stats_files_2_2
165+
166+
def update_gui_ngram_size(ngram_size_min, ngram_size_max):
167+
def update_gui(err_msg, keywords_freq_files, keywords_stats_files):
168+
assert not err_msg
169+
170+
globals()[f'keywords_freq_files_{ngram_size_min}_{ngram_size_max}'] = keywords_freq_files
171+
globals()[f'keywords_stats_files_{ngram_size_min}_{ngram_size_max}'] = keywords_stats_files
172+
173+
return update_gui
174+
140175
if __name__ == '__main__':
141176
test_keyword_extractor()
177+
test_keyword_extractor_ngram_size()

tests/test_ngram_generator.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
)
2525
from wordless import wl_ngram_generator
2626
from wordless.wl_dialogs import wl_dialogs_misc
27+
from wordless.wl_nlp import wl_texts
2728

2829
main_global = None
2930

@@ -35,6 +36,8 @@ def test_ngram_generator():
3536

3637
settings['search_settings']['multi_search_mode'] = True
3738
settings['search_settings']['search_terms'] = wl_test_init.SEARCH_TERMS
39+
settings['generation_settings']['ngram_size_min'] = 1
40+
settings['generation_settings']['ngram_size_max'] = 1
3841

3942
measures_dispersion = list(main.settings_global['measures_dispersion'])
4043
measures_adjusted_freq = list(main.settings_global['measures_adjusted_freq'])
@@ -127,5 +130,54 @@ def update_gui(err_msg, ngrams_freq_files, ngrams_stats_files):
127130
# Number of Files Found
128131
assert len([freq for freq in freq_files[:-1] if freq]) >= 1
129132

133+
# Check that the results of unigrams and bigrams are the exact concatenation of those of unigrams and those of bigrams
134+
def test_ngram_generator_ngram_size():
135+
settings = main_global.settings_custom['ngram_generator']
136+
137+
settings['search_settings']['search_term'] = wl_test_init.SEARCH_TERMS[0]
138+
settings['generation_settings']['measure_dispersion'] = 'juillands_d'
139+
settings['generation_settings']['measure_adjusted_freq'] = 'juillands_u'
140+
141+
wl_test_init.select_test_files(main_global, no_files = (0,))
142+
143+
for ngram_size_min, ngram_size_max in (
144+
(1, 1),
145+
(2, 2),
146+
(1, 2)
147+
):
148+
settings['generation_settings']['ngram_size_min'] = ngram_size_min
149+
settings['generation_settings']['ngram_size_max'] = ngram_size_max
150+
151+
worker_ngram_generator = wl_ngram_generator.Wl_Worker_Ngram_Generator_Table(
152+
main_global,
153+
dialog_progress = wl_dialogs_misc.Wl_Dialog_Progress_Process_Data(main_global),
154+
)
155+
156+
worker_ngram_generator.finished.connect(update_gui_ngram_size(ngram_size_min, ngram_size_max))
157+
worker_ngram_generator.run()
158+
159+
# pylint: disable=undefined-variable
160+
assert ngrams_freq_files_1_2 == ngrams_freq_files_1_1 | ngrams_freq_files_2_2
161+
assert ngrams_stats_files_1_2 == ngrams_stats_files_1_1 | ngrams_stats_files_2_2
162+
163+
def update_gui_ngram_size(ngram_size_min, ngram_size_max):
164+
def update_gui(err_msg, ngrams_freq_files, ngrams_stats_files):
165+
assert not err_msg
166+
167+
globals()[f'ngrams_freq_files_{ngram_size_min}_{ngram_size_max}'] = ngrams_freq_files
168+
globals()[f'ngrams_stats_files_{ngram_size_min}_{ngram_size_max}'] = ngrams_stats_files
169+
170+
return update_gui
171+
172+
def test_get_ngrams_is():
173+
tokens = wl_texts.to_tokens(('1', '2', '1'))
174+
175+
assert wl_ngram_generator.get_ngrams_is(
176+
ngrams = [(tokens[0],), (tokens[1],), (tokens[2],)],
177+
tokens = tokens
178+
) == [((tokens[0],), 0), ((tokens[1],), 1), ((tokens[2],), 2)]
179+
130180
if __name__ == '__main__':
131181
test_ngram_generator()
182+
test_ngram_generator_ngram_size()
183+
test_get_ngrams_is()

tests/tests_measures/test_measures_effect_size.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ def test_log_ratio():
156156
numpy.array([0, 0, 0]),
157157
numpy.array([1, 1, 0])
158158
),
159-
numpy.array([float('-inf'), float('inf'), 0])
159+
numpy.array([-math.inf, math.inf, 0])
160160
)
161161

162162
def test_mi_log_f():
@@ -246,7 +246,7 @@ def test_odds_ratio():
246246
numpy.array([0, 0, 0]),
247247
numpy.array([1, 1, 0])
248248
),
249-
numpy.array([float('-inf'), float('inf'), 0])
249+
numpy.array([-math.inf, math.inf, 0])
250250
)
251251

252252
# Reference: Gabrielatos, C., & Marchi, A. (2011, November 5). Keyness: Matching metrics to definitions [Conference session]. Corpus Linguistics in the South 1, University of Portsmouth, United Kingdom. https://eprints.lancs.ac.uk/id/eprint/51449/4/Gabrielatos_Marchi_Keyness.pdf | p. 18
@@ -270,7 +270,7 @@ def test_pct_diff():
270270
numpy.array([0, 0, 0]),
271271
numpy.array([1, 1, 0])
272272
),
273-
numpy.array([float('-inf'), float('inf'), 0])
273+
numpy.array([-math.inf, math.inf, 0])
274274
)
275275

276276
# Reference: Church, K. W., & Hanks, P. (1990). Word association norms, mutual information, and lexicography. Computational Linguistics, 16(1), 22–29. | p. 24

tests/wl_test_init.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
from wordless.wl_utils import wl_misc
4343
from wordless.wl_widgets import wl_tables
4444

45-
# English
45+
# English / Tibetan
4646
SEARCH_TERMS = ['take', 'ལ་']
4747

4848
# An instance of QApplication must be created before any instance of QWidget

0 commit comments

Comments
 (0)