Work Area: Add Keyword Extractor - Generation Settings - N-gram size

BLKSerene · BLKSerene · commit 9c42d4f202e7 · 2025-10-27T17:49:14.000+08:00
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -28,7 +28,7 @@ jobs:
       - uses: actions/checkout@v5
 
       - name: Setup Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
         with:
           python-version: '3.11'
           architecture: 'x64'
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -29,7 +29,7 @@ jobs:
       - uses: actions/checkout@v5
 
       - name: Setup Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
         with:
           python-version: '3.11'
           architecture: 'x64'
@@ -90,7 +90,7 @@ jobs:
       - uses: actions/checkout@v5
 
       - name: Setup Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
         with:
           python-version: '3.11'
           architecture: 'x64'
@@ -144,7 +144,7 @@ jobs:
       - uses: actions/checkout@v5
 
       - name: Setup Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
         with:
           python-version: '3.11'
           architecture: 'x64'
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -24,6 +24,7 @@
 - Menu: Add Edit - Sample
 - Settings: Add Settings - Tables - Miscellaneous Settings
 - Utils: Add Wordless's character tokenizer
+- Work Area: Add Keyword Extractor - Generation Settings - N-gram size
 - Work Area: Add Profiler - Export all tables
 - Work Area: Add Dependency Parser - Search Settings - Search term position
 - Work Area: Add Work Area - Table Settings - Show total
diff --git a/doc/doc.md b/doc/doc.md
@@ -678,7 +678,7 @@ You can generate line charts or word clouds for patterns of colligation using an
 > [!NOTE]
 > Renamed from **Keyword** to **Keyword Extractor** in *Wordless* 2.2.
 
-In *Keyword Extractor*, you can search for candidates of potential keywords (tokens that have far more or far less frequency in the observed corpus than in the reference corpus) in different files given a reference corpus, conduct different tests of statistical significance on each keyword and calculate the Bayes factor and effect size for each keyword using different measures. You can adjust the settings for the generated data via **Generation Settings**. You can disable the calculation of statistical significance and/or Bayes factor and/or effect size by setting **Generation Settings → Test of Statistical Significance / Measures of Bayes Factor / Measure of Effect Size** to **None**.
+In *Keyword Extractor*, you can search for potential keywords (tokens or n-grams that appear much more or much less frequently in the observed corpus than in the reference corpus) in different files given a reference corpus, conduct different tests of statistical significance on each keyword and calculate the Bayes factor and effect size for each keyword using different measures. You can adjust the settings for the generated data via **Generation Settings**. You can disable the calculation of statistical significance and/or Bayes factor and/or effect size by setting **Generation Settings → Test of Statistical Significance / Measures of Bayes Factor / Measure of Effect Size** to **None**.
 
 You can filter the results by clicking **Filter results** or search in *Data Table* for parts that might be of interest to you by clicking **Search in results**.
 
diff --git a/requirements/requirements_dev.txt b/requirements/requirements_dev.txt
@@ -71,4 +71,5 @@ wordcloud
 # See: https://stackoverflow.com/a/57060441
 --index-url=https://download.pytorch.org/whl/cpu
 --extra-index-url=https://pypi.org/simple
-torch
+# To be compatible with Windows
+torch == 2.8.0
diff --git a/requirements/requirements_tests.txt b/requirements/requirements_tests.txt
@@ -72,4 +72,5 @@ wordcloud
 # See: https://stackoverflow.com/a/57060441
 --index-url=https://download.pytorch.org/whl/cpu
 --extra-index-url=https://pypi.org/simple
-torch
+# To be compatible with Windows
+torch == 2.8.0
diff --git a/tests/test_keyword_extractor.py b/tests/test_keyword_extractor.py
@@ -137,5 +137,41 @@ def update_gui(err_msg, keywords_freq_files, keywords_stats_files):
         # Number of Files Found
         assert len([freq for freq in freq_files[1:-1] if freq]) >= 1
 
+# Check that the results of unigrams and bigrams are the exact concatenation of those of unigrams and those of bigrams
+def test_keyword_extractor_ngram_size():
+    settings = main_global.settings_custom['keyword_extractor']
+
+    wl_test_init.select_test_files(main_global, no_files = (0,))
+
+    for ngram_size_min, ngram_size_max in (
+        (1, 1),
+        (2, 2),
+        (1, 2)
+    ):
+        settings['generation_settings']['ngram_size_min'] = ngram_size_min
+        settings['generation_settings']['ngram_size_max'] = ngram_size_max
+
+        worker_keyword_extractor = wl_keyword_extractor.Wl_Worker_Keyword_Extractor_Table(
+            main_global,
+            dialog_progress = wl_dialogs_misc.Wl_Dialog_Progress_Process_Data(main_global),
+        )
+
+        worker_keyword_extractor.finished.connect(update_gui_ngram_size(ngram_size_min, ngram_size_max))
+        worker_keyword_extractor.run()
+
+    # pylint: disable=undefined-variable
+    assert keywords_freq_files_1_2 == keywords_freq_files_1_1 | keywords_freq_files_2_2
+    assert keywords_stats_files_1_2 == keywords_stats_files_1_1 | keywords_stats_files_2_2
+
+def update_gui_ngram_size(ngram_size_min, ngram_size_max):
+    def update_gui(err_msg, keywords_freq_files, keywords_stats_files):
+        assert not err_msg
+
+        globals()[f'keywords_freq_files_{ngram_size_min}_{ngram_size_max}'] = keywords_freq_files
+        globals()[f'keywords_stats_files_{ngram_size_min}_{ngram_size_max}'] = keywords_stats_files
+
+    return update_gui
+
 if __name__ == '__main__':
     test_keyword_extractor()
+    test_keyword_extractor_ngram_size()
diff --git a/tests/test_ngram_generator.py b/tests/test_ngram_generator.py
@@ -24,6 +24,7 @@
 )
 from wordless import wl_ngram_generator
 from wordless.wl_dialogs import wl_dialogs_misc
+from wordless.wl_nlp import wl_texts
 
 main_global = None
 
@@ -35,6 +36,8 @@ def test_ngram_generator():
 
     settings['search_settings']['multi_search_mode'] = True
     settings['search_settings']['search_terms'] = wl_test_init.SEARCH_TERMS
+    settings['generation_settings']['ngram_size_min'] = 1
+    settings['generation_settings']['ngram_size_max'] = 1
 
     measures_dispersion = list(main.settings_global['measures_dispersion'])
     measures_adjusted_freq = list(main.settings_global['measures_adjusted_freq'])
@@ -127,5 +130,54 @@ def update_gui(err_msg, ngrams_freq_files, ngrams_stats_files):
         # Number of Files Found
         assert len([freq for freq in freq_files[:-1] if freq]) >= 1
 
+# Check that the results of unigrams and bigrams are the exact concatenation of those of unigrams and those of bigrams
+def test_ngram_generator_ngram_size():
+    settings = main_global.settings_custom['ngram_generator']
+
+    settings['search_settings']['search_term'] = wl_test_init.SEARCH_TERMS[0]
+    settings['generation_settings']['measure_dispersion'] = 'juillands_d'
+    settings['generation_settings']['measure_adjusted_freq'] = 'juillands_u'
+
+    wl_test_init.select_test_files(main_global, no_files = (0,))
+
+    for ngram_size_min, ngram_size_max in (
+        (1, 1),
+        (2, 2),
+        (1, 2)
+    ):
+        settings['generation_settings']['ngram_size_min'] = ngram_size_min
+        settings['generation_settings']['ngram_size_max'] = ngram_size_max
+
+        worker_ngram_generator = wl_ngram_generator.Wl_Worker_Ngram_Generator_Table(
+            main_global,
+            dialog_progress = wl_dialogs_misc.Wl_Dialog_Progress_Process_Data(main_global),
+        )
+
+        worker_ngram_generator.finished.connect(update_gui_ngram_size(ngram_size_min, ngram_size_max))
+        worker_ngram_generator.run()
+
+    # pylint: disable=undefined-variable
+    assert ngrams_freq_files_1_2 == ngrams_freq_files_1_1 | ngrams_freq_files_2_2
+    assert ngrams_stats_files_1_2 == ngrams_stats_files_1_1 | ngrams_stats_files_2_2
+
+def update_gui_ngram_size(ngram_size_min, ngram_size_max):
+    def update_gui(err_msg, ngrams_freq_files, ngrams_stats_files):
+        assert not err_msg
+
+        globals()[f'ngrams_freq_files_{ngram_size_min}_{ngram_size_max}'] = ngrams_freq_files
+        globals()[f'ngrams_stats_files_{ngram_size_min}_{ngram_size_max}'] = ngrams_stats_files
+
+    return update_gui
+
+def test_get_ngrams_is():
+    tokens = wl_texts.to_tokens(('1', '2', '1'))
+
+    assert wl_ngram_generator.get_ngrams_is(
+        ngrams = [(tokens[0],), (tokens[1],), (tokens[2],)],
+        tokens = tokens
+    ) == [((tokens[0],), 0), ((tokens[1],), 1), ((tokens[2],), 2)]
+
 if __name__ == '__main__':
     test_ngram_generator()
+    test_ngram_generator_ngram_size()
+    test_get_ngrams_is()
diff --git a/tests/tests_measures/test_measures_effect_size.py b/tests/tests_measures/test_measures_effect_size.py
@@ -156,7 +156,7 @@ def test_log_ratio():
             numpy.array([0, 0, 0]),
             numpy.array([1, 1, 0])
         ),
-        numpy.array([float('-inf'), float('inf'), 0])
+        numpy.array([-math.inf, math.inf, 0])
     )
 
 def test_mi_log_f():
@@ -246,7 +246,7 @@ def test_odds_ratio():
             numpy.array([0, 0, 0]),
             numpy.array([1, 1, 0])
         ),
-        numpy.array([float('-inf'), float('inf'), 0])
+        numpy.array([-math.inf, math.inf, 0])
     )
 
 # Reference: Gabrielatos, C., & Marchi, A. (2011, November 5). Keyness: Matching metrics to definitions [Conference session]. Corpus Linguistics in the South 1, University of Portsmouth, United Kingdom. https://eprints.lancs.ac.uk/id/eprint/51449/4/Gabrielatos_Marchi_Keyness.pdf | p. 18
@@ -270,7 +270,7 @@ def test_pct_diff():
             numpy.array([0, 0, 0]),
             numpy.array([1, 1, 0])
         ),
-        numpy.array([float('-inf'), float('inf'), 0])
+        numpy.array([-math.inf, math.inf, 0])
     )
 
 # Reference: Church, K. W., & Hanks, P. (1990). Word association norms, mutual information, and lexicography. Computational Linguistics, 16(1), 22–29. | p. 24
diff --git a/tests/wl_test_init.py b/tests/wl_test_init.py
@@ -42,7 +42,7 @@
 from wordless.wl_utils import wl_misc
 from wordless.wl_widgets import wl_tables
 
-# English
+# English / Tibetan
 SEARCH_TERMS = ['take', 'ལ་']
 
 # An instance of QApplication must be created before any instance of QWidget
diff --git a/wordless/wl_colligation_extractor.py b/wordless/wl_colligation_extractor.py
@@ -1030,7 +1030,7 @@ def run(self):
                         if not self._running:
                             raise wl_excs.Wl_Exc_Aborted(self.main)
 
-                        # Limit Searching
+                        # Limit searching
                         if settings_limit_searching != _tr('Wl_Worker_Colligation_Extractor', 'None'):
                             if settings_limit_searching == _tr('Wl_Worker_Colligation_Extractor', 'Within sentence segments'):
                                 offsets_unit = offsets_sentence_segs
@@ -1052,7 +1052,7 @@ def run(self):
                         tags_right = []
 
                         if window_left < 0 < window_right:
-                            # Limit Searching
+                            # Limit searching
                             if settings_limit_searching == _tr('Wl_Worker_Colligation_Extractor', 'None'):
                                 tags_left = text.tags[max(0, i + window_left) : i]
                                 tags_right = text.tags[i + ngram_size : i + ngram_size + window_right]
@@ -1095,7 +1095,7 @@ def run(self):
 
                                 colligations_freqs_file_all[ngram_size][(ngram, collocate)] += 1
                         elif window_left < 0 and window_right < 0:
-                            # Limit Searching
+                            # Limit searching
                             if settings_limit_searching == _tr('Wl_Worker_Colligation_Extractor', 'None'):
                                 tags_left = text.tags[max(0, i + window_left) : max(0, i + window_right + 1)]
                             else:
@@ -1118,7 +1118,7 @@ def run(self):
 
                                 colligations_freqs_file_all[ngram_size][(ngram, collocate)] += 1
                         elif window_left > 0 and window_right > 0:
-                            # Limit Searching
+                            # Limit searching
                             if settings_limit_searching == _tr('Wl_Worker_Colligation_Extractor', 'None'):
                                 tags_right = text.tags[i + ngram_size + window_left - 1 : i + ngram_size + window_right]
                             else:
diff --git a/wordless/wl_collocation_extractor.py b/wordless/wl_collocation_extractor.py
@@ -1028,7 +1028,7 @@ def run(self):
                         if not self._running:
                             raise wl_excs.Wl_Exc_Aborted(self.main)
 
-                        # Limit Searching
+                        # Limit searching
                         if settings_limit_searching != _tr('Wl_Worker_Collocation_Extractor', 'None'):
                             if settings_limit_searching == _tr('Wl_Worker_Collocation_Extractor', 'Within sentence segments'):
                                 offsets_unit = offsets_sentence_segs
@@ -1050,7 +1050,7 @@ def run(self):
                         tokens_right = []
 
                         if window_left < 0 < window_right:
-                            # Limit Searching
+                            # Limit searching
                             if settings_limit_searching == _tr('Wl_Worker_Collocation_Extractor', 'None'):
                                 tokens_left = tokens[max(0, i + window_left) : i]
                                 tokens_right = tokens[i + ngram_size : i + ngram_size + window_right]
@@ -1093,7 +1093,7 @@ def run(self):
 
                                 collocations_freqs_file_all[ngram_size][(ngram, collocate)] += 1
                         elif window_left < 0 and window_right < 0:
-                            # Limit Searching
+                            # Limit searching
                             if settings_limit_searching == _tr('Wl_Worker_Collocation_Extractor', 'None'):
                                 tokens_left = tokens[max(0, i + window_left) : max(0, i + window_right + 1)]
                             else:
diff --git a/wordless/wl_keyword_extractor.py b/wordless/wl_keyword_extractor.py
diff --git a/wordless/wl_ngram_generator.py b/wordless/wl_ngram_generator.py
diff --git a/wordless/wl_nlp/wl_nlp_utils.py b/wordless/wl_nlp/wl_nlp_utils.py
diff --git a/wordless/wl_results/wl_results_filter.py b/wordless/wl_results/wl_results_filter.py
diff --git a/wordless/wl_settings/wl_settings_default.py b/wordless/wl_settings/wl_settings_default.py