Work Area: Parallel Concordancer - Parallel Unit No. - Empty parallel units are not counted

BLKSerene · BLKSerene · commit 41c748f3d6e6 · 2025-09-14T14:49:51.000+08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -32,6 +32,7 @@
 ### ✨ Improvements
 - Utils: Update Stanza's Serbian (Latin script) sentence tokenizer, part-of-speech tagger, and dependency parser
 - Utils: Update Wordless's sentence splitter and sentence segment tokenizer
+- Work Area: Parallel Concordancer - Parallel Unit No. - Empty parallel units are not counted
 
 ### 🔧 Bugfixes
 - Work Area: Fix Profiler - Syntactic Complexity
diff --git a/doc/doc.md b/doc/doc.md
@@ -461,10 +461,10 @@ In *Parallel Concordancer*, you can search for tokens in parallel corpora and ge
 You can search in *Data Table* for parts that might be of interest to you by clicking **Search in results**.
 
 - **5.1 Parallel Unit No.**<br>
-  The position of the alignment unit (paragraph) where the the search term is found.
+  The position of the alignment unit (paragraph) where the the search term is found. Empty parallel units are not counted.
 
 - **5.2 Parallel Unit No. %**<br>
-  The percentage of the position of the alignment unit (paragraph) where the the search term is found.
+  The percentage of the position of the alignment unit (paragraph) where the the search term is found. Empty parallel units are not counted.
 
 - **5.3 Parallel Units**<br>
   The parallel unit (paragraph) where the search term is found in each file.
diff --git a/tests/files/file_area/concordancer_parallel/[eng_us] Empty search term - src.txt b/tests/files/file_area/concordancer_parallel/[eng_us] Empty search term - src.txt
@@ -0,0 +1,5 @@
+The 1st alignment unit in the source text.
+The 2nd alignment unit in the source text.
+The 3rd alignment unit in the source text.
+
+Omitted source text. (without corresponding translation).
diff --git a/tests/files/file_area/concordancer_parallel/[eng_us] Empty search term - tgt1.txt b/tests/files/file_area/concordancer_parallel/[eng_us] Empty search term - tgt1.txt
@@ -0,0 +1,6 @@
+The 1st alignment unit in the first target text.
+The 2nd alignment unit in the first target text.
+The 3rd alignment unit in the first target text.
+
+
+Added target text (without corresponding originals).
diff --git a/tests/files/file_area/concordancer_parallel/[eng_us] Empty search term - tgt2.txt b/tests/files/file_area/concordancer_parallel/[eng_us] Empty search term - tgt2.txt
@@ -0,0 +1,3 @@
+The 1st alignment unit in the second target text.
+The 2nd alignment unit in the second target text.
+The 3rd alignment unit in the second target text.
diff --git a/tests/test_colligation_extractor.py b/tests/test_colligation_extractor.py
@@ -48,34 +48,32 @@ def test_colligation_extractor():
     ]
     measures_effect_size = list(main.settings_global['measures_effect_size'].keys())
 
-    for i in range(2 + wl_test_file_area.LEN_FILES_TESTS_OTHERS):
+    for i in range(2 + wl_test_file_area.NUM_FILES_OTHERS):
         match i:
             # Single file
             case 0:
                 wl_test_init.select_test_files(main, no_files = (0,))
             # Multiple files
             case 1:
                 wl_test_init.select_test_files(main, no_files = (1, 2))
-            # Tibetan (Classical)
-            case 2:
-                # Avoid loading modern-botok's spaCy model
-                wl_test_init.select_test_files(main, no_files = (4,))
-
-                settings_table['add_missing_ending_tshegs'] = True
-            case 3:
-                wl_test_init.select_test_files(main, no_files = (4,))
-
-                settings_table['add_missing_ending_tshegs'] = False
             # Miscellaneous
             case _:
+                wl_test_init.select_test_files(main, no_files = (i + 1,))
+
                 # Excluding files without POS tagging support and tagged files without POS tags
-                if (
-                    main.settings_custom['file_area']['files_open'][i + 1]['lang'] == 'eng_us'
-                    and main.settings_custom['file_area']['files_open'][i + 1]['name'] != '[eng_us] Starting with tags'
-                ):
-                    wl_test_init.select_test_files(main, no_files = [i + 1])
-                else:
-                    continue
+                match main.settings_custom['file_area']['files_open'][i + 1]['name']:
+                    # Tibetan (Classical)
+                    case '[bod] Tibetan tshegs':
+                        # Avoid loading modern-botok's spaCy model
+                        wl_test_init.select_test_files(main, no_files = (4,))
+
+                        settings_table['add_missing_ending_tshegs'] = True
+                    case '[xct] Tibetan tshegs':
+                        settings_table['add_missing_ending_tshegs'] = False
+                    case '[eng_us] Starting with a punctuation mark':
+                        pass
+                    case _:
+                        continue
 
         settings['generation_settings']['test_statistical_significance'] = random.choice(tests_statistical_significance)
         settings['generation_settings']['measure_bayes_factor'] = random.choice(measures_bayes_factor)
diff --git a/tests/test_collocation_extractor.py b/tests/test_collocation_extractor.py
@@ -48,27 +48,29 @@ def test_collocation_extractor():
     ]
     measures_effect_size = list(main.settings_global['measures_effect_size'].keys())
 
-    for i in range(2 + wl_test_file_area.LEN_FILES_TESTS_OTHERS):
+    for i in range(2 + wl_test_file_area.NUM_FILES_OTHERS):
         match i:
             # Single file
             case 0:
                 wl_test_init.select_test_files(main, no_files = (0,))
             # Multiple files
             case 1:
                 wl_test_init.select_test_files(main, no_files = (1, 2))
-            # Tibetan
-            case 2:
-                wl_test_init.select_test_files(main, no_files = (3,))
-
-                settings_table['add_missing_ending_tshegs'] = True
-            case 3:
-                wl_test_init.select_test_files(main, no_files = (4,))
-
-                settings_table['add_missing_ending_tshegs'] = False
-            # Miscellaneous
             case _:
                 wl_test_init.select_test_files(main, no_files = (i + 1,))
 
+                match main.settings_custom['file_area']['files_open'][i + 1]['name']:
+                    # Tibetan
+                    case '[bod] Tibetan tshegs':
+                        settings_table['add_missing_ending_tshegs'] = True
+                    case '[xct] Tibetan tshegs':
+                        settings_table['add_missing_ending_tshegs'] = False
+                    # Miscellaneous
+                    case '[eng_us] Starting with a punctuation mark' | '[eng_us] Starting with tags':
+                        pass
+                    case _:
+                        continue
+
         settings['generation_settings']['test_statistical_significance'] = random.choice(tests_statistical_significance)
         settings['generation_settings']['measure_bayes_factor'] = random.choice(measures_bayes_factor)
         settings['generation_settings']['measure_effect_size'] = random.choice(measures_effect_size)
diff --git a/tests/test_concordancer.py b/tests/test_concordancer.py
@@ -33,7 +33,7 @@ def test_concordancer():
     settings['search_settings']['multi_search_mode'] = True
     settings['search_settings']['search_terms'] = wl_test_init.SEARCH_TERMS
 
-    for i in range(2 + wl_test_file_area.LEN_FILES_TESTS_OTHERS):
+    for i in range(2 + wl_test_file_area.NUM_FILES_OTHERS):
         match i:
             # Single file
             case 0:
@@ -47,7 +47,14 @@ def test_concordancer():
                 settings['generation_settings']['calc_sentiment_scores'] = True
             # Miscellaneous
             case _:
-                wl_test_init.select_test_files(main, no_files = (i + 1,))
+                if main.settings_custom['file_area']['files_open'][i + 1]['name'] in (
+                    '[other] No language support',
+                    '[eng_us] Starting with a punctuation mark',
+                    '[eng_us] Starting with tags'
+                ):
+                    wl_test_init.select_test_files(main, no_files = (i + 1,))
+                else:
+                    continue
 
         global main_global
         main_global = main
diff --git a/tests/test_concordancer_parallel.py b/tests/test_concordancer_parallel.py
@@ -19,21 +19,30 @@
 from tests import wl_test_init
 from wordless import wl_concordancer_parallel
 from wordless.wl_dialogs import wl_dialogs_misc
+from wordless.wl_nlp import wl_texts
+
+main_global = None
 
 def test_concordancer_parallel():
     main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'fast')
 
     settings = main.settings_custom['concordancer_parallel']
 
-    settings['search_settings']['multi_search_mode'] = True
-    settings['search_settings']['search_terms'] = wl_test_init.SEARCH_TERMS
-
     for i in range(2):
         match i:
             case 0:
+                settings['search_settings']['multi_search_mode'] = True
+                settings['search_settings']['search_terms'] = wl_test_init.SEARCH_TERMS
+
                 wl_test_init.select_test_files(main, no_files = (0, 1, 2))
             case 1:
-                wl_test_init.select_test_files(main, no_files = (1, 2, 5))
+                settings['search_settings']['multi_search_mode'] = False
+                settings['search_settings']['search_term'] = ''
+
+                wl_test_init.select_test_files(main, no_files = (8, 9, 10))
+
+        global main_global
+        main_global = main
 
         print(f"Files: {' | '.join(wl_test_init.get_test_file_names(main))}")
 
@@ -45,23 +54,41 @@ def test_concordancer_parallel():
         worker_concordancer_parallel.finished.connect(update_gui)
         worker_concordancer_parallel.run()
 
-def update_gui(err_msg, concordance_lines):
+def update_gui(err_msg, parallel_units, num_paras_max):
     print(err_msg)
     assert not err_msg
-    assert concordance_lines
-
-    for concordance_line in concordance_lines:
-        assert len(concordance_line) == 2
+    assert parallel_units
 
-        parallel_unit_no, len_parallel_units = concordance_line[0]
+    files_selected = list(main_global.wl_file_area.get_selected_files())
 
-        # Parallel Unit No.
-        assert parallel_unit_no >= 1
-        assert len_parallel_units >= 1
+    # Test whether empty parallel units are removed
+    if files_selected[0]['name'] == '[eng_us] Empty search term - src':
+        assert parallel_units == [
+            (4, [
+                [
+                    ['Omitted', 'source', 'text. (', 'without', 'corresponding', 'translation).'],
+                    wl_texts.to_tokens(['Omitted', 'source', 'text.', ' (', 'without', 'corresponding', 'translation', ').'])
+                ],
+                [[], []],
+                [[], []]
+            ]),
+            (5, [
+                [[], []],
+                [
+                    ['Added', 'target', 'text (', 'without', 'corresponding', 'originals).'],
+                    wl_texts.to_tokens(['Added', 'target', 'text', ' (', 'without', 'corresponding', 'originals', ').'])
+                ],
+                [[], []]
+            ])
+        ]
+    else:
+        for parallel_unit_no, parallel_units_files in parallel_units:
+            # Parallel Unit No.
+            assert 1 <= parallel_unit_no <= num_paras_max
 
-        # Parallel Units
-        for parallel_unit in concordance_line[1]:
-            assert len(parallel_unit) == 2
+            # Parallel Units
+            for parallel_unit in parallel_units_files:
+                assert len(parallel_unit) == 2
 
 if __name__ == '__main__':
     test_concordancer_parallel()
diff --git a/tests/test_dependency_parser.py b/tests/test_dependency_parser.py
@@ -33,7 +33,7 @@ def test_dependency_parser():
     settings['search_settings']['multi_search_mode'] = True
     settings['search_settings']['search_terms'] = wl_test_init.SEARCH_TERMS
 
-    for i in range(2 + wl_test_file_area.LEN_FILES_TESTS_OTHERS):
+    for i in range(2 + wl_test_file_area.NUM_FILES_OTHERS):
         match i:
             # Single file
             case 0:
@@ -44,8 +44,11 @@ def test_dependency_parser():
             # Miscellaneous
             case _:
                 # Excluding files without dependency parsing support
-                if main.settings_custom['file_area']['files_open'][i + 1]['lang'] == 'eng_us':
-                    wl_test_init.select_test_files(main, no_files = [i + 1])
+                if main.settings_custom['file_area']['files_open'][i + 1]['name'] in (
+                    '[eng_us] Starting with a punctuation mark',
+                    '[eng_us] Starting with tags'
+                ):
+                    wl_test_init.select_test_files(main, no_files = (i + 1,))
                 else:
                     continue
 
diff --git a/tests/test_keyword_extractor.py b/tests/test_keyword_extractor.py
@@ -45,37 +45,32 @@ def test_keyword_extractor():
     ]
     measures_effect_size = list(main.settings_global['measures_effect_size'].keys())
 
-    for i in range(4 + wl_test_file_area.LEN_FILES_TESTS_OTHERS):
+    for i in range(2 + wl_test_file_area.NUM_FILES_OTHERS):
         match i:
             # Single observed file & single reference file
             case 0:
                 wl_test_init.select_test_files(main, no_files = (0,))
                 wl_test_init.select_test_files(main, no_files = (0,), ref = True)
-            # Single observed file & multiple reference files
-            case 1:
-                wl_test_init.select_test_files(main, no_files = (0,))
-                wl_test_init.select_test_files(main, no_files = (1, 2), ref = True)
-            # Multiple observed files & single reference file
-            case 2:
-                wl_test_init.select_test_files(main, no_files = (1, 2))
-                wl_test_init.select_test_files(main, no_files = (0,), ref = True)
             # Multiple observed files & multiple reference files
-            case 3:
+            case 1:
                 wl_test_init.select_test_files(main, no_files = (1, 2))
                 wl_test_init.select_test_files(main, no_files = (1, 2), ref = True)
-            # Tibetan
-            case 4:
-                wl_test_init.select_test_files(main, no_files = (3,))
-
-                settings_table['add_missing_ending_tshegs'] = True
-            case 5:
-                wl_test_init.select_test_files(main, no_files = (4,))
-
-                settings_table['add_missing_ending_tshegs'] = False
             # Miscellaneous
             case _:
-                wl_test_init.select_test_files(main, no_files = (i - 1,))
-                wl_test_init.select_test_files(main, no_files = (0,), ref = True)
+                wl_test_init.select_test_files(main, no_files = (i + 1,))
+                wl_test_init.select_test_files(main, no_files = (1,), ref = True)
+
+                match main.settings_custom['file_area']['files_open'][i + 1]['name']:
+                    # Tibetan
+                    case '[bod] Tibetan tshegs':
+                        settings_table['add_missing_ending_tshegs'] = True
+                    case '[xct] Tibetan tshegs':
+                        settings_table['add_missing_ending_tshegs'] = False
+                    # Miscellaneous
+                    case '[eng_us] Starting with a punctuation mark' | '[eng_us] Starting with tags':
+                        pass
+                    case _:
+                        continue
 
         settings['generation_settings']['test_statistical_significance'] = random.choice(tests_statistical_significance)
         settings['generation_settings']['measure_bayes_factor'] = random.choice(measures_bayes_factor)
diff --git a/tests/test_ngram_generator.py b/tests/test_ngram_generator.py
@@ -39,27 +39,30 @@ def test_ngram_generator():
     measures_dispersion = list(main.settings_global['measures_dispersion'])
     measures_adjusted_freq = list(main.settings_global['measures_adjusted_freq'])
 
-    for i in range(2 + wl_test_file_area.LEN_FILES_TESTS_OTHERS):
+    for i in range(2 + wl_test_file_area.NUM_FILES_OTHERS):
         match i:
             # Single file
             case 0:
                 wl_test_init.select_test_files(main, no_files = (0,))
             # Multiple files
             case 1:
                 wl_test_init.select_test_files(main, no_files = (1, 2))
-            # Tibetan
-            case 2:
-                wl_test_init.select_test_files(main, no_files = (3,))
-
-                settings_table['add_missing_ending_tshegs'] = True
-            case 3:
-                wl_test_init.select_test_files(main, no_files = (4,))
-
-                settings_table['add_missing_ending_tshegs'] = False
             # Miscellaneous
             case _:
                 wl_test_init.select_test_files(main, no_files = (i + 1,))
 
+                match main.settings_custom['file_area']['files_open'][i + 1]['name']:
+                    # Tibetan
+                    case '[bod] Tibetan tshegs':
+                        settings_table['add_missing_ending_tshegs'] = True
+                    case '[xct] Tibetan tshegs':
+                        settings_table['add_missing_ending_tshegs'] = False
+                    # Miscellaneous
+                    case '[eng_us] Starting with a punctuation mark' | '[eng_us] Starting with tags':
+                        pass
+                    case _:
+                        continue
+
         settings['generation_settings']['measure_dispersion'] = random.choice(measures_dispersion)
         settings['generation_settings']['measure_adjusted_freq'] = random.choice(measures_adjusted_freq)
 
diff --git a/tests/test_profiler.py b/tests/test_profiler.py
diff --git a/tests/test_wordlist_generator.py b/tests/test_wordlist_generator.py
diff --git a/tests/tests_nlp/tests_spacy/test_spacy.py b/tests/tests_nlp/tests_spacy/test_spacy.py
diff --git a/tests/wl_test_file_area.py b/tests/wl_test_file_area.py
diff --git a/wordless/wl_concordancer_parallel.py b/wordless/wl_concordancer_parallel.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+The 1st alignment unit in the second target text.`
	`2`	`+The 2nd alignment unit in the second target text.`
	`3`	`+The 3rd alignment unit in the second target text.`