Skip to content

Commit 41c748f

Browse files
committed
Work Area: Parallel Concordancer - Parallel Unit No. - Empty parallel units are not counted
1 parent ba5185e commit 41c748f

17 files changed

+203
-168
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
### ✨ Improvements
3333
- Utils: Update Stanza's Serbian (Latin script) sentence tokenizer, part-of-speech tagger, and dependency parser
3434
- Utils: Update Wordless's sentence splitter and sentence segment tokenizer
35+
- Work Area: Parallel Concordancer - Parallel Unit No. - Empty parallel units are not counted
3536

3637
### 🔧 Bugfixes
3738
- Work Area: Fix Profiler - Syntactic Complexity

doc/doc.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -461,10 +461,10 @@ In *Parallel Concordancer*, you can search for tokens in parallel corpora and ge
461461
You can search in *Data Table* for parts that might be of interest to you by clicking **Search in results**.
462462

463463
- **5.1 Parallel Unit No.**<br>
464-
The position of the alignment unit (paragraph) where the the search term is found.
464+
The position of the alignment unit (paragraph) where the the search term is found. Empty parallel units are not counted.
465465

466466
- **5.2 Parallel Unit No. %**<br>
467-
The percentage of the position of the alignment unit (paragraph) where the the search term is found.
467+
The percentage of the position of the alignment unit (paragraph) where the the search term is found. Empty parallel units are not counted.
468468

469469
- **5.3 Parallel Units**<br>
470470
The parallel unit (paragraph) where the search term is found in each file.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
The 1st alignment unit in the source text.
2+
The 2nd alignment unit in the source text.
3+
The 3rd alignment unit in the source text.
4+
5+
Omitted source text. (without corresponding translation).
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
The 1st alignment unit in the first target text.
2+
The 2nd alignment unit in the first target text.
3+
The 3rd alignment unit in the first target text.
4+
5+
6+
Added target text (without corresponding originals).
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
The 1st alignment unit in the second target text.
2+
The 2nd alignment unit in the second target text.
3+
The 3rd alignment unit in the second target text.

tests/test_colligation_extractor.py

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -48,34 +48,32 @@ def test_colligation_extractor():
4848
]
4949
measures_effect_size = list(main.settings_global['measures_effect_size'].keys())
5050

51-
for i in range(2 + wl_test_file_area.LEN_FILES_TESTS_OTHERS):
51+
for i in range(2 + wl_test_file_area.NUM_FILES_OTHERS):
5252
match i:
5353
# Single file
5454
case 0:
5555
wl_test_init.select_test_files(main, no_files = (0,))
5656
# Multiple files
5757
case 1:
5858
wl_test_init.select_test_files(main, no_files = (1, 2))
59-
# Tibetan (Classical)
60-
case 2:
61-
# Avoid loading modern-botok's spaCy model
62-
wl_test_init.select_test_files(main, no_files = (4,))
63-
64-
settings_table['add_missing_ending_tshegs'] = True
65-
case 3:
66-
wl_test_init.select_test_files(main, no_files = (4,))
67-
68-
settings_table['add_missing_ending_tshegs'] = False
6959
# Miscellaneous
7060
case _:
61+
wl_test_init.select_test_files(main, no_files = (i + 1,))
62+
7163
# Excluding files without POS tagging support and tagged files without POS tags
72-
if (
73-
main.settings_custom['file_area']['files_open'][i + 1]['lang'] == 'eng_us'
74-
and main.settings_custom['file_area']['files_open'][i + 1]['name'] != '[eng_us] Starting with tags'
75-
):
76-
wl_test_init.select_test_files(main, no_files = [i + 1])
77-
else:
78-
continue
64+
match main.settings_custom['file_area']['files_open'][i + 1]['name']:
65+
# Tibetan (Classical)
66+
case '[bod] Tibetan tshegs':
67+
# Avoid loading modern-botok's spaCy model
68+
wl_test_init.select_test_files(main, no_files = (4,))
69+
70+
settings_table['add_missing_ending_tshegs'] = True
71+
case '[xct] Tibetan tshegs':
72+
settings_table['add_missing_ending_tshegs'] = False
73+
case '[eng_us] Starting with a punctuation mark':
74+
pass
75+
case _:
76+
continue
7977

8078
settings['generation_settings']['test_statistical_significance'] = random.choice(tests_statistical_significance)
8179
settings['generation_settings']['measure_bayes_factor'] = random.choice(measures_bayes_factor)

tests/test_collocation_extractor.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -48,27 +48,29 @@ def test_collocation_extractor():
4848
]
4949
measures_effect_size = list(main.settings_global['measures_effect_size'].keys())
5050

51-
for i in range(2 + wl_test_file_area.LEN_FILES_TESTS_OTHERS):
51+
for i in range(2 + wl_test_file_area.NUM_FILES_OTHERS):
5252
match i:
5353
# Single file
5454
case 0:
5555
wl_test_init.select_test_files(main, no_files = (0,))
5656
# Multiple files
5757
case 1:
5858
wl_test_init.select_test_files(main, no_files = (1, 2))
59-
# Tibetan
60-
case 2:
61-
wl_test_init.select_test_files(main, no_files = (3,))
62-
63-
settings_table['add_missing_ending_tshegs'] = True
64-
case 3:
65-
wl_test_init.select_test_files(main, no_files = (4,))
66-
67-
settings_table['add_missing_ending_tshegs'] = False
68-
# Miscellaneous
6959
case _:
7060
wl_test_init.select_test_files(main, no_files = (i + 1,))
7161

62+
match main.settings_custom['file_area']['files_open'][i + 1]['name']:
63+
# Tibetan
64+
case '[bod] Tibetan tshegs':
65+
settings_table['add_missing_ending_tshegs'] = True
66+
case '[xct] Tibetan tshegs':
67+
settings_table['add_missing_ending_tshegs'] = False
68+
# Miscellaneous
69+
case '[eng_us] Starting with a punctuation mark' | '[eng_us] Starting with tags':
70+
pass
71+
case _:
72+
continue
73+
7274
settings['generation_settings']['test_statistical_significance'] = random.choice(tests_statistical_significance)
7375
settings['generation_settings']['measure_bayes_factor'] = random.choice(measures_bayes_factor)
7476
settings['generation_settings']['measure_effect_size'] = random.choice(measures_effect_size)

tests/test_concordancer.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def test_concordancer():
3333
settings['search_settings']['multi_search_mode'] = True
3434
settings['search_settings']['search_terms'] = wl_test_init.SEARCH_TERMS
3535

36-
for i in range(2 + wl_test_file_area.LEN_FILES_TESTS_OTHERS):
36+
for i in range(2 + wl_test_file_area.NUM_FILES_OTHERS):
3737
match i:
3838
# Single file
3939
case 0:
@@ -47,7 +47,14 @@ def test_concordancer():
4747
settings['generation_settings']['calc_sentiment_scores'] = True
4848
# Miscellaneous
4949
case _:
50-
wl_test_init.select_test_files(main, no_files = (i + 1,))
50+
if main.settings_custom['file_area']['files_open'][i + 1]['name'] in (
51+
'[other] No language support',
52+
'[eng_us] Starting with a punctuation mark',
53+
'[eng_us] Starting with tags'
54+
):
55+
wl_test_init.select_test_files(main, no_files = (i + 1,))
56+
else:
57+
continue
5158

5259
global main_global
5360
main_global = main

tests/test_concordancer_parallel.py

Lines changed: 43 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,21 +19,30 @@
1919
from tests import wl_test_init
2020
from wordless import wl_concordancer_parallel
2121
from wordless.wl_dialogs import wl_dialogs_misc
22+
from wordless.wl_nlp import wl_texts
23+
24+
main_global = None
2225

2326
def test_concordancer_parallel():
2427
main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'fast')
2528

2629
settings = main.settings_custom['concordancer_parallel']
2730

28-
settings['search_settings']['multi_search_mode'] = True
29-
settings['search_settings']['search_terms'] = wl_test_init.SEARCH_TERMS
30-
3131
for i in range(2):
3232
match i:
3333
case 0:
34+
settings['search_settings']['multi_search_mode'] = True
35+
settings['search_settings']['search_terms'] = wl_test_init.SEARCH_TERMS
36+
3437
wl_test_init.select_test_files(main, no_files = (0, 1, 2))
3538
case 1:
36-
wl_test_init.select_test_files(main, no_files = (1, 2, 5))
39+
settings['search_settings']['multi_search_mode'] = False
40+
settings['search_settings']['search_term'] = ''
41+
42+
wl_test_init.select_test_files(main, no_files = (8, 9, 10))
43+
44+
global main_global
45+
main_global = main
3746

3847
print(f"Files: {' | '.join(wl_test_init.get_test_file_names(main))}")
3948

@@ -45,23 +54,41 @@ def test_concordancer_parallel():
4554
worker_concordancer_parallel.finished.connect(update_gui)
4655
worker_concordancer_parallel.run()
4756

48-
def update_gui(err_msg, concordance_lines):
57+
def update_gui(err_msg, parallel_units, num_paras_max):
4958
print(err_msg)
5059
assert not err_msg
51-
assert concordance_lines
52-
53-
for concordance_line in concordance_lines:
54-
assert len(concordance_line) == 2
60+
assert parallel_units
5561

56-
parallel_unit_no, len_parallel_units = concordance_line[0]
62+
files_selected = list(main_global.wl_file_area.get_selected_files())
5763

58-
# Parallel Unit No.
59-
assert parallel_unit_no >= 1
60-
assert len_parallel_units >= 1
64+
# Test whether empty parallel units are removed
65+
if files_selected[0]['name'] == '[eng_us] Empty search term - src':
66+
assert parallel_units == [
67+
(4, [
68+
[
69+
['Omitted', 'source', 'text. (', 'without', 'corresponding', 'translation).'],
70+
wl_texts.to_tokens(['Omitted', 'source', 'text.', ' (', 'without', 'corresponding', 'translation', ').'])
71+
],
72+
[[], []],
73+
[[], []]
74+
]),
75+
(5, [
76+
[[], []],
77+
[
78+
['Added', 'target', 'text (', 'without', 'corresponding', 'originals).'],
79+
wl_texts.to_tokens(['Added', 'target', 'text', ' (', 'without', 'corresponding', 'originals', ').'])
80+
],
81+
[[], []]
82+
])
83+
]
84+
else:
85+
for parallel_unit_no, parallel_units_files in parallel_units:
86+
# Parallel Unit No.
87+
assert 1 <= parallel_unit_no <= num_paras_max
6188

62-
# Parallel Units
63-
for parallel_unit in concordance_line[1]:
64-
assert len(parallel_unit) == 2
89+
# Parallel Units
90+
for parallel_unit in parallel_units_files:
91+
assert len(parallel_unit) == 2
6592

6693
if __name__ == '__main__':
6794
test_concordancer_parallel()

tests/test_dependency_parser.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def test_dependency_parser():
3333
settings['search_settings']['multi_search_mode'] = True
3434
settings['search_settings']['search_terms'] = wl_test_init.SEARCH_TERMS
3535

36-
for i in range(2 + wl_test_file_area.LEN_FILES_TESTS_OTHERS):
36+
for i in range(2 + wl_test_file_area.NUM_FILES_OTHERS):
3737
match i:
3838
# Single file
3939
case 0:
@@ -44,8 +44,11 @@ def test_dependency_parser():
4444
# Miscellaneous
4545
case _:
4646
# Excluding files without dependency parsing support
47-
if main.settings_custom['file_area']['files_open'][i + 1]['lang'] == 'eng_us':
48-
wl_test_init.select_test_files(main, no_files = [i + 1])
47+
if main.settings_custom['file_area']['files_open'][i + 1]['name'] in (
48+
'[eng_us] Starting with a punctuation mark',
49+
'[eng_us] Starting with tags'
50+
):
51+
wl_test_init.select_test_files(main, no_files = (i + 1,))
4952
else:
5053
continue
5154

0 commit comments

Comments
 (0)