Skip to content

Commit 4d6aa9e

Browse files
committed
Settings: 1. Update Settings - Files - Miscellaneous Settings - Read files in chunks of characters 2. Update Settings - Sentiment Analysis - Preview
1 parent 41c748f commit 4d6aa9e

File tree

58 files changed

+1792
-1458
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+1792
-1458
lines changed

.circleci/config.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,9 @@ jobs:
5252
command: |
5353
# Run some tests separately to reduce memory usage
5454
pytest tests/tests_nlp/tests_spacy/test_spacy_eng.py
55+
pytest tests/tests_nlp/tests_spacy/test_spacy_misc.py
5556
pytest tests/tests_nlp/tests_stanza/test_stanza_eng.py
57+
pytest tests/tests_nlp/tests_stanza/test_stanza_misc.py
5658
5759
pytest tests/tests_nlp/test_dependency_parsing.py
5860
pytest tests/tests_nlp/test_lemmatization.py
@@ -121,7 +123,9 @@ jobs:
121123
command: |
122124
# Run some tests separately to reduce memory usage
123125
pytest tests/tests_nlp/tests_spacy/test_spacy_eng.py
126+
pytest tests/tests_nlp/tests_spacy/test_spacy_misc.py
124127
pytest tests/tests_nlp/tests_stanza/test_stanza_eng.py
128+
pytest tests/tests_nlp/tests_stanza/test_stanza_misc.py
125129
126130
pytest tests/tests_nlp/test_dependency_parsing.py
127131
pytest tests/tests_nlp/test_lemmatization.py
@@ -194,7 +198,9 @@ jobs:
194198
# Use "python -m pytest" instead to fix "pytest: command not found"
195199
# Run some tests separately to reduce memory usage
196200
python3.11 -m pytest tests/tests_nlp/tests_spacy/test_spacy_eng.py
201+
python3.11 -m pytest tests/tests_nlp/tests_spacy/test_spacy_misc.py
197202
python3.11 -m pytest tests/tests_nlp/tests_stanza/test_stanza_eng.py
203+
python3.11 -m pytest tests/tests_nlp/tests_stanza/test_stanza_misc.py
198204
199205
python3.11 -m pytest tests/tests_nlp/test_dependency_parsing.py
200206
python3.11 -m pytest tests/tests_nlp/test_lemmatization.py

.github/workflows/tests.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,9 @@ jobs:
5151
run: |
5252
# Run some tests separately to reduce memory usage
5353
pytest --cov=./ --cov-report=xml --cov-append tests/tests_nlp/tests_spacy/test_spacy_eng.py
54+
pytest --cov=./ --cov-report=xml --cov-append tests/tests_nlp/tests_spacy/test_spacy_misc.py
5455
pytest --cov=./ --cov-report=xml --cov-append tests/tests_nlp/tests_stanza/test_stanza_eng.py
56+
pytest --cov=./ --cov-report=xml --cov-append tests/tests_nlp/tests_stanza/test_stanza_misc.py
5557
5658
pytest --cov=./ --cov-report=xml --cov-append tests/tests_nlp/test_dependency_parsing.py
5759
pytest --cov=./ --cov-report=xml --cov-append tests/tests_nlp/test_lemmatization.py
@@ -113,7 +115,9 @@ jobs:
113115
run: |
114116
# Run some tests separately to reduce memory usage
115117
pytest tests/tests_nlp/tests_spacy/test_spacy_eng.py
118+
pytest tests/tests_nlp/tests_spacy/test_spacy_misc.py
116119
pytest tests/tests_nlp/tests_stanza/test_stanza_eng.py
120+
pytest tests/tests_nlp/tests_stanza/test_stanza_misc.py
117121
118122
pytest tests/tests_nlp/test_dependency_parsing.py
119123
pytest tests/tests_nlp/test_lemmatization.py
@@ -173,7 +177,9 @@ jobs:
173177
174178
# Run some tests separately to reduce memory usage
175179
pytest tests/tests_nlp/tests_spacy/test_spacy_eng.py
180+
pytest tests/tests_nlp/tests_spacy/test_spacy_misc.py
176181
pytest tests/tests_nlp/tests_stanza/test_stanza_eng.py
182+
pytest tests/tests_nlp/tests_stanza/test_stanza_misc.py
177183
178184
pytest tests/tests_nlp/test_dependency_parsing.py
179185
pytest tests/tests_nlp/test_lemmatization.py

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
- Work Area: Add Work Area - Sample
3131

3232
### ✨ Improvements
33+
- Settings: Update Settings - Files - Miscellaneous Settings - Read files in chunks of characters
34+
- Settings: Update Settings - Sentiment Analysis - Preview
3335
- Utils: Update Stanza's Serbian (Latin script) sentence tokenizer, part-of-speech tagger, and dependency parser
3436
- Utils: Update Wordless's sentence splitter and sentence segment tokenizer
3537
- Work Area: Parallel Concordancer - Parallel Unit No. - Empty parallel units are not counted

appveyor.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,9 @@ for:
6363
test_script:
6464
# Run some tests separately to reduce memory usage
6565
- pytest tests/tests_nlp/tests_spacy/test_spacy_eng.py
66+
- pytest tests/tests_nlp/tests_spacy/test_spacy_misc.py
6667
- pytest tests/tests_nlp/tests_stanza/test_stanza_eng.py
68+
- pytest tests/tests_nlp/tests_stanza/test_stanza_misc.py
6769

6870
- pytest tests/tests_nlp/test_dependency_parsing.py
6971
- pytest tests/tests_nlp/test_lemmatization.py
@@ -112,7 +114,9 @@ for:
112114
test_script:
113115
# Run some tests separately to reduce memory usage
114116
- pytest tests/tests_nlp/tests_spacy/test_spacy_eng.py
117+
- pytest tests/tests_nlp/tests_spacy/test_spacy_misc.py
115118
- pytest tests/tests_nlp/tests_stanza/test_stanza_eng.py
119+
- pytest tests/tests_nlp/tests_stanza/test_stanza_misc.py
116120

117121
- pytest tests/tests_nlp/test_dependency_parsing.py
118122
- pytest tests/tests_nlp/test_lemmatization.py
@@ -164,7 +168,9 @@ for:
164168

165169
# Run some tests separately to reduce memory usage
166170
- pytest tests/tests_nlp/tests_spacy/test_spacy_eng.py
171+
- pytest tests/tests_nlp/tests_spacy/test_spacy_misc.py
167172
- pytest tests/tests_nlp/tests_stanza/test_stanza_eng.py
173+
- pytest tests/tests_nlp/tests_stanza/test_stanza_misc.py
168174

169175
- pytest tests/tests_nlp/test_dependency_parsing.py
170176
- pytest tests/tests_nlp/test_lemmatization.py

azure-pipelines.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,9 @@ jobs:
5252
- script: |
5353
:: Run some tests separately to reduce memory usage
5454
pytest tests/tests_nlp/tests_spacy/test_spacy_eng.py
55+
pytest tests/tests_nlp/tests_spacy/test_spacy_misc.py
5556
pytest tests/tests_nlp/tests_stanza/test_stanza_eng.py
57+
pytest tests/tests_nlp/tests_stanza/test_stanza_misc.py
5658
5759
pytest tests/tests_nlp/test_dependency_parsing.py
5860
pytest tests/tests_nlp/test_lemmatization.py
@@ -113,7 +115,9 @@ jobs:
113115
- script: |
114116
# Run some tests separately to reduce memory usage
115117
pytest tests/tests_nlp/tests_spacy/test_spacy_eng.py
118+
pytest tests/tests_nlp/tests_spacy/test_spacy_misc.py
116119
pytest tests/tests_nlp/tests_stanza/test_stanza_eng.py
120+
pytest tests/tests_nlp/tests_stanza/test_stanza_misc.py
117121
118122
pytest tests/tests_nlp/test_dependency_parsing.py
119123
pytest tests/tests_nlp/test_lemmatization.py
@@ -177,7 +181,9 @@ jobs:
177181
178182
# Run some tests separately to reduce memory usage
179183
pytest tests/tests_nlp/tests_spacy/test_spacy_eng.py
184+
pytest tests/tests_nlp/tests_spacy/test_spacy_misc.py
180185
pytest tests/tests_nlp/tests_stanza/test_stanza_eng.py
186+
pytest tests/tests_nlp/tests_stanza/test_stanza_misc.py
181187
182188
pytest tests/tests_nlp/test_dependency_parsing.py
183189
pytest tests/tests_nlp/test_lemmatization.py

tests/tests_measures/test_measures_readability.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -173,8 +173,8 @@ def test_colemans_readability_formula():
173173
assert cloze_pct_eng_0 == 'text_too_short'
174174
assert cloze_pct_eng_12_1 == 1.29 * (9 / 12 * 100) - 38.45
175175
assert cloze_pct_eng_12_2 == 1.16 * (9 / 12 * 100) + 1.48 * (3 / 12 * 100) - 37.95
176-
assert cloze_pct_eng_12_3 == 1.07 * (9 / 12 * 100) + 1.18 * (3 / 12 * 100) + 0.76 * (0 / 12 * 100) - 34.02
177-
assert cloze_pct_eng_12_4 == 1.04 * (9 / 12 * 100) + 1.06 * (3 / 12 * 100) + 0.56 * (0 / 12 * 100) - 0.36 * (0 / 12) - 26.01
176+
assert cloze_pct_eng_12_3 == 1.07 * (9 / 12 * 100) + 1.18 * (3 / 12 * 100) + 0.76 * (3 / 12 * 100) - 34.02
177+
assert cloze_pct_eng_12_4 == 1.04 * (9 / 12 * 100) + 1.06 * (3 / 12 * 100) + 0.56 * (3 / 12 * 100) - 0.36 * (0 / 12) - 26.01
178178
assert cloze_pct_tha_12 != 'no_support'
179179
assert cloze_pct_other_12 == 'no_support'
180180

@@ -427,8 +427,8 @@ def test_lorge_readability_index():
427427
lorge_spa_12 = wl_measures_readability.lorge_readability_index(main, test_text_spa_12)
428428

429429
assert lorge_eng_0 == 'text_too_short'
430-
assert lorge_eng_12_corrected == 12 / 3 * 0.06 + 2 / 12 * 0.1 + 2 / 12 * 0.1 + 1.99
431-
assert lorge_eng_12 == 12 / 3 * 0.07 + 2 / 12 * 13.01 + 2 / 12 * 10.73 + 1.6126
430+
assert lorge_eng_12_corrected == 12 / 3 * 0.06 + 0 / 12 * 0.1 + 2 / 12 * 0.1 + 1.99
431+
assert lorge_eng_12 == 12 / 3 * 0.07 + 0 / 12 * 13.01 + 2 / 12 * 10.73 + 1.6126
432432
assert lorge_spa_12 == 'no_support'
433433

434434
def test_luong_nguyen_dinhs_readability_formula():

tests/tests_nlp/test_dependency_parsing.py

Lines changed: 61 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
)
2525
from wordless.wl_nlp import (
2626
wl_dependency_parsing,
27+
wl_nlp_utils,
2728
wl_texts,
2829
wl_word_tokenization
2930
)
@@ -47,10 +48,14 @@ def test_dependency_parse(lang, dependency_parser):
4748
lang = lang
4849
)
4950

50-
wl_test_dependency_parse_models(lang, dependency_parser, test_sentence, tokens, '')
51-
wl_test_dependency_parse_fig_models(lang, dependency_parser, test_sentence, tokens)
51+
wl_test_dependency_parse_models(lang, dependency_parser, tokens, '')
52+
wl_test_dependency_parse_fig_models(lang, dependency_parser, tokens)
53+
54+
def wl_test_dependency_parse_models(lang, dependency_parser, tokens, results):
55+
print(f'{lang} / {dependency_parser}:')
56+
57+
test_sentence = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}')
5258

53-
def wl_test_dependency_parse_models(lang, dependency_parser, test_sentence, tokens, results):
5459
# Untokenized
5560
tokens_untokenized = wl_dependency_parsing.wl_dependency_parse(
5661
main,
@@ -63,7 +68,6 @@ def wl_test_dependency_parse_models(lang, dependency_parser, test_sentence, toke
6368
for token in tokens_untokenized
6469
]
6570

66-
print(f'{lang} / {dependency_parser}:')
6771
print(f'{dependencies_untokenized}\n')
6872

6973
# Tokenized
@@ -92,6 +96,16 @@ def wl_test_dependency_parse_models(lang, dependency_parser, test_sentence, toke
9296
# Tokenization should not be modified
9397
assert len(tokens) == len(dependencies_tokenized)
9498

99+
# Newlines
100+
tokens_newlines = wl_dependency_parsing.wl_dependency_parse(
101+
main,
102+
inputs = wl_test_lang_examples.TEXT_NEWLINES,
103+
lang = lang,
104+
dependency_parser = dependency_parser
105+
)
106+
107+
assert wl_texts.to_token_texts(tokens_newlines) == wl_nlp_utils.clean_texts(wl_test_lang_examples.TEXT_NEWLINES)
108+
95109
# Tagged
96110
main.settings_custom['files']['tags']['body_tag_settings'] = [['Embedded', 'Part of speech', '_*', 'N/A']]
97111

@@ -108,16 +122,6 @@ def wl_test_dependency_parse_models(lang, dependency_parser, test_sentence, toke
108122

109123
assert dependencies_tagged == dependencies_tokenized
110124

111-
# Long
112-
tokens_long = wl_dependency_parsing.wl_dependency_parse(
113-
main,
114-
inputs = wl_texts.to_tokens(wl_test_lang_examples.TOKENS_LONG, lang = lang),
115-
lang = lang,
116-
dependency_parser = dependency_parser
117-
)
118-
119-
assert [str(token) for token in tokens_long] == wl_test_lang_examples.TOKENS_LONG
120-
121125
# Parsed
122126
heads_orig = ['test_head']
123127
tokens_parsed = wl_dependency_parsing.wl_dependency_parse(
@@ -129,72 +133,82 @@ def wl_test_dependency_parse_models(lang, dependency_parser, test_sentence, toke
129133

130134
assert [str(token.head) for token in tokens_parsed] == heads_orig
131135

132-
def wl_test_dependency_parse_fig_models(lang, dependency_parser, test_sentence, tokens):
136+
# Long
137+
if dependency_parser.startswith(('spacy_', 'stanza_')):
138+
main.settings_custom['files']['misc_settings']['read_files_in_chunks_chars'] = 99
139+
140+
tokens_long = wl_dependency_parsing.wl_dependency_parse(
141+
main,
142+
inputs = '\n'.join(wl_test_lang_examples.TOKENS_LONG),
143+
lang = lang,
144+
dependency_parser = dependency_parser
145+
)
146+
147+
assert wl_texts.to_token_texts(tokens_long) == wl_test_lang_examples.TOKENS_LONG
148+
149+
tokens_long = wl_dependency_parsing.wl_dependency_parse(
150+
main,
151+
inputs = wl_texts.to_tokens(wl_test_lang_examples.TOKENS_LONG, lang = lang),
152+
lang = lang,
153+
dependency_parser = dependency_parser
154+
)
155+
156+
assert wl_texts.to_token_texts(tokens_long) == wl_test_lang_examples.TOKENS_LONG
157+
158+
main.settings_custom['files']['misc_settings']['read_files_in_chunks_chars'] = main.settings_default['files']['misc_settings']['read_files_in_chunks_chars']
159+
160+
def wl_test_dependency_parse_fig_models(lang, dependency_parser, tokens):
133161
print(f'{lang} / {dependency_parser} (figure)')
134162

163+
test_sentence = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}')
164+
135165
# Untokenized
136166
html_untokenized = wl_dependency_parsing.wl_dependency_parse_fig(
137167
main,
138168
inputs = test_sentence,
139169
lang = lang,
140-
dependency_parser = dependency_parser,
141-
show_in_separate_tabs = False
142-
)
143-
html_untokenized_separate_tabs = wl_dependency_parsing.wl_dependency_parse_fig(
144-
main,
145-
inputs = test_sentence,
146-
lang = lang,
147-
dependency_parser = dependency_parser,
148-
show_in_separate_tabs = True
170+
dependency_parser = dependency_parser
149171
)
150172

151173
# Tokenized
152174
html_tokenized = wl_dependency_parsing.wl_dependency_parse_fig(
153175
main,
154-
inputs = tokens,
155-
lang = lang,
156-
dependency_parser = dependency_parser,
157-
show_in_separate_tabs = False
158-
)
159-
html_tokenized_separate_tabs = wl_dependency_parsing.wl_dependency_parse_fig(
160-
main,
161-
inputs = tokens,
176+
inputs = [tokens],
162177
lang = lang,
163-
dependency_parser = dependency_parser,
164-
show_in_separate_tabs = True
178+
dependency_parser = dependency_parser
165179
)
166180

167181
# Check for empty HTMLs
168182
assert html_untokenized
169-
assert html_untokenized_separate_tabs
170183
assert html_tokenized
171-
assert html_tokenized_separate_tabs
172184

173185
# RTL languages
174186
def test_dependency_parse_fig_rtl_langs():
175-
html = wl_dependency_parsing.wl_dependency_parse_fig(
187+
html_untokenized = wl_dependency_parsing.wl_dependency_parse_fig(
176188
main,
177189
inputs = 'test',
178190
lang = 'ara'
179191
)
180-
html = wl_dependency_parsing.wl_dependency_parse_fig(
192+
193+
html_tokenized = wl_dependency_parsing.wl_dependency_parse_fig(
181194
main,
182-
inputs = [wl_texts.Wl_Token('test')],
195+
inputs = [[wl_texts.Wl_Token('test', lang = 'ara')]],
183196
lang = 'ara'
184197
)
185198

186-
assert html
199+
assert html_untokenized
200+
assert html_tokenized
187201

188-
def test__get_pipelines_disabled():
189-
wl_dependency_parsing._get_pipelines_disabled(show_pos_tags = True, show_lemmas = True)
190-
wl_dependency_parsing._get_pipelines_disabled(show_pos_tags = True, show_lemmas = False)
191-
wl_dependency_parsing._get_pipelines_disabled(show_pos_tags = False, show_lemmas = True)
192-
wl_dependency_parsing._get_pipelines_disabled(show_pos_tags = False, show_lemmas = False)
202+
def test__get_pipelines_to_disable():
203+
wl_dependency_parsing._get_pipelines_to_disable(show_pos_tags = True, show_lemmas = True)
204+
wl_dependency_parsing._get_pipelines_to_disable(show_pos_tags = True, show_lemmas = False)
205+
wl_dependency_parsing._get_pipelines_to_disable(show_pos_tags = False, show_lemmas = True)
206+
wl_dependency_parsing._get_pipelines_to_disable(show_pos_tags = False, show_lemmas = False)
193207

194208
def test_wl_show_dependency_graphs():
195209
htmls = wl_dependency_parsing.wl_dependency_parse_fig(
196210
main,
197-
inputs = 'test',
211+
inputs = wl_test_lang_examples.TEXT_NEWLINES,
198212
lang = 'eng_us',
199213
dependency_parser = 'stanza_eng'
200214
)
@@ -207,5 +221,5 @@ def test_wl_show_dependency_graphs():
207221
test_dependency_parse(lang, dependency_parser)
208222

209223
test_dependency_parse_fig_rtl_langs()
210-
test__get_pipelines_disabled()
224+
test__get_pipelines_to_disable()
211225
test_wl_show_dependency_graphs()

0 commit comments

Comments
 (0)