2424)
2525from wordless .wl_nlp import (
2626 wl_dependency_parsing ,
27+ wl_nlp_utils ,
2728 wl_texts ,
2829 wl_word_tokenization
2930)
@@ -47,10 +48,14 @@ def test_dependency_parse(lang, dependency_parser):
4748 lang = lang
4849 )
4950
50- wl_test_dependency_parse_models (lang , dependency_parser , test_sentence , tokens , '' )
51- wl_test_dependency_parse_fig_models (lang , dependency_parser , test_sentence , tokens )
51+ wl_test_dependency_parse_models (lang , dependency_parser , tokens , '' )
52+ wl_test_dependency_parse_fig_models (lang , dependency_parser , tokens )
53+
54+ def wl_test_dependency_parse_models (lang , dependency_parser , tokens , results ):
55+ print (f'{ lang } / { dependency_parser } :' )
56+
57+ test_sentence = getattr (wl_test_lang_examples , f'SENTENCE_{ lang .upper ()} ' )
5258
53- def wl_test_dependency_parse_models (lang , dependency_parser , test_sentence , tokens , results ):
5459 # Untokenized
5560 tokens_untokenized = wl_dependency_parsing .wl_dependency_parse (
5661 main ,
@@ -63,7 +68,6 @@ def wl_test_dependency_parse_models(lang, dependency_parser, test_sentence, toke
6368 for token in tokens_untokenized
6469 ]
6570
66- print (f'{ lang } / { dependency_parser } :' )
6771 print (f'{ dependencies_untokenized } \n ' )
6872
6973 # Tokenized
@@ -92,6 +96,16 @@ def wl_test_dependency_parse_models(lang, dependency_parser, test_sentence, toke
9296 # Tokenization should not be modified
9397 assert len (tokens ) == len (dependencies_tokenized )
9498
99+ # Newlines
100+ tokens_newlines = wl_dependency_parsing .wl_dependency_parse (
101+ main ,
102+ inputs = wl_test_lang_examples .TEXT_NEWLINES ,
103+ lang = lang ,
104+ dependency_parser = dependency_parser
105+ )
106+
107+ assert wl_texts .to_token_texts (tokens_newlines ) == wl_nlp_utils .clean_texts (wl_test_lang_examples .TEXT_NEWLINES )
108+
95109 # Tagged
96110 main .settings_custom ['files' ]['tags' ]['body_tag_settings' ] = [['Embedded' , 'Part of speech' , '_*' , 'N/A' ]]
97111
@@ -108,16 +122,6 @@ def wl_test_dependency_parse_models(lang, dependency_parser, test_sentence, toke
108122
109123 assert dependencies_tagged == dependencies_tokenized
110124
111- # Long
112- tokens_long = wl_dependency_parsing .wl_dependency_parse (
113- main ,
114- inputs = wl_texts .to_tokens (wl_test_lang_examples .TOKENS_LONG , lang = lang ),
115- lang = lang ,
116- dependency_parser = dependency_parser
117- )
118-
119- assert [str (token ) for token in tokens_long ] == wl_test_lang_examples .TOKENS_LONG
120-
121125 # Parsed
122126 heads_orig = ['test_head' ]
123127 tokens_parsed = wl_dependency_parsing .wl_dependency_parse (
@@ -129,72 +133,82 @@ def wl_test_dependency_parse_models(lang, dependency_parser, test_sentence, toke
129133
130134 assert [str (token .head ) for token in tokens_parsed ] == heads_orig
131135
132- def wl_test_dependency_parse_fig_models (lang , dependency_parser , test_sentence , tokens ):
136+ # Long
137+ if dependency_parser .startswith (('spacy_' , 'stanza_' )):
138+ main .settings_custom ['files' ]['misc_settings' ]['read_files_in_chunks_chars' ] = 99
139+
140+ tokens_long = wl_dependency_parsing .wl_dependency_parse (
141+ main ,
142+ inputs = '\n ' .join (wl_test_lang_examples .TOKENS_LONG ),
143+ lang = lang ,
144+ dependency_parser = dependency_parser
145+ )
146+
147+ assert wl_texts .to_token_texts (tokens_long ) == wl_test_lang_examples .TOKENS_LONG
148+
149+ tokens_long = wl_dependency_parsing .wl_dependency_parse (
150+ main ,
151+ inputs = wl_texts .to_tokens (wl_test_lang_examples .TOKENS_LONG , lang = lang ),
152+ lang = lang ,
153+ dependency_parser = dependency_parser
154+ )
155+
156+ assert wl_texts .to_token_texts (tokens_long ) == wl_test_lang_examples .TOKENS_LONG
157+
158+ main .settings_custom ['files' ]['misc_settings' ]['read_files_in_chunks_chars' ] = main .settings_default ['files' ]['misc_settings' ]['read_files_in_chunks_chars' ]
159+
160+ def wl_test_dependency_parse_fig_models (lang , dependency_parser , tokens ):
133161 print (f'{ lang } / { dependency_parser } (figure)' )
134162
163+ test_sentence = getattr (wl_test_lang_examples , f'SENTENCE_{ lang .upper ()} ' )
164+
135165 # Untokenized
136166 html_untokenized = wl_dependency_parsing .wl_dependency_parse_fig (
137167 main ,
138168 inputs = test_sentence ,
139169 lang = lang ,
140- dependency_parser = dependency_parser ,
141- show_in_separate_tabs = False
142- )
143- html_untokenized_separate_tabs = wl_dependency_parsing .wl_dependency_parse_fig (
144- main ,
145- inputs = test_sentence ,
146- lang = lang ,
147- dependency_parser = dependency_parser ,
148- show_in_separate_tabs = True
170+ dependency_parser = dependency_parser
149171 )
150172
151173 # Tokenized
152174 html_tokenized = wl_dependency_parsing .wl_dependency_parse_fig (
153175 main ,
154- inputs = tokens ,
155- lang = lang ,
156- dependency_parser = dependency_parser ,
157- show_in_separate_tabs = False
158- )
159- html_tokenized_separate_tabs = wl_dependency_parsing .wl_dependency_parse_fig (
160- main ,
161- inputs = tokens ,
176+ inputs = [tokens ],
162177 lang = lang ,
163- dependency_parser = dependency_parser ,
164- show_in_separate_tabs = True
178+ dependency_parser = dependency_parser
165179 )
166180
167181 # Check for empty HTMLs
168182 assert html_untokenized
169- assert html_untokenized_separate_tabs
170183 assert html_tokenized
171- assert html_tokenized_separate_tabs
172184
173185# RTL languages
174186def test_dependency_parse_fig_rtl_langs ():
175- html = wl_dependency_parsing .wl_dependency_parse_fig (
187+ html_untokenized = wl_dependency_parsing .wl_dependency_parse_fig (
176188 main ,
177189 inputs = 'test' ,
178190 lang = 'ara'
179191 )
180- html = wl_dependency_parsing .wl_dependency_parse_fig (
192+
193+ html_tokenized = wl_dependency_parsing .wl_dependency_parse_fig (
181194 main ,
182- inputs = [wl_texts .Wl_Token ('test' ) ],
195+ inputs = [[ wl_texts .Wl_Token ('test' , lang = 'ara' )] ],
183196 lang = 'ara'
184197 )
185198
186- assert html
199+ assert html_untokenized
200+ assert html_tokenized
187201
188- def test__get_pipelines_disabled ():
189- wl_dependency_parsing ._get_pipelines_disabled (show_pos_tags = True , show_lemmas = True )
190- wl_dependency_parsing ._get_pipelines_disabled (show_pos_tags = True , show_lemmas = False )
191- wl_dependency_parsing ._get_pipelines_disabled (show_pos_tags = False , show_lemmas = True )
192- wl_dependency_parsing ._get_pipelines_disabled (show_pos_tags = False , show_lemmas = False )
202+ def test__get_pipelines_to_disable ():
203+ wl_dependency_parsing ._get_pipelines_to_disable (show_pos_tags = True , show_lemmas = True )
204+ wl_dependency_parsing ._get_pipelines_to_disable (show_pos_tags = True , show_lemmas = False )
205+ wl_dependency_parsing ._get_pipelines_to_disable (show_pos_tags = False , show_lemmas = True )
206+ wl_dependency_parsing ._get_pipelines_to_disable (show_pos_tags = False , show_lemmas = False )
193207
194208def test_wl_show_dependency_graphs ():
195209 htmls = wl_dependency_parsing .wl_dependency_parse_fig (
196210 main ,
197- inputs = 'test' ,
211+ inputs = wl_test_lang_examples . TEXT_NEWLINES ,
198212 lang = 'eng_us' ,
199213 dependency_parser = 'stanza_eng'
200214 )
@@ -207,5 +221,5 @@ def test_wl_show_dependency_graphs():
207221 test_dependency_parse (lang , dependency_parser )
208222
209223 test_dependency_parse_fig_rtl_langs ()
210- test__get_pipelines_disabled ()
224+ test__get_pipelines_to_disable ()
211225 test_wl_show_dependency_graphs ()
0 commit comments