diff --git a/python/src/tokenizer.rs b/python/src/tokenizer.rs index 525c674a..2d6ebb51 100644 --- a/python/src/tokenizer.rs +++ b/python/src/tokenizer.rs @@ -136,6 +136,10 @@ impl PyTokenizer { /// :param out: tokenization results will be written into this MorphemeList, a new one will be created instead. /// See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details. /// + /// A Tokenizer instance cannot be used concurrently from multiple threads. + /// For parallel tokenization, share a Dictionary and create one Tokenizer + /// per worker/thread. Concurrent calls raise sudachipy.errors.SudachiError. + /// /// :type text: str /// :type mode: SplitMode | str | None /// :type out: MorphemeList @@ -145,7 +149,7 @@ impl PyTokenizer { )] #[allow(unused_variables)] fn tokenize<'py>( - &'py mut self, + self_: &Bound<'py, Self>, py: Python<'py>, text: &'py str, mode: Option<&Bound<'py, PyAny>>, @@ -157,8 +161,19 @@ impl PyTokenizer { None => None, Some(m) => Some(extract_mode(m)?), }; - let default_mode = mode.map(|m| self.tokenizer.set_mode(m)); - let mut tokenizer = scopeguard::guard(&mut self.tokenizer, |t| { + + let mut this = match self_.try_borrow_mut() { + Ok(this) => this, + Err(_) => { + return errors::wrap(Err( + "Tokenizer is already in use. A Tokenizer instance cannot be used concurrently; create a separate Tokenizer per thread or guard calls externally", + )) + } + }; + + let projection = this.projection.clone(); + let default_mode = mode.map(|m| this.tokenizer.set_mode(m)); + let mut tokenizer = scopeguard::guard(&mut this.tokenizer, |t| { default_mode.map(|m| t.set_mode(m)); }); @@ -175,8 +190,7 @@ impl PyTokenizer { None => { let dict = tokenizer.dict_clone(); let morphemes = MorphemeList::empty(dict); - let wrapper = - PyMorphemeListWrapper::from_components(morphemes, self.projection.clone()); + let wrapper = PyMorphemeListWrapper::from_components(morphemes, projection); Bound::new(py, wrapper)? } Some(list) => list, diff --git a/python/tests/test_tokenizer.py b/python/tests/test_tokenizer.py index 623cb78c..93314550 100644 --- a/python/tests/test_tokenizer.py +++ b/python/tests/test_tokenizer.py @@ -13,9 +13,12 @@ # limitations under the License. import os +import queue +import threading import unittest from sudachipy import Dictionary, SplitMode +from sudachipy.errors import SudachiError class TestTokenizer(unittest.TestCase): @@ -166,6 +169,77 @@ def test_tokenizer_out_param(self): self.assertEqual(id(ms1), id(ms2)) self.assertEqual(m.surface(), 'すだち') + def test_concurrent_tokenize_on_same_tokenizer_fails(self): + text = '東京都庁に行きました。' * 1000 + ready = threading.Barrier(8) + errors = queue.Queue() + unexpected = queue.Queue() + + def worker(): + try: + ready.wait(timeout=10) + for _ in range(20): + self.tokenizer_obj.tokenize(text) + except SudachiError as err: + errors.put(str(err)) + except Exception as err: + unexpected.put(err) + + threads = [threading.Thread(target=worker) for _ in range(8)] + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + self.assertTrue(unexpected.empty(), list(unexpected.queue)) + + messages = list(errors.queue) + self.assertTrue( + any('Tokenizer is already in use' in message for message in messages), + messages, + ) + self.assertFalse( + any('Already borrowed' in message for message in messages), + messages, + ) + + def test_separate_tokenizers_work_in_threads(self): + errors = queue.Queue() + + def worker(): + try: + tok = self.dict_.create() + for _ in range(50): + morphemes = tok.tokenize('東京都庁に行きました。') + self.assertGreater(morphemes.size(), 0) + except Exception as err: + errors.put(err) + + threads = [threading.Thread(target=worker) for _ in range(8)] + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + self.assertTrue(errors.empty(), list(errors.queue)) + + def test_tokenizer_is_released_after_internal_error(self): + with self.assertRaises(SudachiError) as cm: + self.tokenizer_obj.tokenize('あ' * 20000) + + self.assertIn('Input is too long', str(cm.exception)) + + morphemes = self.tokenizer_obj.tokenize('東京') + self.assertGreater(morphemes.size(), 0) + + def test_temporary_mode_is_restored_after_internal_error(self): + tok = self.dict_.create(SplitMode.C) + + with self.assertRaises(SudachiError): + tok.tokenize('あ' * 20000, SplitMode.A) + + self.assertEqual(SplitMode.C, tok.mode) + if __name__ == '__main__': unittest.main()