Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 19 additions & 5 deletions python/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,10 @@ impl PyTokenizer {
/// :param out: tokenization results will be written into this MorphemeList, a new one will be created instead.
/// See https://worksapplications.github.io/sudachi.rs/python/topics/out_param.html for details.
///
/// A Tokenizer instance cannot be used concurrently from multiple threads.
/// For parallel tokenization, share a Dictionary and create one Tokenizer
/// per worker/thread. Concurrent calls raise sudachipy.errors.SudachiError.
///
/// :type text: str
/// :type mode: SplitMode | str | None
/// :type out: MorphemeList
Expand All @@ -145,7 +149,7 @@ impl PyTokenizer {
)]
#[allow(unused_variables)]
fn tokenize<'py>(
&'py mut self,
self_: &Bound<'py, Self>,
py: Python<'py>,
text: &'py str,
mode: Option<&Bound<'py, PyAny>>,
Expand All @@ -157,8 +161,19 @@ impl PyTokenizer {
None => None,
Some(m) => Some(extract_mode(m)?),
};
let default_mode = mode.map(|m| self.tokenizer.set_mode(m));
let mut tokenizer = scopeguard::guard(&mut self.tokenizer, |t| {

let mut this = match self_.try_borrow_mut() {
Ok(this) => this,
Err(_) => {
return errors::wrap(Err(
"Tokenizer is already in use. A Tokenizer instance cannot be used concurrently; create a separate Tokenizer per thread or guard calls externally",
))
}
};

let projection = this.projection.clone();
let default_mode = mode.map(|m| this.tokenizer.set_mode(m));
let mut tokenizer = scopeguard::guard(&mut this.tokenizer, |t| {
default_mode.map(|m| t.set_mode(m));
});

Expand All @@ -175,8 +190,7 @@ impl PyTokenizer {
None => {
let dict = tokenizer.dict_clone();
let morphemes = MorphemeList::empty(dict);
let wrapper =
PyMorphemeListWrapper::from_components(morphemes, self.projection.clone());
let wrapper = PyMorphemeListWrapper::from_components(morphemes, projection);
Bound::new(py, wrapper)?
}
Some(list) => list,
Expand Down
74 changes: 74 additions & 0 deletions python/tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,12 @@
# limitations under the License.

import os
import queue
import threading
import unittest

from sudachipy import Dictionary, SplitMode
from sudachipy.errors import SudachiError


class TestTokenizer(unittest.TestCase):
Expand Down Expand Up @@ -166,6 +169,77 @@ def test_tokenizer_out_param(self):
self.assertEqual(id(ms1), id(ms2))
self.assertEqual(m.surface(), 'すだち')

def test_concurrent_tokenize_on_same_tokenizer_fails(self):
text = '東京都庁に行きました。' * 1000
ready = threading.Barrier(8)
errors = queue.Queue()
unexpected = queue.Queue()

def worker():
try:
ready.wait(timeout=10)
for _ in range(20):
self.tokenizer_obj.tokenize(text)
except SudachiError as err:
errors.put(str(err))
except Exception as err:
unexpected.put(err)

threads = [threading.Thread(target=worker) for _ in range(8)]
for thread in threads:
thread.start()
for thread in threads:
thread.join()

self.assertTrue(unexpected.empty(), list(unexpected.queue))

messages = list(errors.queue)
self.assertTrue(
any('Tokenizer is already in use' in message for message in messages),
messages,
)
self.assertFalse(
any('Already borrowed' in message for message in messages),
messages,
)

def test_separate_tokenizers_work_in_threads(self):
errors = queue.Queue()

def worker():
try:
tok = self.dict_.create()
for _ in range(50):
morphemes = tok.tokenize('東京都庁に行きました。')
self.assertGreater(morphemes.size(), 0)
except Exception as err:
errors.put(err)

threads = [threading.Thread(target=worker) for _ in range(8)]
for thread in threads:
thread.start()
for thread in threads:
thread.join()

self.assertTrue(errors.empty(), list(errors.queue))

def test_tokenizer_is_released_after_internal_error(self):
with self.assertRaises(SudachiError) as cm:
self.tokenizer_obj.tokenize('あ' * 20000)

self.assertIn('Input is too long', str(cm.exception))

morphemes = self.tokenizer_obj.tokenize('東京')
self.assertGreater(morphemes.size(), 0)

def test_temporary_mode_is_restored_after_internal_error(self):
tok = self.dict_.create(SplitMode.C)

with self.assertRaises(SudachiError):
tok.tokenize('あ' * 20000, SplitMode.A)

self.assertEqual(SplitMode.C, tok.mode)


if __name__ == '__main__':
unittest.main()
Loading