Skip to content

Commit 8df4002

Browse files
committed
Refactor translator to use a more accurate language model
1 parent eba8460 commit 8df4002

File tree

6 files changed

+35
-19
lines changed

6 files changed

+35
-19
lines changed
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
1
22
00:00:00,000 --> 00:00:06,360
3-
When we got out of the long tunnel of the border, it was an eyebled.
3+
It was a snowflake when we came out of the long tunnel.
44

55
2
66
00:00:07,800 --> 00:00:10,740
7-
The bottom of the night has been changed.
7+
The bottom of the night has been hallucinating.
88

99
3
1010
00:00:12,320 --> 00:00:15,360
11-
The train stopped at the signal station.
11+
The train stopped in front of the signal station.
1212

1313
4
1414
00:00:16,980 --> 00:00:23,540
15-
On the other side, a virgin approached and opened a window in front of the box.
15+
A virgin came from the other side of the seat and opened the window in front of the movie theater.
Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
11
WEBVTT
22
33
00:00:00.000 --> 00:00:04.880
4-
It's okay, son.
4+
I'm all right, son.
55

66
00:00:04.880 --> 00:00:09.240
7-
There's no fighting.
7+
I'm not going to fight.
88

99
00:00:09.240 --> 00:00:16.440
10-
No, I don't think you should be a brother.
10+
No, I should call him brother.
1111

1212
00:00:16.440 --> 00:00:18.060
1313
What?
1414

1515
00:00:18.060 --> 00:00:21.280
16-
It's over.
16+
It's over now.
1717

1818
00:00:21.280 --> 00:00:25.140
19-
You can throw away your gun and live.
19+
You can throw your guns away and live.
-255 KB
Binary file not shown.
-75.3 KB
Binary file not shown.

src/koffee/translator.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import logging
44

5-
from transformers import MarianMTModel, MarianTokenizer
5+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
66

77
logging.getLogger("transformers").setLevel(logging.ERROR)
88
log = logging.getLogger(__name__)
@@ -28,13 +28,29 @@ def translate_transcript(transcript: dict, target_language: str) -> list:
2828

2929
def translate_text(text: str, source_language: str, target_language: str) -> str:
3030
"""Translates source language to target language."""
31-
model_name = f"Helsinki-NLP/opus-mt-{source_language}-{target_language}"
32-
model = MarianMTModel.from_pretrained(model_name)
33-
34-
tokenizer = MarianTokenizer.from_pretrained(model_name)
35-
tokenized_text = tokenizer([text], return_tensors="pt")
36-
37-
translation = model.generate(**tokenized_text)
31+
model_name = "facebook/nllb-200-distilled-600M"
32+
33+
languages = {
34+
"ko": "kor_Hang",
35+
"ja": "jpn_Jpan",
36+
"en": "eng_Latn",
37+
"es": "spa_Latn",
38+
"fr": "fra_Latn",
39+
"de": "deu_Latn",
40+
"zh": "zho_Hans",
41+
}
42+
43+
source_language_code = languages.get(source_language, source_language)
44+
target_language_code = languages.get(target_language, target_language)
45+
46+
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang=source_language_code)
47+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
48+
49+
tokenized_text = tokenizer(text, return_tensors="pt")
50+
translation = model.generate(
51+
**tokenized_text,
52+
forced_bos_token_id=tokenizer.convert_tokens_to_ids(target_language_code),
53+
)
3854

3955
translated_text = tokenizer.decode(translation[0], skip_special_tokens=True)
4056
return translated_text

tests/test_translator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,15 @@ def test_translate_transcript() -> None:
77
"""Tests that the transcript was translated properly."""
88
sample_transcript = {
99
"segments": [
10-
{"text": "안녕하세요."},
10+
{"text": "시대를 초월한 마음."},
1111
{"text": "음식."},
1212
],
1313
"language": "ko",
1414
}
1515

1616
actual = translate_transcript(sample_transcript, target_language="en")
1717
expected = [
18-
{"text": "Hello."},
18+
{"text": "A mind beyond its time."},
1919
{"text": "Food."},
2020
]
2121

0 commit comments

Comments
 (0)