

Ukrainian tokenizer support
@76960d85c7d793c356a1e6e083d759e06b9ef28d
--- whisper_online.py
+++ whisper_online.py
... | ... | @@ -4,7 +4,7 @@ |
4 | 4 |
import librosa |
5 | 5 |
from functools import lru_cache |
6 | 6 |
import time |
7 |
-from mosestokenizer import MosesTokenizer |
|
7 |
+ |
|
8 | 8 |
|
9 | 9 |
|
10 | 10 |
@lru_cache |
... | ... | @@ -207,14 +207,12 @@ |
207 | 207 |
|
208 | 208 |
SAMPLING_RATE = 16000 |
209 | 209 |
|
210 |
- def __init__(self, language, asr): |
|
211 |
- """language: lang. code that MosesTokenizer uses for sentence segmentation |
|
212 |
- asr: WhisperASR object |
|
213 |
- chunk: number of seconds for intended size of audio interval that is inserted and looped |
|
210 |
+ def __init__(self, asr, tokenizer): |
|
211 |
+ """asr: WhisperASR object |
|
212 |
+ tokenizer: sentence tokenizer object for the target language. Must have a method *split* that behaves like the one of MosesTokenizer. |
|
214 | 213 |
""" |
215 |
- self.language = language |
|
216 | 214 |
self.asr = asr |
217 |
- self.tokenizer = MosesTokenizer(self.language) |
|
215 |
+ self.tokenizer = tokenizer |
|
218 | 216 |
|
219 | 217 |
self.init() |
220 | 218 |
|
... | ... | @@ -369,7 +367,7 @@ |
369 | 367 |
self.last_chunked_at = time |
370 | 368 |
|
371 | 369 |
def words_to_sentences(self, words): |
372 |
- """Uses mosestokenizer for sentence segmentation of words. |
|
370 |
+ """Uses self.tokenizer for sentence segmentation of words. |
|
373 | 371 |
Returns: [(beg,end,"sentence 1"),...] |
374 | 372 |
""" |
375 | 373 |
|
... | ... | @@ -419,6 +417,15 @@ |
419 | 417 |
return (b,e,t) |
420 | 418 |
|
421 | 419 |
|
420 |
+def create_tokenizer(lan): |
|
421 |
+ if lan == "uk": |
|
422 |
+ import tokenize_uk |
|
423 |
+ class UkrainianTokenizer: |
|
424 |
+ def split(self, text): |
|
425 |
+ return tokenize_uk.tokenize_sents(text) |
|
426 |
+ return UkrainianTokenizer() |
|
427 |
+ from mosestokenizer import MosesTokenizer |
|
428 |
+ return MosesTokenizer(lan) |
|
422 | 429 |
|
423 | 430 |
## main: |
424 | 431 |
|
... | ... | @@ -482,8 +489,9 @@ |
482 | 489 |
print("setting VAD filter",file=sys.stderr) |
483 | 490 |
asr.use_vad() |
484 | 491 |
|
492 |
+ |
|
485 | 493 |
min_chunk = args.min_chunk_size |
486 |
- online = OnlineASRProcessor(tgt_language,asr) |
|
494 |
+ online = OnlineASRProcessor(asr,create_tokenizer(tgt_language)) |
|
487 | 495 |
|
488 | 496 |
|
489 | 497 |
# load the audio into the LRU cache before we start the timer |
--- whisper_online_server.py
+++ whisper_online_server.py
... | ... | @@ -48,6 +48,9 @@ |
48 | 48 |
|
49 | 49 |
if args.task == "translate": |
50 | 50 |
asr.set_translate_task() |
51 |
+ tgt_language = "en" |
|
52 |
+else: |
|
53 |
+ tgt_language = language |
|
51 | 54 |
|
52 | 55 |
e = time.time() |
53 | 56 |
print(f"done. It took {round(e-t,2)} seconds.",file=sys.stderr) |
... | ... | @@ -58,7 +61,7 @@ |
58 | 61 |
|
59 | 62 |
|
60 | 63 |
min_chunk = args.min_chunk_size |
61 |
-online = OnlineASRProcessor(language,asr) |
|
64 |
+online = OnlineASRProcessor(asr,create_tokenizer(tgt_language)) |
|
62 | 65 |
|
63 | 66 |
|
64 | 67 |
|
Add a comment
Delete comment
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?