Commit @11c47567cf48af1e26d7ab1a6b2fb95996992c38 - yjyoon/whisper_streaming

silask 2024-12-31

fix #7

@11c47567cf48af1e26d7ab1a6b2fb95996992c38

1dc60f1

11c4756

whisper_online.py

--- whisper_online.py

+++ whisper_online.py


     SAMPLING_RATE = 16000
 
     def __init__(
-        self, asr, tokenizer=None, buffer_trimming=("segment", 15), logfile=sys.stderr
+        self,
+        asr,
+        tokenize_method=None,
+        buffer_trimming=("segment", 15),
+        logfile=sys.stderr,
     ):
         """asr: WhisperASR object
-        tokenizer: sentence tokenizer object for the target language. Must have a method *split* that behaves like the one of MosesTokenizer. It can be None, if "segment" buffer trimming option is used, then tokenizer is not used at all.
+        tokenize_method: sentence tokenizer function for the target language. Must be a callable and behaves like the one of MosesTokenizer. It can be None, if "segment" buffer trimming option is used, then tokenizer is not used at all.
         ("segment", 15)
         buffer_trimming: a pair of (option, seconds), where option is either "sentence" or "segment", and seconds is a number. Buffer is trimmed if it is longer than "seconds" threshold. Default is the most recommended option.
         logfile: where to store the log.
         """
         self.asr = asr
-        self.tokenizer = tokenizer
+        self.tokenize = tokenize_method
         self.logfile = logfile
 
         self.init()

         self.buffer_time_offset = time
 
     def words_to_sentences(self, words):
-        """Uses self.tokenizer for sentence segmentation of words.
+        """Uses self.tokenize for sentence segmentation of words.
         Returns: [(beg,end,"sentence 1"),...]
         """
 
         cwords = [w for w in words]
         t = " ".join(o[2] for o in cwords)
-        s = self.tokenizer.split(t)
+        s = self.tokenize(t)
         out = []
         while s:
             beg = None

Add a comment

Open 0
Closed 0

List

...	...	@@ -459,16 +459,20 @@
459	459	SAMPLING_RATE = 16000
460	460
461	461	def __init__(
462		- self, asr, tokenizer=None, buffer_trimming=("segment", 15), logfile=sys.stderr
	462	+ self,
	463	+ asr,
	464	+ tokenize_method=None,
	465	+ buffer_trimming=("segment", 15),
	466	+ logfile=sys.stderr,
463	467	):
464	468	"""asr: WhisperASR object
465		- tokenizer: sentence tokenizer object for the target language. Must have a method split that behaves like the one of MosesTokenizer. It can be None, if "segment" buffer trimming option is used, then tokenizer is not used at all.
	469	+ tokenize_method: sentence tokenizer function for the target language. Must be a callable and behaves like the one of MosesTokenizer. It can be None, if "segment" buffer trimming option is used, then tokenizer is not used at all.
466	470	("segment", 15)
467	471	buffer_trimming: a pair of (option, seconds), where option is either "sentence" or "segment", and seconds is a number. Buffer is trimmed if it is longer than "seconds" threshold. Default is the most recommended option.
468	472	logfile: where to store the log.
469	473	"""
470	474	self.asr = asr
471		- self.tokenizer = tokenizer
	475	+ self.tokenize = tokenize_method
472	476	self.logfile = logfile
473	477
474	478	self.init()
...	...	@@ -612,13 +616,13 @@
612	616	self.buffer_time_offset = time
613	617
614	618	def words_to_sentences(self, words):
615		- """Uses self.tokenizer for sentence segmentation of words.
	619	+ """Uses self.tokenize for sentence segmentation of words.
616	620	Returns: [(beg,end,"sentence 1"),...]
617	621	"""
618	622
619	623	cwords = [w for w in words]
620	624	t = " ".join(o[2] for o in cwords)
621		- s = self.tokenizer.split(t)
	625	+ s = self.tokenize(t)
622	626	out = []
623	627	while s:
624	628	beg = None

Delete comment