Commit @cc7e524fc4c5be39875538e4b944c2a846b72316 - yjyoon/whisper_streaming

Luca 2023-11-03

backend import in child load_model method and expose logfile arg

@cc7e524fc4c5be39875538e4b944c2a846b72316

57d4e4e

cc7e524

whisper_online.py

--- whisper_online.py

+++ whisper_online.py


         self.transcribe_kargs = {}
         self.original_language = lan 
 
-        self.import_backend()
         self.model = self.load_model(modelsize, cache_dir, model_dir)
-
-    def import_backend(self):
-        raise NotImplemented("must be implemented in the child class")
 
     def load_model(self, modelsize, cache_dir):
         raise NotImplemented("must be implemented in the child class")

     """
 
     sep = " "
-    
-    def import_backend(self):
-        global whisper, whisper_timestamped
-        import whisper
-        import whisper_timestamped
 
     def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
+        global whisper_timestamped  # has to be global as it is used at each `transcribe` call
+        import whisper
+        import whisper_timestamped
         if model_dir is not None:
-            print("ignoring model_dir, not implemented",file=self.output)
+            print("ignoring model_dir, not implemented",file=self.logfile)
         return whisper.load_model(modelsize, download_root=cache_dir)
 
     def transcribe(self, audio, init_prompt=""):

 
     sep = ""
 
-    def import_backend(self):
-        global faster_whisper
-        import faster_whisper
-
     def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
+        from faster_whisper import WhisperModel
         if model_dir is not None:
-            print(f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.",file=self.output)
+            print(f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.",file=self.logfile)
             model_size_or_path = model_dir
         elif modelsize is not None:
             model_size_or_path = modelsize

 
 class HypothesisBuffer:
 
-    def __init__(self, output=sys.stderr):
+    def __init__(self, logfile=sys.stderr):
         """output: where to store the log. Leave it unchanged to print to terminal."""
         self.commited_in_buffer = []
         self.buffer = []

         self.last_commited_time = 0
         self.last_commited_word = None
 
-        self.output = output
+        self.logfile = logfile
 
     def insert(self, new, offset):
         # compare self.commited_in_buffer and new. It inserts only the words in new that extend the commited_in_buffer, it means they are roughly behind last_commited_time and new in content

                         c = " ".join([self.commited_in_buffer[-j][2] for j in range(1,i+1)][::-1])
                         tail = " ".join(self.new[j-1][2] for j in range(1,i+1))
                         if c == tail:
-                            print("removing last",i,"words:",file=self.output)
+                            print("removing last",i,"words:",file=self.logfile)
                             for j in range(i):
-                                print("\t",self.new.pop(0),file=self.output)
+                                print("\t",self.new.pop(0),file=self.logfile)
                             break
 
     def flush(self):

 
     SAMPLING_RATE = 16000
 
-    def __init__(self, asr, tokenizer, output=sys.stderr):
+    def __init__(self, asr, tokenizer, logfile=sys.stderr):
         """asr: WhisperASR object
         tokenizer: sentence tokenizer object for the target language. Must have a method *split* that behaves like the one of MosesTokenizer.
         output: where to store the log. Leave it unchanged to print to terminal.
         """
         self.asr = asr
         self.tokenizer = tokenizer
-        self.output = output
+        self.logfile = logfile
 
         self.init()
 

         self.audio_buffer = np.array([],dtype=np.float32)
         self.buffer_time_offset = 0
 
-        self.transcript_buffer = HypothesisBuffer(output=self.output)
+        self.transcript_buffer = HypothesisBuffer(logfile=self.logfile)
         self.commited = []
         self.last_chunked_at = 0
 

         """
 
         prompt, non_prompt = self.prompt()
-        print("PROMPT:", prompt, file=self.output)
-        print("CONTEXT:", non_prompt, file=self.output)
-        print(f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}",file=self.output)
+        print("PROMPT:", prompt, file=self.logfile)
+        print("CONTEXT:", non_prompt, file=self.logfile)
+        print(f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}",file=self.logfile)
         res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
 
         # transform to [(beg,end,"word1"), ...]

         self.transcript_buffer.insert(tsw, self.buffer_time_offset)
         o = self.transcript_buffer.flush()
         self.commited.extend(o)
-        print(">>>>COMPLETE NOW:",self.to_flush(o),file=self.output,flush=True)
-        print("INCOMPLETE:",self.to_flush(self.transcript_buffer.complete()),file=self.output,flush=True)
+        print(">>>>COMPLETE NOW:",self.to_flush(o),file=self.logfile,flush=True)
+        print("INCOMPLETE:",self.to_flush(self.transcript_buffer.complete()),file=self.logfile,flush=True)
 
         # there is a newly confirmed text
         if o:

 #        elif self.transcript_buffer.complete():
 #            self.silence_iters = 0
 #        elif not self.transcript_buffer.complete():
-#        #    print("NOT COMPLETE:",to_flush(self.transcript_buffer.complete()),file=self.output,flush=True)
+#        #    print("NOT COMPLETE:",to_flush(self.transcript_buffer.complete()),file=self.logfile,flush=True)
 #            self.silence_iters += 1
 #            if self.silence_iters >= 3:
 #                n = self.last_chunked_at
 ##                self.chunk_completed_sentence()
 ##                if n == self.last_chunked_at:
 #                self.chunk_at(self.last_chunked_at+self.chunk)
-#                print(f"\tCHUNK: 3-times silence! chunk_at {n}+{self.chunk}",file=self.output)
+#                print(f"\tCHUNK: 3-times silence! chunk_at {n}+{self.chunk}",file=self.logfile)
 ##                self.silence_iters = 0
 
 

             #while k>0 and self.commited[k][1] > l:
             #    k -= 1
             #t = self.commited[k][1] 
-            print(f"chunking because of len",file=self.output)
+            print(f"chunking because of len",file=self.logfile)
             #self.chunk_at(t)
 
-        print(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}",file=self.output)
+        print(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}",file=self.logfile)
         return self.to_flush(o)
 
     def chunk_completed_sentence(self):
         if self.commited == []: return
-        print(self.commited,file=self.output)
+        print(self.commited,file=self.logfile)
         sents = self.words_to_sentences(self.commited)
         for s in sents:
-            print("\t\tSENT:",s,file=self.output)
+            print("\t\tSENT:",s,file=self.logfile)
         if len(sents) < 2:
             return
         while len(sents) > 2:

         # we will continue with audio processing at this timestamp
         chunk_at = sents[-2][1]
 
-        print(f"--- sentence chunked at {chunk_at:2.2f}",file=self.output)
+        print(f"--- sentence chunked at {chunk_at:2.2f}",file=self.logfile)
         self.chunk_at(chunk_at)
 
     def chunk_completed_segment(self, res):

                 ends.pop(-1)
                 e = ends[-2]+self.buffer_time_offset
             if e <= t:
-                print(f"--- segment chunked at {e:2.2f}",file=self.output)
+                print(f"--- segment chunked at {e:2.2f}",file=self.logfile)
                 self.chunk_at(e)
             else:
-                print(f"--- last segment not within commited area",file=self.output)
+                print(f"--- last segment not within commited area",file=self.logfile)
         else:
-            print(f"--- not enough segments to chunk",file=self.output)
+            print(f"--- not enough segments to chunk",file=self.logfile)
 
 
 

         """
         o = self.transcript_buffer.complete()
         f = self.to_flush(o)
-        print("last, noncommited:",f,file=self.output)
+        print("last, noncommited:",f,file=self.logfile)
         return f
 
 

Add a comment

Open 0
Closed 0

List

...	...	@@ -30,11 +30,7 @@
30	30	self.transcribe_kargs = {}
31	31	self.original_language = lan
32	32
33		- self.import_backend()
34	33	self.model = self.load_model(modelsize, cache_dir, model_dir)
35		-
36		- def import_backend(self):
37		- raise NotImplemented("must be implemented in the child class")
38	34
39	35	def load_model(self, modelsize, cache_dir):
40	36	raise NotImplemented("must be implemented in the child class")
...	...	@@ -52,15 +48,13 @@
52	48	"""
53	49
54	50	sep = " "
55		-
56		- def import_backend(self):
57		- global whisper, whisper_timestamped
58		- import whisper
59		- import whisper_timestamped
60	51
61	52	def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
	53	+ global whisper_timestamped # has to be global as it is used at each `transcribe` call
	54	+ import whisper
	55	+ import whisper_timestamped
62	56	if model_dir is not None:
63		- print("ignoring model_dir, not implemented",file=self.output)
	57	+ print("ignoring model_dir, not implemented",file=self.logfile)
64	58	return whisper.load_model(modelsize, download_root=cache_dir)
65	59
66	60	def transcribe(self, audio, init_prompt=""):
...	...	@@ -89,13 +83,10 @@
89	83
90	84	sep = ""
91	85
92		- def import_backend(self):
93		- global faster_whisper
94		- import faster_whisper
95		-
96	86	def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
	87	+ from faster_whisper import WhisperModel
97	88	if model_dir is not None:
98		- print(f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.",file=self.output)
	89	+ print(f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.",file=self.logfile)
99	90	model_size_or_path = model_dir
100	91	elif modelsize is not None:
101	92	model_size_or_path = modelsize
...	...	@@ -143,7 +134,7 @@
143	134
144	135	class HypothesisBuffer:
145	136
146		- def __init__(self, output=sys.stderr):
	137	+ def __init__(self, logfile=sys.stderr):
147	138	"""output: where to store the log. Leave it unchanged to print to terminal."""
148	139	self.commited_in_buffer = []
149	140	self.buffer = []
...	...	@@ -152,7 +143,7 @@
152	143	self.last_commited_time = 0
153	144	self.last_commited_word = None
154	145
155		- self.output = output
	146	+ self.logfile = logfile
156	147
157	148	def insert(self, new, offset):
158	149	# compare self.commited_in_buffer and new. It inserts only the words in new that extend the commited_in_buffer, it means they are roughly behind last_commited_time and new in content
...	...	@@ -172,9 +163,9 @@
172	163	c = " ".join([self.commited_in_buffer[-j][2] for j in range(1,i+1)][::-1])
173	164	tail = " ".join(self.new[j-1][2] for j in range(1,i+1))
174	165	if c == tail:
175		- print("removing last",i,"words:",file=self.output)
	166	+ print("removing last",i,"words:",file=self.logfile)
176	167	for j in range(i):
177		- print("\t",self.new.pop(0),file=self.output)
	168	+ print("\t",self.new.pop(0),file=self.logfile)
178	169	break
179	170
180	171	def flush(self):
...	...	@@ -211,14 +202,14 @@
211	202
212	203	SAMPLING_RATE = 16000
213	204
214		- def __init__(self, asr, tokenizer, output=sys.stderr):
	205	+ def __init__(self, asr, tokenizer, logfile=sys.stderr):
215	206	"""asr: WhisperASR object
216	207	tokenizer: sentence tokenizer object for the target language. Must have a method split that behaves like the one of MosesTokenizer.
217	208	output: where to store the log. Leave it unchanged to print to terminal.
218	209	"""
219	210	self.asr = asr
220	211	self.tokenizer = tokenizer
221		- self.output = output
	212	+ self.logfile = logfile
222	213
223	214	self.init()
224	215
...	...	@@ -227,7 +218,7 @@
227	218	self.audio_buffer = np.array([],dtype=np.float32)
228	219	self.buffer_time_offset = 0
229	220
230		- self.transcript_buffer = HypothesisBuffer(output=self.output)
	221	+ self.transcript_buffer = HypothesisBuffer(logfile=self.logfile)
231	222	self.commited = []
232	223	self.last_chunked_at = 0
233	224
...	...	@@ -262,9 +253,9 @@
262	253	"""
263	254
264	255	prompt, non_prompt = self.prompt()
265		- print("PROMPT:", prompt, file=self.output)
266		- print("CONTEXT:", non_prompt, file=self.output)
267		- print(f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}",file=self.output)
	256	+ print("PROMPT:", prompt, file=self.logfile)
	257	+ print("CONTEXT:", non_prompt, file=self.logfile)
	258	+ print(f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}",file=self.logfile)
268	259	res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
269	260
270	261	# transform to [(beg,end,"word1"), ...]
...	...	@@ -273,8 +264,8 @@
273	264	self.transcript_buffer.insert(tsw, self.buffer_time_offset)
274	265	o = self.transcript_buffer.flush()
275	266	self.commited.extend(o)
276		- print(">>>>COMPLETE NOW:",self.to_flush(o),file=self.output,flush=True)
277		- print("INCOMPLETE:",self.to_flush(self.transcript_buffer.complete()),file=self.output,flush=True)
	267	+ print(">>>>COMPLETE NOW:",self.to_flush(o),file=self.logfile,flush=True)
	268	+ print("INCOMPLETE:",self.to_flush(self.transcript_buffer.complete()),file=self.logfile,flush=True)
278	269
279	270	# there is a newly confirmed text
280	271	if o:
...	...	@@ -293,14 +284,14 @@
293	284	# elif self.transcript_buffer.complete():
294	285	# self.silence_iters = 0
295	286	# elif not self.transcript_buffer.complete():
296		-# # print("NOT COMPLETE:",to_flush(self.transcript_buffer.complete()),file=self.output,flush=True)
	287	+# # print("NOT COMPLETE:",to_flush(self.transcript_buffer.complete()),file=self.logfile,flush=True)
297	288	# self.silence_iters += 1
298	289	# if self.silence_iters >= 3:
299	290	# n = self.last_chunked_at
300	291	## self.chunk_completed_sentence()
301	292	## if n == self.last_chunked_at:
302	293	# self.chunk_at(self.last_chunked_at+self.chunk)
303		-# print(f"\tCHUNK: 3-times silence! chunk_at {n}+{self.chunk}",file=self.output)
	294	+# print(f"\tCHUNK: 3-times silence! chunk_at {n}+{self.chunk}",file=self.logfile)
304	295	## self.silence_iters = 0
305	296
306	297
...	...	@@ -316,18 +307,18 @@
316	307	#while k>0 and self.commited[k][1] > l:
317	308	# k -= 1
318	309	#t = self.commited[k][1]
319		- print(f"chunking because of len",file=self.output)
	310	+ print(f"chunking because of len",file=self.logfile)
320	311	#self.chunk_at(t)
321	312
322		- print(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}",file=self.output)
	313	+ print(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}",file=self.logfile)
323	314	return self.to_flush(o)
324	315
325	316	def chunk_completed_sentence(self):
326	317	if self.commited == []: return
327		- print(self.commited,file=self.output)
	318	+ print(self.commited,file=self.logfile)
328	319	sents = self.words_to_sentences(self.commited)
329	320	for s in sents:
330		- print("\t\tSENT:",s,file=self.output)
	321	+ print("\t\tSENT:",s,file=self.logfile)
331	322	if len(sents) < 2:
332	323	return
333	324	while len(sents) > 2:
...	...	@@ -335,7 +326,7 @@
335	326	# we will continue with audio processing at this timestamp
336	327	chunk_at = sents[-2][1]
337	328
338		- print(f"--- sentence chunked at {chunk_at:2.2f}",file=self.output)
	329	+ print(f"--- sentence chunked at {chunk_at:2.2f}",file=self.logfile)
339	330	self.chunk_at(chunk_at)
340	331
341	332	def chunk_completed_segment(self, res):
...	...	@@ -352,12 +343,12 @@
352	343	ends.pop(-1)
353	344	e = ends[-2]+self.buffer_time_offset
354	345	if e <= t:
355		- print(f"--- segment chunked at {e:2.2f}",file=self.output)
	346	+ print(f"--- segment chunked at {e:2.2f}",file=self.logfile)
356	347	self.chunk_at(e)
357	348	else:
358		- print(f"--- last segment not within commited area",file=self.output)
	349	+ print(f"--- last segment not within commited area",file=self.logfile)
359	350	else:
360		- print(f"--- not enough segments to chunk",file=self.output)
	351	+ print(f"--- not enough segments to chunk",file=self.logfile)
361	352
362	353
363	354
...	...	@@ -403,7 +394,7 @@
403	394	"""
404	395	o = self.transcript_buffer.complete()
405	396	f = self.to_flush(o)
406		- print("last, noncommited:",f,file=self.output)
	397	+ print("last, noncommited:",f,file=self.logfile)
407	398	return f
408	399
409	400

Delete comment