Commit @3673dfa4e158fa02fa4fea5be0f6e17d12cae09f - yjyoon/whisper_server

Fedir Zadniprovskyi 2024-06-23

extract segments to response logic

@3673dfa4e158fa02fa4fea5be0f6e17d12cae09f

0eb933a

3673dfa

faster_whisper_server/main.py

--- faster_whisper_server/main.py

+++ faster_whisper_server/main.py


 import time
 from contextlib import asynccontextmanager
 from io import BytesIO
-from typing import Annotated, Generator, Literal, OrderedDict
+from typing import Annotated, Generator, Iterable, Literal, OrderedDict
 
 import huggingface_hub
 from fastapi import (

 from fastapi.responses import StreamingResponse
 from fastapi.websockets import WebSocketState
 from faster_whisper import WhisperModel
+from faster_whisper.transcribe import Segment, TranscriptionInfo
 from faster_whisper.vad import VadOptions, get_speech_timestamps
 from huggingface_hub.hf_api import ModelInfo
 from pydantic import AfterValidator

     )
 
 
+def segments_to_response(
+    segments: Iterable[Segment],
+    transcription_info: TranscriptionInfo,
+    response_format: ResponseFormat,
+) -> str | TranscriptionJsonResponse | TranscriptionVerboseJsonResponse:
+    segments = list(segments)
+    if response_format == ResponseFormat.TEXT:
+        return utils.segments_text(segments)
+    elif response_format == ResponseFormat.JSON:
+        return TranscriptionJsonResponse.from_segments(segments)
+    elif response_format == ResponseFormat.VERBOSE_JSON:
+        return TranscriptionVerboseJsonResponse.from_segments(
+            segments, transcription_info
+        )
+
+
 def format_as_sse(data: str) -> str:
     return f"data: {data}\n\n"
+
+
+def segments_to_streaming_response(
+    segments: Iterable[Segment],
+    transcription_info: TranscriptionInfo,
+    response_format: ResponseFormat,
+) -> StreamingResponse:
+    def segment_responses() -> Generator[str, None, None]:
+        for segment in segments:
+            if response_format == ResponseFormat.TEXT:
+                data = segment.text
+            elif response_format == ResponseFormat.JSON:
+                data = TranscriptionJsonResponse.from_segments(
+                    [segment]
+                ).model_dump_json()
+            elif response_format == ResponseFormat.VERBOSE_JSON:
+                data = TranscriptionVerboseJsonResponse.from_segment(
+                    segment, transcription_info
+                ).model_dump_json()
+            yield format_as_sse(data)
+
+    return StreamingResponse(segment_responses(), media_type="text/event-stream")
 
 
 def handle_default_openai_model(model_name: str) -> str:

     | TranscriptionVerboseJsonResponse
     | StreamingResponse
 ):
-    start = time.perf_counter()
     whisper = load_model(model)
     segments, transcription_info = whisper.transcribe(
         file.file,

         vad_filter=True,
     )
 
-    if not stream:
-        segments = list(segments)
-        logger.info(
-            f"Translated {transcription_info.duration}({transcription_info.duration_after_vad}) seconds of audio in {time.perf_counter() - start:.2f} seconds"
+    if stream:
+        return segments_to_streaming_response(
+            segments, transcription_info, response_format
         )
-        if response_format == ResponseFormat.TEXT:
-            return utils.segments_text(segments)
-        elif response_format == ResponseFormat.JSON:
-            return TranscriptionJsonResponse.from_segments(segments)
-        elif response_format == ResponseFormat.VERBOSE_JSON:
-            return TranscriptionVerboseJsonResponse.from_segments(
-                segments, transcription_info
-            )
     else:
-
-        def segment_responses() -> Generator[str, None, None]:
-            for segment in segments:
-                if response_format == ResponseFormat.TEXT:
-                    data = segment.text
-                elif response_format == ResponseFormat.JSON:
-                    data = TranscriptionJsonResponse.from_segments(
-                        [segment]
-                    ).model_dump_json()
-                elif response_format == ResponseFormat.VERBOSE_JSON:
-                    data = TranscriptionVerboseJsonResponse.from_segment(
-                        segment, transcription_info
-                    ).model_dump_json()
-                yield format_as_sse(data)
-
-        return StreamingResponse(segment_responses(), media_type="text/event-stream")
+        return segments_to_response(segments, transcription_info, response_format)
 
 
 # https://platform.openai.com/docs/api-reference/audio/createTranscription

     | TranscriptionVerboseJsonResponse
     | StreamingResponse
 ):
-    start = time.perf_counter()
     whisper = load_model(model)
     segments, transcription_info = whisper.transcribe(
         file.file,

         vad_filter=True,
     )
 
-    if not stream:
-        segments = list(segments)
-        logger.info(
-            f"Transcribed {transcription_info.duration}({transcription_info.duration_after_vad}) seconds of audio in {time.perf_counter() - start:.2f} seconds"
+    if stream:
+        return segments_to_streaming_response(
+            segments, transcription_info, response_format
         )
-        if response_format == ResponseFormat.TEXT:
-            return utils.segments_text(segments)
-        elif response_format == ResponseFormat.JSON:
-            return TranscriptionJsonResponse.from_segments(segments)
-        elif response_format == ResponseFormat.VERBOSE_JSON:
-            return TranscriptionVerboseJsonResponse.from_segments(
-                segments, transcription_info
-            )
     else:
-
-        def segment_responses() -> Generator[str, None, None]:
-            for segment in segments:
-                logger.info(
-                    f"Transcribed {segment.end - segment.start} seconds of audio in {time.perf_counter() - start:.2f} seconds"
-                )
-                if response_format == ResponseFormat.TEXT:
-                    data = segment.text
-                elif response_format == ResponseFormat.JSON:
-                    data = TranscriptionJsonResponse.from_segments(
-                        [segment]
-                    ).model_dump_json()
-                elif response_format == ResponseFormat.VERBOSE_JSON:
-                    data = TranscriptionVerboseJsonResponse.from_segment(
-                        segment, transcription_info
-                    ).model_dump_json()
-                yield format_as_sse(data)
-
-        return StreamingResponse(segment_responses(), media_type="text/event-stream")
+        return segments_to_response(segments, transcription_info, response_format)
 
 
 async def audio_receiver(ws: WebSocket, audio_stream: AudioStream) -> None:

Add a comment

Open 0
Closed 0

List

...	...	@@ -4,7 +4,7 @@
4	4	import time
5	5	from contextlib import asynccontextmanager
6	6	from io import BytesIO
7		-from typing import Annotated, Generator, Literal, OrderedDict
	7	+from typing import Annotated, Generator, Iterable, Literal, OrderedDict
8	8
9	9	import huggingface_hub
10	10	from fastapi import (
...	...	@@ -21,6 +21,7 @@
21	21	from fastapi.responses import StreamingResponse
22	22	from fastapi.websockets import WebSocketState
23	23	from faster_whisper import WhisperModel
	24	+from faster_whisper.transcribe import Segment, TranscriptionInfo
24	25	from faster_whisper.vad import VadOptions, get_speech_timestamps
25	26	from huggingface_hub.hf_api import ModelInfo
26	27	from pydantic import AfterValidator
...	...	@@ -132,8 +133,46 @@
132	133	)
133	134
134	135
	136	+def segments_to_response(
	137	+ segments: Iterable[Segment],
	138	+ transcription_info: TranscriptionInfo,
	139	+ response_format: ResponseFormat,
	140	+) -> str \| TranscriptionJsonResponse \| TranscriptionVerboseJsonResponse:
	141	+ segments = list(segments)
	142	+ if response_format == ResponseFormat.TEXT:
	143	+ return utils.segments_text(segments)
	144	+ elif response_format == ResponseFormat.JSON:
	145	+ return TranscriptionJsonResponse.from_segments(segments)
	146	+ elif response_format == ResponseFormat.VERBOSE_JSON:
	147	+ return TranscriptionVerboseJsonResponse.from_segments(
	148	+ segments, transcription_info
	149	+ )
	150	+
	151	+
135	152	def format_as_sse(data: str) -> str:
136	153	return f"data: {data}\n\n"
	154	+
	155	+
	156	+def segments_to_streaming_response(
	157	+ segments: Iterable[Segment],
	158	+ transcription_info: TranscriptionInfo,
	159	+ response_format: ResponseFormat,
	160	+) -> StreamingResponse:
	161	+ def segment_responses() -> Generator[str, None, None]:
	162	+ for segment in segments:
	163	+ if response_format == ResponseFormat.TEXT:
	164	+ data = segment.text
	165	+ elif response_format == ResponseFormat.JSON:
	166	+ data = TranscriptionJsonResponse.from_segments(
	167	+ [segment]
	168	+ ).model_dump_json()
	169	+ elif response_format == ResponseFormat.VERBOSE_JSON:
	170	+ data = TranscriptionVerboseJsonResponse.from_segment(
	171	+ segment, transcription_info
	172	+ ).model_dump_json()
	173	+ yield format_as_sse(data)
	174	+
	175	+ return StreamingResponse(segment_responses(), media_type="text/event-stream")
137	176
138	177
139	178	def handle_default_openai_model(model_name: str) -> str:
...	...	@@ -168,7 +207,6 @@
168	207	\| TranscriptionVerboseJsonResponse
169	208	\| StreamingResponse
170	209	):
171		- start = time.perf_counter()
172	210	whisper = load_model(model)
173	211	segments, transcription_info = whisper.transcribe(
174	212	file.file,
...	...	@@ -178,36 +216,12 @@
178	216	vad_filter=True,
179	217	)
180	218
181		- if not stream:
182		- segments = list(segments)
183		- logger.info(
184		- f"Translated {transcription_info.duration}({transcription_info.duration_after_vad}) seconds of audio in {time.perf_counter() - start:.2f} seconds"
	219	+ if stream:
	220	+ return segments_to_streaming_response(
	221	+ segments, transcription_info, response_format
185	222	)
186		- if response_format == ResponseFormat.TEXT:
187		- return utils.segments_text(segments)
188		- elif response_format == ResponseFormat.JSON:
189		- return TranscriptionJsonResponse.from_segments(segments)
190		- elif response_format == ResponseFormat.VERBOSE_JSON:
191		- return TranscriptionVerboseJsonResponse.from_segments(
192		- segments, transcription_info
193		- )
194	223	else:
195		-
196		- def segment_responses() -> Generator[str, None, None]:
197		- for segment in segments:
198		- if response_format == ResponseFormat.TEXT:
199		- data = segment.text
200		- elif response_format == ResponseFormat.JSON:
201		- data = TranscriptionJsonResponse.from_segments(
202		- [segment]
203		- ).model_dump_json()
204		- elif response_format == ResponseFormat.VERBOSE_JSON:
205		- data = TranscriptionVerboseJsonResponse.from_segment(
206		- segment, transcription_info
207		- ).model_dump_json()
208		- yield format_as_sse(data)
209		-
210		- return StreamingResponse(segment_responses(), media_type="text/event-stream")
	224	+ return segments_to_response(segments, transcription_info, response_format)
211	225
212	226
213	227	# https://platform.openai.com/docs/api-reference/audio/createTranscription
...	...	@@ -234,7 +248,6 @@
234	248	\| TranscriptionVerboseJsonResponse
235	249	\| StreamingResponse
236	250	):
237		- start = time.perf_counter()
238	251	whisper = load_model(model)
239	252	segments, transcription_info = whisper.transcribe(
240	253	file.file,
...	...	@@ -246,39 +259,12 @@
246	259	vad_filter=True,
247	260	)
248	261
249		- if not stream:
250		- segments = list(segments)
251		- logger.info(
252		- f"Transcribed {transcription_info.duration}({transcription_info.duration_after_vad}) seconds of audio in {time.perf_counter() - start:.2f} seconds"
	262	+ if stream:
	263	+ return segments_to_streaming_response(
	264	+ segments, transcription_info, response_format
253	265	)
254		- if response_format == ResponseFormat.TEXT:
255		- return utils.segments_text(segments)
256		- elif response_format == ResponseFormat.JSON:
257		- return TranscriptionJsonResponse.from_segments(segments)
258		- elif response_format == ResponseFormat.VERBOSE_JSON:
259		- return TranscriptionVerboseJsonResponse.from_segments(
260		- segments, transcription_info
261		- )
262	266	else:
263		-
264		- def segment_responses() -> Generator[str, None, None]:
265		- for segment in segments:
266		- logger.info(
267		- f"Transcribed {segment.end - segment.start} seconds of audio in {time.perf_counter() - start:.2f} seconds"
268		- )
269		- if response_format == ResponseFormat.TEXT:
270		- data = segment.text
271		- elif response_format == ResponseFormat.JSON:
272		- data = TranscriptionJsonResponse.from_segments(
273		- [segment]
274		- ).model_dump_json()
275		- elif response_format == ResponseFormat.VERBOSE_JSON:
276		- data = TranscriptionVerboseJsonResponse.from_segment(
277		- segment, transcription_info
278		- ).model_dump_json()
279		- yield format_as_sse(data)
280		-
281		- return StreamingResponse(segment_responses(), media_type="text/event-stream")
	267	+ return segments_to_response(segments, transcription_info, response_format)
282	268
283	269
284	270	async def audio_receiver(ws: WebSocket, audio_stream: AudioStream) -> None:

Delete comment