Commit @bb600af152fd4f84fa76d69aa5fded709153a03e - yjyoon/whisper_server

Fedir Zadniprovskyi 2024-06-03

feat: allow using any ctraslate2 compatible model #14

@bb600af152fd4f84fa76d69aa5fded709153a03e

6857023

bb600af

Dockerfile.cpu

--- Dockerfile.cpu

+++ Dockerfile.cpu


 COPY ./faster_whisper_server ./faster_whisper_server
 ENTRYPOINT ["poetry", "run"]
 CMD ["uvicorn", "faster_whisper_server.main:app"]
-ENV WHISPER_MODEL=medium.en
+ENV WHISPER_MODEL=Systran/faster-whisper-medium.en
 ENV WHISPER_INFERENCE_DEVICE=cpu
 ENV WHISPER_COMPUTE_TYPE=int8
 ENV UVICORN_HOST=0.0.0.0

6857023

bb600af

Dockerfile.cuda

--- Dockerfile.cuda

+++ Dockerfile.cuda


 COPY ./faster_whisper_server ./faster_whisper_server
 ENTRYPOINT ["poetry", "run"]
 CMD ["uvicorn", "faster_whisper_server.main:app"]
-ENV WHISPER_MODEL=distil-large-v3
+ENV WHISPER_MODEL=Systran/faster-distil-whisper-large-v3
 ENV WHISPER_INFERENCE_DEVICE=cuda
 ENV UVICORN_HOST=0.0.0.0
 ENV UVICORN_PORT=8000

6857023

bb600af

README.md

--- README.md

+++ README.md


 export OPENAI_BASE_URL=http://localhost:8000/v1/
 ```
 ```bash
-openai api audio.transcriptions.create -m distil-large-v3 -f audio.wav --response-format text
+openai api audio.transcriptions.create -m Systran/faster-distil-whisper-large-v3 -f audio.wav --response-format text
 
-openai api audio.translations.create -m distil-large-v3 -f audio.wav --response-format verbose_json
+openai api audio.translations.create -m Systran/faster-distil-whisper-large-v3 -f audio.wav --response-format verbose_json
 ```
 ### OpenAI API Python SDK
 ```python

 
 audio_file = open("audio.wav", "rb")
 transcript = client.audio.transcriptions.create(
-    model="distil-large-v3", file=audio_file
+    model="Systran/faster-distil-whisper-large-v3", file=audio_file
 )
 print(transcript.text)
 ```

 curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav"
 curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.mp3"
 curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav" -F "stream=true"
-curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav" -F "stream=true" -F "model=distil-large-v3"
+curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav" -F "model=Systran/faster-distil-whisper-large-v3"
 # It's recommended that you always specify the language as that will reduce the transcription time
-curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav" -F "stream=true" -F "model=distil-large-v3" -F "language=en"
+curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav" -F "language=en"
 
 curl http://localhost:8000/v1/audio/translations -F "file=@audio.wav"
 ```

6857023

bb600af

examples/live-audio/script.sh

--- examples/live-audio/script.sh

+++ examples/live-audio/script.sh


 # ffmpeg -y -hide_banner -loglevel quiet -i audio.mp3 -ac 1 -ar 16000 -f s16le -acodec pcm_s16le audio.pcm
 # rm -f audio.mp3
 
-export WHISPER_MODEL=distil-large-v3 # or tiny.en if you are running on a CPU for a faster inference.
+export WHISPER_MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
 
 # Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
 docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER_MODEL=$WHISPER_MODEL fedirz/faster-whisper-server:0.1-cuda

6857023

bb600af

examples/youtube/script.sh

--- examples/youtube/script.sh

+++ examples/youtube/script.sh


 set -e
 
 # NOTE: do not use any distil-* model other than the large ones as they don't work on long audio files for some reason.
-export WHISPER_MODEL=distil-large-v3 # or tiny.en if you are running on a CPU for a faster inference.
+export WHISPER_MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
 
 # Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
 docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER_MODEL=$WHISPER_MODEL fedirz/faster-whisper-server:0.1-cuda

 # Download the audio from a YouTube video. In this example I'm downloading "The Evolution of the Operating System" by Asionometry YouTube channel. I highly checking this channel out, the guy produces very high content. If you don't have `youtube-dl`, you'll have to install it. https://github.com/ytdl-org/youtube-dl
 youtube-dl --extract-audio --audio-format mp3 -o the-evolution-of-the-operating-system.mp3 'https://www.youtube.com/watch?v=1lG7lFLXBIs'
 
-# Make a request to the API to transcribe the audio. The response will be streamed to the terminal and saved to a file. The video is 30 minutes long, so it might take a while to transcribe, especially if you are running this on a CPU. `distil-large-v3` takes ~30 seconds on Nvidia L4. `tiny.en` takes ~1 minute on Ryzen 7 7700X. The .txt file in the example was transcribed using `distil-large-v3`.
+# Make a request to the API to transcribe the audio. The response will be streamed to the terminal and saved to a file. The video is 30 minutes long, so it might take a while to transcribe, especially if you are running this on a CPU. `Systran/faster-distil-whisper-large-v3` takes ~30 seconds on Nvidia L4. `Systran/faster-whisper-tiny.en` takes ~1 minute on Ryzen 7 7700X. The .txt file in the example was transcribed using `Systran/faster-distil-whisper-large-v3`.
 curl -s http://localhost:8000/v1/audio/transcriptions -F "file=@the-evolution-of-the-operating-system.mp3" -F "stream=true" -F "language=en" -F "response_format=text" | tee the-evolution-of-the-operating-system.txt
 
 # Here I'm using `aichat` which is a CLI LLM client. You could use any other client that supports attaching/uploading files. https://github.com/sigoden/aichat

6857023

bb600af

faster_whisper_server/config.py

--- faster_whisper_server/config.py

+++ faster_whisper_server/config.py


     # I see a lot of equivalence between this new LLM OS and operating systems of today.
 
 
-# https://huggingface.co/Systran
-class Model(enum.StrEnum):
-    TINY_EN = "tiny.en"
-    TINY = "tiny"
-    BASE_EN = "base.en"
-    BASE = "base"
-    SMALL_EN = "small.en"
-    SMALL = "small"
-    MEDIUM_EN = "medium.en"
-    MEDIUM = "medium"
-    LARGE = "large"
-    LARGE_V1 = "large-v1"
-    LARGE_V2 = "large-v2"
-    LARGE_V3 = "large-v3"
-    DISTIL_SMALL_EN = "distil-small.en"
-    DISTIL_MEDIUM_EN = "distil-medium.en"
-    DISTIL_LARGE_V2 = "distil-large-v2"
-    DISTIL_LARGE_V3 = "distil-large-v3"
-
-
 class Device(enum.StrEnum):
     CPU = "cpu"
     CUDA = "cuda"

 
 
 class WhisperConfig(BaseModel):
-    model: Model = Field(default=Model.MEDIUM_EN)
+    model: str = Field(default="Systran/faster-whisper-medium.en")
+    """
+    Huggingface model to use for transcription. Note, the model must support being ran using CTranslate2.
+    Models created by authors of `faster-whisper` can be found at https://huggingface.co/Systran
+    You can find other supported models at https://huggingface.co/models?p=2&sort=trending&search=ctranslate2 and https://huggingface.co/models?sort=trending&search=ct2
+    """
     inference_device: Device = Field(default=Device.AUTO)
     compute_type: Quantization = Field(default=Quantization.DEFAULT)
 

     default_response_format: ResponseFormat = ResponseFormat.JSON
     whisper: WhisperConfig = WhisperConfig()
     max_models: int = 1
+    max_no_data_seconds: float = 1.0
     """
     Max duration to for the next audio chunk before transcription is finilized and connection is closed.
     """
-    max_no_data_seconds: float = 1.0
     min_duration: float = 1.0
     word_timestamp_error_margin: float = 0.2
+    max_inactivity_seconds: float = 5.0
     """
     Max allowed audio duration without any speech being detected before transcription is finilized and connection is closed.
     """
-    max_inactivity_seconds: float = 5.0
+    inactivity_window_seconds: float = 10.0
     """
     Controls how many latest seconds of audio are being passed through VAD.
     Should be greater than `max_inactivity_seconds`
     """
-    inactivity_window_seconds: float = 10.0
 
 
 config = Config()

6857023

bb600af

faster_whisper_server/main.py

--- faster_whisper_server/main.py

+++ faster_whisper_server/main.py


 from faster_whisper_server.config import (
     SAMPLES_PER_SECOND,
     Language,
-    Model,
     ResponseFormat,
     config,
 )

 )
 from faster_whisper_server.transcriber import audio_transcriber
 
-models: OrderedDict[Model, WhisperModel] = OrderedDict()
+models: OrderedDict[str, WhisperModel] = OrderedDict()
 
 
-def load_model(model_name: Model) -> WhisperModel:
+def load_model(model_name: str) -> WhisperModel:
     if model_name in models:
         logger.debug(f"{model_name} model already loaded")
         return models[model_name]

             f"Max models ({config.max_models}) reached. Unloading the oldest model: {oldest_model_name}"
         )
         del models[oldest_model_name]
-    logger.debug(f"Loading {model_name}")
+    logger.debug(f"Loading {model_name}...")
     start = time.perf_counter()
+    # NOTE: will raise an exception if the model name isn't valid
     whisper = WhisperModel(
         model_name,
         device=config.whisper.inference_device,

 @app.post("/v1/audio/translations")
 def translate_file(
     file: Annotated[UploadFile, Form()],
-    model: Annotated[Model, Form()] = config.whisper.model,
+    model: Annotated[str, Form()] = config.whisper.model,
     prompt: Annotated[str | None, Form()] = None,
     response_format: Annotated[ResponseFormat, Form()] = config.default_response_format,
     temperature: Annotated[float, Form()] = 0.0,

 @app.post("/v1/audio/transcriptions")
 def transcribe_file(
     file: Annotated[UploadFile, Form()],
-    model: Annotated[Model, Form()] = config.whisper.model,
+    model: Annotated[str, Form()] = config.whisper.model,
     language: Annotated[Language | None, Form()] = config.default_language,
     prompt: Annotated[str | None, Form()] = None,
     response_format: Annotated[ResponseFormat, Form()] = config.default_response_format,

 @app.websocket("/v1/audio/transcriptions")
 async def transcribe_stream(
     ws: WebSocket,
-    model: Annotated[Model, Query()] = config.whisper.model,
+    model: Annotated[str, Query()] = config.whisper.model,
     language: Annotated[Language | None, Query()] = config.default_language,
     response_format: Annotated[
         ResponseFormat, Query()

Add a comment

Open 0
Closed 0

List

...	...	@@ -38,9 +38,9 @@
38	38	export OPENAI_BASE_URL=http://localhost:8000/v1/
39	39	```
40	40	```bash
41		-openai api audio.transcriptions.create -m distil-large-v3 -f audio.wav --response-format text
	41	+openai api audio.transcriptions.create -m Systran/faster-distil-whisper-large-v3 -f audio.wav --response-format text
42	42
43		-openai api audio.translations.create -m distil-large-v3 -f audio.wav --response-format verbose_json
	43	+openai api audio.translations.create -m Systran/faster-distil-whisper-large-v3 -f audio.wav --response-format verbose_json
44	44	```
45	45	### OpenAI API Python SDK
46	46	```python
...	...	@@ -50,7 +50,7 @@
50	50
51	51	audio_file = open("audio.wav", "rb")
52	52	transcript = client.audio.transcriptions.create(
53		- model="distil-large-v3", file=audio_file
	53	+ model="Systran/faster-distil-whisper-large-v3", file=audio_file
54	54	)
55	55	print(transcript.text)
56	56	```
...	...	@@ -61,9 +61,9 @@
61	61	curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav"
62	62	curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.mp3"
63	63	curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav" -F "stream=true"
64		-curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav" -F "stream=true" -F "model=distil-large-v3"
	64	+curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav" -F "model=Systran/faster-distil-whisper-large-v3"
65	65	# It's recommended that you always specify the language as that will reduce the transcription time
66		-curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav" -F "stream=true" -F "model=distil-large-v3" -F "language=en"
	66	+curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav" -F "language=en"
67	67
68	68	curl http://localhost:8000/v1/audio/translations -F "file=@audio.wav"
69	69	```

...	...	@@ -46,26 +46,6 @@
46	46	# I see a lot of equivalence between this new LLM OS and operating systems of today.
47	47
48	48
49		-# https://huggingface.co/Systran
50		-class Model(enum.StrEnum):
51		- TINY_EN = "tiny.en"
52		- TINY = "tiny"
53		- BASE_EN = "base.en"
54		- BASE = "base"
55		- SMALL_EN = "small.en"
56		- SMALL = "small"
57		- MEDIUM_EN = "medium.en"
58		- MEDIUM = "medium"
59		- LARGE = "large"
60		- LARGE_V1 = "large-v1"
61		- LARGE_V2 = "large-v2"
62		- LARGE_V3 = "large-v3"
63		- DISTIL_SMALL_EN = "distil-small.en"
64		- DISTIL_MEDIUM_EN = "distil-medium.en"
65		- DISTIL_LARGE_V2 = "distil-large-v2"
66		- DISTIL_LARGE_V3 = "distil-large-v3"
67		-
68		-
69	49	class Device(enum.StrEnum):
70	50	CPU = "cpu"
71	51	CUDA = "cuda"
...	...	@@ -189,7 +169,12 @@
189	169
190	170
191	171	class WhisperConfig(BaseModel):
192		- model: Model = Field(default=Model.MEDIUM_EN)
	172	+ model: str = Field(default="Systran/faster-whisper-medium.en")
	173	+ """
	174	+ Huggingface model to use for transcription. Note, the model must support being ran using CTranslate2.
	175	+ Models created by authors of `faster-whisper` can be found at https://huggingface.co/Systran
	176	+ You can find other supported models at https://huggingface.co/models?p=2&sort=trending&search=ctranslate2 and https://huggingface.co/models?sort=trending&search=ct2
	177	+ """
193	178	inference_device: Device = Field(default=Device.AUTO)
194	179	compute_type: Quantization = Field(default=Quantization.DEFAULT)
195	180
...	...	@@ -209,21 +194,21 @@
209	194	default_response_format: ResponseFormat = ResponseFormat.JSON
210	195	whisper: WhisperConfig = WhisperConfig()
211	196	max_models: int = 1
	197	+ max_no_data_seconds: float = 1.0
212	198	"""
213	199	Max duration to for the next audio chunk before transcription is finilized and connection is closed.
214	200	"""
215		- max_no_data_seconds: float = 1.0
216	201	min_duration: float = 1.0
217	202	word_timestamp_error_margin: float = 0.2
	203	+ max_inactivity_seconds: float = 5.0
218	204	"""
219	205	Max allowed audio duration without any speech being detected before transcription is finilized and connection is closed.
220	206	"""
221		- max_inactivity_seconds: float = 5.0
	207	+ inactivity_window_seconds: float = 10.0
222	208	"""
223	209	Controls how many latest seconds of audio are being passed through VAD.
224	210	Should be greater than `max_inactivity_seconds`
225	211	"""
226		- inactivity_window_seconds: float = 10.0
227	212
228	213
229	214	config = Config()

...	...	@@ -26,7 +26,6 @@
26	26	from faster_whisper_server.config import (
27	27	SAMPLES_PER_SECOND,
28	28	Language,
29		- Model,
30	29	ResponseFormat,
31	30	config,
32	31	)
...	...	@@ -37,10 +36,10 @@
37	36	)
38	37	from faster_whisper_server.transcriber import audio_transcriber
39	38
40		-models: OrderedDict[Model, WhisperModel] = OrderedDict()
	39	+models: OrderedDict[str, WhisperModel] = OrderedDict()
41	40
42	41
43		-def load_model(model_name: Model) -> WhisperModel:
	42	+def load_model(model_name: str) -> WhisperModel:
44	43	if model_name in models:
45	44	logger.debug(f"{model_name} model already loaded")
46	45	return models[model_name]
...	...	@@ -50,8 +49,9 @@
50	49	f"Max models ({config.max_models}) reached. Unloading the oldest model: {oldest_model_name}"
51	50	)
52	51	del models[oldest_model_name]
53		- logger.debug(f"Loading {model_name}")
	52	+ logger.debug(f"Loading {model_name}...")
54	53	start = time.perf_counter()
	54	+ # NOTE: will raise an exception if the model name isn't valid
55	55	whisper = WhisperModel(
56	56	model_name,
57	57	device=config.whisper.inference_device,
...	...	@@ -84,7 +84,7 @@
84	84	@app.post("/v1/audio/translations")
85	85	def translate_file(
86	86	file: Annotated[UploadFile, Form()],
87		- model: Annotated[Model, Form()] = config.whisper.model,
	87	+ model: Annotated[str, Form()] = config.whisper.model,
88	88	prompt: Annotated[str \| None, Form()] = None,
89	89	response_format: Annotated[ResponseFormat, Form()] = config.default_response_format,
90	90	temperature: Annotated[float, Form()] = 0.0,
...	...	@@ -135,7 +135,7 @@
135	135	@app.post("/v1/audio/transcriptions")
136	136	def transcribe_file(
137	137	file: Annotated[UploadFile, Form()],
138		- model: Annotated[Model, Form()] = config.whisper.model,
	138	+ model: Annotated[str, Form()] = config.whisper.model,
139	139	language: Annotated[Language \| None, Form()] = config.default_language,
140	140	prompt: Annotated[str \| None, Form()] = None,
141	141	response_format: Annotated[ResponseFormat, Form()] = config.default_response_format,
...	...	@@ -235,7 +235,7 @@
235	235	@app.websocket("/v1/audio/transcriptions")
236	236	async def transcribe_stream(
237	237	ws: WebSocket,
238		- model: Annotated[Model, Query()] = config.whisper.model,
	238	+ model: Annotated[str, Query()] = config.whisper.model,
239	239	language: Annotated[Language \| None, Query()] = config.default_language,
240	240	response_format: Annotated[
241	241	ResponseFormat, Query()

...	...	@@ -15,7 +15,7 @@
15	15	COPY ./faster_whisper_server ./faster_whisper_server
16	16	ENTRYPOINT ["poetry", "run"]
17	17	CMD ["uvicorn", "faster_whisper_server.main:app"]
18		-ENV WHISPER_MODEL=medium.en
	18	+ENV WHISPER_MODEL=Systran/faster-whisper-medium.en
19	19	ENV WHISPER_INFERENCE_DEVICE=cpu
20	20	ENV WHISPER_COMPUTE_TYPE=int8
21	21	ENV UVICORN_HOST=0.0.0.0

...	...	@@ -7,7 +7,7 @@
7	7	# ffmpeg -y -hide_banner -loglevel quiet -i audio.mp3 -ac 1 -ar 16000 -f s16le -acodec pcm_s16le audio.pcm
8	8	# rm -f audio.mp3
9	9
10		-export WHISPER_MODEL=distil-large-v3 # or tiny.en if you are running on a CPU for a faster inference.
	10	+export WHISPER_MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
11	11
12	12	# Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
13	13	docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER_MODEL=$WHISPER_MODEL fedirz/faster-whisper-server:0.1-cuda

...	...	@@ -3,7 +3,7 @@
3	3	set -e
4	4
5	5	# NOTE: do not use any distil-* model other than the large ones as they don't work on long audio files for some reason.
6		-export WHISPER_MODEL=distil-large-v3 # or tiny.en if you are running on a CPU for a faster inference.
	6	+export WHISPER_MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
7	7
8	8	# Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
9	9	docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER_MODEL=$WHISPER_MODEL fedirz/faster-whisper-server:0.1-cuda
...	...	@@ -13,7 +13,7 @@
13	13	# Download the audio from a YouTube video. In this example I'm downloading "The Evolution of the Operating System" by Asionometry YouTube channel. I highly checking this channel out, the guy produces very high content. If you don't have `youtube-dl`, you'll have to install it. https://github.com/ytdl-org/youtube-dl
14	14	youtube-dl --extract-audio --audio-format mp3 -o the-evolution-of-the-operating-system.mp3 'https://www.youtube.com/watch?v=1lG7lFLXBIs'
15	15
16		-# Make a request to the API to transcribe the audio. The response will be streamed to the terminal and saved to a file. The video is 30 minutes long, so it might take a while to transcribe, especially if you are running this on a CPU. `distil-large-v3` takes ~30 seconds on Nvidia L4. `tiny.en` takes ~1 minute on Ryzen 7 7700X. The .txt file in the example was transcribed using `distil-large-v3`.
	16	+# Make a request to the API to transcribe the audio. The response will be streamed to the terminal and saved to a file. The video is 30 minutes long, so it might take a while to transcribe, especially if you are running this on a CPU. `Systran/faster-distil-whisper-large-v3` takes ~30 seconds on Nvidia L4. `Systran/faster-whisper-tiny.en` takes ~1 minute on Ryzen 7 7700X. The .txt file in the example was transcribed using `Systran/faster-distil-whisper-large-v3`.
17	17	curl -s http://localhost:8000/v1/audio/transcriptions -F "file=@the-evolution-of-the-operating-system.mp3" -F "stream=true" -F "language=en" -F "response_format=text" \| tee the-evolution-of-the-operating-system.txt
18	18
19	19	# Here I'm using `aichat` which is a CLI LLM client. You could use any other client that supports attaching/uploading files. https://github.com/sigoden/aichat

Delete comment