

feat: allow using any ctraslate2 compatible model #14
@bb600af152fd4f84fa76d69aa5fded709153a03e
--- Dockerfile.cpu
+++ Dockerfile.cpu
... | ... | @@ -15,7 +15,7 @@ |
15 | 15 |
COPY ./faster_whisper_server ./faster_whisper_server |
16 | 16 |
ENTRYPOINT ["poetry", "run"] |
17 | 17 |
CMD ["uvicorn", "faster_whisper_server.main:app"] |
18 |
-ENV WHISPER_MODEL=medium.en |
|
18 |
+ENV WHISPER_MODEL=Systran/faster-whisper-medium.en |
|
19 | 19 |
ENV WHISPER_INFERENCE_DEVICE=cpu |
20 | 20 |
ENV WHISPER_COMPUTE_TYPE=int8 |
21 | 21 |
ENV UVICORN_HOST=0.0.0.0 |
--- Dockerfile.cuda
+++ Dockerfile.cuda
... | ... | @@ -15,7 +15,7 @@ |
15 | 15 |
COPY ./faster_whisper_server ./faster_whisper_server |
16 | 16 |
ENTRYPOINT ["poetry", "run"] |
17 | 17 |
CMD ["uvicorn", "faster_whisper_server.main:app"] |
18 |
-ENV WHISPER_MODEL=distil-large-v3 |
|
18 |
+ENV WHISPER_MODEL=Systran/faster-distil-whisper-large-v3 |
|
19 | 19 |
ENV WHISPER_INFERENCE_DEVICE=cuda |
20 | 20 |
ENV UVICORN_HOST=0.0.0.0 |
21 | 21 |
ENV UVICORN_PORT=8000 |
--- README.md
+++ README.md
... | ... | @@ -38,9 +38,9 @@ |
38 | 38 |
export OPENAI_BASE_URL=http://localhost:8000/v1/ |
39 | 39 |
``` |
40 | 40 |
```bash |
41 |
-openai api audio.transcriptions.create -m distil-large-v3 -f audio.wav --response-format text |
|
41 |
+openai api audio.transcriptions.create -m Systran/faster-distil-whisper-large-v3 -f audio.wav --response-format text |
|
42 | 42 |
|
43 |
-openai api audio.translations.create -m distil-large-v3 -f audio.wav --response-format verbose_json |
|
43 |
+openai api audio.translations.create -m Systran/faster-distil-whisper-large-v3 -f audio.wav --response-format verbose_json |
|
44 | 44 |
``` |
45 | 45 |
### OpenAI API Python SDK |
46 | 46 |
```python |
... | ... | @@ -50,7 +50,7 @@ |
50 | 50 |
|
51 | 51 |
audio_file = open("audio.wav", "rb") |
52 | 52 |
transcript = client.audio.transcriptions.create( |
53 |
- model="distil-large-v3", file=audio_file |
|
53 |
+ model="Systran/faster-distil-whisper-large-v3", file=audio_file |
|
54 | 54 |
) |
55 | 55 |
print(transcript.text) |
56 | 56 |
``` |
... | ... | @@ -61,9 +61,9 @@ |
61 | 61 |
curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav" |
62 | 62 |
curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.mp3" |
63 | 63 |
curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav" -F "stream=true" |
64 |
-curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav" -F "stream=true" -F "model=distil-large-v3" |
|
64 |
+curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav" -F "model=Systran/faster-distil-whisper-large-v3" |
|
65 | 65 |
# It's recommended that you always specify the language as that will reduce the transcription time |
66 |
-curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav" -F "stream=true" -F "model=distil-large-v3" -F "language=en" |
|
66 |
+curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav" -F "language=en" |
|
67 | 67 |
|
68 | 68 |
curl http://localhost:8000/v1/audio/translations -F "file=@audio.wav" |
69 | 69 |
``` |
--- examples/live-audio/script.sh
+++ examples/live-audio/script.sh
... | ... | @@ -7,7 +7,7 @@ |
7 | 7 |
# ffmpeg -y -hide_banner -loglevel quiet -i audio.mp3 -ac 1 -ar 16000 -f s16le -acodec pcm_s16le audio.pcm |
8 | 8 |
# rm -f audio.mp3 |
9 | 9 |
|
10 |
-export WHISPER_MODEL=distil-large-v3 # or tiny.en if you are running on a CPU for a faster inference. |
|
10 |
+export WHISPER_MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference. |
|
11 | 11 |
|
12 | 12 |
# Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`. |
13 | 13 |
docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER_MODEL=$WHISPER_MODEL fedirz/faster-whisper-server:0.1-cuda |
--- examples/youtube/script.sh
+++ examples/youtube/script.sh
... | ... | @@ -3,7 +3,7 @@ |
3 | 3 |
set -e |
4 | 4 |
|
5 | 5 |
# NOTE: do not use any distil-* model other than the large ones as they don't work on long audio files for some reason. |
6 |
-export WHISPER_MODEL=distil-large-v3 # or tiny.en if you are running on a CPU for a faster inference. |
|
6 |
+export WHISPER_MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference. |
|
7 | 7 |
|
8 | 8 |
# Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`. |
9 | 9 |
docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER_MODEL=$WHISPER_MODEL fedirz/faster-whisper-server:0.1-cuda |
... | ... | @@ -13,7 +13,7 @@ |
13 | 13 |
# Download the audio from a YouTube video. In this example I'm downloading "The Evolution of the Operating System" by Asionometry YouTube channel. I highly checking this channel out, the guy produces very high content. If you don't have `youtube-dl`, you'll have to install it. https://github.com/ytdl-org/youtube-dl |
14 | 14 |
youtube-dl --extract-audio --audio-format mp3 -o the-evolution-of-the-operating-system.mp3 'https://www.youtube.com/watch?v=1lG7lFLXBIs' |
15 | 15 |
|
16 |
-# Make a request to the API to transcribe the audio. The response will be streamed to the terminal and saved to a file. The video is 30 minutes long, so it might take a while to transcribe, especially if you are running this on a CPU. `distil-large-v3` takes ~30 seconds on Nvidia L4. `tiny.en` takes ~1 minute on Ryzen 7 7700X. The .txt file in the example was transcribed using `distil-large-v3`. |
|
16 |
+# Make a request to the API to transcribe the audio. The response will be streamed to the terminal and saved to a file. The video is 30 minutes long, so it might take a while to transcribe, especially if you are running this on a CPU. `Systran/faster-distil-whisper-large-v3` takes ~30 seconds on Nvidia L4. `Systran/faster-whisper-tiny.en` takes ~1 minute on Ryzen 7 7700X. The .txt file in the example was transcribed using `Systran/faster-distil-whisper-large-v3`. |
|
17 | 17 |
curl -s http://localhost:8000/v1/audio/transcriptions -F "file=@the-evolution-of-the-operating-system.mp3" -F "stream=true" -F "language=en" -F "response_format=text" | tee the-evolution-of-the-operating-system.txt |
18 | 18 |
|
19 | 19 |
# Here I'm using `aichat` which is a CLI LLM client. You could use any other client that supports attaching/uploading files. https://github.com/sigoden/aichat |
--- faster_whisper_server/config.py
+++ faster_whisper_server/config.py
... | ... | @@ -46,26 +46,6 @@ |
46 | 46 |
# I see a lot of equivalence between this new LLM OS and operating systems of today. |
47 | 47 |
|
48 | 48 |
|
49 |
-# https://huggingface.co/Systran |
|
50 |
-class Model(enum.StrEnum): |
|
51 |
- TINY_EN = "tiny.en" |
|
52 |
- TINY = "tiny" |
|
53 |
- BASE_EN = "base.en" |
|
54 |
- BASE = "base" |
|
55 |
- SMALL_EN = "small.en" |
|
56 |
- SMALL = "small" |
|
57 |
- MEDIUM_EN = "medium.en" |
|
58 |
- MEDIUM = "medium" |
|
59 |
- LARGE = "large" |
|
60 |
- LARGE_V1 = "large-v1" |
|
61 |
- LARGE_V2 = "large-v2" |
|
62 |
- LARGE_V3 = "large-v3" |
|
63 |
- DISTIL_SMALL_EN = "distil-small.en" |
|
64 |
- DISTIL_MEDIUM_EN = "distil-medium.en" |
|
65 |
- DISTIL_LARGE_V2 = "distil-large-v2" |
|
66 |
- DISTIL_LARGE_V3 = "distil-large-v3" |
|
67 |
- |
|
68 |
- |
|
69 | 49 |
class Device(enum.StrEnum): |
70 | 50 |
CPU = "cpu" |
71 | 51 |
CUDA = "cuda" |
... | ... | @@ -189,7 +169,12 @@ |
189 | 169 |
|
190 | 170 |
|
191 | 171 |
class WhisperConfig(BaseModel): |
192 |
- model: Model = Field(default=Model.MEDIUM_EN) |
|
172 |
+ model: str = Field(default="Systran/faster-whisper-medium.en") |
|
173 |
+ """ |
|
174 |
+ Huggingface model to use for transcription. Note, the model must support being ran using CTranslate2. |
|
175 |
+ Models created by authors of `faster-whisper` can be found at https://huggingface.co/Systran |
|
176 |
+ You can find other supported models at https://huggingface.co/models?p=2&sort=trending&search=ctranslate2 and https://huggingface.co/models?sort=trending&search=ct2 |
|
177 |
+ """ |
|
193 | 178 |
inference_device: Device = Field(default=Device.AUTO) |
194 | 179 |
compute_type: Quantization = Field(default=Quantization.DEFAULT) |
195 | 180 |
|
... | ... | @@ -209,21 +194,21 @@ |
209 | 194 |
default_response_format: ResponseFormat = ResponseFormat.JSON |
210 | 195 |
whisper: WhisperConfig = WhisperConfig() |
211 | 196 |
max_models: int = 1 |
197 |
+ max_no_data_seconds: float = 1.0 |
|
212 | 198 |
""" |
213 | 199 |
Max duration to for the next audio chunk before transcription is finilized and connection is closed. |
214 | 200 |
""" |
215 |
- max_no_data_seconds: float = 1.0 |
|
216 | 201 |
min_duration: float = 1.0 |
217 | 202 |
word_timestamp_error_margin: float = 0.2 |
203 |
+ max_inactivity_seconds: float = 5.0 |
|
218 | 204 |
""" |
219 | 205 |
Max allowed audio duration without any speech being detected before transcription is finilized and connection is closed. |
220 | 206 |
""" |
221 |
- max_inactivity_seconds: float = 5.0 |
|
207 |
+ inactivity_window_seconds: float = 10.0 |
|
222 | 208 |
""" |
223 | 209 |
Controls how many latest seconds of audio are being passed through VAD. |
224 | 210 |
Should be greater than `max_inactivity_seconds` |
225 | 211 |
""" |
226 |
- inactivity_window_seconds: float = 10.0 |
|
227 | 212 |
|
228 | 213 |
|
229 | 214 |
config = Config() |
--- faster_whisper_server/main.py
+++ faster_whisper_server/main.py
... | ... | @@ -26,7 +26,6 @@ |
26 | 26 |
from faster_whisper_server.config import ( |
27 | 27 |
SAMPLES_PER_SECOND, |
28 | 28 |
Language, |
29 |
- Model, |
|
30 | 29 |
ResponseFormat, |
31 | 30 |
config, |
32 | 31 |
) |
... | ... | @@ -37,10 +36,10 @@ |
37 | 36 |
) |
38 | 37 |
from faster_whisper_server.transcriber import audio_transcriber |
39 | 38 |
|
40 |
-models: OrderedDict[Model, WhisperModel] = OrderedDict() |
|
39 |
+models: OrderedDict[str, WhisperModel] = OrderedDict() |
|
41 | 40 |
|
42 | 41 |
|
43 |
-def load_model(model_name: Model) -> WhisperModel: |
|
42 |
+def load_model(model_name: str) -> WhisperModel: |
|
44 | 43 |
if model_name in models: |
45 | 44 |
logger.debug(f"{model_name} model already loaded") |
46 | 45 |
return models[model_name] |
... | ... | @@ -50,8 +49,9 @@ |
50 | 49 |
f"Max models ({config.max_models}) reached. Unloading the oldest model: {oldest_model_name}" |
51 | 50 |
) |
52 | 51 |
del models[oldest_model_name] |
53 |
- logger.debug(f"Loading {model_name}") |
|
52 |
+ logger.debug(f"Loading {model_name}...") |
|
54 | 53 |
start = time.perf_counter() |
54 |
+ # NOTE: will raise an exception if the model name isn't valid |
|
55 | 55 |
whisper = WhisperModel( |
56 | 56 |
model_name, |
57 | 57 |
device=config.whisper.inference_device, |
... | ... | @@ -84,7 +84,7 @@ |
84 | 84 |
@app.post("/v1/audio/translations") |
85 | 85 |
def translate_file( |
86 | 86 |
file: Annotated[UploadFile, Form()], |
87 |
- model: Annotated[Model, Form()] = config.whisper.model, |
|
87 |
+ model: Annotated[str, Form()] = config.whisper.model, |
|
88 | 88 |
prompt: Annotated[str | None, Form()] = None, |
89 | 89 |
response_format: Annotated[ResponseFormat, Form()] = config.default_response_format, |
90 | 90 |
temperature: Annotated[float, Form()] = 0.0, |
... | ... | @@ -135,7 +135,7 @@ |
135 | 135 |
@app.post("/v1/audio/transcriptions") |
136 | 136 |
def transcribe_file( |
137 | 137 |
file: Annotated[UploadFile, Form()], |
138 |
- model: Annotated[Model, Form()] = config.whisper.model, |
|
138 |
+ model: Annotated[str, Form()] = config.whisper.model, |
|
139 | 139 |
language: Annotated[Language | None, Form()] = config.default_language, |
140 | 140 |
prompt: Annotated[str | None, Form()] = None, |
141 | 141 |
response_format: Annotated[ResponseFormat, Form()] = config.default_response_format, |
... | ... | @@ -235,7 +235,7 @@ |
235 | 235 |
@app.websocket("/v1/audio/transcriptions") |
236 | 236 |
async def transcribe_stream( |
237 | 237 |
ws: WebSocket, |
238 |
- model: Annotated[Model, Query()] = config.whisper.model, |
|
238 |
+ model: Annotated[str, Query()] = config.whisper.model, |
|
239 | 239 |
language: Annotated[Language | None, Query()] = config.default_language, |
240 | 240 |
response_format: Annotated[ |
241 | 241 |
ResponseFormat, Query() |
Add a comment
Delete comment
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?