Fedir Zadniprovskyi 2024-06-03
feat: allow using any ctraslate2 compatible model #14
@bb600af152fd4f84fa76d69aa5fded709153a03e
Dockerfile.cpu
--- Dockerfile.cpu
+++ Dockerfile.cpu
@@ -15,7 +15,7 @@
 COPY ./faster_whisper_server ./faster_whisper_server
 ENTRYPOINT ["poetry", "run"]
 CMD ["uvicorn", "faster_whisper_server.main:app"]
-ENV WHISPER_MODEL=medium.en
+ENV WHISPER_MODEL=Systran/faster-whisper-medium.en
 ENV WHISPER_INFERENCE_DEVICE=cpu
 ENV WHISPER_COMPUTE_TYPE=int8
 ENV UVICORN_HOST=0.0.0.0
Dockerfile.cuda
--- Dockerfile.cuda
+++ Dockerfile.cuda
@@ -15,7 +15,7 @@
 COPY ./faster_whisper_server ./faster_whisper_server
 ENTRYPOINT ["poetry", "run"]
 CMD ["uvicorn", "faster_whisper_server.main:app"]
-ENV WHISPER_MODEL=distil-large-v3
+ENV WHISPER_MODEL=Systran/faster-distil-whisper-large-v3
 ENV WHISPER_INFERENCE_DEVICE=cuda
 ENV UVICORN_HOST=0.0.0.0
 ENV UVICORN_PORT=8000
README.md
--- README.md
+++ README.md
@@ -38,9 +38,9 @@
 export OPENAI_BASE_URL=http://localhost:8000/v1/
 ```
 ```bash
-openai api audio.transcriptions.create -m distil-large-v3 -f audio.wav --response-format text
+openai api audio.transcriptions.create -m Systran/faster-distil-whisper-large-v3 -f audio.wav --response-format text
 
-openai api audio.translations.create -m distil-large-v3 -f audio.wav --response-format verbose_json
+openai api audio.translations.create -m Systran/faster-distil-whisper-large-v3 -f audio.wav --response-format verbose_json
 ```
 ### OpenAI API Python SDK
 ```python
@@ -50,7 +50,7 @@
 
 audio_file = open("audio.wav", "rb")
 transcript = client.audio.transcriptions.create(
-    model="distil-large-v3", file=audio_file
+    model="Systran/faster-distil-whisper-large-v3", file=audio_file
 )
 print(transcript.text)
 ```
@@ -61,9 +61,9 @@
 curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav"
 curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.mp3"
 curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav" -F "stream=true"
-curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav" -F "stream=true" -F "model=distil-large-v3"
+curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav" -F "model=Systran/faster-distil-whisper-large-v3"
 # It's recommended that you always specify the language as that will reduce the transcription time
-curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav" -F "stream=true" -F "model=distil-large-v3" -F "language=en"
+curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav" -F "language=en"
 
 curl http://localhost:8000/v1/audio/translations -F "file=@audio.wav"
 ```
examples/live-audio/script.sh
--- examples/live-audio/script.sh
+++ examples/live-audio/script.sh
@@ -7,7 +7,7 @@
 # ffmpeg -y -hide_banner -loglevel quiet -i audio.mp3 -ac 1 -ar 16000 -f s16le -acodec pcm_s16le audio.pcm
 # rm -f audio.mp3
 
-export WHISPER_MODEL=distil-large-v3 # or tiny.en if you are running on a CPU for a faster inference.
+export WHISPER_MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
 
 # Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
 docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER_MODEL=$WHISPER_MODEL fedirz/faster-whisper-server:0.1-cuda
examples/youtube/script.sh
--- examples/youtube/script.sh
+++ examples/youtube/script.sh
@@ -3,7 +3,7 @@
 set -e
 
 # NOTE: do not use any distil-* model other than the large ones as they don't work on long audio files for some reason.
-export WHISPER_MODEL=distil-large-v3 # or tiny.en if you are running on a CPU for a faster inference.
+export WHISPER_MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
 
 # Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
 docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER_MODEL=$WHISPER_MODEL fedirz/faster-whisper-server:0.1-cuda
@@ -13,7 +13,7 @@
 # Download the audio from a YouTube video. In this example I'm downloading "The Evolution of the Operating System" by Asionometry YouTube channel. I highly checking this channel out, the guy produces very high content. If you don't have `youtube-dl`, you'll have to install it. https://github.com/ytdl-org/youtube-dl
 youtube-dl --extract-audio --audio-format mp3 -o the-evolution-of-the-operating-system.mp3 'https://www.youtube.com/watch?v=1lG7lFLXBIs'
 
-# Make a request to the API to transcribe the audio. The response will be streamed to the terminal and saved to a file. The video is 30 minutes long, so it might take a while to transcribe, especially if you are running this on a CPU. `distil-large-v3` takes ~30 seconds on Nvidia L4. `tiny.en` takes ~1 minute on Ryzen 7 7700X. The .txt file in the example was transcribed using `distil-large-v3`.
+# Make a request to the API to transcribe the audio. The response will be streamed to the terminal and saved to a file. The video is 30 minutes long, so it might take a while to transcribe, especially if you are running this on a CPU. `Systran/faster-distil-whisper-large-v3` takes ~30 seconds on Nvidia L4. `Systran/faster-whisper-tiny.en` takes ~1 minute on Ryzen 7 7700X. The .txt file in the example was transcribed using `Systran/faster-distil-whisper-large-v3`.
 curl -s http://localhost:8000/v1/audio/transcriptions -F "file=@the-evolution-of-the-operating-system.mp3" -F "stream=true" -F "language=en" -F "response_format=text" | tee the-evolution-of-the-operating-system.txt
 
 # Here I'm using `aichat` which is a CLI LLM client. You could use any other client that supports attaching/uploading files. https://github.com/sigoden/aichat
faster_whisper_server/config.py
--- faster_whisper_server/config.py
+++ faster_whisper_server/config.py
@@ -46,26 +46,6 @@
     # I see a lot of equivalence between this new LLM OS and operating systems of today.
 
 
-# https://huggingface.co/Systran
-class Model(enum.StrEnum):
-    TINY_EN = "tiny.en"
-    TINY = "tiny"
-    BASE_EN = "base.en"
-    BASE = "base"
-    SMALL_EN = "small.en"
-    SMALL = "small"
-    MEDIUM_EN = "medium.en"
-    MEDIUM = "medium"
-    LARGE = "large"
-    LARGE_V1 = "large-v1"
-    LARGE_V2 = "large-v2"
-    LARGE_V3 = "large-v3"
-    DISTIL_SMALL_EN = "distil-small.en"
-    DISTIL_MEDIUM_EN = "distil-medium.en"
-    DISTIL_LARGE_V2 = "distil-large-v2"
-    DISTIL_LARGE_V3 = "distil-large-v3"
-
-
 class Device(enum.StrEnum):
     CPU = "cpu"
     CUDA = "cuda"
@@ -189,7 +169,12 @@
 
 
 class WhisperConfig(BaseModel):
-    model: Model = Field(default=Model.MEDIUM_EN)
+    model: str = Field(default="Systran/faster-whisper-medium.en")
+    """
+    Huggingface model to use for transcription. Note, the model must support being ran using CTranslate2.
+    Models created by authors of `faster-whisper` can be found at https://huggingface.co/Systran
+    You can find other supported models at https://huggingface.co/models?p=2&sort=trending&search=ctranslate2 and https://huggingface.co/models?sort=trending&search=ct2
+    """
     inference_device: Device = Field(default=Device.AUTO)
     compute_type: Quantization = Field(default=Quantization.DEFAULT)
 
@@ -209,21 +194,21 @@
     default_response_format: ResponseFormat = ResponseFormat.JSON
     whisper: WhisperConfig = WhisperConfig()
     max_models: int = 1
+    max_no_data_seconds: float = 1.0
     """
     Max duration to for the next audio chunk before transcription is finilized and connection is closed.
     """
-    max_no_data_seconds: float = 1.0
     min_duration: float = 1.0
     word_timestamp_error_margin: float = 0.2
+    max_inactivity_seconds: float = 5.0
     """
     Max allowed audio duration without any speech being detected before transcription is finilized and connection is closed.
     """
-    max_inactivity_seconds: float = 5.0
+    inactivity_window_seconds: float = 10.0
     """
     Controls how many latest seconds of audio are being passed through VAD.
     Should be greater than `max_inactivity_seconds`
     """
-    inactivity_window_seconds: float = 10.0
 
 
 config = Config()
faster_whisper_server/main.py
--- faster_whisper_server/main.py
+++ faster_whisper_server/main.py
@@ -26,7 +26,6 @@
 from faster_whisper_server.config import (
     SAMPLES_PER_SECOND,
     Language,
-    Model,
     ResponseFormat,
     config,
 )
@@ -37,10 +36,10 @@
 )
 from faster_whisper_server.transcriber import audio_transcriber
 
-models: OrderedDict[Model, WhisperModel] = OrderedDict()
+models: OrderedDict[str, WhisperModel] = OrderedDict()
 
 
-def load_model(model_name: Model) -> WhisperModel:
+def load_model(model_name: str) -> WhisperModel:
     if model_name in models:
         logger.debug(f"{model_name} model already loaded")
         return models[model_name]
@@ -50,8 +49,9 @@
             f"Max models ({config.max_models}) reached. Unloading the oldest model: {oldest_model_name}"
         )
         del models[oldest_model_name]
-    logger.debug(f"Loading {model_name}")
+    logger.debug(f"Loading {model_name}...")
     start = time.perf_counter()
+    # NOTE: will raise an exception if the model name isn't valid
     whisper = WhisperModel(
         model_name,
         device=config.whisper.inference_device,
@@ -84,7 +84,7 @@
 @app.post("/v1/audio/translations")
 def translate_file(
     file: Annotated[UploadFile, Form()],
-    model: Annotated[Model, Form()] = config.whisper.model,
+    model: Annotated[str, Form()] = config.whisper.model,
     prompt: Annotated[str | None, Form()] = None,
     response_format: Annotated[ResponseFormat, Form()] = config.default_response_format,
     temperature: Annotated[float, Form()] = 0.0,
@@ -135,7 +135,7 @@
 @app.post("/v1/audio/transcriptions")
 def transcribe_file(
     file: Annotated[UploadFile, Form()],
-    model: Annotated[Model, Form()] = config.whisper.model,
+    model: Annotated[str, Form()] = config.whisper.model,
     language: Annotated[Language | None, Form()] = config.default_language,
     prompt: Annotated[str | None, Form()] = None,
     response_format: Annotated[ResponseFormat, Form()] = config.default_response_format,
@@ -235,7 +235,7 @@
 @app.websocket("/v1/audio/transcriptions")
 async def transcribe_stream(
     ws: WebSocket,
-    model: Annotated[Model, Query()] = config.whisper.model,
+    model: Annotated[str, Query()] = config.whisper.model,
     language: Annotated[Language | None, Query()] = config.default_language,
     response_format: Annotated[
         ResponseFormat, Query()
Add a comment
List