Commit @88f046769818fa23660057c000a659005307ae14 - yjyoon/whisper_server

Fedir Zadniprovskyi 2024-07-03

chore: fix ruff errors

@88f046769818fa23660057c000a659005307ae14

4e2de91

88f0467

faster_whisper_server/asr.py

--- faster_whisper_server/asr.py

+++ faster_whisper_server/asr.py


 import asyncio
+from collections.abc import Iterable
 import time
-from typing import Iterable
 
 from faster_whisper import transcribe
 

         audio: Audio,
         prompt: str | None = None,
     ) -> tuple[Transcription, transcribe.TranscriptionInfo]:
-        """Wrapper around _transcribe so it can be used in async context"""
+        """Wrapper around _transcribe so it can be used in async context."""
         # is this the optimal way to execute a blocking call in an async context?
         # TODO: verify performance when running inference on a CPU
         return await asyncio.get_running_loop().run_in_executor(

4e2de91

88f0467

faster_whisper_server/audio.py

--- faster_whisper_server/audio.py

+++ faster_whisper_server/audio.py


 from __future__ import annotations
 
 import asyncio
-from typing import AsyncGenerator, BinaryIO
+from typing import TYPE_CHECKING, BinaryIO
 
 import numpy as np
 import soundfile as sf
-from numpy.typing import NDArray
 
 from faster_whisper_server.config import SAMPLES_PER_SECOND
 from faster_whisper_server.logger import logger
+
+if TYPE_CHECKING:
+    from collections.abc import AsyncGenerator
+
+    from numpy.typing import NDArray
 
 
 def audio_samples_from_file(file: BinaryIO) -> NDArray[np.float32]:

         endian="LITTLE",
     )
     audio = audio_and_sample_rate[0]
-    return audio  # type: ignore
+    return audio  # pyright: ignore[reportReturnType]
 
 
 class Audio:

         self.modify_event.set()
         logger.info("AudioStream closed")
 
-    async def chunks(
-        self, min_duration: float
-    ) -> AsyncGenerator[NDArray[np.float32], None]:
+    async def chunks(self, min_duration: float) -> AsyncGenerator[NDArray[np.float32], None]:
         i = 0.0  # end time of last chunk
         while True:
             await self.modify_event.wait()

4e2de91

88f0467

faster_whisper_server/config.py

--- faster_whisper_server/config.py

+++ faster_whisper_server/config.py


     TEXT = "text"
     JSON = "json"
     VERBOSE_JSON = "verbose_json"
-    # NOTE: While inspecting outputs of these formats with `curl`, I noticed there's one or two "\n" inserted at the end of the response.
+    # NOTE: While inspecting outputs of these formats with `curl`, I noticed there's one or two "\n" inserted at the end of the response. # noqa: E501
 
     # VTT = "vtt" # TODO
     # 1

 
 
 class Config(BaseSettings):
-    """
-    Configuration for the application. Values can be set via environment variables.
+    """Configuration for the application. Values can be set via environment variables.
+
     Pydantic will automatically handle mapping uppercased environment variables to the corresponding fields.
     To populate nested, the environment should be prefixed with the nested field name and an underscore. For example,
     the environment variable `LOG_LEVEL` will be mapped to `log_level`, `WHISPER_MODEL` to `whisper.model`, etc.

     max_inactivity_seconds: float = 5.0
     """
     Max allowed audio duration without any speech being detected before transcription is finilized and connection is closed.
-    """
+    """  # noqa: E501
     inactivity_window_seconds: float = 10.0
     """
     Controls how many latest seconds of audio are being passed through VAD.

4e2de91

88f0467

faster_whisper_server/core.py

--- faster_whisper_server/core.py

+++ faster_whisper_server/core.py


 # TODO: rename module
 from __future__ import annotations
 
-import re
 from dataclasses import dataclass
+import re
 
 from faster_whisper_server.config import config
 

     def is_eos(self) -> bool:
         if self.text.endswith("..."):
             return False
-        for punctuation_symbol in ".?!":
-            if self.text.endswith(punctuation_symbol):
-                return True
-        return False
+        return any(self.text.endswith(punctuation_symbol) for punctuation_symbol in ".?!")
 
     def offset(self, seconds: float) -> None:
         self.start += seconds

     @classmethod
     def common_prefix(cls, a: list[Word], b: list[Word]) -> list[Word]:
         i = 0
-        while (
-            i < len(a)
-            and i < len(b)
-            and canonicalize_word(a[i].text) == canonicalize_word(b[i].text)
-        ):
+        while i < len(a) and i < len(b) and canonicalize_word(a[i].text) == canonicalize_word(b[i].text):
             i += 1
         return a[:i]
 

         return self.end - self.start
 
     def after(self, seconds: float) -> Transcription:
-        return Transcription(
-            words=[word for word in self.words if word.start > seconds]
-        )
+        return Transcription(words=[word for word in self.words if word.start > seconds])
 
     def extend(self, words: list[Word]) -> None:
         self._ensure_no_word_overlap(words)

 
     def _ensure_no_word_overlap(self, words: list[Word]) -> None:
         if len(self.words) > 0 and len(words) > 0:
-            if (
-                words[0].start + config.word_timestamp_error_margin
-                <= self.words[-1].end
-            ):
+            if words[0].start + config.word_timestamp_error_margin <= self.words[-1].end:
                 raise ValueError(
-                    f"Words overlap: {self.words[-1]} and {words[0]}. Error margin: {config.word_timestamp_error_margin}"
+                    f"Words overlap: {self.words[-1]} and {words[0]}. Error margin: {config.word_timestamp_error_margin}"  # noqa: E501
                 )
         for i in range(1, len(words)):
             if words[i].start + config.word_timestamp_error_margin <= words[i - 1].end:
-                raise ValueError(
-                    f"Words overlap: {words[i - 1]} and {words[i]}. All words: {words}"
-                )
+                raise ValueError(f"Words overlap: {words[i - 1]} and {words[i]}. All words: {words}")
 
 
-def test_segment_is_eos():
+def test_segment_is_eos() -> None:
     assert not Segment("Hello").is_eos
     assert not Segment("Hello...").is_eos
     assert Segment("Hello.").is_eos

     return sentences
 
 
-def tests_to_full_sentences():
+def tests_to_full_sentences() -> None:
     assert to_full_sentences([]) == []
     assert to_full_sentences([Word(text="Hello")]) == []
     assert to_full_sentences([Word(text="Hello..."), Word(" world")]) == []
-    assert to_full_sentences([Word(text="Hello..."), Word(" world.")]) == [
+    assert to_full_sentences([Word(text="Hello..."), Word(" world.")]) == [Segment(text="Hello... world.")]
+    assert to_full_sentences([Word(text="Hello..."), Word(" world."), Word(" How")]) == [
         Segment(text="Hello... world.")
     ]
-    assert to_full_sentences(
-        [Word(text="Hello..."), Word(" world."), Word(" How")]
-    ) == [Segment(text="Hello... world.")]
 
 
 def to_text(words: list[Word]) -> str:

     return text.lower().strip().strip(".,?!")
 
 
-def test_canonicalize_word():
+def test_canonicalize_word() -> None:
     assert canonicalize_word("ABC") == "abc"
     assert canonicalize_word("...ABC?") == "abc"
     assert canonicalize_word("... AbC  ...") == "abc"

 
 def common_prefix(a: list[Word], b: list[Word]) -> list[Word]:
     i = 0
-    while (
-        i < len(a)
-        and i < len(b)
-        and canonicalize_word(a[i].text) == canonicalize_word(b[i].text)
-    ):
+    while i < len(a) and i < len(b) and canonicalize_word(a[i].text) == canonicalize_word(b[i].text):
         i += 1
     return a[:i]
 
 
-def test_common_prefix():
+def test_common_prefix() -> None:
     def word(text: str) -> Word:
         return Word(text=text, start=0.0, end=0.0, probability=0.0)
 

     assert common_prefix(a, b) == []
 
 
-def test_common_prefix_and_canonicalization():
+def test_common_prefix_and_canonicalization() -> None:
     def word(text: str) -> Word:
         return Word(text=text, start=0.0, end=0.0, probability=0.0)
 

4e2de91

88f0467

faster_whisper_server/gradio_app.py

--- faster_whisper_server/gradio_app.py

+++ faster_whisper_server/gradio_app.py


+from collections.abc import Generator
 import os
-from typing import Generator
 
 import gradio as gr
 import httpx

 
 def create_gradio_demo(config: Config) -> gr.Blocks:
     host = os.getenv("UVICORN_HOST", "0.0.0.0")
-    port = os.getenv("UVICORN_PORT", 8000)
+    port = int(os.getenv("UVICORN_PORT", "8000"))
     # NOTE: worth looking into generated clients
     http_client = httpx.Client(base_url=f"http://{host}:{port}", timeout=None)
 
-    def handler(
-        file_path: str, model: str, task: Task, temperature: float, stream: bool
-    ) -> Generator[str, None, None]:
+    def handler(file_path: str, model: str, task: Task, temperature: float, stream: bool) -> Generator[str, None, None]:
         if stream:
             previous_transcription = ""
-            for transcription in transcribe_audio_streaming(
-                file_path, task, temperature, model
-            ):
+            for transcription in transcribe_audio_streaming(file_path, task, temperature, model):
                 previous_transcription += transcription
                 yield previous_transcription
         else:
             yield transcribe_audio(file_path, task, temperature, model)
 
-    def transcribe_audio(
-        file_path: str, task: Task, temperature: float, model: str
-    ) -> str:
+    def transcribe_audio(file_path: str, task: Task, temperature: float, model: str) -> str:
         if task == Task.TRANSCRIBE:
             endpoint = TRANSCRIPTION_ENDPOINT
         elif task == Task.TRANSLATE:

                     "stream": True,
                 },
             }
-            endpoint = (
-                TRANSCRIPTION_ENDPOINT
-                if task == Task.TRANSCRIBE
-                else TRANSLATION_ENDPOINT
-            )
+            endpoint = TRANSCRIPTION_ENDPOINT if task == Task.TRANSCRIBE else TRANSLATION_ENDPOINT
             with connect_sse(http_client, "POST", endpoint, **kwargs) as event_source:
                 for event in event_source.iter_sse():
                     yield event.data

         res_data = res.json()
         models: list[str] = [model["id"] for model in res_data]
         assert config.whisper.model in models
-        recommended_models = set(
-            model for model in models if model.startswith("Systran")
-        )
+        recommended_models = {model for model in models if model.startswith("Systran")}
         other_models = [model for model in models if model not in recommended_models]
         models = list(recommended_models) + other_models
-        model_dropdown = gr.Dropdown(
+        return gr.Dropdown(
             # no idea why it's complaining
-            choices=models,  # type: ignore
+            choices=models,  # pyright: ignore[reportArgumentType]
             label="Model",
             value=config.whisper.model,
         )
-        return model_dropdown
 
     model_dropdown = gr.Dropdown(
         choices=[config.whisper.model],

         label="Task",
         value=Task.TRANSCRIBE,
     )
-    temperature_slider = gr.Slider(
-        minimum=0.0, maximum=1.0, step=0.1, label="Temperature", value=0.0
-    )
+    temperature_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label="Temperature", value=0.0)
     stream_checkbox = gr.Checkbox(label="Stream", value=True)
     with gr.Interface(
         title="Whisper Playground",
-        description="""Consider supporting the project by starring the <a href="https://github.com/fedirz/faster-whisper-server">repository on GitHub</a>.""",
+        description="""Consider supporting the project by starring the <a href="https://github.com/fedirz/faster-whisper-server">repository on GitHub</a>.""",  # noqa: E501
         inputs=[
             gr.Audio(type="filepath"),
             model_dropdown,

4e2de91

88f0467

faster_whisper_server/logger.py

--- faster_whisper_server/logger.py

+++ faster_whisper_server/logger.py


 root_logger.setLevel(logging.CRITICAL)
 logger = logging.getLogger(__name__)
 logger.setLevel(config.log_level.upper())
-logging.basicConfig(
-    format="%(asctime)s:%(levelname)s:%(name)s:%(funcName)s:%(message)s"
-)
+logging.basicConfig(format="%(asctime)s:%(levelname)s:%(name)s:%(funcName)s:%(message)s")

4e2de91

88f0467

faster_whisper_server/main.py

--- faster_whisper_server/main.py

+++ faster_whisper_server/main.py


 from __future__ import annotations
 
 import asyncio
-import time
+from collections import OrderedDict
 from io import BytesIO
-from typing import Annotated, Generator, Iterable, Literal, OrderedDict
+import time
+from typing import TYPE_CHECKING, Annotated, Literal
 
-import gradio as gr
-import huggingface_hub
 from fastapi import (
     FastAPI,
     Form,

 from fastapi.responses import StreamingResponse
 from fastapi.websockets import WebSocketState
 from faster_whisper import WhisperModel
-from faster_whisper.transcribe import Segment, TranscriptionInfo
 from faster_whisper.vad import VadOptions, get_speech_timestamps
-from huggingface_hub.hf_api import ModelInfo
+import gradio as gr
+import huggingface_hub
 from pydantic import AfterValidator
 
 from faster_whisper_server import utils

 )
 from faster_whisper_server.transcriber import audio_transcriber
 
+if TYPE_CHECKING:
+    from collections.abc import Generator, Iterable
+
+    from faster_whisper.transcribe import Segment, TranscriptionInfo
+    from huggingface_hub.hf_api import ModelInfo
+
 loaded_models: OrderedDict[str, WhisperModel] = OrderedDict()
 
 

         return loaded_models[model_name]
     if len(loaded_models) >= config.max_models:
         oldest_model_name = next(iter(loaded_models))
-        logger.info(
-            f"Max models ({config.max_models}) reached. Unloading the oldest model: {oldest_model_name}"
-        )
+        logger.info(f"Max models ({config.max_models}) reached. Unloading the oldest model: {oldest_model_name}")
         del loaded_models[oldest_model_name]
     logger.debug(f"Loading {model_name}...")
     start = time.perf_counter()

         compute_type=config.whisper.compute_type,
     )
     logger.info(
-        f"Loaded {model_name} loaded in {time.perf_counter() - start:.2f} seconds. {config.whisper.inference_device}({config.whisper.compute_type}) will be used for inference."
+        f"Loaded {model_name} loaded in {time.perf_counter() - start:.2f} seconds. {config.whisper.inference_device}({config.whisper.compute_type}) will be used for inference."  # noqa: E501
     )
     loaded_models[model_name] = whisper
     return whisper

 def get_model(
     model_name: Annotated[str, Path(example="Systran/faster-distil-whisper-large-v3")],
 ) -> ModelObject:
-    models = list(
-        huggingface_hub.list_models(model_name=model_name, library="ctranslate2")
-    )
+    models = list(huggingface_hub.list_models(model_name=model_name, library="ctranslate2"))
     if len(models) == 0:
         raise HTTPException(status_code=404, detail="Model doesn't exists")
     exact_match: ModelInfo | None = None

     response_format: ResponseFormat,
 ) -> str | TranscriptionJsonResponse | TranscriptionVerboseJsonResponse:
     segments = list(segments)
-    if response_format == ResponseFormat.TEXT:
+    if response_format == ResponseFormat.TEXT:  # noqa: RET503
         return utils.segments_text(segments)
     elif response_format == ResponseFormat.JSON:
         return TranscriptionJsonResponse.from_segments(segments)
     elif response_format == ResponseFormat.VERBOSE_JSON:
-        return TranscriptionVerboseJsonResponse.from_segments(
-            segments, transcription_info
-        )
+        return TranscriptionVerboseJsonResponse.from_segments(segments, transcription_info)
 
 
 def format_as_sse(data: str) -> str:

             if response_format == ResponseFormat.TEXT:
                 data = segment.text
             elif response_format == ResponseFormat.JSON:
-                data = TranscriptionJsonResponse.from_segments(
-                    [segment]
-                ).model_dump_json()
+                data = TranscriptionJsonResponse.from_segments([segment]).model_dump_json()
             elif response_format == ResponseFormat.VERBOSE_JSON:
-                data = TranscriptionVerboseJsonResponse.from_segment(
-                    segment, transcription_info
-                ).model_dump_json()
+                data = TranscriptionVerboseJsonResponse.from_segment(segment, transcription_info).model_dump_json()
             yield format_as_sse(data)
 
     return StreamingResponse(segment_responses(), media_type="text/event-stream")
 
 
 def handle_default_openai_model(model_name: str) -> str:
-    """This exists because some callers may not be able override the default("whisper-1") model name.
+    """Exists because some callers may not be able override the default("whisper-1") model name.
+
     For example, https://github.com/open-webui/open-webui/issues/2248#issuecomment-2162997623.
     """
     if model_name == "whisper-1":
-        logger.info(
-            f"{model_name} is not a valid model name. Using {config.whisper.model} instead."
-        )
+        logger.info(f"{model_name} is not a valid model name. Using {config.whisper.model} instead.")
         return config.whisper.model
     return model_name
 

     response_format: Annotated[ResponseFormat, Form()] = config.default_response_format,
     temperature: Annotated[float, Form()] = 0.0,
     stream: Annotated[bool, Form()] = False,
-) -> (
-    str
-    | TranscriptionJsonResponse
-    | TranscriptionVerboseJsonResponse
-    | StreamingResponse
-):
+) -> str | TranscriptionJsonResponse | TranscriptionVerboseJsonResponse | StreamingResponse:
     whisper = load_model(model)
     segments, transcription_info = whisper.transcribe(
         file.file,

     )
 
     if stream:
-        return segments_to_streaming_response(
-            segments, transcription_info, response_format
-        )
+        return segments_to_streaming_response(segments, transcription_info, response_format)
     else:
         return segments_to_response(segments, transcription_info, response_format)
 

     response_format: Annotated[ResponseFormat, Form()] = config.default_response_format,
     temperature: Annotated[float, Form()] = 0.0,
     timestamp_granularities: Annotated[
-        list[Literal["segment"] | Literal["word"]],
+        list[Literal["segment", "word"]],
         Form(alias="timestamp_granularities[]"),
     ] = ["segment"],
     stream: Annotated[bool, Form()] = False,
-) -> (
-    str
-    | TranscriptionJsonResponse
-    | TranscriptionVerboseJsonResponse
-    | StreamingResponse
-):
+) -> str | TranscriptionJsonResponse | TranscriptionVerboseJsonResponse | StreamingResponse:
     whisper = load_model(model)
     segments, transcription_info = whisper.transcribe(
         file.file,

     )
 
     if stream:
-        return segments_to_streaming_response(
-            segments, transcription_info, response_format
-        )
+        return segments_to_streaming_response(segments, transcription_info, response_format)
     else:
         return segments_to_response(segments, transcription_info, response_format)
 

 async def audio_receiver(ws: WebSocket, audio_stream: AudioStream) -> None:
     try:
         while True:
-            bytes_ = await asyncio.wait_for(
-                ws.receive_bytes(), timeout=config.max_no_data_seconds
-            )
+            bytes_ = await asyncio.wait_for(ws.receive_bytes(), timeout=config.max_no_data_seconds)
             logger.debug(f"Received {len(bytes_)} bytes of audio data")
             audio_samples = audio_samples_from_file(BytesIO(bytes_))
             audio_stream.extend(audio_samples)
             if audio_stream.duration - config.inactivity_window_seconds >= 0:
-                audio = audio_stream.after(
-                    audio_stream.duration - config.inactivity_window_seconds
-                )
+                audio = audio_stream.after(audio_stream.duration - config.inactivity_window_seconds)
                 vad_opts = VadOptions(min_silence_duration_ms=500, speech_pad_ms=0)
                 # NOTE: This is a synchronous operation that runs every time new data is received.
-                # This shouldn't be an issue unless data is being received in tiny chunks or the user's machine is a potato.
+                # This shouldn't be an issue unless data is being received in tiny chunks or the user's machine is a potato.  # noqa: E501
                 timestamps = get_speech_timestamps(audio.data, vad_opts)
                 if len(timestamps) == 0:
-                    logger.info(
-                        f"No speech detected in the last {config.inactivity_window_seconds} seconds."
-                    )
+                    logger.info(f"No speech detected in the last {config.inactivity_window_seconds} seconds.")
                     break
                 elif (
                     # last speech end time
-                    config.inactivity_window_seconds
-                    - timestamps[-1]["end"] / SAMPLES_PER_SECOND
+                    config.inactivity_window_seconds - timestamps[-1]["end"] / SAMPLES_PER_SECOND
                     >= config.max_inactivity_seconds
                 ):
-                    logger.info(
-                        f"Not enough speech in the last {config.inactivity_window_seconds} seconds."
-                    )
+                    logger.info(f"Not enough speech in the last {config.inactivity_window_seconds} seconds.")
                     break
-    except asyncio.TimeoutError:
-        logger.info(
-            f"No data received in {config.max_no_data_seconds} seconds. Closing the connection."
-        )
+    except TimeoutError:
+        logger.info(f"No data received in {config.max_no_data_seconds} seconds. Closing the connection.")
     except WebSocketDisconnect as e:
         logger.info(f"Client disconnected: {e}")
     audio_stream.close()

     ws: WebSocket,
     model: Annotated[ModelName, Query()] = config.whisper.model,
     language: Annotated[Language | None, Query()] = config.default_language,
-    response_format: Annotated[
-        ResponseFormat, Query()
-    ] = config.default_response_format,
+    response_format: Annotated[ResponseFormat, Query()] = config.default_response_format,
     temperature: Annotated[float, Query()] = 0.0,
 ) -> None:
     await ws.accept()

             if response_format == ResponseFormat.TEXT:
                 await ws.send_text(transcription.text)
             elif response_format == ResponseFormat.JSON:
-                await ws.send_json(
-                    TranscriptionJsonResponse.from_transcription(
-                        transcription
-                    ).model_dump()
-                )
+                await ws.send_json(TranscriptionJsonResponse.from_transcription(transcription).model_dump())
             elif response_format == ResponseFormat.VERBOSE_JSON:
-                await ws.send_json(
-                    TranscriptionVerboseJsonResponse.from_transcription(
-                        transcription
-                    ).model_dump()
-                )
+                await ws.send_json(TranscriptionVerboseJsonResponse.from_transcription(transcription).model_dump())
 
-    if not ws.client_state == WebSocketState.DISCONNECTED:
+    if ws.client_state != WebSocketState.DISCONNECTED:
         logger.info("Closing the connection.")
         await ws.close()
 

4e2de91

88f0467

faster_whisper_server/server_models.py

--- faster_whisper_server/server_models.py

+++ faster_whisper_server/server_models.py


 from __future__ import annotations
 
-from typing import Literal
+from typing import TYPE_CHECKING, Literal
 
-from faster_whisper.transcribe import Segment, TranscriptionInfo, Word
 from pydantic import BaseModel, ConfigDict, Field
 
 from faster_whisper_server import utils
-from faster_whisper_server.core import Transcription
+
+if TYPE_CHECKING:
+    from faster_whisper.transcribe import Segment, TranscriptionInfo, Word
+
+    from faster_whisper_server.core import Transcription
 
 
 # https://platform.openai.com/docs/api-reference/audio/json-object

         return cls(text=utils.segments_text(segments))
 
     @classmethod
-    def from_transcription(
-        cls, transcription: Transcription
-    ) -> TranscriptionJsonResponse:
+    def from_transcription(cls, transcription: Transcription) -> TranscriptionJsonResponse:
         return cls(text=transcription.text)
 
 

     segments: list[SegmentObject]
 
     @classmethod
-    def from_segment(
-        cls, segment: Segment, transcription_info: TranscriptionInfo
-    ) -> TranscriptionVerboseJsonResponse:
+    def from_segment(cls, segment: Segment, transcription_info: TranscriptionInfo) -> TranscriptionVerboseJsonResponse:
         return cls(
             language=transcription_info.language,
             duration=segment.end - segment.start,
             text=segment.text,
-            words=(
-                [WordObject.from_word(word) for word in segment.words]
-                if isinstance(segment.words, list)
-                else []
-            ),
+            words=([WordObject.from_word(word) for word in segment.words] if isinstance(segment.words, list) else []),
             segments=[SegmentObject.from_segment(segment)],
         )
 

             duration=transcription_info.duration,
             text=utils.segments_text(segments),
             segments=[SegmentObject.from_segment(segment) for segment in segments],
-            words=[
-                WordObject.from_word(word)
-                for word in utils.words_from_segments(segments)
-            ],
+            words=[WordObject.from_word(word) for word in utils.words_from_segments(segments)],
         )
 
     @classmethod
-    def from_transcription(
-        cls, transcription: Transcription
-    ) -> TranscriptionVerboseJsonResponse:
+    def from_transcription(cls, transcription: Transcription) -> TranscriptionVerboseJsonResponse:
         return cls(
             language="english",  # FIX: hardcoded
             duration=transcription.duration,

4e2de91

88f0467

faster_whisper_server/transcriber.py

--- faster_whisper_server/transcriber.py

+++ faster_whisper_server/transcriber.py


 from __future__ import annotations
 
-from typing import AsyncGenerator
+from typing import TYPE_CHECKING
 
-from faster_whisper_server.asr import FasterWhisperASR
 from faster_whisper_server.audio import Audio, AudioStream
 from faster_whisper_server.config import config
 from faster_whisper_server.core import (

 )
 from faster_whisper_server.logger import logger
 
+if TYPE_CHECKING:
+    from collections.abc import AsyncGenerator
+
+    from faster_whisper_server.asr import FasterWhisperASR
+
 
 class LocalAgreement:
     def __init__(self) -> None:

4e2de91

88f0467

pyproject.toml

--- pyproject.toml

+++ pyproject.toml


 [tool.ruff.lint]
 select = ["ALL"]
 ignore = [
-    "D10",  # disabled required docstrings
+    "FIX",
+    "TD", # disable todo warnings
     "ERA",  # allow commented out code
-    "TD", # disable TODO warnings
-    "FIX002", # disable TODO warnings
+    "PTH",
 
-    "COM812", # trailing comma
-    "T201", # print
-    "S101", # allow assert
-    "PTH123", # Path.open
-    "S603", # subprocess untrusted input
-
+    "ANN003", # missing kwargs
     "ANN101", # missing self type
+    "ANN102", # missing cls
+    "B006",
+    "B008",
+    "COM812", # trailing comma
+    "D10",  # disabled required docstrings
+    "D401",
+    "EM102",
+    "FBT001",
+    "FBT002",
+    "PLR0913",
+    "PLR2004", # magic
+    "RET504",
+    "RET505",
+    "RET508",
+    "S101", # allow assert
+    "S104",
+    "S603", # subprocess untrusted input
+    "SIM102",
+    "T201", # print
+    "TRY003",
+    "W505",
+    "ISC001" # recommended to disable for formatting
 ]
 
 [tool.ruff.lint.isort]

4e2de91

88f0467

tests/api_model_test.py

--- tests/api_model_test.py

+++ tests/api_model_test.py


 
 MODEL_THAT_EXISTS = "Systran/faster-whisper-tiny.en"
 MODEL_THAT_DOES_NOT_EXIST = "i-do-not-exist"
-MIN_EXPECTED_NUMBER_OF_MODELS = (
-    200  # At the time of the test creation there are 228 models
-)
+MIN_EXPECTED_NUMBER_OF_MODELS = 200  # At the time of the test creation there are 228 models
 
 
 # HACK: because ModelObject(**data) doesn't work

     )
 
 
-def test_list_models(client: TestClient):
+def test_list_models(client: TestClient) -> None:
     response = client.get("/v1/models")
     data = response.json()
     models = [model_dict_to_object(model_dict) for model_dict in data]
     assert len(models) > MIN_EXPECTED_NUMBER_OF_MODELS
 
 
-def test_model_exists(client: TestClient):
+def test_model_exists(client: TestClient) -> None:
     response = client.get(f"/v1/models/{MODEL_THAT_EXISTS}")
     data = response.json()
     model = model_dict_to_object(data)
     assert model.id == MODEL_THAT_EXISTS
 
 
-def test_model_does_not_exist(client: TestClient):
+def test_model_does_not_exist(client: TestClient) -> None:
     response = client.get(f"/v1/models/{MODEL_THAT_DOES_NOT_EXIST}")
     assert response.status_code == 404

4e2de91

88f0467

tests/app_test.py

--- tests/app_test.py

+++ tests/app_test.py


+from collections.abc import Generator
 import json
 import os
 import time
-from typing import Generator
 
-import pytest
 from fastapi.testclient import TestClient
+import pytest
 from starlette.testclient import WebSocketTestSession
 
 from faster_whisper_server.config import BYTES_PER_SECOND

         yield ws
 
 
-def get_audio_file_paths():
-    file_paths = []
+def get_audio_file_paths() -> list[str]:
+    file_paths: list[str] = []
     directory = "tests/data"
     for filename in sorted(os.listdir(directory)[:AUDIO_FILES_LIMIT]):
-        file_paths.append(os.path.join(directory, filename))
+        file_paths.append(os.path.join(directory, filename))  # noqa: PERF401
     return file_paths
 
 
 file_paths = get_audio_file_paths()
 
 
-def stream_audio_data(
-    ws: WebSocketTestSession, data: bytes, *, chunk_size: int = 4000, speed: float = 1.0
-):
+def stream_audio_data(ws: WebSocketTestSession, data: bytes, *, chunk_size: int = 4000, speed: float = 1.0) -> None:
     for i in range(0, len(data), chunk_size):
         ws.send_bytes(data[i : i + chunk_size])
         delay = len(data[i : i + chunk_size]) / BYTES_PER_SECOND / speed
         time.sleep(delay)
 
 
-def transcribe_audio_data(
-    client: TestClient, data: bytes
-) -> TranscriptionVerboseJsonResponse:
+def transcribe_audio_data(client: TestClient, data: bytes) -> TranscriptionVerboseJsonResponse:
     response = client.post(
         TRANSCRIBE_ENDPOINT,
         files={"file": ("audio.raw", data, "audio/raw")},
     )
     data = json.loads(response.json())  # TODO: figure this out
-    return TranscriptionVerboseJsonResponse(**data)  # type: ignore
+    return TranscriptionVerboseJsonResponse(**data)  # pyright: ignore[reportCallIssue]
 
 
 # @pytest.mark.parametrize("file_path", file_paths)

 #     with open(file_path, "rb") as file:
 #         data = file.read()
 #
-#     streaming_transcription: TranscriptionVerboseJsonResponse = None  # type: ignore
+#     streaming_transcription: TranscriptionVerboseJsonResponse = None  # type: ignore  # noqa: PGH003
 #     thread = threading.Thread(
 #         target=stream_audio_data, args=(ws, data), kwargs={"speed": 4.0}
 #     )

4e2de91

88f0467

tests/conftest.py

--- tests/conftest.py

+++ tests/conftest.py


+from collections.abc import Generator
 import logging
-import os
-from typing import Generator
 
-import pytest
 from fastapi.testclient import TestClient
+import pytest
 
-# HACK
-os.environ["WHISPER_MODEL"] = "Systran/faster-whisper-tiny.en"
-from faster_whisper_server.main import app  # noqa: E402
+from faster_whisper_server.main import app
 
 disable_loggers = ["multipart.multipart", "faster_whisper"]
 
 
-def pytest_configure():
+def pytest_configure() -> None:
     for logger_name in disable_loggers:
         logger = logging.getLogger(logger_name)
         logger.disabled = True

4e2de91

88f0467

tests/sse_test.py

--- tests/sse_test.py

+++ tests/sse_test.py


 import json
 import os
 
-import pytest
 from fastapi.testclient import TestClient
 from httpx_sse import connect_sse
+import pytest
 
 from faster_whisper_server.server_models import (
     TranscriptionJsonResponse,

 ]
 
 
-parameters = [
-    (file_path, endpoint) for endpoint in ENDPOINTS for file_path in FILE_PATHS
-]
+parameters = [(file_path, endpoint) for endpoint in ENDPOINTS for file_path in FILE_PATHS]
 
 
-@pytest.mark.parametrize("file_path,endpoint", parameters)
-def test_streaming_transcription_text(
-    client: TestClient, file_path: str, endpoint: str
-):
+@pytest.mark.parametrize(("file_path", "endpoint"), parameters)
+def test_streaming_transcription_text(client: TestClient, file_path: str, endpoint: str) -> None:
     extension = os.path.splitext(file_path)[1]
     with open(file_path, "rb") as f:
         data = f.read()

     with connect_sse(client, "POST", endpoint, **kwargs) as event_source:
         for event in event_source.iter_sse():
             print(event)
-            assert (
-                len(event.data) > 1
-            )  # HACK: 1 because of the space character that's always prepended
+            assert len(event.data) > 1  # HACK: 1 because of the space character that's always prepended
 
 
-@pytest.mark.parametrize("file_path,endpoint", parameters)
-def test_streaming_transcription_json(
-    client: TestClient, file_path: str, endpoint: str
-):
+@pytest.mark.parametrize(("file_path", "endpoint"), parameters)
+def test_streaming_transcription_json(client: TestClient, file_path: str, endpoint: str) -> None:
     extension = os.path.splitext(file_path)[1]
     with open(file_path, "rb") as f:
         data = f.read()

             TranscriptionJsonResponse(**json.loads(event.data))
 
 
-@pytest.mark.parametrize("file_path,endpoint", parameters)
-def test_streaming_transcription_verbose_json(
-    client: TestClient, file_path: str, endpoint: str
-):
+@pytest.mark.parametrize(("file_path", "endpoint"), parameters)
+def test_streaming_transcription_verbose_json(client: TestClient, file_path: str, endpoint: str) -> None:
     extension = os.path.splitext(file_path)[1]
     with open(file_path, "rb") as f:
         data = f.read()

Add a comment

Open 0
Closed 0

List

...	...	@@ -15,7 +15,7 @@
15	15	TEXT = "text"
16	16	JSON = "json"
17	17	VERBOSE_JSON = "verbose_json"
18		- # NOTE: While inspecting outputs of these formats with `curl`, I noticed there's one or two "\n" inserted at the end of the response.
	18	+ # NOTE: While inspecting outputs of these formats with `curl`, I noticed there's one or two "\n" inserted at the end of the response. # noqa: E501
19	19
20	20	# VTT = "vtt" # TODO
21	21	# 1
...	...	@@ -185,8 +185,8 @@
185	185
186	186
187	187	class Config(BaseSettings):
188		- """
189		- Configuration for the application. Values can be set via environment variables.
	188	+ """Configuration for the application. Values can be set via environment variables.
	189	+
190	190	Pydantic will automatically handle mapping uppercased environment variables to the corresponding fields.
191	191	To populate nested, the environment should be prefixed with the nested field name and an underscore. For example,
192	192	the environment variable `LOG_LEVEL` will be mapped to `log_level`, `WHISPER_MODEL` to `whisper.model`, etc.
...	...	@@ -208,7 +208,7 @@
208	208	max_inactivity_seconds: float = 5.0
209	209	"""
210	210	Max allowed audio duration without any speech being detected before transcription is finilized and connection is closed.
211		- """
	211	+ """ # noqa: E501
212	212	inactivity_window_seconds: float = 10.0
213	213	"""
214	214	Controls how many latest seconds of audio are being passed through VAD.

...	...	@@ -1,6 +1,6 @@
1	1	import asyncio
	2	+from collections.abc import Iterable
2	3	import time
3		-from typing import Iterable
4	4
5	5	from faster_whisper import transcribe
6	6
...	...	@@ -45,7 +45,7 @@
45	45	audio: Audio,
46	46	prompt: str \| None = None,
47	47	) -> tuple[Transcription, transcribe.TranscriptionInfo]:
48		- """Wrapper around _transcribe so it can be used in async context"""
	48	+ """Wrapper around _transcribe so it can be used in async context."""
49	49	# is this the optimal way to execute a blocking call in an async context?
50	50	# TODO: verify performance when running inference on a CPU
51	51	return await asyncio.get_running_loop().run_in_executor(

...	...	@@ -1,14 +1,18 @@
1	1	from __future__ import annotations
2	2
3	3	import asyncio
4		-from typing import AsyncGenerator, BinaryIO
	4	+from typing import TYPE_CHECKING, BinaryIO
5	5
6	6	import numpy as np
7	7	import soundfile as sf
8		-from numpy.typing import NDArray
9	8
10	9	from faster_whisper_server.config import SAMPLES_PER_SECOND
11	10	from faster_whisper_server.logger import logger
	11	+
	12	+if TYPE_CHECKING:
	13	+ from collections.abc import AsyncGenerator
	14	+
	15	+ from numpy.typing import NDArray
12	16
13	17
14	18	def audio_samples_from_file(file: BinaryIO) -> NDArray[np.float32]:
...	...	@@ -22,7 +26,7 @@
22	26	endian="LITTLE",
23	27	)
24	28	audio = audio_and_sample_rate[0]
25		- return audio # type: ignore
	29	+ return audio # pyright: ignore[reportReturnType]
26	30
27	31
28	32	class Audio:
...	...	@@ -78,9 +82,7 @@
78	82	self.modify_event.set()
79	83	logger.info("AudioStream closed")
80	84
81		- async def chunks(
82		- self, min_duration: float
83		- ) -> AsyncGenerator[NDArray[np.float32], None]:
	85	+ async def chunks(self, min_duration: float) -> AsyncGenerator[NDArray[np.float32], None]:
84	86	i = 0.0 # end time of last chunk
85	87	while True:
86	88	await self.modify_event.wait()

...	...	@@ -1,8 +1,8 @@
1	1	# TODO: rename module
2	2	from __future__ import annotations
3	3
4		-import re
5	4	from dataclasses import dataclass
	5	+import re
6	6
7	7	from faster_whisper_server.config import config
8	8
...	...	@@ -18,10 +18,7 @@
18	18	def is_eos(self) -> bool:
19	19	if self.text.endswith("..."):
20	20	return False
21		- for punctuation_symbol in ".?!":
22		- if self.text.endswith(punctuation_symbol):
23		- return True
24		- return False
	21	+ return any(self.text.endswith(punctuation_symbol) for punctuation_symbol in ".?!")
25	22
26	23	def offset(self, seconds: float) -> None:
27	24	self.start += seconds
...	...	@@ -36,11 +33,7 @@
36	33	@classmethod
37	34	def common_prefix(cls, a: list[Word], b: list[Word]) -> list[Word]:
38	35	i = 0
39		- while (
40		- i < len(a)
41		- and i < len(b)
42		- and canonicalize_word(a[i].text) == canonicalize_word(b[i].text)
43		- ):
	36	+ while i < len(a) and i < len(b) and canonicalize_word(a[i].text) == canonicalize_word(b[i].text):
44	37	i += 1
45	38	return a[:i]
46	39
...	...	@@ -67,9 +60,7 @@
67	60	return self.end - self.start
68	61
69	62	def after(self, seconds: float) -> Transcription:
70		- return Transcription(
71		- words=[word for word in self.words if word.start > seconds]
72		- )
	63	+ return Transcription(words=[word for word in self.words if word.start > seconds])
73	64
74	65	def extend(self, words: list[Word]) -> None:
75	66	self._ensure_no_word_overlap(words)
...	...	@@ -77,21 +68,16 @@
77	68
78	69	def _ensure_no_word_overlap(self, words: list[Word]) -> None:
79	70	if len(self.words) > 0 and len(words) > 0:
80		- if (
81		- words[0].start + config.word_timestamp_error_margin
82		- <= self.words[-1].end
83		- ):
	71	+ if words[0].start + config.word_timestamp_error_margin <= self.words[-1].end:
84	72	raise ValueError(
85		- f"Words overlap: {self.words[-1]} and {words[0]}. Error margin: {config.word_timestamp_error_margin}"
	73	+ f"Words overlap: {self.words[-1]} and {words[0]}. Error margin: {config.word_timestamp_error_margin}" # noqa: E501
86	74	)
87	75	for i in range(1, len(words)):
88	76	if words[i].start + config.word_timestamp_error_margin <= words[i - 1].end:
89		- raise ValueError(
90		- f"Words overlap: {words[i - 1]} and {words[i]}. All words: {words}"
91		- )
	77	+ raise ValueError(f"Words overlap: {words[i - 1]} and {words[i]}. All words: {words}")
92	78
93	79
94		-def test_segment_is_eos():
	80	+def test_segment_is_eos() -> None:
95	81	assert not Segment("Hello").is_eos
96	82	assert not Segment("Hello...").is_eos
97	83	assert Segment("Hello.").is_eos
...	...	@@ -117,16 +103,14 @@
117	103	return sentences
118	104
119	105
120		-def tests_to_full_sentences():
	106	+def tests_to_full_sentences() -> None:
121	107	assert to_full_sentences([]) == []
122	108	assert to_full_sentences([Word(text="Hello")]) == []
123	109	assert to_full_sentences([Word(text="Hello..."), Word(" world")]) == []
124		- assert to_full_sentences([Word(text="Hello..."), Word(" world.")]) == [
	110	+ assert to_full_sentences([Word(text="Hello..."), Word(" world.")]) == [Segment(text="Hello... world.")]
	111	+ assert to_full_sentences([Word(text="Hello..."), Word(" world."), Word(" How")]) == [
125	112	Segment(text="Hello... world.")
126	113	]
127		- assert to_full_sentences(
128		- [Word(text="Hello..."), Word(" world."), Word(" How")]
129		- ) == [Segment(text="Hello... world.")]
130	114
131	115
132	116	def to_text(words: list[Word]) -> str:
...	...	@@ -144,7 +128,7 @@
144	128	return text.lower().strip().strip(".,?!")
145	129
146	130
147		-def test_canonicalize_word():
	131	+def test_canonicalize_word() -> None:
148	132	assert canonicalize_word("ABC") == "abc"
149	133	assert canonicalize_word("...ABC?") == "abc"
150	134	assert canonicalize_word("... AbC ...") == "abc"
...	...	@@ -152,16 +136,12 @@
152	136
153	137	def common_prefix(a: list[Word], b: list[Word]) -> list[Word]:
154	138	i = 0
155		- while (
156		- i < len(a)
157		- and i < len(b)
158		- and canonicalize_word(a[i].text) == canonicalize_word(b[i].text)
159		- ):
	139	+ while i < len(a) and i < len(b) and canonicalize_word(a[i].text) == canonicalize_word(b[i].text):
160	140	i += 1
161	141	return a[:i]
162	142
163	143
164		-def test_common_prefix():
	144	+def test_common_prefix() -> None:
165	145	def word(text: str) -> Word:
166	146	return Word(text=text, start=0.0, end=0.0, probability=0.0)
167	147
...	...	@@ -194,7 +174,7 @@
194	174	assert common_prefix(a, b) == []
195	175
196	176
197		-def test_common_prefix_and_canonicalization():
	177	+def test_common_prefix_and_canonicalization() -> None:
198	178	def word(text: str) -> Word:
199	179	return Word(text=text, start=0.0, end=0.0, probability=0.0)
200	180

...	...	@@ -1,5 +1,5 @@
	1	+from collections.abc import Generator
1	2	import os
2		-from typing import Generator
3	3
4	4	import gradio as gr
5	5	import httpx
...	...	@@ -13,26 +13,20 @@
13	13
14	14	def create_gradio_demo(config: Config) -> gr.Blocks:
15	15	host = os.getenv("UVICORN_HOST", "0.0.0.0")
16		- port = os.getenv("UVICORN_PORT", 8000)
	16	+ port = int(os.getenv("UVICORN_PORT", "8000"))
17	17	# NOTE: worth looking into generated clients
18	18	http_client = httpx.Client(base_url=f"http://{host}:{port}", timeout=None)
19	19
20		- def handler(
21		- file_path: str, model: str, task: Task, temperature: float, stream: bool
22		- ) -> Generator[str, None, None]:
	20	+ def handler(file_path: str, model: str, task: Task, temperature: float, stream: bool) -> Generator[str, None, None]:
23	21	if stream:
24	22	previous_transcription = ""
25		- for transcription in transcribe_audio_streaming(
26		- file_path, task, temperature, model
27		- ):
	23	+ for transcription in transcribe_audio_streaming(file_path, task, temperature, model):
28	24	previous_transcription += transcription
29	25	yield previous_transcription
30	26	else:
31	27	yield transcribe_audio(file_path, task, temperature, model)
32	28
33		- def transcribe_audio(
34		- file_path: str, task: Task, temperature: float, model: str
35		- ) -> str:
	29	+ def transcribe_audio(file_path: str, task: Task, temperature: float, model: str) -> str:
36	30	if task == Task.TRANSCRIBE:
37	31	endpoint = TRANSCRIPTION_ENDPOINT
38	32	elif task == Task.TRANSLATE:
...	...	@@ -65,11 +59,7 @@
65	59	"stream": True,
66	60	},
67	61	}
68		- endpoint = (
69		- TRANSCRIPTION_ENDPOINT
70		- if task == Task.TRANSCRIBE
71		- else TRANSLATION_ENDPOINT
72		- )
	62	+ endpoint = TRANSCRIPTION_ENDPOINT if task == Task.TRANSCRIBE else TRANSLATION_ENDPOINT
73	63	with connect_sse(http_client, "POST", endpoint, **kwargs) as event_source:
74	64	for event in event_source.iter_sse():
75	65	yield event.data
...	...	@@ -79,18 +69,15 @@
79	69	res_data = res.json()
80	70	models: list[str] = [model["id"] for model in res_data]
81	71	assert config.whisper.model in models
82		- recommended_models = set(
83		- model for model in models if model.startswith("Systran")
84		- )
	72	+ recommended_models = {model for model in models if model.startswith("Systran")}
85	73	other_models = [model for model in models if model not in recommended_models]
86	74	models = list(recommended_models) + other_models
87		- model_dropdown = gr.Dropdown(
	75	+ return gr.Dropdown(
88	76	# no idea why it's complaining
89		- choices=models, # type: ignore
	77	+ choices=models, # pyright: ignore[reportArgumentType]
90	78	label="Model",
91	79	value=config.whisper.model,
92	80	)
93		- return model_dropdown
94	81
95	82	model_dropdown = gr.Dropdown(
96	83	choices=[config.whisper.model],
...	...	@@ -102,13 +89,11 @@
102	89	label="Task",
103	90	value=Task.TRANSCRIBE,
104	91	)
105		- temperature_slider = gr.Slider(
106		- minimum=0.0, maximum=1.0, step=0.1, label="Temperature", value=0.0
107		- )
	92	+ temperature_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label="Temperature", value=0.0)
108	93	stream_checkbox = gr.Checkbox(label="Stream", value=True)
109	94	with gr.Interface(
110	95	title="Whisper Playground",
111		- description="""Consider supporting the project by starring the <a href="https://github.com/fedirz/faster-whisper-server">repository on GitHub</a>.""",
	96	+ description="""Consider supporting the project by starring the <a href="https://github.com/fedirz/faster-whisper-server">repository on GitHub</a>.""", # noqa: E501
112	97	inputs=[
113	98	gr.Audio(type="filepath"),
114	99	model_dropdown,

...	...	@@ -8,6 +8,4 @@
8	8	root_logger.setLevel(logging.CRITICAL)
9	9	logger = logging.getLogger(__name__)
10	10	logger.setLevel(config.log_level.upper())
11		-logging.basicConfig(
12		- format="%(asctime)s:%(levelname)s:%(name)s:%(funcName)s:%(message)s"
13		-)
	11	+logging.basicConfig(format="%(asctime)s:%(levelname)s:%(name)s:%(funcName)s:%(message)s")

...	...	@@ -1,12 +1,11 @@
1	1	from __future__ import annotations
2	2
3	3	import asyncio
4		-import time
	4	+from collections import OrderedDict
5	5	from io import BytesIO
6		-from typing import Annotated, Generator, Iterable, Literal, OrderedDict
	6	+import time
	7	+from typing import TYPE_CHECKING, Annotated, Literal
7	8
8		-import gradio as gr
9		-import huggingface_hub
10	9	from fastapi import (
11	10	FastAPI,
12	11	Form,
...	...	@@ -21,9 +20,9 @@
21	20	from fastapi.responses import StreamingResponse
22	21	from fastapi.websockets import WebSocketState
23	22	from faster_whisper import WhisperModel
24		-from faster_whisper.transcribe import Segment, TranscriptionInfo
25	23	from faster_whisper.vad import VadOptions, get_speech_timestamps
26		-from huggingface_hub.hf_api import ModelInfo
	24	+import gradio as gr
	25	+import huggingface_hub
27	26	from pydantic import AfterValidator
28	27
29	28	from faster_whisper_server import utils
...	...	@@ -45,6 +44,12 @@
45	44	)
46	45	from faster_whisper_server.transcriber import audio_transcriber
47	46
	47	+if TYPE_CHECKING:
	48	+ from collections.abc import Generator, Iterable
	49	+
	50	+ from faster_whisper.transcribe import Segment, TranscriptionInfo
	51	+ from huggingface_hub.hf_api import ModelInfo
	52	+
48	53	loaded_models: OrderedDict[str, WhisperModel] = OrderedDict()
49	54
50	55
...	...	@@ -54,9 +59,7 @@
54	59	return loaded_models[model_name]
55	60	if len(loaded_models) >= config.max_models:
56	61	oldest_model_name = next(iter(loaded_models))
57		- logger.info(
58		- f"Max models ({config.max_models}) reached. Unloading the oldest model: {oldest_model_name}"
59		- )
	62	+ logger.info(f"Max models ({config.max_models}) reached. Unloading the oldest model: {oldest_model_name}")
60	63	del loaded_models[oldest_model_name]
61	64	logger.debug(f"Loading {model_name}...")
62	65	start = time.perf_counter()
...	...	@@ -67,7 +70,7 @@
67	70	compute_type=config.whisper.compute_type,
68	71	)
69	72	logger.info(
70		- f"Loaded {model_name} loaded in {time.perf_counter() - start:.2f} seconds. {config.whisper.inference_device}({config.whisper.compute_type}) will be used for inference."
	73	+ f"Loaded {model_name} loaded in {time.perf_counter() - start:.2f} seconds. {config.whisper.inference_device}({config.whisper.compute_type}) will be used for inference." # noqa: E501
71	74	)
72	75	loaded_models[model_name] = whisper
73	76	return whisper
...	...	@@ -102,9 +105,7 @@
102	105	def get_model(
103	106	model_name: Annotated[str, Path(example="Systran/faster-distil-whisper-large-v3")],
104	107	) -> ModelObject:
105		- models = list(
106		- huggingface_hub.list_models(model_name=model_name, library="ctranslate2")
107		- )
	108	+ models = list(huggingface_hub.list_models(model_name=model_name, library="ctranslate2"))
108	109	if len(models) == 0:
109	110	raise HTTPException(status_code=404, detail="Model doesn't exists")
110	111	exact_match: ModelInfo \| None = None
...	...	@@ -132,14 +133,12 @@
132	133	response_format: ResponseFormat,
133	134	) -> str \| TranscriptionJsonResponse \| TranscriptionVerboseJsonResponse:
134	135	segments = list(segments)
135		- if response_format == ResponseFormat.TEXT:
	136	+ if response_format == ResponseFormat.TEXT: # noqa: RET503
136	137	return utils.segments_text(segments)
137	138	elif response_format == ResponseFormat.JSON:
138	139	return TranscriptionJsonResponse.from_segments(segments)
139	140	elif response_format == ResponseFormat.VERBOSE_JSON:
140		- return TranscriptionVerboseJsonResponse.from_segments(
141		- segments, transcription_info
142		- )
	141	+ return TranscriptionVerboseJsonResponse.from_segments(segments, transcription_info)
143	142
144	143
145	144	def format_as_sse(data: str) -> str:
...	...	@@ -156,26 +155,21 @@
156	155	if response_format == ResponseFormat.TEXT:
157	156	data = segment.text
158	157	elif response_format == ResponseFormat.JSON:
159		- data = TranscriptionJsonResponse.from_segments(
160		- [segment]
161		- ).model_dump_json()
	158	+ data = TranscriptionJsonResponse.from_segments([segment]).model_dump_json()
162	159	elif response_format == ResponseFormat.VERBOSE_JSON:
163		- data = TranscriptionVerboseJsonResponse.from_segment(
164		- segment, transcription_info
165		- ).model_dump_json()
	160	+ data = TranscriptionVerboseJsonResponse.from_segment(segment, transcription_info).model_dump_json()
166	161	yield format_as_sse(data)
167	162
168	163	return StreamingResponse(segment_responses(), media_type="text/event-stream")
169	164
170	165
171	166	def handle_default_openai_model(model_name: str) -> str:
172		- """This exists because some callers may not be able override the default("whisper-1") model name.
	167	+ """Exists because some callers may not be able override the default("whisper-1") model name.
	168	+
173	169	For example, https://github.com/open-webui/open-webui/issues/2248#issuecomment-2162997623.
174	170	"""
175	171	if model_name == "whisper-1":
176		- logger.info(
177		- f"{model_name} is not a valid model name. Using {config.whisper.model} instead."
178		- )
	172	+ logger.info(f"{model_name} is not a valid model name. Using {config.whisper.model} instead.")
179	173	return config.whisper.model
180	174	return model_name
181	175
...	...	@@ -194,12 +188,7 @@
194	188	response_format: Annotated[ResponseFormat, Form()] = config.default_response_format,
195	189	temperature: Annotated[float, Form()] = 0.0,
196	190	stream: Annotated[bool, Form()] = False,
197		-) -> (
198		- str
199		- \| TranscriptionJsonResponse
200		- \| TranscriptionVerboseJsonResponse
201		- \| StreamingResponse
202		-):
	191	+) -> str \| TranscriptionJsonResponse \| TranscriptionVerboseJsonResponse \| StreamingResponse:
203	192	whisper = load_model(model)
204	193	segments, transcription_info = whisper.transcribe(
205	194	file.file,
...	...	@@ -210,9 +199,7 @@
210	199	)
211	200
212	201	if stream:
213		- return segments_to_streaming_response(
214		- segments, transcription_info, response_format
215		- )
	202	+ return segments_to_streaming_response(segments, transcription_info, response_format)
216	203	else:
217	204	return segments_to_response(segments, transcription_info, response_format)
218	205
...	...	@@ -231,16 +218,11 @@
231	218	response_format: Annotated[ResponseFormat, Form()] = config.default_response_format,
232	219	temperature: Annotated[float, Form()] = 0.0,
233	220	timestamp_granularities: Annotated[
234		- list[Literal["segment"] \| Literal["word"]],
	221	+ list[Literal["segment", "word"]],
235	222	Form(alias="timestamp_granularities[]"),
236	223	] = ["segment"],
237	224	stream: Annotated[bool, Form()] = False,
238		-) -> (
239		- str
240		- \| TranscriptionJsonResponse
241		- \| TranscriptionVerboseJsonResponse
242		- \| StreamingResponse
243		-):
	225	+) -> str \| TranscriptionJsonResponse \| TranscriptionVerboseJsonResponse \| StreamingResponse:
244	226	whisper = load_model(model)
245	227	segments, transcription_info = whisper.transcribe(
246	228	file.file,
...	...	@@ -253,9 +235,7 @@
253	235	)
254	236
255	237	if stream:
256		- return segments_to_streaming_response(
257		- segments, transcription_info, response_format
258		- )
	238	+ return segments_to_streaming_response(segments, transcription_info, response_format)
259	239	else:
260	240	return segments_to_response(segments, transcription_info, response_format)
261	241
...	...	@@ -263,39 +243,28 @@
263	243	async def audio_receiver(ws: WebSocket, audio_stream: AudioStream) -> None:
264	244	try:
265	245	while True:
266		- bytes_ = await asyncio.wait_for(
267		- ws.receive_bytes(), timeout=config.max_no_data_seconds
268		- )
	246	+ bytes_ = await asyncio.wait_for(ws.receive_bytes(), timeout=config.max_no_data_seconds)
269	247	logger.debug(f"Received {len(bytes_)} bytes of audio data")
270	248	audio_samples = audio_samples_from_file(BytesIO(bytes_))
271	249	audio_stream.extend(audio_samples)
272	250	if audio_stream.duration - config.inactivity_window_seconds >= 0:
273		- audio = audio_stream.after(
274		- audio_stream.duration - config.inactivity_window_seconds
275		- )
	251	+ audio = audio_stream.after(audio_stream.duration - config.inactivity_window_seconds)
276	252	vad_opts = VadOptions(min_silence_duration_ms=500, speech_pad_ms=0)
277	253	# NOTE: This is a synchronous operation that runs every time new data is received.
278		- # This shouldn't be an issue unless data is being received in tiny chunks or the user's machine is a potato.
	254	+ # This shouldn't be an issue unless data is being received in tiny chunks or the user's machine is a potato. # noqa: E501
279	255	timestamps = get_speech_timestamps(audio.data, vad_opts)
280	256	if len(timestamps) == 0:
281		- logger.info(
282		- f"No speech detected in the last {config.inactivity_window_seconds} seconds."
283		- )
	257	+ logger.info(f"No speech detected in the last {config.inactivity_window_seconds} seconds.")
284	258	break
285	259	elif (
286	260	# last speech end time
287		- config.inactivity_window_seconds
288		- - timestamps[-1]["end"] / SAMPLES_PER_SECOND
	261	+ config.inactivity_window_seconds - timestamps[-1]["end"] / SAMPLES_PER_SECOND
289	262	>= config.max_inactivity_seconds
290	263	):
291		- logger.info(
292		- f"Not enough speech in the last {config.inactivity_window_seconds} seconds."
293		- )
	264	+ logger.info(f"Not enough speech in the last {config.inactivity_window_seconds} seconds.")
294	265	break
295		- except asyncio.TimeoutError:
296		- logger.info(
297		- f"No data received in {config.max_no_data_seconds} seconds. Closing the connection."
298		- )
	266	+ except TimeoutError:
	267	+ logger.info(f"No data received in {config.max_no_data_seconds} seconds. Closing the connection.")
299	268	except WebSocketDisconnect as e:
300	269	logger.info(f"Client disconnected: {e}")
301	270	audio_stream.close()
...	...	@@ -306,9 +275,7 @@
306	275	ws: WebSocket,
307	276	model: Annotated[ModelName, Query()] = config.whisper.model,
308	277	language: Annotated[Language \| None, Query()] = config.default_language,
309		- response_format: Annotated[
310		- ResponseFormat, Query()
311		- ] = config.default_response_format,
	278	+ response_format: Annotated[ResponseFormat, Query()] = config.default_response_format,
312	279	temperature: Annotated[float, Query()] = 0.0,
313	280	) -> None:
314	281	await ws.accept()
...	...	@@ -331,19 +298,11 @@
331	298	if response_format == ResponseFormat.TEXT:
332	299	await ws.send_text(transcription.text)
333	300	elif response_format == ResponseFormat.JSON:
334		- await ws.send_json(
335		- TranscriptionJsonResponse.from_transcription(
336		- transcription
337		- ).model_dump()
338		- )
	301	+ await ws.send_json(TranscriptionJsonResponse.from_transcription(transcription).model_dump())
339	302	elif response_format == ResponseFormat.VERBOSE_JSON:
340		- await ws.send_json(
341		- TranscriptionVerboseJsonResponse.from_transcription(
342		- transcription
343		- ).model_dump()
344		- )
	303	+ await ws.send_json(TranscriptionVerboseJsonResponse.from_transcription(transcription).model_dump())
345	304
346		- if not ws.client_state == WebSocketState.DISCONNECTED:
	305	+ if ws.client_state != WebSocketState.DISCONNECTED:
347	306	logger.info("Closing the connection.")
348	307	await ws.close()
349	308

...	...	@@ -1,12 +1,15 @@
1	1	from __future__ import annotations
2	2
3		-from typing import Literal
	3	+from typing import TYPE_CHECKING, Literal
4	4
5		-from faster_whisper.transcribe import Segment, TranscriptionInfo, Word
6	5	from pydantic import BaseModel, ConfigDict, Field
7	6
8	7	from faster_whisper_server import utils
9		-from faster_whisper_server.core import Transcription
	8	+
	9	+if TYPE_CHECKING:
	10	+ from faster_whisper.transcribe import Segment, TranscriptionInfo, Word
	11	+
	12	+ from faster_whisper_server.core import Transcription
10	13
11	14
12	15	# https://platform.openai.com/docs/api-reference/audio/json-object
...	...	@@ -18,9 +21,7 @@
18	21	return cls(text=utils.segments_text(segments))
19	22
20	23	@classmethod
21		- def from_transcription(
22		- cls, transcription: Transcription
23		- ) -> TranscriptionJsonResponse:
	24	+ def from_transcription(cls, transcription: Transcription) -> TranscriptionJsonResponse:
24	25	return cls(text=transcription.text)
25	26
26	27
...	...	@@ -78,18 +79,12 @@
78	79	segments: list[SegmentObject]
79	80
80	81	@classmethod
81		- def from_segment(
82		- cls, segment: Segment, transcription_info: TranscriptionInfo
83		- ) -> TranscriptionVerboseJsonResponse:
	82	+ def from_segment(cls, segment: Segment, transcription_info: TranscriptionInfo) -> TranscriptionVerboseJsonResponse:
84	83	return cls(
85	84	language=transcription_info.language,
86	85	duration=segment.end - segment.start,
87	86	text=segment.text,
88		- words=(
89		- [WordObject.from_word(word) for word in segment.words]
90		- if isinstance(segment.words, list)
91		- else []
92		- ),
	87	+ words=([WordObject.from_word(word) for word in segment.words] if isinstance(segment.words, list) else []),
93	88	segments=[SegmentObject.from_segment(segment)],
94	89	)
95	90
...	...	@@ -102,16 +97,11 @@
102	97	duration=transcription_info.duration,
103	98	text=utils.segments_text(segments),
104	99	segments=[SegmentObject.from_segment(segment) for segment in segments],
105		- words=[
106		- WordObject.from_word(word)
107		- for word in utils.words_from_segments(segments)
108		- ],
	100	+ words=[WordObject.from_word(word) for word in utils.words_from_segments(segments)],
109	101	)
110	102
111	103	@classmethod
112		- def from_transcription(
113		- cls, transcription: Transcription
114		- ) -> TranscriptionVerboseJsonResponse:
	104	+ def from_transcription(cls, transcription: Transcription) -> TranscriptionVerboseJsonResponse:
115	105	return cls(
116	106	language="english", # FIX: hardcoded
117	107	duration=transcription.duration,

...	...	@@ -1,8 +1,7 @@
1	1	from __future__ import annotations
2	2
3		-from typing import AsyncGenerator
	3	+from typing import TYPE_CHECKING
4	4
5		-from faster_whisper_server.asr import FasterWhisperASR
6	5	from faster_whisper_server.audio import Audio, AudioStream
7	6	from faster_whisper_server.config import config
8	7	from faster_whisper_server.core import (
...	...	@@ -13,6 +12,11 @@
13	12	)
14	13	from faster_whisper_server.logger import logger
15	14
	15	+if TYPE_CHECKING:
	16	+ from collections.abc import AsyncGenerator
	17	+
	18	+ from faster_whisper_server.asr import FasterWhisperASR
	19	+
16	20
17	21	class LocalAgreement:
18	22	def __init__(self) -> None:

...	...	@@ -28,18 +28,35 @@
28	28	[tool.ruff.lint]
29	29	select = ["ALL"]
30	30	ignore = [
31		- "D10", # disabled required docstrings
	31	+ "FIX",
	32	+ "TD", # disable todo warnings
32	33	"ERA", # allow commented out code
33		- "TD", # disable TODO warnings
34		- "FIX002", # disable TODO warnings
	34	+ "PTH",
35	35
36		- "COM812", # trailing comma
37		- "T201", # print
38		- "S101", # allow assert
39		- "PTH123", # Path.open
40		- "S603", # subprocess untrusted input
41		-
	36	+ "ANN003", # missing kwargs
42	37	"ANN101", # missing self type
	38	+ "ANN102", # missing cls
	39	+ "B006",
	40	+ "B008",
	41	+ "COM812", # trailing comma
	42	+ "D10", # disabled required docstrings
	43	+ "D401",
	44	+ "EM102",
	45	+ "FBT001",
	46	+ "FBT002",
	47	+ "PLR0913",
	48	+ "PLR2004", # magic
	49	+ "RET504",
	50	+ "RET505",
	51	+ "RET508",
	52	+ "S101", # allow assert
	53	+ "S104",
	54	+ "S603", # subprocess untrusted input
	55	+ "SIM102",
	56	+ "T201", # print
	57	+ "TRY003",
	58	+ "W505",
	59	+ "ISC001" # recommended to disable for formatting
43	60	]
44	61
45	62	[tool.ruff.lint.isort]

...	...	@@ -4,9 +4,7 @@
4	4
5	5	MODEL_THAT_EXISTS = "Systran/faster-whisper-tiny.en"
6	6	MODEL_THAT_DOES_NOT_EXIST = "i-do-not-exist"
7		-MIN_EXPECTED_NUMBER_OF_MODELS = (
8		- 200 # At the time of the test creation there are 228 models
9		-)
	7	+MIN_EXPECTED_NUMBER_OF_MODELS = 200 # At the time of the test creation there are 228 models
10	8
11	9
12	10	# HACK: because ModelObject(**data) doesn't work
...	...	@@ -19,20 +17,20 @@
19	17	)
20	18
21	19
22		-def test_list_models(client: TestClient):
	20	+def test_list_models(client: TestClient) -> None:
23	21	response = client.get("/v1/models")
24	22	data = response.json()
25	23	models = [model_dict_to_object(model_dict) for model_dict in data]
26	24	assert len(models) > MIN_EXPECTED_NUMBER_OF_MODELS
27	25
28	26
29		-def test_model_exists(client: TestClient):
	27	+def test_model_exists(client: TestClient) -> None:
30	28	response = client.get(f"/v1/models/{MODEL_THAT_EXISTS}")
31	29	data = response.json()
32	30	model = model_dict_to_object(data)
33	31	assert model.id == MODEL_THAT_EXISTS
34	32
35	33
36		-def test_model_does_not_exist(client: TestClient):
	34	+def test_model_does_not_exist(client: TestClient) -> None:
37	35	response = client.get(f"/v1/models/{MODEL_THAT_DOES_NOT_EXIST}")
38	36	assert response.status_code == 404

...	...	@@ -1,10 +1,10 @@
	1	+from collections.abc import Generator
1	2	import json
2	3	import os
3	4	import time
4		-from typing import Generator
5	5
6		-import pytest
7	6	from fastapi.testclient import TestClient
	7	+import pytest
8	8	from starlette.testclient import WebSocketTestSession
9	9
10	10	from faster_whisper_server.config import BYTES_PER_SECOND
...	...	@@ -22,35 +22,31 @@
22	22	yield ws
23	23
24	24
25		-def get_audio_file_paths():
26		- file_paths = []
	25	+def get_audio_file_paths() -> list[str]:
	26	+ file_paths: list[str] = []
27	27	directory = "tests/data"
28	28	for filename in sorted(os.listdir(directory)[:AUDIO_FILES_LIMIT]):
29		- file_paths.append(os.path.join(directory, filename))
	29	+ file_paths.append(os.path.join(directory, filename)) # noqa: PERF401
30	30	return file_paths
31	31
32	32
33	33	file_paths = get_audio_file_paths()
34	34
35	35
36		-def stream_audio_data(
37		- ws: WebSocketTestSession, data: bytes, *, chunk_size: int = 4000, speed: float = 1.0
38		-):
	36	+def stream_audio_data(ws: WebSocketTestSession, data: bytes, *, chunk_size: int = 4000, speed: float = 1.0) -> None:
39	37	for i in range(0, len(data), chunk_size):
40	38	ws.send_bytes(data[i : i + chunk_size])
41	39	delay = len(data[i : i + chunk_size]) / BYTES_PER_SECOND / speed
42	40	time.sleep(delay)
43	41
44	42
45		-def transcribe_audio_data(
46		- client: TestClient, data: bytes
47		-) -> TranscriptionVerboseJsonResponse:
	43	+def transcribe_audio_data(client: TestClient, data: bytes) -> TranscriptionVerboseJsonResponse:
48	44	response = client.post(
49	45	TRANSCRIBE_ENDPOINT,
50	46	files={"file": ("audio.raw", data, "audio/raw")},
51	47	)
52	48	data = json.loads(response.json()) # TODO: figure this out
53		- return TranscriptionVerboseJsonResponse(**data) # type: ignore
	49	+ return TranscriptionVerboseJsonResponse(**data) # pyright: ignore[reportCallIssue]
54	50
55	51
56	52	# @pytest.mark.parametrize("file_path", file_paths)
...	...	@@ -60,7 +56,7 @@
60	56	# with open(file_path, "rb") as file:
61	57	# data = file.read()
62	58	#
63		-# streaming_transcription: TranscriptionVerboseJsonResponse = None # type: ignore
	59	+# streaming_transcription: TranscriptionVerboseJsonResponse = None # type: ignore # noqa: PGH003
64	60	# thread = threading.Thread(
65	61	# target=stream_audio_data, args=(ws, data), kwargs={"speed": 4.0}
66	62	# )

...	...	@@ -1,18 +1,15 @@
	1	+from collections.abc import Generator
1	2	import logging
2		-import os
3		-from typing import Generator
4	3
5		-import pytest
6	4	from fastapi.testclient import TestClient
	5	+import pytest
7	6
8		-# HACK
9		-os.environ["WHISPER_MODEL"] = "Systran/faster-whisper-tiny.en"
10		-from faster_whisper_server.main import app # noqa: E402
	7	+from faster_whisper_server.main import app
11	8
12	9	disable_loggers = ["multipart.multipart", "faster_whisper"]
13	10
14	11
15		-def pytest_configure():
	12	+def pytest_configure() -> None:
16	13	for logger_name in disable_loggers:
17	14	logger = logging.getLogger(logger_name)
18	15	logger.disabled = True

...	...	@@ -1,9 +1,9 @@
1	1	import json
2	2	import os
3	3
4		-import pytest
5	4	from fastapi.testclient import TestClient
6	5	from httpx_sse import connect_sse
	6	+import pytest
7	7
8	8	from faster_whisper_server.server_models import (
9	9	TranscriptionJsonResponse,
...	...	@@ -17,15 +17,11 @@
17	17	]
18	18
19	19
20		-parameters = [
21		- (file_path, endpoint) for endpoint in ENDPOINTS for file_path in FILE_PATHS
22		-]
	20	+parameters = [(file_path, endpoint) for endpoint in ENDPOINTS for file_path in FILE_PATHS]
23	21
24	22
25		-@pytest.mark.parametrize("file_path,endpoint", parameters)
26		-def test_streaming_transcription_text(
27		- client: TestClient, file_path: str, endpoint: str
28		-):
	23	+@pytest.mark.parametrize(("file_path", "endpoint"), parameters)
	24	+def test_streaming_transcription_text(client: TestClient, file_path: str, endpoint: str) -> None:
29	25	extension = os.path.splitext(file_path)[1]
30	26	with open(file_path, "rb") as f:
31	27	data = f.read()
...	...	@@ -36,15 +32,11 @@
36	32	with connect_sse(client, "POST", endpoint, **kwargs) as event_source:
37	33	for event in event_source.iter_sse():
38	34	print(event)
39		- assert (
40		- len(event.data) > 1
41		- ) # HACK: 1 because of the space character that's always prepended
	35	+ assert len(event.data) > 1 # HACK: 1 because of the space character that's always prepended
42	36
43	37
44		-@pytest.mark.parametrize("file_path,endpoint", parameters)
45		-def test_streaming_transcription_json(
46		- client: TestClient, file_path: str, endpoint: str
47		-):
	38	+@pytest.mark.parametrize(("file_path", "endpoint"), parameters)
	39	+def test_streaming_transcription_json(client: TestClient, file_path: str, endpoint: str) -> None:
48	40	extension = os.path.splitext(file_path)[1]
49	41	with open(file_path, "rb") as f:
50	42	data = f.read()
...	...	@@ -57,10 +49,8 @@
57	49	TranscriptionJsonResponse(**json.loads(event.data))
58	50
59	51
60		-@pytest.mark.parametrize("file_path,endpoint", parameters)
61		-def test_streaming_transcription_verbose_json(
62		- client: TestClient, file_path: str, endpoint: str
63		-):
	52	+@pytest.mark.parametrize(("file_path", "endpoint"), parameters)
	53	+def test_streaming_transcription_verbose_json(client: TestClient, file_path: str, endpoint: str) -> None:
64	54	extension = os.path.splitext(file_path)[1]
65	55	with open(file_path, "rb") as f:
66	56	data = f.read()

Delete comment