Fedir Zadniprovskyi 2024-10-01
refactor: update response model names and module name
@b68af2b6a548ecd9a6a95165f0a63e74e9b3f47a
 
src/faster_whisper_server/api_models.py (added)
+++ src/faster_whisper_server/api_models.py
@@ -0,0 +1,208 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Literal
+
+from pydantic import BaseModel, ConfigDict, Field
+
+from faster_whisper_server.text_utils import Transcription, canonicalize_word, segments_to_text
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+    import faster_whisper.transcribe
+
+
+# https://github.com/openai/openai-openapi/blob/master/openapi.yaml#L10909
+class TranscriptionWord(BaseModel):
+    start: float
+    end: float
+    word: str
+    probability: float
+
+    @classmethod
+    def from_segments(cls, segments: Iterable[TranscriptionSegment]) -> list[TranscriptionWord]:
+        words: list[TranscriptionWord] = []
+        for segment in segments:
+            # NOTE: a temporary "fix" for https://github.com/fedirz/faster-whisper-server/issues/58.
+            # TODO: properly address the issue
+            assert (
+                segment.words is not None
+            ), "Segment must have words. If you are using an API ensure `timestamp_granularities[]=word` is set"
+            words.extend(segment.words)
+        return words
+
+    def offset(self, seconds: float) -> None:
+        self.start += seconds
+        self.end += seconds
+
+    @classmethod
+    def common_prefix(cls, a: list[TranscriptionWord], b: list[TranscriptionWord]) -> list[TranscriptionWord]:
+        i = 0
+        while i < len(a) and i < len(b) and canonicalize_word(a[i].word) == canonicalize_word(b[i].word):
+            i += 1
+        return a[:i]
+
+
+# https://github.com/openai/openai-openapi/blob/master/openapi.yaml#L10938
+class TranscriptionSegment(BaseModel):
+    id: int
+    seek: int
+    start: float
+    end: float
+    text: str
+    tokens: list[int]
+    temperature: float
+    avg_logprob: float
+    compression_ratio: float
+    no_speech_prob: float
+    words: list[TranscriptionWord] | None
+
+    @classmethod
+    def from_faster_whisper_segments(
+        cls, segments: Iterable[faster_whisper.transcribe.Segment]
+    ) -> Iterable[TranscriptionSegment]:
+        for segment in segments:
+            yield cls(
+                id=segment.id,
+                seek=segment.seek,
+                start=segment.start,
+                end=segment.end,
+                text=segment.text,
+                tokens=segment.tokens,
+                temperature=segment.temperature,
+                avg_logprob=segment.avg_logprob,
+                compression_ratio=segment.compression_ratio,
+                no_speech_prob=segment.no_speech_prob,
+                words=[
+                    TranscriptionWord(
+                        start=word.start,
+                        end=word.end,
+                        word=word.word,
+                        probability=word.probability,
+                    )
+                    for word in segment.words
+                ]
+                if segment.words is not None
+                else None,
+            )
+
+
+# https://platform.openai.com/docs/api-reference/audio/json-object
+# https://github.com/openai/openai-openapi/blob/master/openapi.yaml#L10924
+class CreateTranscriptionResponseJson(BaseModel):
+    text: str
+
+    @classmethod
+    def from_segments(cls, segments: list[TranscriptionSegment]) -> CreateTranscriptionResponseJson:
+        return cls(text=segments_to_text(segments))
+
+    @classmethod
+    def from_transcription(cls, transcription: Transcription) -> CreateTranscriptionResponseJson:
+        return cls(text=transcription.text)
+
+
+# https://platform.openai.com/docs/api-reference/audio/verbose-json-object
+# https://github.com/openai/openai-openapi/blob/master/openapi.yaml#L11007
+class CreateTranscriptionResponseVerboseJson(BaseModel):
+    task: str = "transcribe"
+    language: str
+    duration: float
+    text: str
+    words: list[TranscriptionWord] | None
+    segments: list[TranscriptionSegment]
+
+    @classmethod
+    def from_segment(
+        cls, segment: TranscriptionSegment, transcription_info: faster_whisper.transcribe.TranscriptionInfo
+    ) -> CreateTranscriptionResponseVerboseJson:
+        return cls(
+            language=transcription_info.language,
+            duration=segment.end - segment.start,
+            text=segment.text,
+            words=segment.words if transcription_info.transcription_options.word_timestamps else None,
+            segments=[segment],
+        )
+
+    @classmethod
+    def from_segments(
+        cls, segments: list[TranscriptionSegment], transcription_info: faster_whisper.transcribe.TranscriptionInfo
+    ) -> CreateTranscriptionResponseVerboseJson:
+        return cls(
+            language=transcription_info.language,
+            duration=transcription_info.duration,
+            text=segments_to_text(segments),
+            segments=segments,
+            words=TranscriptionWord.from_segments(segments)
+            if transcription_info.transcription_options.word_timestamps
+            else None,
+        )
+
+    @classmethod
+    def from_transcription(cls, transcription: Transcription) -> CreateTranscriptionResponseVerboseJson:
+        return cls(
+            language="english",  # FIX: hardcoded
+            duration=transcription.duration,
+            text=transcription.text,
+            words=transcription.words,
+            segments=[],  # FIX: hardcoded
+        )
+
+
+# https://github.com/openai/openai-openapi/blob/master/openapi.yaml#L8730
+class ListModelsResponse(BaseModel):
+    data: list[Model]
+    object: Literal["list"] = "list"
+
+
+# https://github.com/openai/openai-openapi/blob/master/openapi.yaml#L11146
+class Model(BaseModel):
+    id: str
+    """The model identifier, which can be referenced in the API endpoints."""
+    created: int
+    """The Unix timestamp (in seconds) when the model was created."""
+    object_: Literal["model"] = Field(serialization_alias="object")
+    """The object type, which is always "model"."""
+    owned_by: str
+    """The organization that owns the model."""
+    language: list[str] = Field(default_factory=list)
+    """List of ISO 639-3 supported by the model. It's possible that the list will be empty. This field is not a part of the OpenAI API spec and is added for convenience."""  # noqa: E501
+
+    model_config = ConfigDict(
+        populate_by_name=True,
+        json_schema_extra={
+            "examples": [
+                {
+                    "id": "Systran/faster-whisper-large-v3",
+                    "created": 1700732060,
+                    "object": "model",
+                    "owned_by": "Systran",
+                },
+                {
+                    "id": "Systran/faster-distil-whisper-large-v3",
+                    "created": 1711378296,
+                    "object": "model",
+                    "owned_by": "Systran",
+                },
+                {
+                    "id": "bofenghuang/whisper-large-v2-cv11-french-ct2",
+                    "created": 1687968011,
+                    "object": "model",
+                    "owned_by": "bofenghuang",
+                },
+            ]
+        },
+    )
+
+
+# https://github.com/openai/openai-openapi/blob/master/openapi.yaml#L10909
+TimestampGranularities = list[Literal["segment", "word"]]
+
+
+DEFAULT_TIMESTAMP_GRANULARITIES: TimestampGranularities = ["segment"]
+TIMESTAMP_GRANULARITIES_COMBINATIONS: list[TimestampGranularities] = [
+    [],  # should be treated as ["segment"]. https://platform.openai.com/docs/api-reference/audio/createTranscription#audio-createtranscription-timestamp_granularities
+    ["segment"],
+    ["word"],
+    ["word", "segment"],
+    ["segment", "word"],  # same as ["word", "segment"] but order is different
+]
src/faster_whisper_server/asr.py
--- src/faster_whisper_server/asr.py
+++ src/faster_whisper_server/asr.py
@@ -1,11 +1,17 @@
+from __future__ import annotations
+
 import asyncio
 import logging
 import time
+from typing import TYPE_CHECKING
 
-from faster_whisper import transcribe
+from faster_whisper_server.api_models import TranscriptionSegment, TranscriptionWord
+from faster_whisper_server.text_utils import Transcription
 
-from faster_whisper_server.audio import Audio
-from faster_whisper_server.core import Segment, Transcription, Word
+if TYPE_CHECKING:
+    from faster_whisper import transcribe
+
+    from faster_whisper_server.audio import Audio
 
 logger = logging.getLogger(__name__)
 
@@ -31,8 +37,8 @@
             word_timestamps=True,
             **self.transcribe_opts,
         )
-        segments = Segment.from_faster_whisper_segments(segments)
-        words = Word.from_segments(segments)
+        segments = TranscriptionSegment.from_faster_whisper_segments(segments)
+        words = TranscriptionWord.from_segments(segments)
         for word in words:
             word.offset(audio.start)
         transcription = Transcription(words)
 
src/faster_whisper_server/core.py (deleted)
--- src/faster_whisper_server/core.py
@@ -1,299 +0,0 @@
-from __future__ import annotations
-
-import re
-from typing import TYPE_CHECKING
-
-from pydantic import BaseModel
-
-from faster_whisper_server.dependencies import get_config
-
-if TYPE_CHECKING:
-    from collections.abc import Iterable
-
-    import faster_whisper.transcribe
-
-
-class Word(BaseModel):
-    start: float
-    end: float
-    word: str
-    probability: float
-
-    @classmethod
-    def from_segments(cls, segments: Iterable[Segment]) -> list[Word]:
-        words: list[Word] = []
-        for segment in segments:
-            # NOTE: a temporary "fix" for https://github.com/fedirz/faster-whisper-server/issues/58.
-            # TODO: properly address the issue
-            assert (
-                segment.words is not None
-            ), "Segment must have words. If you are using an API ensure `timestamp_granularities[]=word` is set"
-            words.extend(segment.words)
-        return words
-
-    def offset(self, seconds: float) -> None:
-        self.start += seconds
-        self.end += seconds
-
-    @classmethod
-    def common_prefix(cls, a: list[Word], b: list[Word]) -> list[Word]:
-        i = 0
-        while i < len(a) and i < len(b) and canonicalize_word(a[i].word) == canonicalize_word(b[i].word):
-            i += 1
-        return a[:i]
-
-
-class Segment(BaseModel):
-    id: int
-    seek: int
-    start: float
-    end: float
-    text: str
-    tokens: list[int]
-    temperature: float
-    avg_logprob: float
-    compression_ratio: float
-    no_speech_prob: float
-    words: list[Word] | None
-
-    @classmethod
-    def from_faster_whisper_segments(cls, segments: Iterable[faster_whisper.transcribe.Segment]) -> Iterable[Segment]:
-        for segment in segments:
-            yield cls(
-                id=segment.id,
-                seek=segment.seek,
-                start=segment.start,
-                end=segment.end,
-                text=segment.text,
-                tokens=segment.tokens,
-                temperature=segment.temperature,
-                avg_logprob=segment.avg_logprob,
-                compression_ratio=segment.compression_ratio,
-                no_speech_prob=segment.no_speech_prob,
-                words=[
-                    Word(
-                        start=word.start,
-                        end=word.end,
-                        word=word.word,
-                        probability=word.probability,
-                    )
-                    for word in segment.words
-                ]
-                if segment.words is not None
-                else None,
-            )
-
-
-class Transcription:
-    def __init__(self, words: list[Word] = []) -> None:
-        self.words: list[Word] = []
-        self.extend(words)
-
-    @property
-    def text(self) -> str:
-        return " ".join(word.word for word in self.words).strip()
-
-    @property
-    def start(self) -> float:
-        return self.words[0].start if len(self.words) > 0 else 0.0
-
-    @property
-    def end(self) -> float:
-        return self.words[-1].end if len(self.words) > 0 else 0.0
-
-    @property
-    def duration(self) -> float:
-        return self.end - self.start
-
-    def after(self, seconds: float) -> Transcription:
-        return Transcription(words=[word for word in self.words if word.start > seconds])
-
-    def extend(self, words: list[Word]) -> None:
-        self._ensure_no_word_overlap(words)
-        self.words.extend(words)
-
-    def _ensure_no_word_overlap(self, words: list[Word]) -> None:
-        config = get_config()  # HACK
-        if len(self.words) > 0 and len(words) > 0:
-            if words[0].start + config.word_timestamp_error_margin <= self.words[-1].end:
-                raise ValueError(
-                    f"Words overlap: {self.words[-1]} and {words[0]}. Error margin: {config.word_timestamp_error_margin}"  # noqa: E501
-                )
-        for i in range(1, len(words)):
-            if words[i].start + config.word_timestamp_error_margin <= words[i - 1].end:
-                raise ValueError(f"Words overlap: {words[i - 1]} and {words[i]}. All words: {words}")
-
-
-def is_eos(text: str) -> bool:
-    if text.endswith("..."):
-        return False
-    return any(text.endswith(punctuation_symbol) for punctuation_symbol in ".?!")
-
-
-def test_is_eos() -> None:
-    assert not is_eos("Hello")
-    assert not is_eos("Hello...")
-    assert is_eos("Hello.")
-    assert is_eos("Hello!")
-    assert is_eos("Hello?")
-    assert not is_eos("Hello. Yo")
-    assert not is_eos("Hello. Yo...")
-    assert is_eos("Hello. Yo.")
-
-
-def to_full_sentences(words: list[Word]) -> list[list[Word]]:
-    sentences: list[list[Word]] = [[]]
-    for word in words:
-        sentences[-1].append(word)
-        if is_eos(word.word):
-            sentences.append([])
-    if len(sentences[-1]) == 0 or not is_eos(sentences[-1][-1].word):
-        sentences.pop()
-    return sentences
-
-
-def tests_to_full_sentences() -> None:
-    def word(text: str) -> Word:
-        return Word(word=text, start=0.0, end=0.0, probability=0.0)
-
-    assert to_full_sentences([]) == []
-    assert to_full_sentences([word(text="Hello")]) == []
-    assert to_full_sentences([word(text="Hello..."), word(" world")]) == []
-    assert to_full_sentences([word(text="Hello..."), word(" world.")]) == [[word("Hello..."), word(" world.")]]
-    assert to_full_sentences([word(text="Hello..."), word(" world."), word(" How")]) == [
-        [word("Hello..."), word(" world.")],
-    ]
-
-
-def word_to_text(words: list[Word]) -> str:
-    return "".join(word.word for word in words)
-
-
-def words_to_text_w_ts(words: list[Word]) -> str:
-    return "".join(f"{word.word}({word.start:.2f}-{word.end:.2f})" for word in words)
-
-
-def segments_to_text(segments: Iterable[Segment]) -> str:
-    return "".join(segment.text for segment in segments).strip()
-
-
-def srt_format_timestamp(ts: float) -> str:
-    hours = ts // 3600
-    minutes = (ts % 3600) // 60
-    seconds = ts % 60
-    milliseconds = (ts * 1000) % 1000
-    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
-
-
-def test_srt_format_timestamp() -> None:
-    assert srt_format_timestamp(0.0) == "00:00:00,000"
-    assert srt_format_timestamp(1.0) == "00:00:01,000"
-    assert srt_format_timestamp(1.234) == "00:00:01,234"
-    assert srt_format_timestamp(60.0) == "00:01:00,000"
-    assert srt_format_timestamp(61.0) == "00:01:01,000"
-    assert srt_format_timestamp(61.234) == "00:01:01,234"
-    assert srt_format_timestamp(3600.0) == "01:00:00,000"
-    assert srt_format_timestamp(3601.0) == "01:00:01,000"
-    assert srt_format_timestamp(3601.234) == "01:00:01,234"
-    assert srt_format_timestamp(23423.4234) == "06:30:23,423"
-
-
-def vtt_format_timestamp(ts: float) -> str:
-    hours = ts // 3600
-    minutes = (ts % 3600) // 60
-    seconds = ts % 60
-    milliseconds = (ts * 1000) % 1000
-    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}"
-
-
-def test_vtt_format_timestamp() -> None:
-    assert vtt_format_timestamp(0.0) == "00:00:00.000"
-    assert vtt_format_timestamp(1.0) == "00:00:01.000"
-    assert vtt_format_timestamp(1.234) == "00:00:01.234"
-    assert vtt_format_timestamp(60.0) == "00:01:00.000"
-    assert vtt_format_timestamp(61.0) == "00:01:01.000"
-    assert vtt_format_timestamp(61.234) == "00:01:01.234"
-    assert vtt_format_timestamp(3600.0) == "01:00:00.000"
-    assert vtt_format_timestamp(3601.0) == "01:00:01.000"
-    assert vtt_format_timestamp(3601.234) == "01:00:01.234"
-    assert vtt_format_timestamp(23423.4234) == "06:30:23.423"
-
-
-def segments_to_vtt(segment: Segment, i: int) -> str:
-    start = segment.start if i > 0 else 0.0
-    result = f"{vtt_format_timestamp(start)} --> {vtt_format_timestamp(segment.end)}\n{segment.text}\n\n"
-
-    if i == 0:
-        return f"WEBVTT\n\n{result}"
-    else:
-        return result
-
-
-def segments_to_srt(segment: Segment, i: int) -> str:
-    return f"{i + 1}\n{srt_format_timestamp(segment.start)} --> {srt_format_timestamp(segment.end)}\n{segment.text}\n\n"
-
-
-def canonicalize_word(text: str) -> str:
-    text = text.lower()
-    # Remove non-alphabetic characters using regular expression
-    text = re.sub(r"[^a-z]", "", text)
-    return text.lower().strip().strip(".,?!")
-
-
-def test_canonicalize_word() -> None:
-    assert canonicalize_word("ABC") == "abc"
-    assert canonicalize_word("...ABC?") == "abc"
-    assert canonicalize_word("... AbC  ...") == "abc"
-
-
-def common_prefix(a: list[Word], b: list[Word]) -> list[Word]:
-    i = 0
-    while i < len(a) and i < len(b) and canonicalize_word(a[i].word) == canonicalize_word(b[i].word):
-        i += 1
-    return a[:i]
-
-
-def test_common_prefix() -> None:
-    def word(text: str) -> Word:
-        return Word(word=text, start=0.0, end=0.0, probability=0.0)
-
-    a = [word("a"), word("b"), word("c")]
-    b = [word("a"), word("b"), word("c")]
-    assert common_prefix(a, b) == [word("a"), word("b"), word("c")]
-
-    a = [word("a"), word("b"), word("c")]
-    b = [word("a"), word("b"), word("d")]
-    assert common_prefix(a, b) == [word("a"), word("b")]
-
-    a = [word("a"), word("b"), word("c")]
-    b = [word("a")]
-    assert common_prefix(a, b) == [word("a")]
-
-    a = [word("a")]
-    b = [word("a"), word("b"), word("c")]
-    assert common_prefix(a, b) == [word("a")]
-
-    a = [word("a")]
-    b = []
-    assert common_prefix(a, b) == []
-
-    a = []
-    b = [word("a")]
-    assert common_prefix(a, b) == []
-
-    a = [word("a"), word("b"), word("c")]
-    b = [word("b"), word("c")]
-    assert common_prefix(a, b) == []
-
-
-def test_common_prefix_and_canonicalization() -> None:
-    def word(text: str) -> Word:
-        return Word(word=text, start=0.0, end=0.0, probability=0.0)
-
-    a = [word("A...")]
-    b = [word("a?"), word("b"), word("c")]
-    assert common_prefix(a, b) == [word("A...")]
-
-    a = [word("A..."), word("B?"), word("C,")]
-    b = [word("a??"), word("  b"), word(" ,c")]
-    assert common_prefix(a, b) == [word("A..."), word("B?"), word("C,")]
src/faster_whisper_server/routers/list_models.py
--- src/faster_whisper_server/routers/list_models.py
+++ src/faster_whisper_server/routers/list_models.py
@@ -9,9 +9,9 @@
 )
 import huggingface_hub
 
-from faster_whisper_server.server_models import (
-    ModelListResponse,
-    ModelObject,
+from faster_whisper_server.api_models import (
+    ListModelsResponse,
+    Model,
 )
 
 if TYPE_CHECKING:
@@ -21,11 +21,11 @@
 
 
 @router.get("/v1/models")
-def get_models() -> ModelListResponse:
+def get_models() -> ListModelsResponse:
     models = huggingface_hub.list_models(library="ctranslate2", tags="automatic-speech-recognition", cardData=True)
     models = list(models)
     models.sort(key=lambda model: model.downloads, reverse=True)  # type: ignore  # noqa: PGH003
-    transformed_models: list[ModelObject] = []
+    transformed_models: list[Model] = []
     for model in models:
         assert model.created_at is not None
         assert model.card_data is not None
@@ -36,7 +36,7 @@
             language = [model.card_data.language]
         else:
             language = model.card_data.language
-        transformed_model = ModelObject(
+        transformed_model = Model(
             id=model.id,
             created=int(model.created_at.timestamp()),
             object_="model",
@@ -44,14 +44,14 @@
             language=language,
         )
         transformed_models.append(transformed_model)
-    return ModelListResponse(data=transformed_models)
+    return ListModelsResponse(data=transformed_models)
 
 
 @router.get("/v1/models/{model_name:path}")
 # NOTE: `examples` doesn't work https://github.com/tiangolo/fastapi/discussions/10537
 def get_model(
     model_name: Annotated[str, Path(example="Systran/faster-distil-whisper-large-v3")],
-) -> ModelObject:
+) -> Model:
     models = huggingface_hub.list_models(
         model_name=model_name, library="ctranslate2", tags="automatic-speech-recognition", cardData=True
     )
@@ -78,7 +78,7 @@
         language = [exact_match.card_data.language]
     else:
         language = exact_match.card_data.language
-    return ModelObject(
+    return Model(
         id=exact_match.id,
         created=int(exact_match.created_at.timestamp()),
         object_="model",
src/faster_whisper_server/routers/stt.py
--- src/faster_whisper_server/routers/stt.py
+++ src/faster_whisper_server/routers/stt.py
@@ -20,6 +20,14 @@
 from faster_whisper.vad import VadOptions, get_speech_timestamps
 from pydantic import AfterValidator
 
+from faster_whisper_server.api_models import (
+    DEFAULT_TIMESTAMP_GRANULARITIES,
+    TIMESTAMP_GRANULARITIES_COMBINATIONS,
+    CreateTranscriptionResponseJson,
+    CreateTranscriptionResponseVerboseJson,
+    TimestampGranularities,
+    TranscriptionSegment,
+)
 from faster_whisper_server.asr import FasterWhisperASR
 from faster_whisper_server.audio import AudioStream, audio_samples_from_file
 from faster_whisper_server.config import (
@@ -28,15 +36,8 @@
     ResponseFormat,
     Task,
 )
-from faster_whisper_server.core import Segment, segments_to_srt, segments_to_text, segments_to_vtt
 from faster_whisper_server.dependencies import ConfigDependency, ModelManagerDependency, get_config
-from faster_whisper_server.server_models import (
-    DEFAULT_TIMESTAMP_GRANULARITIES,
-    TIMESTAMP_GRANULARITIES_COMBINATIONS,
-    TimestampGranularities,
-    TranscriptionJsonResponse,
-    TranscriptionVerboseJsonResponse,
-)
+from faster_whisper_server.text_utils import segments_to_srt, segments_to_text, segments_to_vtt
 from faster_whisper_server.transcriber import audio_transcriber
 
 if TYPE_CHECKING:
@@ -51,7 +52,7 @@
 
 
 def segments_to_response(
-    segments: Iterable[Segment],
+    segments: Iterable[TranscriptionSegment],
     transcription_info: TranscriptionInfo,
     response_format: ResponseFormat,
 ) -> Response:
@@ -60,12 +61,12 @@
         return Response(segments_to_text(segments), media_type="text/plain")
     elif response_format == ResponseFormat.JSON:
         return Response(
-            TranscriptionJsonResponse.from_segments(segments).model_dump_json(),
+            CreateTranscriptionResponseJson.from_segments(segments).model_dump_json(),
             media_type="application/json",
         )
     elif response_format == ResponseFormat.VERBOSE_JSON:
         return Response(
-            TranscriptionVerboseJsonResponse.from_segments(segments, transcription_info).model_dump_json(),
+            CreateTranscriptionResponseVerboseJson.from_segments(segments, transcription_info).model_dump_json(),
             media_type="application/json",
         )
     elif response_format == ResponseFormat.VTT:
@@ -83,7 +84,7 @@
 
 
 def segments_to_streaming_response(
-    segments: Iterable[Segment],
+    segments: Iterable[TranscriptionSegment],
     transcription_info: TranscriptionInfo,
     response_format: ResponseFormat,
 ) -> StreamingResponse:
@@ -92,9 +93,11 @@
             if response_format == ResponseFormat.TEXT:
                 data = segment.text
             elif response_format == ResponseFormat.JSON:
-                data = TranscriptionJsonResponse.from_segments([segment]).model_dump_json()
+                data = CreateTranscriptionResponseJson.from_segments([segment]).model_dump_json()
             elif response_format == ResponseFormat.VERBOSE_JSON:
-                data = TranscriptionVerboseJsonResponse.from_segment(segment, transcription_info).model_dump_json()
+                data = CreateTranscriptionResponseVerboseJson.from_segment(
+                    segment, transcription_info
+                ).model_dump_json()
             elif response_format == ResponseFormat.VTT:
                 data = segments_to_vtt(segment, i)
             elif response_format == ResponseFormat.SRT:
@@ -121,7 +124,7 @@
 
 @router.post(
     "/v1/audio/translations",
-    response_model=str | TranscriptionJsonResponse | TranscriptionVerboseJsonResponse,
+    response_model=str | CreateTranscriptionResponseJson | CreateTranscriptionResponseVerboseJson,
 )
 def translate_file(
     config: ConfigDependency,
@@ -145,7 +148,7 @@
         temperature=temperature,
         vad_filter=True,
     )
-    segments = Segment.from_faster_whisper_segments(segments)
+    segments = TranscriptionSegment.from_faster_whisper_segments(segments)
 
     if stream:
         return segments_to_streaming_response(segments, transcription_info, response_format)
@@ -169,7 +172,7 @@
 # https://github.com/openai/openai-openapi/blob/master/openapi.yaml#L8915
 @router.post(
     "/v1/audio/transcriptions",
-    response_model=str | TranscriptionJsonResponse | TranscriptionVerboseJsonResponse,
+    response_model=str | CreateTranscriptionResponseJson | CreateTranscriptionResponseVerboseJson,
 )
 def transcribe_file(
     config: ConfigDependency,
@@ -211,7 +214,7 @@
         vad_filter=True,
         hotwords=hotwords,
     )
-    segments = Segment.from_faster_whisper_segments(segments)
+    segments = TranscriptionSegment.from_faster_whisper_segments(segments)
 
     if stream:
         return segments_to_streaming_response(segments, transcription_info, response_format)
@@ -286,9 +289,11 @@
             if response_format == ResponseFormat.TEXT:
                 await ws.send_text(transcription.text)
             elif response_format == ResponseFormat.JSON:
-                await ws.send_json(TranscriptionJsonResponse.from_transcription(transcription).model_dump())
+                await ws.send_json(CreateTranscriptionResponseJson.from_transcription(transcription).model_dump())
             elif response_format == ResponseFormat.VERBOSE_JSON:
-                await ws.send_json(TranscriptionVerboseJsonResponse.from_transcription(transcription).model_dump())
+                await ws.send_json(
+                    CreateTranscriptionResponseVerboseJson.from_transcription(transcription).model_dump()
+                )
 
     if ws.client_state != WebSocketState.DISCONNECTED:
         logger.info("Closing the connection.")
 
src/faster_whisper_server/server_models.py (deleted)
--- src/faster_whisper_server/server_models.py
@@ -1,122 +0,0 @@
-from __future__ import annotations
-
-from typing import TYPE_CHECKING, Literal
-
-from pydantic import BaseModel, ConfigDict, Field
-
-from faster_whisper_server.core import Segment, Transcription, Word, segments_to_text
-
-if TYPE_CHECKING:
-    from faster_whisper.transcribe import TranscriptionInfo
-
-
-# https://platform.openai.com/docs/api-reference/audio/json-object
-class TranscriptionJsonResponse(BaseModel):
-    text: str
-
-    @classmethod
-    def from_segments(cls, segments: list[Segment]) -> TranscriptionJsonResponse:
-        return cls(text=segments_to_text(segments))
-
-    @classmethod
-    def from_transcription(cls, transcription: Transcription) -> TranscriptionJsonResponse:
-        return cls(text=transcription.text)
-
-
-# https://platform.openai.com/docs/api-reference/audio/verbose-json-object
-class TranscriptionVerboseJsonResponse(BaseModel):
-    task: str = "transcribe"
-    language: str
-    duration: float
-    text: str
-    words: list[Word] | None
-    segments: list[Segment]
-
-    @classmethod
-    def from_segment(cls, segment: Segment, transcription_info: TranscriptionInfo) -> TranscriptionVerboseJsonResponse:
-        return cls(
-            language=transcription_info.language,
-            duration=segment.end - segment.start,
-            text=segment.text,
-            words=segment.words if transcription_info.transcription_options.word_timestamps else None,
-            segments=[segment],
-        )
-
-    @classmethod
-    def from_segments(
-        cls, segments: list[Segment], transcription_info: TranscriptionInfo
-    ) -> TranscriptionVerboseJsonResponse:
-        return cls(
-            language=transcription_info.language,
-            duration=transcription_info.duration,
-            text=segments_to_text(segments),
-            segments=segments,
-            words=Word.from_segments(segments) if transcription_info.transcription_options.word_timestamps else None,
-        )
-
-    @classmethod
-    def from_transcription(cls, transcription: Transcription) -> TranscriptionVerboseJsonResponse:
-        return cls(
-            language="english",  # FIX: hardcoded
-            duration=transcription.duration,
-            text=transcription.text,
-            words=transcription.words,
-            segments=[],  # FIX: hardcoded
-        )
-
-
-class ModelListResponse(BaseModel):
-    data: list[ModelObject]
-    object: Literal["list"] = "list"
-
-
-class ModelObject(BaseModel):
-    id: str
-    """The model identifier, which can be referenced in the API endpoints."""
-    created: int
-    """The Unix timestamp (in seconds) when the model was created."""
-    object_: Literal["model"] = Field(serialization_alias="object")
-    """The object type, which is always "model"."""
-    owned_by: str
-    """The organization that owns the model."""
-    language: list[str] = Field(default_factory=list)
-    """List of ISO 639-3 supported by the model. It's possible that the list will be empty. This field is not a part of the OpenAI API spec and is added for convenience."""  # noqa: E501
-
-    model_config = ConfigDict(
-        populate_by_name=True,
-        json_schema_extra={
-            "examples": [
-                {
-                    "id": "Systran/faster-whisper-large-v3",
-                    "created": 1700732060,
-                    "object": "model",
-                    "owned_by": "Systran",
-                },
-                {
-                    "id": "Systran/faster-distil-whisper-large-v3",
-                    "created": 1711378296,
-                    "object": "model",
-                    "owned_by": "Systran",
-                },
-                {
-                    "id": "bofenghuang/whisper-large-v2-cv11-french-ct2",
-                    "created": 1687968011,
-                    "object": "model",
-                    "owned_by": "bofenghuang",
-                },
-            ]
-        },
-    )
-
-
-TimestampGranularities = list[Literal["segment", "word"]]
-
-
-DEFAULT_TIMESTAMP_GRANULARITIES: TimestampGranularities = ["segment"]
-TIMESTAMP_GRANULARITIES_COMBINATIONS: list[TimestampGranularities] = [
-    [],  # should be treated as ["segment"]. https://platform.openai.com/docs/api-reference/audio/createTranscription#audio-createtranscription-timestamp_granularities
-    ["segment"],
-    ["word"],
-    ["word", "segment"],
-    ["segment", "word"],  # same as ["word", "segment"] but order is different
-]
 
src/faster_whisper_server/text_utils.py (added)
+++ src/faster_whisper_server/text_utils.py
@@ -0,0 +1,124 @@
+from __future__ import annotations
+
+import re
+from typing import TYPE_CHECKING
+
+from faster_whisper_server.dependencies import get_config
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+    from faster_whisper_server.api_models import TranscriptionSegment, TranscriptionWord
+
+
+class Transcription:
+    def __init__(self, words: list[TranscriptionWord] = []) -> None:
+        self.words: list[TranscriptionWord] = []
+        self.extend(words)
+
+    @property
+    def text(self) -> str:
+        return " ".join(word.word for word in self.words).strip()
+
+    @property
+    def start(self) -> float:
+        return self.words[0].start if len(self.words) > 0 else 0.0
+
+    @property
+    def end(self) -> float:
+        return self.words[-1].end if len(self.words) > 0 else 0.0
+
+    @property
+    def duration(self) -> float:
+        return self.end - self.start
+
+    def after(self, seconds: float) -> Transcription:
+        return Transcription(words=[word for word in self.words if word.start > seconds])
+
+    def extend(self, words: list[TranscriptionWord]) -> None:
+        self._ensure_no_word_overlap(words)
+        self.words.extend(words)
+
+    def _ensure_no_word_overlap(self, words: list[TranscriptionWord]) -> None:
+        config = get_config()  # HACK
+        if len(self.words) > 0 and len(words) > 0:
+            if words[0].start + config.word_timestamp_error_margin <= self.words[-1].end:
+                raise ValueError(
+                    f"Words overlap: {self.words[-1]} and {words[0]}. Error margin: {config.word_timestamp_error_margin}"  # noqa: E501
+                )
+        for i in range(1, len(words)):
+            if words[i].start + config.word_timestamp_error_margin <= words[i - 1].end:
+                raise ValueError(f"Words overlap: {words[i - 1]} and {words[i]}. All words: {words}")
+
+
+def is_eos(text: str) -> bool:
+    if text.endswith("..."):
+        return False
+    return any(text.endswith(punctuation_symbol) for punctuation_symbol in ".?!")
+
+
+def to_full_sentences(words: list[TranscriptionWord]) -> list[list[TranscriptionWord]]:
+    sentences: list[list[TranscriptionWord]] = [[]]
+    for word in words:
+        sentences[-1].append(word)
+        if is_eos(word.word):
+            sentences.append([])
+    if len(sentences[-1]) == 0 or not is_eos(sentences[-1][-1].word):
+        sentences.pop()
+    return sentences
+
+
+def word_to_text(words: list[TranscriptionWord]) -> str:
+    return "".join(word.word for word in words)
+
+
+def words_to_text_w_ts(words: list[TranscriptionWord]) -> str:
+    return "".join(f"{word.word}({word.start:.2f}-{word.end:.2f})" for word in words)
+
+
+def segments_to_text(segments: Iterable[TranscriptionSegment]) -> str:
+    return "".join(segment.text for segment in segments).strip()
+
+
+def srt_format_timestamp(ts: float) -> str:
+    hours = ts // 3600
+    minutes = (ts % 3600) // 60
+    seconds = ts % 60
+    milliseconds = (ts * 1000) % 1000
+    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
+
+
+def vtt_format_timestamp(ts: float) -> str:
+    hours = ts // 3600
+    minutes = (ts % 3600) // 60
+    seconds = ts % 60
+    milliseconds = (ts * 1000) % 1000
+    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}"
+
+
+def segments_to_vtt(segment: TranscriptionSegment, i: int) -> str:
+    start = segment.start if i > 0 else 0.0
+    result = f"{vtt_format_timestamp(start)} --> {vtt_format_timestamp(segment.end)}\n{segment.text}\n\n"
+
+    if i == 0:
+        return f"WEBVTT\n\n{result}"
+    else:
+        return result
+
+
+def segments_to_srt(segment: TranscriptionSegment, i: int) -> str:
+    return f"{i + 1}\n{srt_format_timestamp(segment.start)} --> {srt_format_timestamp(segment.end)}\n{segment.text}\n\n"
+
+
+def canonicalize_word(text: str) -> str:
+    text = text.lower()
+    # Remove non-alphabetic characters using regular expression
+    text = re.sub(r"[^a-z]", "", text)
+    return text.lower().strip().strip(".,?!")
+
+
+def common_prefix(a: list[TranscriptionWord], b: list[TranscriptionWord]) -> list[TranscriptionWord]:
+    i = 0
+    while i < len(a) and i < len(b) and canonicalize_word(a[i].word) == canonicalize_word(b[i].word):
+        i += 1
+    return a[:i]
 
src/faster_whisper_server/text_utils_test.py (added)
+++ src/faster_whisper_server/text_utils_test.py
@@ -0,0 +1,111 @@
+from faster_whisper_server.api_models import TranscriptionWord
+from faster_whisper_server.text_utils import (
+    canonicalize_word,
+    common_prefix,
+    is_eos,
+    srt_format_timestamp,
+    to_full_sentences,
+    vtt_format_timestamp,
+)
+
+
+def test_is_eos() -> None:
+    assert not is_eos("Hello")
+    assert not is_eos("Hello...")
+    assert is_eos("Hello.")
+    assert is_eos("Hello!")
+    assert is_eos("Hello?")
+    assert not is_eos("Hello. Yo")
+    assert not is_eos("Hello. Yo...")
+    assert is_eos("Hello. Yo.")
+
+
+def tests_to_full_sentences() -> None:
+    def word(text: str) -> TranscriptionWord:
+        return TranscriptionWord(word=text, start=0.0, end=0.0, probability=0.0)
+
+    assert to_full_sentences([]) == []
+    assert to_full_sentences([word(text="Hello")]) == []
+    assert to_full_sentences([word(text="Hello..."), word(" world")]) == []
+    assert to_full_sentences([word(text="Hello..."), word(" world.")]) == [[word("Hello..."), word(" world.")]]
+    assert to_full_sentences([word(text="Hello..."), word(" world."), word(" How")]) == [
+        [word("Hello..."), word(" world.")],
+    ]
+
+
+def test_srt_format_timestamp() -> None:
+    assert srt_format_timestamp(0.0) == "00:00:00,000"
+    assert srt_format_timestamp(1.0) == "00:00:01,000"
+    assert srt_format_timestamp(1.234) == "00:00:01,234"
+    assert srt_format_timestamp(60.0) == "00:01:00,000"
+    assert srt_format_timestamp(61.0) == "00:01:01,000"
+    assert srt_format_timestamp(61.234) == "00:01:01,234"
+    assert srt_format_timestamp(3600.0) == "01:00:00,000"
+    assert srt_format_timestamp(3601.0) == "01:00:01,000"
+    assert srt_format_timestamp(3601.234) == "01:00:01,234"
+    assert srt_format_timestamp(23423.4234) == "06:30:23,423"
+
+
+def test_vtt_format_timestamp() -> None:
+    assert vtt_format_timestamp(0.0) == "00:00:00.000"
+    assert vtt_format_timestamp(1.0) == "00:00:01.000"
+    assert vtt_format_timestamp(1.234) == "00:00:01.234"
+    assert vtt_format_timestamp(60.0) == "00:01:00.000"
+    assert vtt_format_timestamp(61.0) == "00:01:01.000"
+    assert vtt_format_timestamp(61.234) == "00:01:01.234"
+    assert vtt_format_timestamp(3600.0) == "01:00:00.000"
+    assert vtt_format_timestamp(3601.0) == "01:00:01.000"
+    assert vtt_format_timestamp(3601.234) == "01:00:01.234"
+    assert vtt_format_timestamp(23423.4234) == "06:30:23.423"
+
+
+def test_canonicalize_word() -> None:
+    assert canonicalize_word("ABC") == "abc"
+    assert canonicalize_word("...ABC?") == "abc"
+    assert canonicalize_word("... AbC  ...") == "abc"
+
+
+def test_common_prefix() -> None:
+    def word(text: str) -> TranscriptionWord:
+        return TranscriptionWord(word=text, start=0.0, end=0.0, probability=0.0)
+
+    a = [word("a"), word("b"), word("c")]
+    b = [word("a"), word("b"), word("c")]
+    assert common_prefix(a, b) == [word("a"), word("b"), word("c")]
+
+    a = [word("a"), word("b"), word("c")]
+    b = [word("a"), word("b"), word("d")]
+    assert common_prefix(a, b) == [word("a"), word("b")]
+
+    a = [word("a"), word("b"), word("c")]
+    b = [word("a")]
+    assert common_prefix(a, b) == [word("a")]
+
+    a = [word("a")]
+    b = [word("a"), word("b"), word("c")]
+    assert common_prefix(a, b) == [word("a")]
+
+    a = [word("a")]
+    b = []
+    assert common_prefix(a, b) == []
+
+    a = []
+    b = [word("a")]
+    assert common_prefix(a, b) == []
+
+    a = [word("a"), word("b"), word("c")]
+    b = [word("b"), word("c")]
+    assert common_prefix(a, b) == []
+
+
+def test_common_prefix_and_canonicalization() -> None:
+    def word(text: str) -> TranscriptionWord:
+        return TranscriptionWord(word=text, start=0.0, end=0.0, probability=0.0)
+
+    a = [word("A...")]
+    b = [word("a?"), word("b"), word("c")]
+    assert common_prefix(a, b) == [word("A...")]
+
+    a = [word("A..."), word("B?"), word("C,")]
+    b = [word("a??"), word("  b"), word(" ,c")]
+    assert common_prefix(a, b) == [word("A..."), word("B?"), word("C,")]
src/faster_whisper_server/transcriber.py
--- src/faster_whisper_server/transcriber.py
+++ src/faster_whisper_server/transcriber.py
@@ -4,11 +4,12 @@
 from typing import TYPE_CHECKING
 
 from faster_whisper_server.audio import Audio, AudioStream
-from faster_whisper_server.core import Transcription, Word, common_prefix, to_full_sentences, word_to_text
+from faster_whisper_server.text_utils import Transcription, common_prefix, to_full_sentences, word_to_text
 
 if TYPE_CHECKING:
     from collections.abc import AsyncGenerator
 
+    from faster_whisper_server.api_models import TranscriptionWord
     from faster_whisper_server.asr import FasterWhisperASR
 
 logger = logging.getLogger(__name__)
@@ -18,7 +19,7 @@
     def __init__(self) -> None:
         self.unconfirmed = Transcription()
 
-    def merge(self, confirmed: Transcription, incoming: Transcription) -> list[Word]:
+    def merge(self, confirmed: Transcription, incoming: Transcription) -> list[TranscriptionWord]:
         # https://github.com/ufal/whisper_streaming/blob/main/whisper_online.py#L264
         incoming = incoming.after(confirmed.end - 0.1)
         prefix = common_prefix(incoming.words, self.unconfirmed.words)
tests/api_timestamp_granularities_test.py
--- tests/api_timestamp_granularities_test.py
+++ tests/api_timestamp_granularities_test.py
@@ -1,6 +1,6 @@
 """See `tests/openai_timestamp_granularities_test.py` to understand how OpenAI handles `response_type` and `timestamp_granularities`."""  # noqa: E501
 
-from faster_whisper_server.server_models import TIMESTAMP_GRANULARITIES_COMBINATIONS, TimestampGranularities
+from faster_whisper_server.api_models import TIMESTAMP_GRANULARITIES_COMBINATIONS, TimestampGranularities
 from openai import AsyncOpenAI
 import pytest
 
tests/openai_timestamp_granularities_test.py
--- tests/openai_timestamp_granularities_test.py
+++ tests/openai_timestamp_granularities_test.py
@@ -1,6 +1,6 @@
 """OpenAI's handling of `response_format` and `timestamp_granularities` is a bit confusing and inconsistent. This test module exists to capture the OpenAI API's behavior with respect to these parameters."""  # noqa: E501
 
-from faster_whisper_server.server_models import TIMESTAMP_GRANULARITIES_COMBINATIONS, TimestampGranularities
+from faster_whisper_server.api_models import TIMESTAMP_GRANULARITIES_COMBINATIONS, TimestampGranularities
 from openai import AsyncOpenAI, BadRequestError
 import pytest
 
tests/sse_test.py
--- tests/sse_test.py
+++ tests/sse_test.py
@@ -2,9 +2,9 @@
 import os
 
 from fastapi.testclient import TestClient
-from faster_whisper_server.server_models import (
-    TranscriptionJsonResponse,
-    TranscriptionVerboseJsonResponse,
+from faster_whisper_server.api_models import (
+    CreateTranscriptionResponseJson,
+    CreateTranscriptionResponseVerboseJson,
 )
 from httpx_sse import connect_sse
 import pytest
@@ -48,7 +48,7 @@
     }
     with connect_sse(client, "POST", endpoint, **kwargs) as event_source:
         for event in event_source.iter_sse():
-            TranscriptionJsonResponse(**json.loads(event.data))
+            CreateTranscriptionResponseJson(**json.loads(event.data))
 
 
 @pytest.mark.parametrize(("file_path", "endpoint"), parameters)
@@ -62,7 +62,7 @@
     }
     with connect_sse(client, "POST", endpoint, **kwargs) as event_source:
         for event in event_source.iter_sse():
-            TranscriptionVerboseJsonResponse(**json.loads(event.data))
+            CreateTranscriptionResponseVerboseJson(**json.loads(event.data))
 
 
 def test_transcription_vtt(client: TestClient) -> None:
Add a comment
List