Commit @9f134f9b35fb81ad7363b521be543f1957acdfe1 - yjyoon/whisper_server

Fedir Zadniprovskyi 2024-07-20

refactor

@9f134f9b35fb81ad7363b521be543f1957acdfe1

633b408

9f134f9

faster_whisper_server/asr.py

--- faster_whisper_server/asr.py

+++ faster_whisper_server/asr.py


 import asyncio
-from collections.abc import Iterable
 import time
 
 from faster_whisper import transcribe
 
 from faster_whisper_server.audio import Audio
-from faster_whisper_server.core import Transcription, Word
+from faster_whisper_server.core import Segment, Transcription, Word
 from faster_whisper_server.logger import logger
 
 

             word_timestamps=True,
             **self.transcribe_opts,
         )
-        words = words_from_whisper_segments(segments)
+        segments = Segment.from_faster_whisper_segments(segments)
+        words = Word.from_segments(segments)
         for word in words:
             word.offset(audio.start)
         transcription = Transcription(words)

             audio,
             prompt,
         )
-
-
-def words_from_whisper_segments(segments: Iterable[transcribe.Segment]) -> list[Word]:
-    words: list[Word] = []
-    for segment in segments:
-        assert segment.words is not None
-        words.extend(
-            Word(
-                start=word.start,
-                end=word.end,
-                text=word.word,
-                probability=word.probability,
-            )
-            for word in segment.words
-        )
-    return words

633b408

9f134f9

faster_whisper_server/core.py

--- faster_whisper_server/core.py

+++ faster_whisper_server/core.py


-# TODO: rename module
 from __future__ import annotations
 
-from dataclasses import dataclass
 import re
+from typing import TYPE_CHECKING
+
+from pydantic import BaseModel
 
 from faster_whisper_server.config import config
 
+if TYPE_CHECKING:
+    from collections.abc import Iterable
 
-# TODO: use the `Segment` from `faster-whisper.transcribe` instead
-@dataclass
-class Segment:
-    text: str
-    start: float = 0.0
-    end: float = 0.0
+    import faster_whisper.transcribe
 
-    @property
-    def is_eos(self) -> bool:
-        if self.text.endswith("..."):
-            return False
-        return any(self.text.endswith(punctuation_symbol) for punctuation_symbol in ".?!")
+
+class Word(BaseModel):
+    start: float
+    end: float
+    word: str
+    probability: float
+
+    @classmethod
+    def from_segments(cls, segments: Iterable[Segment]) -> list[Word]:
+        words: list[Word] = []
+        for segment in segments:
+            assert segment.words is not None
+            words.extend(segment.words)
+        return words
 
     def offset(self, seconds: float) -> None:
         self.start += seconds
         self.end += seconds
 
-
-# TODO: use the `Word` from `faster-whisper.transcribe` instead
-@dataclass
-class Word(Segment):
-    probability: float = 0.0
-
     @classmethod
     def common_prefix(cls, a: list[Word], b: list[Word]) -> list[Word]:
         i = 0
-        while i < len(a) and i < len(b) and canonicalize_word(a[i].text) == canonicalize_word(b[i].text):
+        while i < len(a) and i < len(b) and canonicalize_word(a[i].word) == canonicalize_word(b[i].word):
             i += 1
         return a[:i]
+
+
+class Segment(BaseModel):
+    id: int
+    seek: int
+    start: float
+    end: float
+    text: str
+    tokens: list[int]
+    temperature: float
+    avg_logprob: float
+    compression_ratio: float
+    no_speech_prob: float
+    words: list[Word] | None
+
+    @classmethod
+    def from_faster_whisper_segments(cls, segments: Iterable[faster_whisper.transcribe.Segment]) -> Iterable[Segment]:
+        for segment in segments:
+            yield cls(
+                id=segment.id,
+                seek=segment.seek,
+                start=segment.start,
+                end=segment.end,
+                text=segment.text,
+                tokens=segment.tokens,
+                temperature=segment.temperature,
+                avg_logprob=segment.avg_logprob,
+                compression_ratio=segment.compression_ratio,
+                no_speech_prob=segment.no_speech_prob,
+                words=[
+                    Word(
+                        start=word.start,
+                        end=word.end,
+                        word=word.word,
+                        probability=word.probability,
+                    )
+                    for word in segment.words
+                ]
+                if segment.words is not None
+                else None,
+            )
 
 
 class Transcription:

 
     @property
     def text(self) -> str:
-        return " ".join(word.text for word in self.words).strip()
+        return " ".join(word.word for word in self.words).strip()
 
     @property
     def start(self) -> float:

                 raise ValueError(f"Words overlap: {words[i - 1]} and {words[i]}. All words: {words}")
 
 
-def test_segment_is_eos() -> None:
-    assert not Segment("Hello").is_eos
-    assert not Segment("Hello...").is_eos
-    assert Segment("Hello.").is_eos
-    assert Segment("Hello!").is_eos
-    assert Segment("Hello?").is_eos
-    assert not Segment("Hello. Yo").is_eos
-    assert not Segment("Hello. Yo...").is_eos
-    assert Segment("Hello. Yo.").is_eos
+def is_eos(text: str) -> bool:
+    if text.endswith("..."):
+        return False
+    return any(text.endswith(punctuation_symbol) for punctuation_symbol in ".?!")
 
 
-def to_full_sentences(words: list[Word]) -> list[Segment]:
-    sentences: list[Segment] = [Segment("")]
+def test_is_eos() -> None:
+    assert not is_eos("Hello")
+    assert not is_eos("Hello...")
+    assert is_eos("Hello.")
+    assert is_eos("Hello!")
+    assert is_eos("Hello?")
+    assert not is_eos("Hello. Yo")
+    assert not is_eos("Hello. Yo...")
+    assert is_eos("Hello. Yo.")
+
+
+def to_full_sentences(words: list[Word]) -> list[list[Word]]:
+    sentences: list[list[Word]] = [[]]
     for word in words:
-        sentences[-1] = Segment(
-            start=sentences[-1].start,
-            end=word.end,
-            text=sentences[-1].text + word.text,
-        )
-        if word.is_eos:
-            sentences.append(Segment(""))
-    if len(sentences) > 0 and not sentences[-1].is_eos:
+        sentences[-1].append(word)
+        if is_eos(word.word):
+            sentences.append([])
+    if len(sentences[-1]) == 0 or not is_eos(sentences[-1][-1].word):
         sentences.pop()
     return sentences
 
 
 def tests_to_full_sentences() -> None:
+    def word(text: str) -> Word:
+        return Word(word=text, start=0.0, end=0.0, probability=0.0)
+
     assert to_full_sentences([]) == []
-    assert to_full_sentences([Word(text="Hello")]) == []
-    assert to_full_sentences([Word(text="Hello..."), Word(" world")]) == []
-    assert to_full_sentences([Word(text="Hello..."), Word(" world.")]) == [Segment(text="Hello... world.")]
-    assert to_full_sentences([Word(text="Hello..."), Word(" world."), Word(" How")]) == [
-        Segment(text="Hello... world.")
+    assert to_full_sentences([word(text="Hello")]) == []
+    assert to_full_sentences([word(text="Hello..."), word(" world")]) == []
+    assert to_full_sentences([word(text="Hello..."), word(" world.")]) == [[word("Hello..."), word(" world.")]]
+    assert to_full_sentences([word(text="Hello..."), word(" world."), word(" How")]) == [
+        [word("Hello..."), word(" world.")],
     ]
 
 
-def to_text(words: list[Word]) -> str:
-    return "".join(word.text for word in words)
+def word_to_text(words: list[Word]) -> str:
+    return "".join(word.word for word in words)
 
 
-def to_text_w_ts(words: list[Word]) -> str:
-    return "".join(f"{word.text}({word.start:.2f}-{word.end:.2f})" for word in words)
+def words_to_text_w_ts(words: list[Word]) -> str:
+    return "".join(f"{word.word}({word.start:.2f}-{word.end:.2f})" for word in words)
+
+
+def segments_to_text(segments: Iterable[Segment]) -> str:
+    return "".join(segment.text for segment in segments).strip()
 
 
 def canonicalize_word(text: str) -> str:

 
 def common_prefix(a: list[Word], b: list[Word]) -> list[Word]:
     i = 0
-    while i < len(a) and i < len(b) and canonicalize_word(a[i].text) == canonicalize_word(b[i].text):
+    while i < len(a) and i < len(b) and canonicalize_word(a[i].word) == canonicalize_word(b[i].word):
         i += 1
     return a[:i]
 
 
 def test_common_prefix() -> None:
     def word(text: str) -> Word:
-        return Word(text=text, start=0.0, end=0.0, probability=0.0)
+        return Word(word=text, start=0.0, end=0.0, probability=0.0)
 
     a = [word("a"), word("b"), word("c")]
     b = [word("a"), word("b"), word("c")]

 
 def test_common_prefix_and_canonicalization() -> None:
     def word(text: str) -> Word:
-        return Word(text=text, start=0.0, end=0.0, probability=0.0)
+        return Word(word=text, start=0.0, end=0.0, probability=0.0)
 
     a = [word("A...")]
     b = [word("a?"), word("b"), word("c")]

633b408

9f134f9

faster_whisper_server/main.py

--- faster_whisper_server/main.py

+++ faster_whisper_server/main.py


 import huggingface_hub
 from pydantic import AfterValidator
 
-from faster_whisper_server import utils
 from faster_whisper_server.asr import FasterWhisperASR
 from faster_whisper_server.audio import AudioStream, audio_samples_from_file
 from faster_whisper_server.config import (

     Task,
     config,
 )
+from faster_whisper_server.core import Segment, segments_to_text
 from faster_whisper_server.logger import logger
 from faster_whisper_server.server_models import (
     ModelListResponse,

 if TYPE_CHECKING:
     from collections.abc import Generator, Iterable
 
-    from faster_whisper.transcribe import Segment, TranscriptionInfo
+    from faster_whisper.transcribe import TranscriptionInfo
     from huggingface_hub.hf_api import ModelInfo
 
 loaded_models: OrderedDict[str, WhisperModel] = OrderedDict()

 ) -> str | TranscriptionJsonResponse | TranscriptionVerboseJsonResponse:
     segments = list(segments)
     if response_format == ResponseFormat.TEXT:  # noqa: RET503
-        return utils.segments_text(segments)
+        return segments_to_text(segments)
     elif response_format == ResponseFormat.JSON:
         return TranscriptionJsonResponse.from_segments(segments)
     elif response_format == ResponseFormat.VERBOSE_JSON:

         temperature=temperature,
         vad_filter=True,
     )
+    segments = Segment.from_faster_whisper_segments(segments)
 
     if stream:
         return segments_to_streaming_response(segments, transcription_info, response_format)

         vad_filter=True,
         hotwords=hotwords,
     )
+    segments = Segment.from_faster_whisper_segments(segments)
 
     if stream:
         return segments_to_streaming_response(segments, transcription_info, response_format)

633b408

9f134f9

faster_whisper_server/server_models.py

--- faster_whisper_server/server_models.py

+++ faster_whisper_server/server_models.py


 
 from pydantic import BaseModel, ConfigDict, Field
 
-from faster_whisper_server import utils
+from faster_whisper_server.core import Segment, Transcription, Word, segments_to_text
 
 if TYPE_CHECKING:
-    from faster_whisper.transcribe import Segment, TranscriptionInfo, Word
-
-    from faster_whisper_server.core import Transcription
+    from faster_whisper.transcribe import TranscriptionInfo
 
 
 # https://platform.openai.com/docs/api-reference/audio/json-object

 
     @classmethod
     def from_segments(cls, segments: list[Segment]) -> TranscriptionJsonResponse:
-        return cls(text=utils.segments_text(segments))
+        return cls(text=segments_to_text(segments))
 
     @classmethod
     def from_transcription(cls, transcription: Transcription) -> TranscriptionJsonResponse:
         return cls(text=transcription.text)
-
-
-class WordObject(BaseModel):
-    start: float
-    end: float
-    word: str
-    probability: float
-
-    @classmethod
-    def from_word(cls, word: Word) -> WordObject:
-        return cls(
-            start=word.start,
-            end=word.end,
-            word=word.word,
-            probability=word.probability,
-        )
-
-
-class SegmentObject(BaseModel):
-    id: int
-    seek: int
-    start: float
-    end: float
-    text: str
-    tokens: list[int]
-    temperature: float
-    avg_logprob: float
-    compression_ratio: float
-    no_speech_prob: float
-
-    @classmethod
-    def from_segment(cls, segment: Segment) -> SegmentObject:
-        return cls(
-            id=segment.id,
-            seek=segment.seek,
-            start=segment.start,
-            end=segment.end,
-            text=segment.text,
-            tokens=segment.tokens,
-            temperature=segment.temperature,
-            avg_logprob=segment.avg_logprob,
-            compression_ratio=segment.compression_ratio,
-            no_speech_prob=segment.no_speech_prob,
-        )
 
 
 # https://platform.openai.com/docs/api-reference/audio/verbose-json-object

     language: str
     duration: float
     text: str
-    words: list[WordObject]
-    segments: list[SegmentObject]
+    words: list[Word]
+    segments: list[Segment]
 
     @classmethod
     def from_segment(cls, segment: Segment, transcription_info: TranscriptionInfo) -> TranscriptionVerboseJsonResponse:

             language=transcription_info.language,
             duration=segment.end - segment.start,
             text=segment.text,
-            words=([WordObject.from_word(word) for word in segment.words] if isinstance(segment.words, list) else []),
-            segments=[SegmentObject.from_segment(segment)],
+            words=(segment.words if isinstance(segment.words, list) else []),
+            segments=[segment],
         )
 
     @classmethod

         return cls(
             language=transcription_info.language,
             duration=transcription_info.duration,
-            text=utils.segments_text(segments),
-            segments=[SegmentObject.from_segment(segment) for segment in segments],
-            words=[WordObject.from_word(word) for word in utils.words_from_segments(segments)],
+            text=segments_to_text(segments),
+            segments=segments,
+            words=Word.from_segments(segments),
         )
 
     @classmethod

             language="english",  # FIX: hardcoded
             duration=transcription.duration,
             text=transcription.text,
-            words=[
-                WordObject(
-                    start=word.start,
-                    end=word.end,
-                    word=word.text,
-                    probability=word.probability,
-                )
-                for word in transcription.words
-            ],
+            words=transcription.words,
             segments=[],  # FIX: hardcoded
         )
 

633b408

9f134f9

faster_whisper_server/transcriber.py

--- faster_whisper_server/transcriber.py

+++ faster_whisper_server/transcriber.py


 
 from faster_whisper_server.audio import Audio, AudioStream
 from faster_whisper_server.config import config
-from faster_whisper_server.core import (
-    Transcription,
-    Word,
-    common_prefix,
-    to_full_sentences,
-)
+from faster_whisper_server.core import Transcription, Word, common_prefix, to_full_sentences, word_to_text
 from faster_whisper_server.logger import logger
 
 if TYPE_CHECKING:

 
         return prefix
 
-    @classmethod
-    def prompt(cls, confirmed: Transcription) -> str | None:
-        sentences = to_full_sentences(confirmed.words)
-        if len(sentences) == 0:
-            return None
-        return sentences[-1].text
 
-    # TODO: better name
-    @classmethod
-    def needs_audio_after(cls, confirmed: Transcription) -> float:
-        full_sentences = to_full_sentences(confirmed.words)
-        return full_sentences[-1].end if len(full_sentences) > 0 else 0.0
-
-
+# TODO: needs a better name
 def needs_audio_after(confirmed: Transcription) -> float:
     full_sentences = to_full_sentences(confirmed.words)
-    return full_sentences[-1].end if len(full_sentences) > 0 else 0.0
+    return full_sentences[-1][-1].end if len(full_sentences) > 0 else 0.0
 
 
 def prompt(confirmed: Transcription) -> str | None:
     sentences = to_full_sentences(confirmed.words)
-    if len(sentences) == 0:
-        return None
-    return sentences[-1].text
+    return word_to_text(sentences[-1]) if len(sentences) > 0 else None
 
 
 async def audio_transcriber(

633b408

faster_whisper_server/utils.py (deleted)

--- faster_whisper_server/utils.py

...	...	@@ -1,14 +0,0 @@
	1	-from faster_whisper.transcribe import Segment, Word
	2	-
	3	-
	4	-def segments_text(segments: list[Segment]) -> str:
	5	- return "".join(segment.text for segment in segments).strip()
	6	-
	7	-
	8	-def words_from_segments(segments: list[Segment]) -> list[Word]:
	9	- words = []
	10	- for segment in segments:
	11	- if segment.words is None:
	12	- continue
	13	- words.extend(segment.words)
	14	- return words

Add a comment

Open 0
Closed 0

List

...	...	@@ -24,7 +24,6 @@
24	24	import huggingface_hub
25	25	from pydantic import AfterValidator
26	26
27		-from faster_whisper_server import utils
28	27	from faster_whisper_server.asr import FasterWhisperASR
29	28	from faster_whisper_server.audio import AudioStream, audio_samples_from_file
30	29	from faster_whisper_server.config import (
...	...	@@ -34,6 +33,7 @@
34	33	Task,
35	34	config,
36	35	)
	36	+from faster_whisper_server.core import Segment, segments_to_text
37	37	from faster_whisper_server.logger import logger
38	38	from faster_whisper_server.server_models import (
39	39	ModelListResponse,
...	...	@@ -46,7 +46,7 @@
46	46	if TYPE_CHECKING:
47	47	from collections.abc import Generator, Iterable
48	48
49		- from faster_whisper.transcribe import Segment, TranscriptionInfo
	49	+ from faster_whisper.transcribe import TranscriptionInfo
50	50	from huggingface_hub.hf_api import ModelInfo
51	51
52	52	loaded_models: OrderedDict[str, WhisperModel] = OrderedDict()
...	...	@@ -157,7 +157,7 @@
157	157	) -> str \| TranscriptionJsonResponse \| TranscriptionVerboseJsonResponse:
158	158	segments = list(segments)
159	159	if response_format == ResponseFormat.TEXT: # noqa: RET503
160		- return utils.segments_text(segments)
	160	+ return segments_to_text(segments)
161	161	elif response_format == ResponseFormat.JSON:
162	162	return TranscriptionJsonResponse.from_segments(segments)
163	163	elif response_format == ResponseFormat.VERBOSE_JSON:
...	...	@@ -220,6 +220,7 @@
220	220	temperature=temperature,
221	221	vad_filter=True,
222	222	)
	223	+ segments = Segment.from_faster_whisper_segments(segments)
223	224
224	225	if stream:
225	226	return segments_to_streaming_response(segments, transcription_info, response_format)
...	...	@@ -258,6 +259,7 @@
258	259	vad_filter=True,
259	260	hotwords=hotwords,
260	261	)
	262	+ segments = Segment.from_faster_whisper_segments(segments)
261	263
262	264	if stream:
263	265	return segments_to_streaming_response(segments, transcription_info, response_format)

...	...	@@ -1,11 +1,10 @@
1	1	import asyncio
2		-from collections.abc import Iterable
3	2	import time
4	3
5	4	from faster_whisper import transcribe
6	5
7	6	from faster_whisper_server.audio import Audio
8		-from faster_whisper_server.core import Transcription, Word
	7	+from faster_whisper_server.core import Segment, Transcription, Word
9	8	from faster_whisper_server.logger import logger
10	9
11	10
...	...	@@ -30,7 +29,8 @@
30	29	word_timestamps=True,
31	30	**self.transcribe_opts,
32	31	)
33		- words = words_from_whisper_segments(segments)
	32	+ segments = Segment.from_faster_whisper_segments(segments)
	33	+ words = Word.from_segments(segments)
34	34	for word in words:
35	35	word.offset(audio.start)
36	36	transcription = Transcription(words)
...	...	@@ -54,19 +54,3 @@
54	54	audio,
55	55	prompt,
56	56	)
57		-
58		-
59		-def words_from_whisper_segments(segments: Iterable[transcribe.Segment]) -> list[Word]:
60		- words: list[Word] = []
61		- for segment in segments:
62		- assert segment.words is not None
63		- words.extend(
64		- Word(
65		- start=word.start,
66		- end=word.end,
67		- text=word.word,
68		- probability=word.probability,
69		- )
70		- for word in segment.words
71		- )
72		- return words

...	...	@@ -1,41 +1,83 @@
1		-# TODO: rename module
2	1	from __future__ import annotations
3	2
4		-from dataclasses import dataclass
5	3	import re
	4	+from typing import TYPE_CHECKING
	5	+
	6	+from pydantic import BaseModel
6	7
7	8	from faster_whisper_server.config import config
8	9
	10	+if TYPE_CHECKING:
	11	+ from collections.abc import Iterable
9	12
10		-# TODO: use the `Segment` from `faster-whisper.transcribe` instead
11		-@dataclass
12		-class Segment:
13		- text: str
14		- start: float = 0.0
15		- end: float = 0.0
	13	+ import faster_whisper.transcribe
16	14
17		- @property
18		- def is_eos(self) -> bool:
19		- if self.text.endswith("..."):
20		- return False
21		- return any(self.text.endswith(punctuation_symbol) for punctuation_symbol in ".?!")
	15	+
	16	+class Word(BaseModel):
	17	+ start: float
	18	+ end: float
	19	+ word: str
	20	+ probability: float
	21	+
	22	+ @classmethod
	23	+ def from_segments(cls, segments: Iterable[Segment]) -> list[Word]:
	24	+ words: list[Word] = []
	25	+ for segment in segments:
	26	+ assert segment.words is not None
	27	+ words.extend(segment.words)
	28	+ return words
22	29
23	30	def offset(self, seconds: float) -> None:
24	31	self.start += seconds
25	32	self.end += seconds
26	33
27		-
28		-# TODO: use the `Word` from `faster-whisper.transcribe` instead
29		-@dataclass
30		-class Word(Segment):
31		- probability: float = 0.0
32		-
33	34	@classmethod
34	35	def common_prefix(cls, a: list[Word], b: list[Word]) -> list[Word]:
35	36	i = 0
36		- while i < len(a) and i < len(b) and canonicalize_word(a[i].text) == canonicalize_word(b[i].text):
	37	+ while i < len(a) and i < len(b) and canonicalize_word(a[i].word) == canonicalize_word(b[i].word):
37	38	i += 1
38	39	return a[:i]
	40	+
	41	+
	42	+class Segment(BaseModel):
	43	+ id: int
	44	+ seek: int
	45	+ start: float
	46	+ end: float
	47	+ text: str
	48	+ tokens: list[int]
	49	+ temperature: float
	50	+ avg_logprob: float
	51	+ compression_ratio: float
	52	+ no_speech_prob: float
	53	+ words: list[Word] \| None
	54	+
	55	+ @classmethod
	56	+ def from_faster_whisper_segments(cls, segments: Iterable[faster_whisper.transcribe.Segment]) -> Iterable[Segment]:
	57	+ for segment in segments:
	58	+ yield cls(
	59	+ id=segment.id,
	60	+ seek=segment.seek,
	61	+ start=segment.start,
	62	+ end=segment.end,
	63	+ text=segment.text,
	64	+ tokens=segment.tokens,
	65	+ temperature=segment.temperature,
	66	+ avg_logprob=segment.avg_logprob,
	67	+ compression_ratio=segment.compression_ratio,
	68	+ no_speech_prob=segment.no_speech_prob,
	69	+ words=[
	70	+ Word(
	71	+ start=word.start,
	72	+ end=word.end,
	73	+ word=word.word,
	74	+ probability=word.probability,
	75	+ )
	76	+ for word in segment.words
	77	+ ]
	78	+ if segment.words is not None
	79	+ else None,
	80	+ )
39	81
40	82
41	83	class Transcription:
...	...	@@ -45,7 +87,7 @@
45	87
46	88	@property
47	89	def text(self) -> str:
48		- return " ".join(word.text for word in self.words).strip()
	90	+ return " ".join(word.word for word in self.words).strip()
49	91
50	92	@property
51	93	def start(self) -> float:
...	...	@@ -77,48 +119,57 @@
77	119	raise ValueError(f"Words overlap: {words[i - 1]} and {words[i]}. All words: {words}")
78	120
79	121
80		-def test_segment_is_eos() -> None:
81		- assert not Segment("Hello").is_eos
82		- assert not Segment("Hello...").is_eos
83		- assert Segment("Hello.").is_eos
84		- assert Segment("Hello!").is_eos
85		- assert Segment("Hello?").is_eos
86		- assert not Segment("Hello. Yo").is_eos
87		- assert not Segment("Hello. Yo...").is_eos
88		- assert Segment("Hello. Yo.").is_eos
	122	+def is_eos(text: str) -> bool:
	123	+ if text.endswith("..."):
	124	+ return False
	125	+ return any(text.endswith(punctuation_symbol) for punctuation_symbol in ".?!")
89	126
90	127
91		-def to_full_sentences(words: list[Word]) -> list[Segment]:
92		- sentences: list[Segment] = [Segment("")]
	128	+def test_is_eos() -> None:
	129	+ assert not is_eos("Hello")
	130	+ assert not is_eos("Hello...")
	131	+ assert is_eos("Hello.")
	132	+ assert is_eos("Hello!")
	133	+ assert is_eos("Hello?")
	134	+ assert not is_eos("Hello. Yo")
	135	+ assert not is_eos("Hello. Yo...")
	136	+ assert is_eos("Hello. Yo.")
	137	+
	138	+
	139	+def to_full_sentences(words: list[Word]) -> list[list[Word]]:
	140	+ sentences: list[list[Word]] = [[]]
93	141	for word in words:
94		- sentences[-1] = Segment(
95		- start=sentences[-1].start,
96		- end=word.end,
97		- text=sentences[-1].text + word.text,
98		- )
99		- if word.is_eos:
100		- sentences.append(Segment(""))
101		- if len(sentences) > 0 and not sentences[-1].is_eos:
	142	+ sentences[-1].append(word)
	143	+ if is_eos(word.word):
	144	+ sentences.append([])
	145	+ if len(sentences[-1]) == 0 or not is_eos(sentences[-1][-1].word):
102	146	sentences.pop()
103	147	return sentences
104	148
105	149
106	150	def tests_to_full_sentences() -> None:
	151	+ def word(text: str) -> Word:
	152	+ return Word(word=text, start=0.0, end=0.0, probability=0.0)
	153	+
107	154	assert to_full_sentences([]) == []
108		- assert to_full_sentences([Word(text="Hello")]) == []
109		- assert to_full_sentences([Word(text="Hello..."), Word(" world")]) == []
110		- assert to_full_sentences([Word(text="Hello..."), Word(" world.")]) == [Segment(text="Hello... world.")]
111		- assert to_full_sentences([Word(text="Hello..."), Word(" world."), Word(" How")]) == [
112		- Segment(text="Hello... world.")
	155	+ assert to_full_sentences([word(text="Hello")]) == []
	156	+ assert to_full_sentences([word(text="Hello..."), word(" world")]) == []
	157	+ assert to_full_sentences([word(text="Hello..."), word(" world.")]) == [[word("Hello..."), word(" world.")]]
	158	+ assert to_full_sentences([word(text="Hello..."), word(" world."), word(" How")]) == [
	159	+ [word("Hello..."), word(" world.")],
113	160	]
114	161
115	162
116		-def to_text(words: list[Word]) -> str:
117		- return "".join(word.text for word in words)
	163	+def word_to_text(words: list[Word]) -> str:
	164	+ return "".join(word.word for word in words)
118	165
119	166
120		-def to_text_w_ts(words: list[Word]) -> str:
121		- return "".join(f"{word.text}({word.start:.2f}-{word.end:.2f})" for word in words)
	167	+def words_to_text_w_ts(words: list[Word]) -> str:
	168	+ return "".join(f"{word.word}({word.start:.2f}-{word.end:.2f})" for word in words)
	169	+
	170	+
	171	+def segments_to_text(segments: Iterable[Segment]) -> str:
	172	+ return "".join(segment.text for segment in segments).strip()
122	173
123	174
124	175	def canonicalize_word(text: str) -> str:
...	...	@@ -136,14 +187,14 @@
136	187
137	188	def common_prefix(a: list[Word], b: list[Word]) -> list[Word]:
138	189	i = 0
139		- while i < len(a) and i < len(b) and canonicalize_word(a[i].text) == canonicalize_word(b[i].text):
	190	+ while i < len(a) and i < len(b) and canonicalize_word(a[i].word) == canonicalize_word(b[i].word):
140	191	i += 1
141	192	return a[:i]
142	193
143	194
144	195	def test_common_prefix() -> None:
145	196	def word(text: str) -> Word:
146		- return Word(text=text, start=0.0, end=0.0, probability=0.0)
	197	+ return Word(word=text, start=0.0, end=0.0, probability=0.0)
147	198
148	199	a = [word("a"), word("b"), word("c")]
149	200	b = [word("a"), word("b"), word("c")]
...	...	@@ -176,7 +227,7 @@
176	227
177	228	def test_common_prefix_and_canonicalization() -> None:
178	229	def word(text: str) -> Word:
179		- return Word(text=text, start=0.0, end=0.0, probability=0.0)
	230	+ return Word(word=text, start=0.0, end=0.0, probability=0.0)
180	231
181	232	a = [word("A...")]
182	233	b = [word("a?"), word("b"), word("c")]

...	...	@@ -4,12 +4,10 @@
4	4
5	5	from pydantic import BaseModel, ConfigDict, Field
6	6
7		-from faster_whisper_server import utils
	7	+from faster_whisper_server.core import Segment, Transcription, Word, segments_to_text
8	8
9	9	if TYPE_CHECKING:
10		- from faster_whisper.transcribe import Segment, TranscriptionInfo, Word
11		-
12		- from faster_whisper_server.core import Transcription
	10	+ from faster_whisper.transcribe import TranscriptionInfo
13	11
14	12
15	13	# https://platform.openai.com/docs/api-reference/audio/json-object
...	...	@@ -18,55 +16,11 @@
18	16
19	17	@classmethod
20	18	def from_segments(cls, segments: list[Segment]) -> TranscriptionJsonResponse:
21		- return cls(text=utils.segments_text(segments))
	19	+ return cls(text=segments_to_text(segments))
22	20
23	21	@classmethod
24	22	def from_transcription(cls, transcription: Transcription) -> TranscriptionJsonResponse:
25	23	return cls(text=transcription.text)
26		-
27		-
28		-class WordObject(BaseModel):
29		- start: float
30		- end: float
31		- word: str
32		- probability: float
33		-
34		- @classmethod
35		- def from_word(cls, word: Word) -> WordObject:
36		- return cls(
37		- start=word.start,
38		- end=word.end,
39		- word=word.word,
40		- probability=word.probability,
41		- )
42		-
43		-
44		-class SegmentObject(BaseModel):
45		- id: int
46		- seek: int
47		- start: float
48		- end: float
49		- text: str
50		- tokens: list[int]
51		- temperature: float
52		- avg_logprob: float
53		- compression_ratio: float
54		- no_speech_prob: float
55		-
56		- @classmethod
57		- def from_segment(cls, segment: Segment) -> SegmentObject:
58		- return cls(
59		- id=segment.id,
60		- seek=segment.seek,
61		- start=segment.start,
62		- end=segment.end,
63		- text=segment.text,
64		- tokens=segment.tokens,
65		- temperature=segment.temperature,
66		- avg_logprob=segment.avg_logprob,
67		- compression_ratio=segment.compression_ratio,
68		- no_speech_prob=segment.no_speech_prob,
69		- )
70	24
71	25
72	26	# https://platform.openai.com/docs/api-reference/audio/verbose-json-object
...	...	@@ -75,8 +29,8 @@
75	29	language: str
76	30	duration: float
77	31	text: str
78		- words: list[WordObject]
79		- segments: list[SegmentObject]
	32	+ words: list[Word]
	33	+ segments: list[Segment]
80	34
81	35	@classmethod
82	36	def from_segment(cls, segment: Segment, transcription_info: TranscriptionInfo) -> TranscriptionVerboseJsonResponse:
...	...	@@ -84,8 +38,8 @@
84	38	language=transcription_info.language,
85	39	duration=segment.end - segment.start,
86	40	text=segment.text,
87		- words=([WordObject.from_word(word) for word in segment.words] if isinstance(segment.words, list) else []),
88		- segments=[SegmentObject.from_segment(segment)],
	41	+ words=(segment.words if isinstance(segment.words, list) else []),
	42	+ segments=[segment],
89	43	)
90	44
91	45	@classmethod
...	...	@@ -95,9 +49,9 @@
95	49	return cls(
96	50	language=transcription_info.language,
97	51	duration=transcription_info.duration,
98		- text=utils.segments_text(segments),
99		- segments=[SegmentObject.from_segment(segment) for segment in segments],
100		- words=[WordObject.from_word(word) for word in utils.words_from_segments(segments)],
	52	+ text=segments_to_text(segments),
	53	+ segments=segments,
	54	+ words=Word.from_segments(segments),
101	55	)
102	56
103	57	@classmethod
...	...	@@ -106,15 +60,7 @@
106	60	language="english", # FIX: hardcoded
107	61	duration=transcription.duration,
108	62	text=transcription.text,
109		- words=[
110		- WordObject(
111		- start=word.start,
112		- end=word.end,
113		- word=word.text,
114		- probability=word.probability,
115		- )
116		- for word in transcription.words
117		- ],
	63	+ words=transcription.words,
118	64	segments=[], # FIX: hardcoded
119	65	)
120	66

...	...	@@ -4,12 +4,7 @@
4	4
5	5	from faster_whisper_server.audio import Audio, AudioStream
6	6	from faster_whisper_server.config import config
7		-from faster_whisper_server.core import (
8		- Transcription,
9		- Word,
10		- common_prefix,
11		- to_full_sentences,
12		-)
	7	+from faster_whisper_server.core import Transcription, Word, common_prefix, to_full_sentences, word_to_text
13	8	from faster_whisper_server.logger import logger
14	9
15	10	if TYPE_CHECKING:
...	...	@@ -37,30 +32,16 @@
37	32
38	33	return prefix
39	34
40		- @classmethod
41		- def prompt(cls, confirmed: Transcription) -> str \| None:
42		- sentences = to_full_sentences(confirmed.words)
43		- if len(sentences) == 0:
44		- return None
45		- return sentences[-1].text
46	35
47		- # TODO: better name
48		- @classmethod
49		- def needs_audio_after(cls, confirmed: Transcription) -> float:
50		- full_sentences = to_full_sentences(confirmed.words)
51		- return full_sentences[-1].end if len(full_sentences) > 0 else 0.0
52		-
53		-
	36	+# TODO: needs a better name
54	37	def needs_audio_after(confirmed: Transcription) -> float:
55	38	full_sentences = to_full_sentences(confirmed.words)
56		- return full_sentences[-1].end if len(full_sentences) > 0 else 0.0
	39	+ return full_sentences[-1][-1].end if len(full_sentences) > 0 else 0.0
57	40
58	41
59	42	def prompt(confirmed: Transcription) -> str \| None:
60	43	sentences = to_full_sentences(confirmed.words)
61		- if len(sentences) == 0:
62		- return None
63		- return sentences[-1].text
	44	+ return word_to_text(sentences[-1]) if len(sentences) > 0 else None
64	45
65	46
66	47	async def audio_transcriber(

Delete comment