Commit @9cf1e387e26ffd749313673f0aa31470aa090ef3 - yjyoon/whisper_server

Fedir Zadniprovskyi 01-10

feat: gradio speech generation tab

@9cf1e387e26ffd749313673f0aa31470aa090ef3

78c55fe

9cf1e38

src/faster_whisper_server/gradio_app.py

--- src/faster_whisper_server/gradio_app.py

+++ src/faster_whisper_server/gradio_app.py


 from openai import OpenAI
 
 from faster_whisper_server.config import Config, Task
+from faster_whisper_server.hf_utils import PiperModel
+
+# FIX: this won't work on ARM
+from faster_whisper_server.routers.speech import (
+    DEFAULT_VOICE,
+    MAX_SAMPLE_RATE,
+    MIN_SAMPLE_RATE,
+    SUPPORTED_RESPONSE_FORMATS,
+)
 
 TRANSCRIPTION_ENDPOINT = "/v1/audio/transcriptions"
 TRANSLATION_ENDPOINT = "/v1/audio/translations"

 TIMEOUT = httpx.Timeout(timeout=TIMEOUT_SECONDS)
 
 
-def create_gradio_demo(config: Config) -> gr.Blocks:
+def create_gradio_demo(config: Config) -> gr.Blocks:  # noqa: C901, PLR0915
     base_url = f"http://{config.host}:{config.port}"
     http_client = httpx.Client(base_url=base_url, timeout=TIMEOUT)
     openai_client = OpenAI(base_url=f"{base_url}/v1", api_key="cant-be-empty")
 
-    def handler(file_path: str, model: str, task: Task, temperature: float, stream: bool) -> Generator[str, None, None]:
+    # TODO: make async
+    def whisper_handler(
+        file_path: str, model: str, task: Task, temperature: float, stream: bool
+    ) -> Generator[str, None, None]:
         if task == Task.TRANSCRIBE:
             endpoint = TRANSCRIPTION_ENDPOINT
         elif task == Task.TRANSLATE:

                 for event in event_source.iter_sse():
                     yield event.data
 
-    def update_model_dropdown() -> gr.Dropdown:
+    def update_whisper_model_dropdown() -> gr.Dropdown:
         models = openai_client.models.list().data
         model_names: list[str] = [model.id for model in models]
         assert config.whisper.model in model_names

         other_models = [model for model in model_names if model not in recommended_models]
         model_names = list(recommended_models) + other_models
         return gr.Dropdown(
-            # no idea why it's complaining
-            choices=model_names,  # pyright: ignore[reportArgumentType]
+            choices=model_names,
             label="Model",
             value=config.whisper.model,
         )
 
-    model_dropdown = gr.Dropdown(
-        choices=[config.whisper.model],
-        label="Model",
-        value=config.whisper.model,
-    )
-    task_dropdown = gr.Dropdown(
-        choices=[task.value for task in Task],
-        label="Task",
-        value=Task.TRANSCRIBE,
-    )
-    temperature_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label="Temperature", value=0.0)
-    stream_checkbox = gr.Checkbox(label="Stream", value=True)
-    with gr.Interface(
-        title="Whisper Playground",
-        description="""Consider supporting the project by starring the <a href="https://github.com/fedirz/faster-whisper-server">repository on GitHub</a>.""",  # noqa: E501
-        inputs=[
-            gr.Audio(type="filepath"),
-            model_dropdown,
-            task_dropdown,
-            temperature_slider,
-            stream_checkbox,
-        ],
-        fn=handler,
-        outputs="text",
-        analytics_enabled=False,  # disable telemetry
-    ) as demo:
-        demo.load(update_model_dropdown, inputs=None, outputs=model_dropdown)
+    def update_piper_voices_dropdown() -> gr.Dropdown:
+        res = http_client.get("/v1/audio/speech/voices").raise_for_status()
+        piper_models = [PiperModel.model_validate(x) for x in res.json()]
+        return gr.Dropdown(choices=[model.voice for model in piper_models], label="Voice", value=DEFAULT_VOICE)
+
+    # TODO: make async
+    def handle_audio_speech(text: str, voice: str, response_format: str, speed: float, sample_rate: int | None) -> Path:
+        res = openai_client.audio.speech.create(
+            input=text,
+            model="piper",
+            voice=voice,  # pyright: ignore[reportArgumentType]
+            response_format=response_format,  # pyright: ignore[reportArgumentType]
+            speed=speed,
+            extra_body={"sample_rate": sample_rate},
+        )
+        audio_bytes = res.response.read()
+        file_path = Path(f"audio.{response_format}")
+        with file_path.open("wb") as file:
+            file.write(audio_bytes)
+        return file_path
+
+    with gr.Blocks(title="faster-whisper-server Playground") as demo:
+        gr.Markdown(
+            "### Consider supporting the project by starring the [repository on GitHub](https://github.com/fedirz/faster-whisper-server)."
+        )
+        with gr.Tab(label="Transcribe/Translate"):
+            audio = gr.Audio(type="filepath")
+            model_dropdown = gr.Dropdown(
+                choices=[config.whisper.model],
+                label="Model",
+                value=config.whisper.model,
+            )
+            task_dropdown = gr.Dropdown(
+                choices=[task.value for task in Task],
+                label="Task",
+                value=Task.TRANSCRIBE,
+            )
+            temperature_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label="Temperature", value=0.0)
+            stream_checkbox = gr.Checkbox(label="Stream", value=True)
+            button = gr.Button("Generate")
+
+            output = gr.Textbox()
+
+            # NOTE: the inputs order must match the `whisper_handler` signature
+            button.click(
+                whisper_handler, [audio, model_dropdown, task_dropdown, temperature_slider, stream_checkbox], output
+            )
+
+        with gr.Tab(label="Speech Generation"):
+            # TODO: add warning about ARM
+            text = gr.Textbox(label="Input Text")
+            voice_dropdown = gr.Dropdown(
+                choices=["en_US-amy-medium"],
+                label="Voice",
+                value="en_US-amy-medium",
+                info="""
+The last part of the voice name is the quality (x_low, low, medium, high).
+Each quality has a different default sample rate:
+- x_low: 16000 Hz
+- low: 16000 Hz
+- medium: 22050 Hz
+- high: 22050 Hz
+""",
+            )
+            response_fromat_dropdown = gr.Dropdown(
+                choices=SUPPORTED_RESPONSE_FORMATS,
+                label="Response Format",
+                value="wav",
+            )
+            speed_slider = gr.Slider(minimum=0.25, maximum=4.0, step=0.05, label="Speed", value=1.0)
+            sample_rate_slider = gr.Number(
+                minimum=MIN_SAMPLE_RATE,
+                maximum=MAX_SAMPLE_RATE,
+                label="Desired Sample Rate",
+                info="""
+Setting this will resample the generated audio to the desired sample rate.
+You may want to set this if you are going to use voices of different qualities but want to keep the same sample rate.
+Default: None (No resampling)
+""",
+                value=lambda: None,
+            )
+            button = gr.Button("Generate Speech")
+            output = gr.Audio(type="filepath")
+            button.click(
+                handle_audio_speech,
+                [text, voice_dropdown, response_fromat_dropdown, speed_slider, sample_rate_slider],
+                output,
+            )
+
+        demo.load(update_whisper_model_dropdown, inputs=None, outputs=model_dropdown)
+        demo.load(update_piper_voices_dropdown, inputs=None, outputs=voice_dropdown)
     return demo

78c55fe

9cf1e38

src/faster_whisper_server/hf_utils.py

--- src/faster_whisper_server/hf_utils.py

+++ src/faster_whisper_server/hf_utils.py


 from collections.abc import Generator
-from functools import lru_cache
+from functools import cached_property, lru_cache
 import json
 import logging
 from pathlib import Path

 
 import huggingface_hub
 from huggingface_hub.constants import HF_HUB_CACHE
-from pydantic import BaseModel
+from pydantic import BaseModel, Field, computed_field
 
 from faster_whisper_server.api_models import Model
 

         yield transformed_model
 
 
+PiperVoiceQuality = Literal["x_low", "low", "medium", "high"]
+PIPER_VOICE_QUALITY_SAMPLE_RATE_MAP: dict[PiperVoiceQuality, int] = {
+    "x_low": 16000,
+    "low": 22050,
+    "medium": 22050,
+    "high": 22050,
+}
+
+
 class PiperModel(BaseModel):
-    id: str
+    """Similar structure to the GET /v1/models response but with extra fields."""
+
     object: Literal["model"] = "model"
     created: int
     owned_by: Literal["rhasspy"] = "rhasspy"
-    path: Path
-    config_path: Path
+    model_path: Path = Field(
+        examples=[
+            "/home/nixos/.cache/huggingface/hub/models--rhasspy--piper-voices/snapshots/3d796cc2f2c884b3517c527507e084f7bb245aea/en/en_US/amy/medium/en_US-amy-medium.onnx"
+        ]
+    )
+
+    @computed_field(examples=["rhasspy/piper-voices/en_US-amy-medium"])
+    @cached_property
+    def id(self) -> str:
+        return f"rhasspy/piper-voices/{self.model_path.name.removesuffix(".onnx")}"
+
+    @computed_field(examples=["rhasspy/piper-voices/en_US-amy-medium"])
+    @cached_property
+    def voice(self) -> str:
+        return self.model_path.name.removesuffix(".onnx")
+
+    @computed_field
+    @cached_property
+    def config_path(self) -> Path:
+        return Path(str(self.model_path) + ".json")
+
+    @computed_field
+    @cached_property
+    def quality(self) -> PiperVoiceQuality:
+        return self.id.split("-")[-1]  # pyright: ignore[reportReturnType]
+
+    @computed_field
+    @cached_property
+    def sample_rate(self) -> int:
+        return PIPER_VOICE_QUALITY_SAMPLE_RATE_MAP[self.quality]
 
 
 def get_model_path(model_id: str, *, cache_dir: str | Path | None = None) -> Path | None:

 def list_piper_models() -> Generator[PiperModel, None, None]:
     model_weights_files = list_model_files("rhasspy/piper-voices", glob_pattern="**/*.onnx")
     for model_weights_file in model_weights_files:
-        model_config_file = model_weights_file.with_suffix(".json")
         yield PiperModel(
-            id=model_weights_file.name,
             created=int(model_weights_file.stat().st_mtime),
-            path=model_weights_file,
-            config_path=model_config_file,
+            model_path=model_weights_file,
         )
 
 

78c55fe

9cf1e38

src/faster_whisper_server/routers/speech.py

--- src/faster_whisper_server/routers/speech.py

+++ src/faster_whisper_server/routers/speech.py

...	...	@@ -12,7 +12,11 @@
12	12	import soundfile as sf
13	13
14	14	from faster_whisper_server.dependencies import PiperModelManagerDependency
15		-from faster_whisper_server.hf_utils import read_piper_voices_config
	15	+from faster_whisper_server.hf_utils import (
	16	+ PiperModel,
	17	+ list_piper_models,
	18	+ read_piper_voices_config,
	19	+)
16	20
17	21	DEFAULT_MODEL = "piper"
18	22	# https://platform.openai.com/docs/api-reference/audio/createSpeech#audio-createspeech-response_format
...	...	@@ -126,6 +130,14 @@
126	130	],
127	131	)
128	132	voice: Voice = DEFAULT_VOICE
	133	+ """
	134	+The last part of the voice name is the quality (x_low, low, medium, high).
	135	+Each quality has a different default sample rate:
	136	+- x_low: 16000 Hz
	137	+- low: 16000 Hz
	138	+- medium: 22050 Hz
	139	+- high: 22050 Hz
	140	+ """
129	141	response_format: ResponseFormat = Field(
130	142	DEFAULT_RESPONSE_FORMAT,
131	143	description=f"The format to audio in. Supported formats are {", ".join(SUPPORTED_RESPONSE_FORMATS)}. {", ".join(UNSUPORTED_RESPONSE_FORMATS)} are not supported", # noqa: E501
...	...	@@ -136,6 +148,7 @@
136	148	"""The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default."""
137	149	sample_rate: int \| None = Field(None, ge=MIN_SAMPLE_RATE, le=MAX_SAMPLE_RATE)
138	150	"""Desired sample rate to convert the generated audio to. If not provided, the model's default sample rate will be used.""" # noqa: E501
	151	+ # TODO: document default sample rate for each voice quality
139	152
140	153	# TODO: move into `Voice`
141	154	@model_validator(mode="after")
...	...	@@ -163,3 +176,8 @@
163	176	)
164	177
165	178	return StreamingResponse(audio_generator, media_type=f"audio/{body.response_format}")
	179	+
	180	+
	181	+@router.get("/v1/audio/speech/voices")
	182	+def list_voices() -> list[PiperModel]:
	183	+ return list(list_piper_models())

Add a comment

Open 0
Closed 0

List

...	...	@@ -7,6 +7,15 @@
7	7	from openai import OpenAI
8	8
9	9	from faster_whisper_server.config import Config, Task
	10	+from faster_whisper_server.hf_utils import PiperModel
	11	+
	12	+# FIX: this won't work on ARM
	13	+from faster_whisper_server.routers.speech import (
	14	+ DEFAULT_VOICE,
	15	+ MAX_SAMPLE_RATE,
	16	+ MIN_SAMPLE_RATE,
	17	+ SUPPORTED_RESPONSE_FORMATS,
	18	+)
10	19
11	20	TRANSCRIPTION_ENDPOINT = "/v1/audio/transcriptions"
12	21	TRANSLATION_ENDPOINT = "/v1/audio/translations"
...	...	@@ -14,12 +23,15 @@
14	23	TIMEOUT = httpx.Timeout(timeout=TIMEOUT_SECONDS)
15	24
16	25
17		-def create_gradio_demo(config: Config) -> gr.Blocks:
	26	+def create_gradio_demo(config: Config) -> gr.Blocks: # noqa: C901, PLR0915
18	27	base_url = f"http://{config.host}:{config.port}"
19	28	http_client = httpx.Client(base_url=base_url, timeout=TIMEOUT)
20	29	openai_client = OpenAI(base_url=f"{base_url}/v1", api_key="cant-be-empty")
21	30
22		- def handler(file_path: str, model: str, task: Task, temperature: float, stream: bool) -> Generator[str, None, None]:
	31	+ # TODO: make async
	32	+ def whisper_handler(
	33	+ file_path: str, model: str, task: Task, temperature: float, stream: bool
	34	+ ) -> Generator[str, None, None]:
23	35	if task == Task.TRANSCRIBE:
24	36	endpoint = TRANSCRIPTION_ENDPOINT
25	37	elif task == Task.TRANSLATE:
...	...	@@ -65,7 +77,7 @@
65	77	for event in event_source.iter_sse():
66	78	yield event.data
67	79
68		- def update_model_dropdown() -> gr.Dropdown:
	80	+ def update_whisper_model_dropdown() -> gr.Dropdown:
69	81	models = openai_client.models.list().data
70	82	model_names: list[str] = [model.id for model in models]
71	83	assert config.whisper.model in model_names
...	...	@@ -73,37 +85,100 @@
73	85	other_models = [model for model in model_names if model not in recommended_models]
74	86	model_names = list(recommended_models) + other_models
75	87	return gr.Dropdown(
76		- # no idea why it's complaining
77		- choices=model_names, # pyright: ignore[reportArgumentType]
	88	+ choices=model_names,
78	89	label="Model",
79	90	value=config.whisper.model,
80	91	)
81	92
82		- model_dropdown = gr.Dropdown(
83		- choices=[config.whisper.model],
84		- label="Model",
85		- value=config.whisper.model,
86		- )
87		- task_dropdown = gr.Dropdown(
88		- choices=[task.value for task in Task],
89		- label="Task",
90		- value=Task.TRANSCRIBE,
91		- )
92		- temperature_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label="Temperature", value=0.0)
93		- stream_checkbox = gr.Checkbox(label="Stream", value=True)
94		- with gr.Interface(
95		- title="Whisper Playground",
96		- description="""Consider supporting the project by starring the <a href="https://github.com/fedirz/faster-whisper-server">repository on GitHub</a>.""", # noqa: E501
97		- inputs=[
98		- gr.Audio(type="filepath"),
99		- model_dropdown,
100		- task_dropdown,
101		- temperature_slider,
102		- stream_checkbox,
103		- ],
104		- fn=handler,
105		- outputs="text",
106		- analytics_enabled=False, # disable telemetry
107		- ) as demo:
108		- demo.load(update_model_dropdown, inputs=None, outputs=model_dropdown)
	93	+ def update_piper_voices_dropdown() -> gr.Dropdown:
	94	+ res = http_client.get("/v1/audio/speech/voices").raise_for_status()
	95	+ piper_models = [PiperModel.model_validate(x) for x in res.json()]
	96	+ return gr.Dropdown(choices=[model.voice for model in piper_models], label="Voice", value=DEFAULT_VOICE)
	97	+
	98	+ # TODO: make async
	99	+ def handle_audio_speech(text: str, voice: str, response_format: str, speed: float, sample_rate: int \| None) -> Path:
	100	+ res = openai_client.audio.speech.create(
	101	+ input=text,
	102	+ model="piper",
	103	+ voice=voice, # pyright: ignore[reportArgumentType]
	104	+ response_format=response_format, # pyright: ignore[reportArgumentType]
	105	+ speed=speed,
	106	+ extra_body={"sample_rate": sample_rate},
	107	+ )
	108	+ audio_bytes = res.response.read()
	109	+ file_path = Path(f"audio.{response_format}")
	110	+ with file_path.open("wb") as file:
	111	+ file.write(audio_bytes)
	112	+ return file_path
	113	+
	114	+ with gr.Blocks(title="faster-whisper-server Playground") as demo:
	115	+ gr.Markdown(
	116	+ "### Consider supporting the project by starring the [repository on GitHub](https://github.com/fedirz/faster-whisper-server)."
	117	+ )
	118	+ with gr.Tab(label="Transcribe/Translate"):
	119	+ audio = gr.Audio(type="filepath")
	120	+ model_dropdown = gr.Dropdown(
	121	+ choices=[config.whisper.model],
	122	+ label="Model",
	123	+ value=config.whisper.model,
	124	+ )
	125	+ task_dropdown = gr.Dropdown(
	126	+ choices=[task.value for task in Task],
	127	+ label="Task",
	128	+ value=Task.TRANSCRIBE,
	129	+ )
	130	+ temperature_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label="Temperature", value=0.0)
	131	+ stream_checkbox = gr.Checkbox(label="Stream", value=True)
	132	+ button = gr.Button("Generate")
	133	+
	134	+ output = gr.Textbox()
	135	+
	136	+ # NOTE: the inputs order must match the `whisper_handler` signature
	137	+ button.click(
	138	+ whisper_handler, [audio, model_dropdown, task_dropdown, temperature_slider, stream_checkbox], output
	139	+ )
	140	+
	141	+ with gr.Tab(label="Speech Generation"):
	142	+ # TODO: add warning about ARM
	143	+ text = gr.Textbox(label="Input Text")
	144	+ voice_dropdown = gr.Dropdown(
	145	+ choices=["en_US-amy-medium"],
	146	+ label="Voice",
	147	+ value="en_US-amy-medium",
	148	+ info="""
	149	+The last part of the voice name is the quality (x_low, low, medium, high).
	150	+Each quality has a different default sample rate:
	151	+- x_low: 16000 Hz
	152	+- low: 16000 Hz
	153	+- medium: 22050 Hz
	154	+- high: 22050 Hz
	155	+""",
	156	+ )
	157	+ response_fromat_dropdown = gr.Dropdown(
	158	+ choices=SUPPORTED_RESPONSE_FORMATS,
	159	+ label="Response Format",
	160	+ value="wav",
	161	+ )
	162	+ speed_slider = gr.Slider(minimum=0.25, maximum=4.0, step=0.05, label="Speed", value=1.0)
	163	+ sample_rate_slider = gr.Number(
	164	+ minimum=MIN_SAMPLE_RATE,
	165	+ maximum=MAX_SAMPLE_RATE,
	166	+ label="Desired Sample Rate",
	167	+ info="""
	168	+Setting this will resample the generated audio to the desired sample rate.
	169	+You may want to set this if you are going to use voices of different qualities but want to keep the same sample rate.
	170	+Default: None (No resampling)
	171	+""",
	172	+ value=lambda: None,
	173	+ )
	174	+ button = gr.Button("Generate Speech")
	175	+ output = gr.Audio(type="filepath")
	176	+ button.click(
	177	+ handle_audio_speech,
	178	+ [text, voice_dropdown, response_fromat_dropdown, speed_slider, sample_rate_slider],
	179	+ output,
	180	+ )
	181	+
	182	+ demo.load(update_whisper_model_dropdown, inputs=None, outputs=model_dropdown)
	183	+ demo.load(update_piper_voices_dropdown, inputs=None, outputs=voice_dropdown)
109	184	return demo

...	...	@@ -1,5 +1,5 @@
1	1	from collections.abc import Generator
2		-from functools import lru_cache
	2	+from functools import cached_property, lru_cache
3	3	import json
4	4	import logging
5	5	from pathlib import Path
...	...	@@ -8,7 +8,7 @@
8	8
9	9	import huggingface_hub
10	10	from huggingface_hub.constants import HF_HUB_CACHE
11		-from pydantic import BaseModel
	11	+from pydantic import BaseModel, Field, computed_field
12	12
13	13	from faster_whisper_server.api_models import Model
14	14
...	...	@@ -95,13 +95,51 @@
95	95	yield transformed_model
96	96
97	97
	98	+PiperVoiceQuality = Literal["x_low", "low", "medium", "high"]
	99	+PIPER_VOICE_QUALITY_SAMPLE_RATE_MAP: dict[PiperVoiceQuality, int] = {
	100	+ "x_low": 16000,
	101	+ "low": 22050,
	102	+ "medium": 22050,
	103	+ "high": 22050,
	104	+}
	105	+
	106	+
98	107	class PiperModel(BaseModel):
99		- id: str
	108	+ """Similar structure to the GET /v1/models response but with extra fields."""
	109	+
100	110	object: Literal["model"] = "model"
101	111	created: int
102	112	owned_by: Literal["rhasspy"] = "rhasspy"
103		- path: Path
104		- config_path: Path
	113	+ model_path: Path = Field(
	114	+ examples=[
	115	+ "/home/nixos/.cache/huggingface/hub/models--rhasspy--piper-voices/snapshots/3d796cc2f2c884b3517c527507e084f7bb245aea/en/en_US/amy/medium/en_US-amy-medium.onnx"
	116	+ ]
	117	+ )
	118	+
	119	+ @computed_field(examples=["rhasspy/piper-voices/en_US-amy-medium"])
	120	+ @cached_property
	121	+ def id(self) -> str:
	122	+ return f"rhasspy/piper-voices/{self.model_path.name.removesuffix(".onnx")}"
	123	+
	124	+ @computed_field(examples=["rhasspy/piper-voices/en_US-amy-medium"])
	125	+ @cached_property
	126	+ def voice(self) -> str:
	127	+ return self.model_path.name.removesuffix(".onnx")
	128	+
	129	+ @computed_field
	130	+ @cached_property
	131	+ def config_path(self) -> Path:
	132	+ return Path(str(self.model_path) + ".json")
	133	+
	134	+ @computed_field
	135	+ @cached_property
	136	+ def quality(self) -> PiperVoiceQuality:
	137	+ return self.id.split("-")[-1] # pyright: ignore[reportReturnType]
	138	+
	139	+ @computed_field
	140	+ @cached_property
	141	+ def sample_rate(self) -> int:
	142	+ return PIPER_VOICE_QUALITY_SAMPLE_RATE_MAP[self.quality]
105	143
106	144
107	145	def get_model_path(model_id: str, *, cache_dir: str \| Path \| None = None) -> Path \| None:
...	...	@@ -151,12 +189,9 @@
151	189	def list_piper_models() -> Generator[PiperModel, None, None]:
152	190	model_weights_files = list_model_files("rhasspy/piper-voices", glob_pattern="*/.onnx")
153	191	for model_weights_file in model_weights_files:
154		- model_config_file = model_weights_file.with_suffix(".json")
155	192	yield PiperModel(
156		- id=model_weights_file.name,
157	193	created=int(model_weights_file.stat().st_mtime),
158		- path=model_weights_file,
159		- config_path=model_config_file,
	194	+ model_path=model_weights_file,
160	195	)
161	196
162	197

Delete comment