Commit @e827e03d06343caa1ae9843877e9de7ba912e59c - yjyoon/whisper_server

Fedir Zadniprovskyi 01-10

fix: gradio app breaks on arm

@e827e03d06343caa1ae9843877e9de7ba912e59c

64c033f

e827e03

src/faster_whisper_server/gradio_app.py

--- src/faster_whisper_server/gradio_app.py

+++ src/faster_whisper_server/gradio_app.py


 from collections.abc import AsyncGenerator
 from pathlib import Path
+import platform
 
 import gradio as gr
 import httpx

 
 from faster_whisper_server.config import Config, Task
 from faster_whisper_server.hf_utils import PiperModel
-
-# FIX: this won't work on ARM
-from faster_whisper_server.routers.speech import (
-    DEFAULT_VOICE,
-    MAX_SAMPLE_RATE,
-    MIN_SAMPLE_RATE,
-    SUPPORTED_RESPONSE_FORMATS,
-)
 
 TRANSCRIPTION_ENDPOINT = "/v1/audio/transcriptions"
 TRANSLATION_ENDPOINT = "/v1/audio/translations"

             )
 
         with gr.Tab(label="Speech Generation"):
-            # TODO: add warning about ARM
-            text = gr.Textbox(label="Input Text")
-            voice_dropdown = gr.Dropdown(
-                choices=["en_US-amy-medium"],
-                label="Voice",
-                value="en_US-amy-medium",
-                info="""
+            if platform.machine() != "x86_64":
+                from faster_whisper_server.routers.speech import (
+                    DEFAULT_VOICE,
+                    MAX_SAMPLE_RATE,
+                    MIN_SAMPLE_RATE,
+                    SUPPORTED_RESPONSE_FORMATS,
+                )
+
+                text = gr.Textbox(label="Input Text")
+                voice_dropdown = gr.Dropdown(
+                    choices=["en_US-amy-medium"],
+                    label="Voice",
+                    value="en_US-amy-medium",
+                    info="""
 The last part of the voice name is the quality (x_low, low, medium, high).
 Each quality has a different default sample rate:
 - x_low: 16000 Hz

 - medium: 22050 Hz
 - high: 22050 Hz
 """,
-            )
-            response_fromat_dropdown = gr.Dropdown(
-                choices=SUPPORTED_RESPONSE_FORMATS,
-                label="Response Format",
-                value="wav",
-            )
-            speed_slider = gr.Slider(minimum=0.25, maximum=4.0, step=0.05, label="Speed", value=1.0)
-            sample_rate_slider = gr.Number(
-                minimum=MIN_SAMPLE_RATE,
-                maximum=MAX_SAMPLE_RATE,
-                label="Desired Sample Rate",
-                info="""
+                )
+                response_fromat_dropdown = gr.Dropdown(
+                    choices=SUPPORTED_RESPONSE_FORMATS,
+                    label="Response Format",
+                    value="wav",
+                )
+                speed_slider = gr.Slider(minimum=0.25, maximum=4.0, step=0.05, label="Speed", value=1.0)
+                sample_rate_slider = gr.Number(
+                    minimum=MIN_SAMPLE_RATE,
+                    maximum=MAX_SAMPLE_RATE,
+                    label="Desired Sample Rate",
+                    info="""
 Setting this will resample the generated audio to the desired sample rate.
 You may want to set this if you are going to use voices of different qualities but want to keep the same sample rate.
 Default: None (No resampling)
 """,
-                value=lambda: None,
-            )
-            button = gr.Button("Generate Speech")
-            output = gr.Audio(type="filepath")
-            button.click(
-                handle_audio_speech,
-                [text, voice_dropdown, response_fromat_dropdown, speed_slider, sample_rate_slider],
-                output,
-            )
+                    value=lambda: None,
+                )
+                button = gr.Button("Generate Speech")
+                output = gr.Audio(type="filepath")
+                button.click(
+                    handle_audio_speech,
+                    [text, voice_dropdown, response_fromat_dropdown, speed_slider, sample_rate_slider],
+                    output,
+                )
+                demo.load(update_piper_voices_dropdown, inputs=None, outputs=voice_dropdown)
+            else:
+                gr.Textbox("Speech generation is only supported on x86_64 machines.")
 
         demo.load(update_whisper_model_dropdown, inputs=None, outputs=model_dropdown)
-        demo.load(update_piper_voices_dropdown, inputs=None, outputs=voice_dropdown)
     return demo

Add a comment

Open 0
Closed 0

List

...	...	@@ -1,5 +1,6 @@
1	1	from collections.abc import AsyncGenerator
2	2	from pathlib import Path
	3	+import platform
3	4
4	5	import gradio as gr
5	6	import httpx
...	...	@@ -8,14 +9,6 @@
8	9
9	10	from faster_whisper_server.config import Config, Task
10	11	from faster_whisper_server.hf_utils import PiperModel
11		-
12		-# FIX: this won't work on ARM
13		-from faster_whisper_server.routers.speech import (
14		- DEFAULT_VOICE,
15		- MAX_SAMPLE_RATE,
16		- MIN_SAMPLE_RATE,
17		- SUPPORTED_RESPONSE_FORMATS,
18		-)
19	12
20	13	TRANSCRIPTION_ENDPOINT = "/v1/audio/transcriptions"
21	14	TRANSLATION_ENDPOINT = "/v1/audio/translations"
...	...	@@ -163,13 +156,20 @@
163	156	)
164	157
165	158	with gr.Tab(label="Speech Generation"):
166		- # TODO: add warning about ARM
167		- text = gr.Textbox(label="Input Text")
168		- voice_dropdown = gr.Dropdown(
169		- choices=["en_US-amy-medium"],
170		- label="Voice",
171		- value="en_US-amy-medium",
172		- info="""
	159	+ if platform.machine() != "x86_64":
	160	+ from faster_whisper_server.routers.speech import (
	161	+ DEFAULT_VOICE,
	162	+ MAX_SAMPLE_RATE,
	163	+ MIN_SAMPLE_RATE,
	164	+ SUPPORTED_RESPONSE_FORMATS,
	165	+ )
	166	+
	167	+ text = gr.Textbox(label="Input Text")
	168	+ voice_dropdown = gr.Dropdown(
	169	+ choices=["en_US-amy-medium"],
	170	+ label="Voice",
	171	+ value="en_US-amy-medium",
	172	+ info="""
173	173	The last part of the voice name is the quality (x_low, low, medium, high).
174	174	Each quality has a different default sample rate:
175	175	- x_low: 16000 Hz
...	...	@@ -177,32 +177,34 @@
177	177	- medium: 22050 Hz
178	178	- high: 22050 Hz
179	179	""",
180		- )
181		- response_fromat_dropdown = gr.Dropdown(
182		- choices=SUPPORTED_RESPONSE_FORMATS,
183		- label="Response Format",
184		- value="wav",
185		- )
186		- speed_slider = gr.Slider(minimum=0.25, maximum=4.0, step=0.05, label="Speed", value=1.0)
187		- sample_rate_slider = gr.Number(
188		- minimum=MIN_SAMPLE_RATE,
189		- maximum=MAX_SAMPLE_RATE,
190		- label="Desired Sample Rate",
191		- info="""
	180	+ )
	181	+ response_fromat_dropdown = gr.Dropdown(
	182	+ choices=SUPPORTED_RESPONSE_FORMATS,
	183	+ label="Response Format",
	184	+ value="wav",
	185	+ )
	186	+ speed_slider = gr.Slider(minimum=0.25, maximum=4.0, step=0.05, label="Speed", value=1.0)
	187	+ sample_rate_slider = gr.Number(
	188	+ minimum=MIN_SAMPLE_RATE,
	189	+ maximum=MAX_SAMPLE_RATE,
	190	+ label="Desired Sample Rate",
	191	+ info="""
192	192	Setting this will resample the generated audio to the desired sample rate.
193	193	You may want to set this if you are going to use voices of different qualities but want to keep the same sample rate.
194	194	Default: None (No resampling)
195	195	""",
196		- value=lambda: None,
197		- )
198		- button = gr.Button("Generate Speech")
199		- output = gr.Audio(type="filepath")
200		- button.click(
201		- handle_audio_speech,
202		- [text, voice_dropdown, response_fromat_dropdown, speed_slider, sample_rate_slider],
203		- output,
204		- )
	196	+ value=lambda: None,
	197	+ )
	198	+ button = gr.Button("Generate Speech")
	199	+ output = gr.Audio(type="filepath")
	200	+ button.click(
	201	+ handle_audio_speech,
	202	+ [text, voice_dropdown, response_fromat_dropdown, speed_slider, sample_rate_slider],
	203	+ output,
	204	+ )
	205	+ demo.load(update_piper_voices_dropdown, inputs=None, outputs=voice_dropdown)
	206	+ else:
	207	+ gr.Textbox("Speech generation is only supported on x86_64 machines.")
205	208
206	209	demo.load(update_whisper_model_dropdown, inputs=None, outputs=model_dropdown)
207		- demo.load(update_piper_voices_dropdown, inputs=None, outputs=voice_dropdown)
208	210	return demo

Delete comment