Commit @31537d182e79f198a6083d01495da7670d0b3e27 - yjyoon/whisper_server

3e09701

31537d1

Dockerfile

--- Dockerfile

+++ Dockerfile


 # Creating a directory for the cache to avoid the following error:
 # PermissionError: [Errno 13] Permission denied: '/home/ubuntu/.cache/huggingface/hub'
 # This error occurs because the volume is mounted as root and the `ubuntu` user doesn't have permission to write to it. Pre-creating the directory solves this issue.
-RUN mkdir -p $HOME/.cache/huggingface
+RUN mkdir -p $HOME/.cache/huggingface/hub
 ENV WHISPER__MODEL=Systran/faster-whisper-large-v3
 ENV UVICORN_HOST=0.0.0.0
 ENV UVICORN_PORT=8000

3e09701

31537d1

README.md

--- README.md

+++ README.md


 
 ```bash
 # for GPU support
-docker run --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --detach fedirz/faster-whisper-server:latest-cuda
+docker run --gpus=all --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --detach fedirz/faster-whisper-server:latest-cuda
 # for CPU only (use this if you don't have a GPU, as the image is much smaller)
-docker run --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER__MODEL=Systran/faster-whisper-small --detach fedirz/faster-whisper-server:latest-cpu
+docker run --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=Systran/faster-whisper-small --detach fedirz/faster-whisper-server:latest-cpu
 ```
 
 ### Using Kubernetes

3e09701

31537d1

audio.wav (Binary)

--- audio.wav

+++ audio.wav

Binary file is not shown

3e09701

31537d1

compose.cpu.yaml

--- compose.cpu.yaml

+++ compose.cpu.yaml


     environment:
       - WHISPER__MODEL=Systran/faster-whisper-small
     volumes:
-      - hugging_face_cache:/root/.cache/huggingface
+      - hf-hub-cache:/home/ubuntu/.cache/huggingface/hub
 volumes:
-  hugging_face_cache:
+  hf-hub-cache:

3e09701

31537d1

compose.cuda-cdi.yaml

--- compose.cuda-cdi.yaml

+++ compose.cuda-cdi.yaml


       file: compose.cuda.yaml
       service: faster-whisper-server
     volumes:
-      - hugging_face_cache:/root/.cache/huggingface
+      - hf-hub-cache:/home/ubuntu/.cache/huggingface/hub
     deploy:
       resources:
         reservations:

               device_ids:
                 - nvidia.com/gpu=all
 volumes:
-  hugging_face_cache:
+  hf-hub-cache:

3e09701

31537d1

compose.cuda.yaml

--- compose.cuda.yaml

+++ compose.cuda.yaml


     environment:
       - WHISPER__MODEL=Systran/faster-whisper-large-v3
     volumes:
-      - hugging_face_cache:/root/.cache/huggingface
+      - hf-hub-cache:/home/ubuntu/.cache/huggingface/hub
     deploy:
       resources:
         reservations:
           devices:
             - capabilities: ["gpu"]
 volumes:
-  hugging_face_cache:
+  hf-hub-cache:

3e09701

31537d1

docs/installation.md

--- docs/installation.md

+++ docs/installation.md


         ports:
           - 8000:8000
         volumes:
-          - hugging_face_cache:/root/.cache/huggingface
+          - hf-hub-cache:/home/ubuntu/.cache/huggingface/hub
         deploy:
           resources:
             reservations:
               devices:
                 - capabilities: ["gpu"]
     volumes:
-      hugging_face_cache:
+      hf-hub-cache:
     ```
 
 === "CUDA (with CDI feature enabled)"

         ports:
           - 8000:8000
         volumes:
-          - hugging_face_cache:/root/.cache/huggingface
+          - hf-hub-cache:/home/ubuntu/.cache/huggingface/hub
         deploy:
           resources:
             reservations:

                   device_ids:
                   - nvidia.com/gpu=all
     volumes:
-      hugging_face_cache:
+      hf-hub-cache:
     ```
 
 === "CPU"

         ports:
           - 8000:8000
         volumes:
-          - hugging_face_cache:/root/.cache/huggingface
+          - hf-hub-cache:/home/ubuntu/.cache/huggingface/hub
     volumes:
-      hugging_face_cache:
+      hf-hub-cache:
     ```
 
 ## Docker

 === "CUDA"
 
     ```bash
-    docker run --rm --detach --publish 8000:8000 --name faster-whisper-server --volume hugging_face_cache:/root/.cache/huggingface --gpus=all fedirz/faster-whisper-server:latest-cuda
+    docker run --rm --detach --publish 8000:8000 --name faster-whisper-server --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --gpus=all fedirz/faster-whisper-server:latest-cuda
     ```
 
 === "CUDA (with CDI feature enabled)"
 
     ```bash
-    docker run --rm --detach --publish 8000:8000 --name faster-whisper-server --volume hugging_face_cache:/root/.cache/huggingface --device=nvidia.com/gpu=all fedirz/faster-whisper-server:latest-cuda
+    docker run --rm --detach --publish 8000:8000 --name faster-whisper-server --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --device=nvidia.com/gpu=all fedirz/faster-whisper-server:latest-cuda
     ```
 
 === "CPU"
 
     ```bash
-    docker run --rm --detach --publish 8000:8000 --name faster-whisper-server --volume hugging_face_cache:/root/.cache/huggingface fedirz/faster-whisper-server:latest-cpu
+    docker run --rm --detach --publish 8000:8000 --name faster-whisper-server --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub fedirz/faster-whisper-server:latest-cpu
     ```
 
 ## Kubernetes

3e09701

31537d1

examples/live-audio/script.sh

--- examples/live-audio/script.sh

+++ examples/live-audio/script.sh


 export WHISPER__MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
 
 # Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
-docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER__MODEL=$WHISPER__MODEL fedirz/faster-whisper-server:latest-cuda
+docker run --detach --gpus=all --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL fedirz/faster-whisper-server:latest-cuda
 # or you can run it on a CPU
-# docker run --detach --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER__MODEL=$WHISPER__MODEL fedirz/faster-whisper-server:latest-cpu
+# docker run --detach --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL fedirz/faster-whisper-server:latest-cpu
 
 # `pv` is used to limit the rate at which the audio is streamed to the server. Audio is being streamed at a rate of 32kb/s(16000 sample rate * 16-bit sample / 8 bits per byte = 32000 bytes per second). This emulutes live audio input from a microphone: `ffmpeg -loglevel quiet -f alsa -i default -ac 1 -ar 16000 -f s16le -`
 # shellcheck disable=SC2002

3e09701

31537d1

examples/youtube/script.sh

--- examples/youtube/script.sh

+++ examples/youtube/script.sh


 export WHISPER__MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
 
 # Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
-docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER__MODEL=$WHISPER__MODEL fedirz/faster-whisper-server:latest-cuda
+docker run --detach --gpus=all --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL fedirz/faster-whisper-server:latest-cuda
 # or you can run it on a CPU
-# docker run --detach --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER__MODEL=$WHISPER__MODEL fedirz/faster-whisper-server:latest-cpu
+# docker run --detach --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL fedirz/faster-whisper-server:latest-cpu
 
 # Download the audio from a YouTube video. In this example I'm downloading "The Evolution of the Operating System" by Asionometry YouTube channel. I highly checking this channel out, the guy produces very high content. If you don't have `youtube-dl`, you'll have to install it. https://github.com/ytdl-org/youtube-dl
 youtube-dl --extract-audio --audio-format mp3 -o the-evolution-of-the-operating-system.mp3 'https://www.youtube.com/watch?v=1lG7lFLXBIs'

...	...	@@ -29,7 +29,7 @@
29	29	# Creating a directory for the cache to avoid the following error:
30	30	# PermissionError: [Errno 13] Permission denied: '/home/ubuntu/.cache/huggingface/hub'
31	31	# This error occurs because the volume is mounted as root and the `ubuntu` user doesn't have permission to write to it. Pre-creating the directory solves this issue.
32		-RUN mkdir -p $HOME/.cache/huggingface
	32	+RUN mkdir -p $HOME/.cache/huggingface/hub
33	33	ENV WHISPER__MODEL=Systran/faster-whisper-large-v3
34	34	ENV UVICORN_HOST=0.0.0.0
35	35	ENV UVICORN_PORT=8000

...	...	@@ -49,9 +49,9 @@
49	49
50	50	```bash
51	51	# for GPU support
52		-docker run --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --detach fedirz/faster-whisper-server:latest-cuda
	52	+docker run --gpus=all --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --detach fedirz/faster-whisper-server:latest-cuda
53	53	# for CPU only (use this if you don't have a GPU, as the image is much smaller)
54		-docker run --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER__MODEL=Systran/faster-whisper-small --detach fedirz/faster-whisper-server:latest-cpu
	54	+docker run --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=Systran/faster-whisper-small --detach fedirz/faster-whisper-server:latest-cpu
55	55	```
56	56
57	57	### Using Kubernetes

...	...	@@ -9,7 +9,7 @@
9	9	file: compose.cuda.yaml
10	10	service: faster-whisper-server
11	11	volumes:
12		- - hugging_face_cache:/root/.cache/huggingface
	12	+ - hf-hub-cache:/home/ubuntu/.cache/huggingface/hub
13	13	deploy:
14	14	resources:
15	15	reservations:
...	...	@@ -21,4 +21,4 @@
21	21	device_ids:
22	22	- nvidia.com/gpu=all
23	23	volumes:
24		- hugging_face_cache:
	24	+ hf-hub-cache:

...	...	@@ -13,14 +13,14 @@
13	13	ports:
14	14	- 8000:8000
15	15	volumes:
16		- - hugging_face_cache:/root/.cache/huggingface
	16	+ - hf-hub-cache:/home/ubuntu/.cache/huggingface/hub
17	17	deploy:
18	18	resources:
19	19	reservations:
20	20	devices:
21	21	- capabilities: ["gpu"]
22	22	volumes:
23		- hugging_face_cache:
	23	+ hf-hub-cache:
24	24	```
25	25
26	26	=== "CUDA (with CDI feature enabled)"
...	...	@@ -35,7 +35,7 @@
35	35	ports:
36	36	- 8000:8000
37	37	volumes:
38		- - hugging_face_cache:/root/.cache/huggingface
	38	+ - hf-hub-cache:/home/ubuntu/.cache/huggingface/hub
39	39	deploy:
40	40	resources:
41	41	reservations:
...	...	@@ -46,7 +46,7 @@
46	46	device_ids:
47	47	- nvidia.com/gpu=all
48	48	volumes:
49		- hugging_face_cache:
	49	+ hf-hub-cache:
50	50	```
51	51
52	52	=== "CPU"
...	...	@@ -60,9 +60,9 @@
60	60	ports:
61	61	- 8000:8000
62	62	volumes:
63		- - hugging_face_cache:/root/.cache/huggingface
	63	+ - hf-hub-cache:/home/ubuntu/.cache/huggingface/hub
64	64	volumes:
65		- hugging_face_cache:
	65	+ hf-hub-cache:
66	66	```
67	67
68	68	## Docker
...	...	@@ -70,19 +70,19 @@
70	70	=== "CUDA"
71	71
72	72	```bash
73		- docker run --rm --detach --publish 8000:8000 --name faster-whisper-server --volume hugging_face_cache:/root/.cache/huggingface --gpus=all fedirz/faster-whisper-server:latest-cuda
	73	+ docker run --rm --detach --publish 8000:8000 --name faster-whisper-server --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --gpus=all fedirz/faster-whisper-server:latest-cuda
74	74	```
75	75
76	76	=== "CUDA (with CDI feature enabled)"
77	77
78	78	```bash
79		- docker run --rm --detach --publish 8000:8000 --name faster-whisper-server --volume hugging_face_cache:/root/.cache/huggingface --device=nvidia.com/gpu=all fedirz/faster-whisper-server:latest-cuda
	79	+ docker run --rm --detach --publish 8000:8000 --name faster-whisper-server --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --device=nvidia.com/gpu=all fedirz/faster-whisper-server:latest-cuda
80	80	```
81	81
82	82	=== "CPU"
83	83
84	84	```bash
85		- docker run --rm --detach --publish 8000:8000 --name faster-whisper-server --volume hugging_face_cache:/root/.cache/huggingface fedirz/faster-whisper-server:latest-cpu
	85	+ docker run --rm --detach --publish 8000:8000 --name faster-whisper-server --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub fedirz/faster-whisper-server:latest-cpu
86	86	```
87	87
88	88	## Kubernetes

...	...	@@ -12,6 +12,6 @@
12	12	environment:
13	13	- WHISPER__MODEL=Systran/faster-whisper-small
14	14	volumes:
15		- - hugging_face_cache:/root/.cache/huggingface
	15	+ - hf-hub-cache:/home/ubuntu/.cache/huggingface/hub
16	16	volumes:
17		- hugging_face_cache:
	17	+ hf-hub-cache:

...	...	@@ -12,11 +12,11 @@
12	12	environment:
13	13	- WHISPER__MODEL=Systran/faster-whisper-large-v3
14	14	volumes:
15		- - hugging_face_cache:/root/.cache/huggingface
	15	+ - hf-hub-cache:/home/ubuntu/.cache/huggingface/hub
16	16	deploy:
17	17	resources:
18	18	reservations:
19	19	devices:
20	20	- capabilities: ["gpu"]
21	21	volumes:
22		- hugging_face_cache:
	22	+ hf-hub-cache:

...	...	@@ -10,9 +10,9 @@
10	10	export WHISPER__MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
11	11
12	12	# Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
13		-docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER__MODEL=$WHISPER__MODEL fedirz/faster-whisper-server:latest-cuda
	13	+docker run --detach --gpus=all --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL fedirz/faster-whisper-server:latest-cuda
14	14	# or you can run it on a CPU
15		-# docker run --detach --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER__MODEL=$WHISPER__MODEL fedirz/faster-whisper-server:latest-cpu
	15	+# docker run --detach --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL fedirz/faster-whisper-server:latest-cpu
16	16
17	17	# `pv` is used to limit the rate at which the audio is streamed to the server. Audio is being streamed at a rate of 32kb/s(16000 sample rate * 16-bit sample / 8 bits per byte = 32000 bytes per second). This emulutes live audio input from a microphone: `ffmpeg -loglevel quiet -f alsa -i default -ac 1 -ar 16000 -f s16le -`
18	18	# shellcheck disable=SC2002

...	...	@@ -6,9 +6,9 @@
6	6	export WHISPER__MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
7	7
8	8	# Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
9		-docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER__MODEL=$WHISPER__MODEL fedirz/faster-whisper-server:latest-cuda
	9	+docker run --detach --gpus=all --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL fedirz/faster-whisper-server:latest-cuda
10	10	# or you can run it on a CPU
11		-# docker run --detach --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER__MODEL=$WHISPER__MODEL fedirz/faster-whisper-server:latest-cpu
	11	+# docker run --detach --publish 8000:8000 --volume hf-hub-cache:/home/ubuntu/.cache/huggingface/hub --env WHISPER__MODEL=$WHISPER__MODEL fedirz/faster-whisper-server:latest-cpu
12	12
13	13	# Download the audio from a YouTube video. In this example I'm downloading "The Evolution of the Operating System" by Asionometry YouTube channel. I highly checking this channel out, the guy produces very high content. If you don't have `youtube-dl`, you'll have to install it. https://github.com/ytdl-org/youtube-dl
14	14	youtube-dl --extract-audio --audio-format mp3 -o the-evolution-of-the-operating-system.mp3 'https://www.youtube.com/watch?v=1lG7lFLXBIs'

Delete comment