
File name
Commit message
Commit date
01-12
01-12
File name
Commit message
Commit date
from collections.abc import AsyncGenerator
import logging
import time
from typing import Literal
from kokoro_onnx import Kokoro
import numpy as np
from speaches.audio import resample_audio
logger = logging.getLogger(__name__)
SAMPLE_RATE = 24000 # the default sample rate for Kokoro
Language = Literal["en-us", "en-gb", "fr-fr", "ja", "ko", "cmn"]
LANGUAGES: list[Language] = ["en-us", "en-gb", "fr-fr", "ja", "ko", "cmn"]
VOICE_IDS = [
"af", # Default voice is a 50-50 mix of Bella & Sarah
"af_bella",
"af_sarah",
"am_adam",
"am_michael",
"bf_emma",
"bf_isabella",
"bm_george",
"bm_lewis",
"af_nicole",
"af_sky",
]
async def generate_audio(
kokoro_tts: Kokoro,
text: str,
voice: str,
*,
language: Language = "en-us",
speed: float = 1.0,
sample_rate: int | None = None,
) -> AsyncGenerator[bytes, None]:
if sample_rate is None:
sample_rate = SAMPLE_RATE
start = time.perf_counter()
async for audio_data, _ in kokoro_tts.create_stream(text, voice, lang=language, speed=speed):
assert isinstance(audio_data, np.ndarray) and audio_data.dtype == np.float32 and isinstance(sample_rate, int)
normalized_audio_data = (audio_data * np.iinfo(np.int16).max).astype(np.int16)
audio_bytes = normalized_audio_data.tobytes()
if sample_rate != SAMPLE_RATE:
audio_bytes = resample_audio(audio_bytes, SAMPLE_RATE, sample_rate)
yield audio_bytes
logger.info(f"Generated audio for {len(text)} characters in {time.perf_counter() - start}s")