from collections.abc import AsyncGenerator import logging import time from typing import Literal from kokoro_onnx import Kokoro import numpy as np from speaches.audio import resample_audio logger = logging.getLogger(__name__) SAMPLE_RATE = 24000 # the default sample rate for Kokoro Language = Literal["en-us", "en-gb", "fr-fr", "ja", "ko", "cmn"] LANGUAGES: list[Language] = ["en-us", "en-gb", "fr-fr", "ja", "ko", "cmn"] VOICE_IDS = [ "af", # Default voice is a 50-50 mix of Bella & Sarah "af_bella", "af_sarah", "am_adam", "am_michael", "bf_emma", "bf_isabella", "bm_george", "bm_lewis", "af_nicole", "af_sky", ] async def generate_audio( kokoro_tts: Kokoro, text: str, voice: str, *, language: Language = "en-us", speed: float = 1.0, sample_rate: int | None = None, ) -> AsyncGenerator[bytes, None]: if sample_rate is None: sample_rate = SAMPLE_RATE start = time.perf_counter() async for audio_data, _ in kokoro_tts.create_stream(text, voice, lang=language, speed=speed): assert isinstance(audio_data, np.ndarray) and audio_data.dtype == np.float32 and isinstance(sample_rate, int) normalized_audio_data = (audio_data * np.iinfo(np.int16).max).astype(np.int16) audio_bytes = normalized_audio_data.tobytes() if sample_rate != SAMPLE_RATE: audio_bytes = resample_audio(audio_bytes, SAMPLE_RATE, sample_rate) yield audio_bytes logger.info(f"Generated audio for {len(text)} characters in {time.perf_counter() - start}s")