• Y
  • List All
  • Feedback
    • This Project
    • All Projects
Profile Account settings Log out
  • Favorite
  • Project
  • All
Loading...
  • Log in
  • Sign up
yjyoon / whisper_server_speaches star
  • Project homeH
  • CodeC
  • IssueI
  • Pull requestP
  • Review R
  • MilestoneM
  • BoardB
  • Files
  • Commit
  • Branches
whisper_server_speachesspeachesconfig.py
Download as .zip file
File name
Commit message
Commit date
speaches
fix: circular import
2024-05-26
tests
refactor: simplify tests
2024-05-23
.dockerignore
init
2024-05-20
.envrc
init
2024-05-20
.gitignore
init
2024-05-20
.pre-commit-config.yaml
style: add ruff
2024-05-21
Dockerfile.cpu
build: docker don't install dev deps
2024-05-25
Dockerfile.cuda
build: docker don't install dev deps
2024-05-25
LICENSE
init
2024-05-20
README.md
docs: add examples, roadmap, etc.
2024-05-21
Taskfile.yaml
fix: circular import
2024-05-26
compose.yaml
fix: docker multi-arch builds
2024-05-23
flake.lock
init
2024-05-20
flake.nix
chore: add more tasks
2024-05-23
poetry.lock
deps: add youtube-dl as dev dependency
2024-05-25
pyproject.toml
build: docker don't install dev deps
2024-05-25
File name
Commit message
Commit date
__init__.py
init
2024-05-20
asr.py
feat: further improve openai compatabilit + refactor
2024-05-25
audio.py
style: add ruff
2024-05-21
config.py
fix: circular import
2024-05-26
core.py
style: add ruff
2024-05-21
logger.py
init
2024-05-20
main.py
fix: circular import
2024-05-26
server_models.py
fix: circular import
2024-05-26
transcriber.py
init
2024-05-20
utils.py
feat: further improve openai compatabilit + refactor
2024-05-25
Fedir Zadniprovskyi 2024-05-26 5741d7c fix: circular import UNIX
Raw Open in browser Change history
import enum from pydantic import BaseModel, Field from pydantic_settings import BaseSettings, SettingsConfigDict SAMPLES_PER_SECOND = 16000 BYTES_PER_SAMPLE = 2 BYTES_PER_SECOND = SAMPLES_PER_SECOND * BYTES_PER_SAMPLE # 2 BYTES = 16 BITS = 1 SAMPLE # 1 SECOND OF AUDIO = 32000 BYTES = 16000 SAMPLES # https://platform.openai.com/docs/api-reference/audio/createTranscription#audio-createtranscription-response_format class ResponseFormat(enum.StrEnum): TEXT = "text" JSON = "json" VERBOSE_JSON = "verbose_json" # VTT = "vtt" # SRT = "srt" # https://huggingface.co/Systran class Model(enum.StrEnum): TINY_EN = "tiny.en" TINY = "tiny" BASE_EN = "base.en" BASE = "base" SMALL_EN = "small.en" SMALL = "small" MEDIUM_EN = "medium.en" MEDIUM = "medium" LARGE = "large" LARGE_V1 = "large-v1" LARGE_V2 = "large-v2" LARGE_V3 = "large-v3" DISTIL_SMALL_EN = "distil-small.en" DISTIL_MEDIUM_EN = "distil-medium.en" DISTIL_LARGE_V2 = "distil-large-v2" DISTIL_LARGE_V3 = "distil-large-v3" class Device(enum.StrEnum): CPU = "cpu" CUDA = "cuda" AUTO = "auto" # https://github.com/OpenNMT/CTranslate2/blob/master/docs/quantization.md # NOTE: `Precision` might be a better name class Quantization(enum.StrEnum): INT8 = "int8" INT8_FLOAT16 = "int8_float16" INT8_BFLOAT16 = "int8_bfloat16" INT8_FLOAT32 = "int8_float32" INT16 = "int16" FLOAT16 = "float16" BFLOAT16 = "bfloat16" FLOAT32 = "float32" DEFAULT = "default" class Language(enum.StrEnum): AF = "af" AM = "am" AR = "ar" AS = "as" AZ = "az" BA = "ba" BE = "be" BG = "bg" BN = "bn" BO = "bo" BR = "br" BS = "bs" CA = "ca" CS = "cs" CY = "cy" DA = "da" DE = "de" EL = "el" EN = "en" ES = "es" ET = "et" EU = "eu" FA = "fa" FI = "fi" FO = "fo" FR = "fr" GL = "gl" GU = "gu" HA = "ha" HAW = "haw" HE = "he" HI = "hi" HR = "hr" HT = "ht" HU = "hu" HY = "hy" ID = "id" IS = "is" IT = "it" JA = "ja" JW = "jw" KA = "ka" KK = "kk" KM = "km" KN = "kn" KO = "ko" LA = "la" LB = "lb" LN = "ln" LO = "lo" LT = "lt" LV = "lv" MG = "mg" MI = "mi" MK = "mk" ML = "ml" MN = "mn" MR = "mr" MS = "ms" MT = "mt" MY = "my" NE = "ne" NL = "nl" NN = "nn" NO = "no" OC = "oc" PA = "pa" PL = "pl" PS = "ps" PT = "pt" RO = "ro" RU = "ru" SA = "sa" SD = "sd" SI = "si" SK = "sk" SL = "sl" SN = "sn" SO = "so" SQ = "sq" SR = "sr" SU = "su" SV = "sv" SW = "sw" TA = "ta" TE = "te" TG = "tg" TH = "th" TK = "tk" TL = "tl" TR = "tr" TT = "tt" UK = "uk" UR = "ur" UZ = "uz" VI = "vi" YI = "yi" YO = "yo" YUE = "yue" ZH = "zh" class WhisperConfig(BaseModel): model: Model = Field(default=Model.DISTIL_MEDIUM_EN) # ENV: WHISPER_MODEL inference_device: Device = Field( default=Device.AUTO ) # ENV: WHISPER_INFERENCE_DEVICE compute_type: Quantization = Field( default=Quantization.DEFAULT ) # ENV: WHISPER_COMPUTE_TYPE class Config(BaseSettings): model_config = SettingsConfigDict(env_nested_delimiter="_") log_level: str = "info" # ENV: LOG_LEVEL default_language: Language | None = None # ENV: DEFAULT_LANGUAGE default_response_format: ResponseFormat = ( ResponseFormat.JSON ) # ENV: DEFAULT_RESPONSE_FORMAT whisper: WhisperConfig = WhisperConfig() # ENV: WHISPER_* """ Max duration to for the next audio chunk before transcription is finilized and connection is closed. """ max_no_data_seconds: float = 1.0 # ENV: MAX_NO_DATA_SECONDS min_duration: float = 1.0 # ENV: MIN_DURATION word_timestamp_error_margin: float = 0.2 # ENV: WORD_TIMESTAMP_ERROR_MARGIN """ Max allowed audio duration without any speech being detected before transcription is finilized and connection is closed. """ max_inactivity_seconds: float = 2.0 # ENV: MAX_INACTIVITY_SECONDS """ Controls how many latest seconds of audio are being passed through VAD. Should be greater than `max_inactivity_seconds` """ inactivity_window_seconds: float = 3.0 # ENV: INACTIVITY_WINDOW_SECONDS config = Config()

          
        
    
    
Copyright Yona authors & © NAVER Corp. & NAVER LABS Supported by NAVER CLOUD PLATFORM

or
Sign in with github login with Google Sign in with Google
Reset password | Sign up