Add MMS TTS support and config

Introduce MMS-based Korean TTS, update dependencies and docs, and include environment settings.
This commit is contained in:
dsyoon
2026-01-30 15:47:23 +09:00
parent 6b40d23c7e
commit 1abe725480
5 changed files with 68 additions and 13 deletions

View File

@@ -2,9 +2,48 @@ import os
import subprocess
import tempfile
from pathlib import Path
from typing import Optional, Tuple
import pyttsx3
_MMS_CACHE: Optional[Tuple[object, object]] = None
def _get_mms():
global _MMS_CACHE
if _MMS_CACHE is not None:
return _MMS_CACHE
try:
from transformers import VitsModel, AutoTokenizer
import torch
except Exception as exc:
raise RuntimeError("MMS TTS 사용을 위해 transformers/torch 설치가 필요합니다.") from exc
model_name = os.getenv("MMS_MODEL", "facebook/mms-tts-kor")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = VitsModel.from_pretrained(model_name)
model.eval()
_MMS_CACHE = (model, tokenizer)
return _MMS_CACHE
def _text_to_wav_mms(text: str, wav_path: str) -> None:
try:
import torch
import soundfile as sf
except Exception as exc:
raise RuntimeError("MMS TTS 사용을 위해 soundfile 설치가 필요합니다.") from exc
model, tokenizer = _get_mms()
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
audio = model(**inputs).waveform.squeeze().cpu().numpy()
sample_rate = getattr(model.config, "sampling_rate", 22050)
sf.write(wav_path, audio, sample_rate)
def _select_korean_voice(engine: pyttsx3.Engine) -> None:
try:
@@ -37,21 +76,28 @@ def text_to_mp3(text: str, mp3_path: str) -> None:
mp3_target = Path(mp3_path)
mp3_target.parent.mkdir(parents=True, exist_ok=True)
engine = pyttsx3.init()
# 음질 개선: 속도/볼륨 조정 및 한국어 음성 우선 선택
try:
engine.setProperty("rate", 170)
engine.setProperty("volume", 1.0)
except Exception:
pass
_select_korean_voice(engine)
tts_engine = os.getenv("TTS_ENGINE", "pyttsx3").strip().lower()
wav_fd, wav_path = tempfile.mkstemp(suffix=".wav")
os.close(wav_fd)
try:
# pyttsx3로 wav 생성 후 ffmpeg로 mp3 변환
engine.save_to_file(text, wav_path)
engine.runAndWait()
if tts_engine == "mms":
_text_to_wav_mms(text, wav_path)
audio_filter = "loudnorm=I=-16:LRA=11:TP=-1.5"
else:
engine = pyttsx3.init()
# 음질 개선: 속도/볼륨 조정 및 한국어 음성 우선 선택
try:
# 서버 음성이 늘어지는 현상 완화
engine.setProperty("rate", 210)
engine.setProperty("volume", 1.0)
except Exception:
pass
_select_korean_voice(engine)
# pyttsx3로 wav 생성 후 ffmpeg로 mp3 변환
engine.save_to_file(text, wav_path)
engine.runAndWait()
audio_filter = "loudnorm=I=-16:LRA=11:TP=-1.5,atempo=1.15"
subprocess.run(
[
@@ -66,7 +112,7 @@ def text_to_mp3(text: str, mp3_path: str) -> None:
"-b:a",
"192k",
"-af",
"loudnorm=I=-16:LRA=11:TP=-1.5",
audio_filter,
str(mp3_target),
],
check=True,