Add MMS TTS support and config

Introduce MMS-based Korean TTS, update dependencies and docs, and include environment settings.
2026-01-30 15:47:23 +09:00
parent 6b40d23c7e
commit 1abe725480
5 changed files with 68 additions and 13 deletions
--- a/.env
+++ b/.env
@@ -3,3 +3,5 @@ DB_PORT=5432
 DB_NAME=tts
 DB_USER=ncue
 DB_PASSWORD=ncue5004!
 TTS_ENGINE=mms
 MMS_MODEL=facebook/mms-tts-kor
--- a/.env.example
+++ b/.env.example
@@ -2,4 +2,6 @@ DB_HOST=ncue.net
 DB_PORT=5432
 DB_NAME=tts
 DB_USER=ncue
-DB_PASSWORD=ncue5004!
+DB_PASSWORD=your_db_password
 TTS_ENGINE=mms
 MMS_MODEL=facebook/mms-tts-kor
--- a/README.md
+++ b/README.md
@@ -45,3 +45,5 @@ http://localhost:8000
 - 배포 스크립트 기본 포트는 `8019`이며 `PORT`로 변경할 수 있습니다.
 - ffmpeg가 설치되어 있어야 합니다.
 - mp3 파일은 `resources/` 아래에 저장됩니다.
 - 고품질 TTS를 위해 `TTS_ENGINE=mms`를 설정할 수 있습니다.
 - MMS 모델(`facebook/mms-tts-kor`)은 비상업(CC-BY-NC-4.0) 라이선스입니다.
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,6 @@ python-dotenv
 psycopg2-binary
 pyttsx3
 jinja2
 torch
 transformers
 soundfile
--- a/server/tts_service.py
+++ b/server/tts_service.py
@@ -2,9 +2,48 @@ import os
 import subprocess
 import tempfile
 from pathlib import Path
 from typing import Optional, Tuple
 import pyttsx3
 _MMS_CACHE: Optional[Tuple[object, object]] = None
 def _get_mms():
    global _MMS_CACHE
    if _MMS_CACHE is not None:
        return _MMS_CACHE
    try:
        from transformers import VitsModel, AutoTokenizer
        import torch
    except Exception as exc:
        raise RuntimeError("MMS TTS 사용을 위해 transformers/torch 설치가 필요합니다.") from exc
    model_name = os.getenv("MMS_MODEL", "facebook/mms-tts-kor")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = VitsModel.from_pretrained(model_name)
    model.eval()
    _MMS_CACHE = (model, tokenizer)
    return _MMS_CACHE
 def _text_to_wav_mms(text: str, wav_path: str) -> None:
    try:
        import torch
        import soundfile as sf
    except Exception as exc:
        raise RuntimeError("MMS TTS 사용을 위해 soundfile 설치가 필요합니다.") from exc
    model, tokenizer = _get_mms()
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        audio = model(**inputs).waveform.squeeze().cpu().numpy()
    sample_rate = getattr(model.config, "sampling_rate", 22050)
    sf.write(wav_path, audio, sample_rate)
 def _select_korean_voice(engine: pyttsx3.Engine) -> None:
    try:
@@ -37,21 +76,28 @@ def text_to_mp3(text: str, mp3_path: str) -> None:
    mp3_target = Path(mp3_path)
    mp3_target.parent.mkdir(parents=True, exist_ok=True)
-    engine = pyttsx3.init()
+    tts_engine = os.getenv("TTS_ENGINE", "pyttsx3").strip().lower()
    # 음질 개선: 속도/볼륨 조정 및 한국어 음성 우선 선택
    try:
        engine.setProperty("rate", 170)
        engine.setProperty("volume", 1.0)
    except Exception:
        pass
    _select_korean_voice(engine)
    wav_fd, wav_path = tempfile.mkstemp(suffix=".wav")
    os.close(wav_fd)
    try:
        if tts_engine == "mms":
            _text_to_wav_mms(text, wav_path)
            audio_filter = "loudnorm=I=-16:LRA=11:TP=-1.5"
        else:
            engine = pyttsx3.init()
            # 음질 개선: 속도/볼륨 조정 및 한국어 음성 우선 선택
            try:
                # 서버 음성이 늘어지는 현상 완화
                engine.setProperty("rate", 210)
                engine.setProperty("volume", 1.0)
            except Exception:
                pass
            _select_korean_voice(engine)
            # pyttsx3로 wav 생성 후 ffmpeg로 mp3 변환
            engine.save_to_file(text, wav_path)
            engine.runAndWait()
            audio_filter = "loudnorm=I=-16:LRA=11:TP=-1.5,atempo=1.15"
        subprocess.run(
            [
@@ -66,7 +112,7 @@ def text_to_mp3(text: str, mp3_path: str) -> None:
                "-b:a",
                "192k",
                "-af",
-                "loudnorm=I=-16:LRA=11:TP=-1.5",
+                audio_filter,
                str(mp3_target),
            ],
            check=True,