diff --git a/.env b/.env index d4aad1d..0ce2296 100644 --- a/.env +++ b/.env @@ -3,3 +3,5 @@ DB_PORT=5432 DB_NAME=tts DB_USER=ncue DB_PASSWORD=ncue5004! +TTS_ENGINE=mms +MMS_MODEL=facebook/mms-tts-kor diff --git a/.env.example b/.env.example index d4aad1d..97027d5 100644 --- a/.env.example +++ b/.env.example @@ -2,4 +2,6 @@ DB_HOST=ncue.net DB_PORT=5432 DB_NAME=tts DB_USER=ncue -DB_PASSWORD=ncue5004! +DB_PASSWORD=your_db_password +TTS_ENGINE=mms +MMS_MODEL=facebook/mms-tts-kor diff --git a/README.md b/README.md index 20f74a9..86e9369 100644 --- a/README.md +++ b/README.md @@ -45,3 +45,5 @@ http://localhost:8000 - 배포 스크립트 기본 포트는 `8019`이며 `PORT`로 변경할 수 있습니다. - ffmpeg가 설치되어 있어야 합니다. - mp3 파일은 `resources/` 아래에 저장됩니다. +- 고품질 TTS를 위해 `TTS_ENGINE=mms`를 설정할 수 있습니다. +- MMS 모델(`facebook/mms-tts-kor`)은 비상업(CC-BY-NC-4.0) 라이선스입니다. diff --git a/requirements.txt b/requirements.txt index 2eb1168..15e56eb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,6 @@ python-dotenv psycopg2-binary pyttsx3 jinja2 +torch +transformers +soundfile diff --git a/server/tts_service.py b/server/tts_service.py index e2c56af..d828a2c 100644 --- a/server/tts_service.py +++ b/server/tts_service.py @@ -2,9 +2,48 @@ import os import subprocess import tempfile from pathlib import Path +from typing import Optional, Tuple import pyttsx3 +_MMS_CACHE: Optional[Tuple[object, object]] = None + + +def _get_mms(): + global _MMS_CACHE + if _MMS_CACHE is not None: + return _MMS_CACHE + + try: + from transformers import VitsModel, AutoTokenizer + import torch + except Exception as exc: + raise RuntimeError("MMS TTS 사용을 위해 transformers/torch 설치가 필요합니다.") from exc + + model_name = os.getenv("MMS_MODEL", "facebook/mms-tts-kor") + tokenizer = AutoTokenizer.from_pretrained(model_name) + model = VitsModel.from_pretrained(model_name) + model.eval() + + _MMS_CACHE = (model, tokenizer) + return _MMS_CACHE + + +def _text_to_wav_mms(text: str, wav_path: str) -> None: + try: + import torch + import soundfile as sf + except Exception as exc: + raise RuntimeError("MMS TTS 사용을 위해 soundfile 설치가 필요합니다.") from exc + + model, tokenizer = _get_mms() + inputs = tokenizer(text, return_tensors="pt") + with torch.no_grad(): + audio = model(**inputs).waveform.squeeze().cpu().numpy() + + sample_rate = getattr(model.config, "sampling_rate", 22050) + sf.write(wav_path, audio, sample_rate) + def _select_korean_voice(engine: pyttsx3.Engine) -> None: try: @@ -37,21 +76,28 @@ def text_to_mp3(text: str, mp3_path: str) -> None: mp3_target = Path(mp3_path) mp3_target.parent.mkdir(parents=True, exist_ok=True) - engine = pyttsx3.init() - # 음질 개선: 속도/볼륨 조정 및 한국어 음성 우선 선택 - try: - engine.setProperty("rate", 170) - engine.setProperty("volume", 1.0) - except Exception: - pass - _select_korean_voice(engine) + tts_engine = os.getenv("TTS_ENGINE", "pyttsx3").strip().lower() wav_fd, wav_path = tempfile.mkstemp(suffix=".wav") os.close(wav_fd) try: - # pyttsx3로 wav 생성 후 ffmpeg로 mp3 변환 - engine.save_to_file(text, wav_path) - engine.runAndWait() + if tts_engine == "mms": + _text_to_wav_mms(text, wav_path) + audio_filter = "loudnorm=I=-16:LRA=11:TP=-1.5" + else: + engine = pyttsx3.init() + # 음질 개선: 속도/볼륨 조정 및 한국어 음성 우선 선택 + try: + # 서버 음성이 늘어지는 현상 완화 + engine.setProperty("rate", 210) + engine.setProperty("volume", 1.0) + except Exception: + pass + _select_korean_voice(engine) + # pyttsx3로 wav 생성 후 ffmpeg로 mp3 변환 + engine.save_to_file(text, wav_path) + engine.runAndWait() + audio_filter = "loudnorm=I=-16:LRA=11:TP=-1.5,atempo=1.15" subprocess.run( [ @@ -66,7 +112,7 @@ def text_to_mp3(text: str, mp3_path: str) -> None: "-b:a", "192k", "-af", - "loudnorm=I=-16:LRA=11:TP=-1.5", + audio_filter, str(mp3_target), ], check=True,