import os import subprocess import tempfile from pathlib import Path from typing import Optional, Tuple import pyttsx3 _MMS_CACHE: Optional[Tuple[object, object]] = None def _get_mms(): global _MMS_CACHE if _MMS_CACHE is not None: return _MMS_CACHE try: from transformers import VitsModel, AutoTokenizer import torch except Exception as exc: raise RuntimeError("MMS TTS 사용을 위해 transformers/torch 설치가 필요합니다.") from exc model_name = os.getenv("MMS_MODEL", "facebook/mms-tts-kor") tokenizer = AutoTokenizer.from_pretrained(model_name) model = VitsModel.from_pretrained(model_name) model.eval() _MMS_CACHE = (model, tokenizer) return _MMS_CACHE def _text_to_wav_mms(text: str, wav_path: str) -> None: try: import torch import soundfile as sf except Exception as exc: raise RuntimeError("MMS TTS 사용을 위해 soundfile 설치가 필요합니다.") from exc model, tokenizer = _get_mms() inputs = tokenizer(text, return_tensors="pt") with torch.no_grad(): audio = model(**inputs).waveform.squeeze().cpu().numpy() sample_rate = getattr(model.config, "sampling_rate", 22050) sf.write(wav_path, audio, sample_rate) def _select_korean_voice(engine: pyttsx3.Engine) -> None: try: voices = engine.getProperty("voices") or [] except Exception: return for voice in voices: lang_values = [] if getattr(voice, "languages", None): lang_values.extend(voice.languages) if getattr(voice, "id", None): lang_values.append(voice.id) if getattr(voice, "name", None): lang_values.append(voice.name) joined = " ".join(str(v) for v in lang_values).lower() if "ko" in joined or "korean" in joined: try: engine.setProperty("voice", voice.id) return except Exception: continue def text_to_mp3(text: str, mp3_path: str) -> None: if not text: raise RuntimeError("텍스트가 비어 있습니다.") mp3_target = Path(mp3_path) mp3_target.parent.mkdir(parents=True, exist_ok=True) tts_engine = os.getenv("TTS_ENGINE", "pyttsx3").strip().lower() wav_fd, wav_path = tempfile.mkstemp(suffix=".wav") os.close(wav_fd) try: if tts_engine == "mms": _text_to_wav_mms(text, wav_path) audio_filter = "loudnorm=I=-16:LRA=11:TP=-1.5" else: engine = pyttsx3.init() # 음질 개선: 속도/볼륨 조정 및 한국어 음성 우선 선택 try: # 서버 음성이 늘어지는 현상 완화 engine.setProperty("rate", 210) engine.setProperty("volume", 1.0) except Exception: pass _select_korean_voice(engine) # pyttsx3로 wav 생성 후 ffmpeg로 mp3 변환 engine.save_to_file(text, wav_path) engine.runAndWait() audio_filter = "loudnorm=I=-16:LRA=11:TP=-1.5,atempo=1.15" subprocess.run( [ "ffmpeg", "-y", "-i", wav_path, "-ac", "2", "-ar", "44100", "-b:a", "192k", "-af", audio_filter, str(mp3_target), ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) if not mp3_target.exists(): raise RuntimeError("mp3 파일 생성에 실패했습니다.") except subprocess.CalledProcessError as exc: raise RuntimeError("ffmpeg 변환에 실패했습니다.") from exc except OSError as exc: raise RuntimeError("파일 생성 권한 또는 경로 오류입니다.") from exc finally: try: os.remove(wav_path) except OSError: pass