import os import re import subprocess import tempfile from pathlib import Path from typing import Optional, Tuple import pyttsx3 _MMS_CACHE: Optional[Tuple[object, object]] = None _LETTER_KO = { "A": "에이", "B": "비", "C": "씨", "D": "디", "E": "이", "F": "에프", "G": "지", "H": "에이치", "I": "아이", "J": "제이", "K": "케이", "L": "엘", "M": "엠", "N": "엔", "O": "오", "P": "피", "Q": "큐", "R": "알", "S": "에스", "T": "티", "U": "유", "V": "브이", "W": "더블유", "X": "엑스", "Y": "와이", "Z": "지", } _PHRASE_MAP = [ ("Automatic Document Feeder", "오토매틱 도큐먼트 피더"), ("Naver Blog", "네이버 블로그"), ("Brother Korea", "브라더 코리아"), ] _NUM_KO = { 0: "영", 1: "일", 2: "이", 3: "삼", 4: "사", 5: "오", 6: "육", 7: "칠", 8: "팔", 9: "구", } def _get_mms(): global _MMS_CACHE if _MMS_CACHE is not None: return _MMS_CACHE try: from transformers import VitsModel, AutoTokenizer import torch except Exception as exc: raise RuntimeError("MMS TTS 사용을 위해 transformers/torch 설치가 필요합니다.") from exc model_name = os.getenv("MMS_MODEL", "facebook/mms-tts-kor") tokenizer = AutoTokenizer.from_pretrained(model_name) model = VitsModel.from_pretrained(model_name) model.eval() _MMS_CACHE = (model, tokenizer) return _MMS_CACHE def _text_to_wav_mms(text: str, wav_path: str) -> None: try: import torch except Exception as exc: raise RuntimeError("MMS TTS 사용을 위해 torch/numpy가 정상 설치되어야 합니다.") from exc try: import soundfile as sf except Exception as exc: raise RuntimeError("MMS TTS 사용을 위해 soundfile 설치가 필요합니다.") from exc model, tokenizer = _get_mms() text = text.strip() if not text: raise RuntimeError("MMS 입력 텍스트가 비어 있습니다.") # 한국어 입력은 uroman 전처리가 필요할 수 있음 try: from uroman import uroman text = uroman(text) except Exception: pass inputs = tokenizer(text, return_tensors="pt") if inputs["input_ids"].shape[1] == 0: raise RuntimeError("MMS 토크나이저 입력이 비어 있습니다.") with torch.no_grad(): audio = model(**inputs).waveform.squeeze().cpu().numpy() sample_rate = getattr(model.config, "sampling_rate", 22050) # MMS 출력은 float이므로 PCM16으로 저장해 왜곡을 줄입니다. sf.write(wav_path, audio, sample_rate, subtype="PCM_16") def _select_korean_voice(engine: pyttsx3.Engine, prefer_female: bool = False) -> None: try: voices = engine.getProperty("voices") or [] except Exception: return def _voice_info(v): values = [] if getattr(v, "languages", None): values.extend(v.languages) if getattr(v, "id", None): values.append(v.id) if getattr(v, "name", None): values.append(v.name) return " ".join(str(x) for x in values).lower() def _is_korean(info: str) -> bool: return "ko" in info or "korean" in info def _is_female(info: str) -> bool: return any(token in info for token in ["female", "woman", "girl", "여성", "여자"]) if prefer_female: for voice in voices: info = _voice_info(voice) if _is_korean(info) and _is_female(info): try: engine.setProperty("voice", voice.id) return except Exception: continue for voice in voices: info = _voice_info(voice) if _is_korean(info): try: engine.setProperty("voice", voice.id) return except Exception: continue def _spell_abbrev(match: re.Match) -> str: return " ".join(_LETTER_KO.get(ch, ch) for ch in match.group(0)) def _sino_korean(num: int) -> str: if num == 0: return _NUM_KO[0] parts = [] if num >= 1000: thousands = num // 1000 if thousands > 1: parts.append(_NUM_KO[thousands]) parts.append("천") num %= 1000 if num >= 100: hundreds = num // 100 if hundreds > 1: parts.append(_NUM_KO[hundreds]) parts.append("백") num %= 100 if num >= 10: tens = num // 10 if tens > 1: parts.append(_NUM_KO[tens]) parts.append("십") num %= 10 if num > 0: parts.append(_NUM_KO[num]) return "".join(parts) def _replace_numbers(text: str) -> str: def _year(match: re.Match) -> str: return f"{_sino_korean(int(match.group(1)))}년" def _month_day(match: re.Match) -> str: month = _sino_korean(int(match.group(1))) day = _sino_korean(int(match.group(2))) return f"{month}월 {day}일" def _approx(match: re.Match) -> str: return f"{_sino_korean(int(match.group(1)))}여" def _count(match: re.Match) -> str: return f"{_sino_korean(int(match.group(1)))}명" text = re.sub(r"\b(\d{4})\s*년\b", _year, text) text = re.sub(r"\b(\d{1,2})\s*월\s*(\d{1,2})\s*일\b", _month_day, text) text = re.sub(r"\b(\d+)\s*여\b", _approx, text) text = re.sub(r"\b(\d+)\s*명\b", _count, text) return text def _preprocess_text(text: str) -> str: # 영어 약어/브랜드 발음 보정 for src, dst in _PHRASE_MAP: text = re.sub(rf"\b{re.escape(src)}\b", dst, text, flags=re.IGNORECASE) text = _replace_numbers(text) text = re.sub(r"\b[A-Z]{2,6}\b", _spell_abbrev, text) # 괄호/구두점으로 인한 끊김을 완화 text = text.replace("(", " ").replace(")", " ") return text def text_to_mp3(text: str, mp3_path: str, voice: Optional[str] = None) -> None: if not text: raise RuntimeError("텍스트가 비어 있습니다.") text = _preprocess_text(text) mp3_target = Path(mp3_path) mp3_target.parent.mkdir(parents=True, exist_ok=True) tts_engine = os.getenv("TTS_ENGINE", "pyttsx3").strip().lower() voice = (voice or "").strip().lower() or None wav_fd, wav_path = tempfile.mkstemp(suffix=".wav") os.close(wav_fd) try: if tts_engine == "mms": _text_to_wav_mms(text, wav_path) audio_filter = "highpass=f=80,lowpass=f=12000" else: engine = pyttsx3.init() # 음질 개선: 속도/볼륨 조정 및 한국어 음성 우선 선택 try: # 서버 음성이 늘어지는 현상 완화 engine.setProperty("rate", 210) engine.setProperty("volume", 1.0) except Exception: pass _select_korean_voice(engine, prefer_female=voice == "female") # pyttsx3로 wav 생성 후 ffmpeg로 mp3 변환 engine.save_to_file(text, wav_path) engine.runAndWait() audio_filter = "loudnorm=I=-16:LRA=11:TP=-1.5,atempo=1.15" subprocess.run( [ "ffmpeg", "-y", "-i", wav_path, "-ac", "2", "-ar", "44100", "-b:a", "192k", "-af", audio_filter, str(mp3_target), ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) if not mp3_target.exists(): raise RuntimeError("mp3 파일 생성에 실패했습니다.") except subprocess.CalledProcessError as exc: raise RuntimeError("ffmpeg 변환에 실패했습니다.") from exc except OSError as exc: raise RuntimeError("파일 생성 권한 또는 경로 오류입니다.") from exc finally: try: os.remove(wav_path) except OSError: pass