tts/server/tts_service.py

import os
import re
import subprocess
import tempfile
from pathlib import Path
from typing import Optional, Tuple

import pyttsx3

_MMS_CACHE: Optional[Tuple[object, object]] = None
_LETTER_KO = {
    "A": "에이",
    "B": "비",
    "C": "씨",
    "D": "디",
    "E": "이",
    "F": "에프",
    "G": "지",
    "H": "에이치",
    "I": "아이",
    "J": "제이",
    "K": "케이",
    "L": "엘",
    "M": "엠",
    "N": "엔",
    "O": "오",
    "P": "피",
    "Q": "큐",
    "R": "알",
    "S": "에스",
    "T": "티",
    "U": "유",
    "V": "브이",
    "W": "더블유",
    "X": "엑스",
    "Y": "와이",
    "Z": "지",
}
_PHRASE_MAP = [
    ("Automatic Document Feeder", "오토매틱 도큐먼트 피더"),
    ("Naver Blog", "네이버 블로그"),
    ("Brother Korea", "브라더 코리아"),
]
_NUM_KO = {
    0: "영",
    1: "일",
    2: "이",
    3: "삼",
    4: "사",
    5: "오",
    6: "육",
    7: "칠",
    8: "팔",
    9: "구",
}


def _get_mms():
    global _MMS_CACHE
    if _MMS_CACHE is not None:
        return _MMS_CACHE

    try:
        from transformers import VitsModel, AutoTokenizer
        import torch
    except Exception as exc:
        raise RuntimeError("MMS TTS 사용을 위해 transformers/torch 설치가 필요합니다.") from exc

    model_name = os.getenv("MMS_MODEL", "facebook/mms-tts-kor")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = VitsModel.from_pretrained(model_name)
    model.eval()

    _MMS_CACHE = (model, tokenizer)
    return _MMS_CACHE


def _text_to_wav_mms(text: str, wav_path: str) -> None:
    try:
        import torch
    except Exception as exc:
        raise RuntimeError("MMS TTS 사용을 위해 torch/numpy가 정상 설치되어야 합니다.") from exc

    try:
        import soundfile as sf
    except Exception as exc:
        raise RuntimeError("MMS TTS 사용을 위해 soundfile 설치가 필요합니다.") from exc

    model, tokenizer = _get_mms()

    text = text.strip()
    if not text:
        raise RuntimeError("MMS 입력 텍스트가 비어 있습니다.")

    # 한국어 입력은 uroman 전처리가 필요할 수 있음
    try:
        from uroman import uroman

        text = uroman(text)
    except Exception:
        pass

    inputs = tokenizer(text, return_tensors="pt")
    if inputs["input_ids"].shape[1] == 0:
        raise RuntimeError("MMS 토크나이저 입력이 비어 있습니다.")
    with torch.no_grad():
        audio = model(**inputs).waveform.squeeze().cpu().numpy()

    sample_rate = getattr(model.config, "sampling_rate", 22050)
    # MMS 출력은 float이므로 PCM16으로 저장해 왜곡을 줄입니다.
    sf.write(wav_path, audio, sample_rate, subtype="PCM_16")


def _select_korean_voice(engine: pyttsx3.Engine, prefer_female: bool = False) -> None:
    try:
        voices = engine.getProperty("voices") or []
    except Exception:
        return

    def _voice_info(v):
        values = []
        if getattr(v, "languages", None):
            values.extend(v.languages)
        if getattr(v, "id", None):
            values.append(v.id)
        if getattr(v, "name", None):
            values.append(v.name)
        return " ".join(str(x) for x in values).lower()

    def _is_korean(info: str) -> bool:
        return "ko" in info or "korean" in info

    def _is_female(info: str) -> bool:
        return any(token in info for token in ["female", "woman", "girl", "여성", "여자"])

    if prefer_female:
        for voice in voices:
            info = _voice_info(voice)
            if _is_korean(info) and _is_female(info):
                try:
                    engine.setProperty("voice", voice.id)
                    return
                except Exception:
                    continue

    for voice in voices:
        info = _voice_info(voice)
        if _is_korean(info):
            try:
                engine.setProperty("voice", voice.id)
                return
            except Exception:
                continue


def _spell_abbrev(match: re.Match) -> str:
    return " ".join(_LETTER_KO.get(ch, ch) for ch in match.group(0))


def _sino_korean(num: int) -> str:
    if num == 0:
        return _NUM_KO[0]

    parts = []
    if num >= 1000:
        thousands = num // 1000
        if thousands > 1:
            parts.append(_NUM_KO[thousands])
        parts.append("천")
        num %= 1000
    if num >= 100:
        hundreds = num // 100
        if hundreds > 1:
            parts.append(_NUM_KO[hundreds])
        parts.append("백")
        num %= 100
    if num >= 10:
        tens = num // 10
        if tens > 1:
            parts.append(_NUM_KO[tens])
        parts.append("십")
        num %= 10
    if num > 0:
        parts.append(_NUM_KO[num])
    return "".join(parts)


def _replace_numbers(text: str) -> str:
    def _year(match: re.Match) -> str:
        return f"{_sino_korean(int(match.group(1)))}년"

    def _month_day(match: re.Match) -> str:
        month = _sino_korean(int(match.group(1)))
        day = _sino_korean(int(match.group(2)))
        return f"{month}월 {day}일"

    def _approx(match: re.Match) -> str:
        return f"{_sino_korean(int(match.group(1)))}여"

    def _count(match: re.Match) -> str:
        return f"{_sino_korean(int(match.group(1)))}명"

    text = re.sub(r"\b(\d{4})\s*년\b", _year, text)
    text = re.sub(r"\b(\d{1,2})\s*월\s*(\d{1,2})\s*일\b", _month_day, text)
    text = re.sub(r"\b(\d+)\s*여\b", _approx, text)
    text = re.sub(r"\b(\d+)\s*명\b", _count, text)
    return text


def _preprocess_text(text: str) -> str:
    # 영어 약어/브랜드 발음 보정
    for src, dst in _PHRASE_MAP:
        text = re.sub(rf"\b{re.escape(src)}\b", dst, text, flags=re.IGNORECASE)
    text = _replace_numbers(text)
    text = re.sub(r"\b[A-Z]{2,6}\b", _spell_abbrev, text)
    # 괄호/구두점으로 인한 끊김을 완화
    text = text.replace("(", " ").replace(")", " ")
    return text


def text_to_mp3(text: str, mp3_path: str, voice: Optional[str] = None) -> None:
    if not text:
        raise RuntimeError("텍스트가 비어 있습니다.")

    text = _preprocess_text(text)

    mp3_target = Path(mp3_path)
    mp3_target.parent.mkdir(parents=True, exist_ok=True)

    tts_engine = os.getenv("TTS_ENGINE", "pyttsx3").strip().lower()
    voice = (voice or "").strip().lower() or None
    wav_fd, wav_path = tempfile.mkstemp(suffix=".wav")
    os.close(wav_fd)

    try:
        if tts_engine == "mms":
            _text_to_wav_mms(text, wav_path)
            audio_filter = "highpass=f=80,lowpass=f=12000"
        else:
            engine = pyttsx3.init()
            # 음질 개선: 속도/볼륨 조정 및 한국어 음성 우선 선택
            try:
                # 서버 음성이 늘어지는 현상 완화
                engine.setProperty("rate", 210)
                engine.setProperty("volume", 1.0)
            except Exception:
                pass
            _select_korean_voice(engine, prefer_female=voice == "female")
            # pyttsx3로 wav 생성 후 ffmpeg로 mp3 변환
            engine.save_to_file(text, wav_path)
            engine.runAndWait()
            audio_filter = "loudnorm=I=-16:LRA=11:TP=-1.5,atempo=1.15"

        subprocess.run(
            [
                "ffmpeg",
                "-y",
                "-i",
                wav_path,
                "-ac",
                "2",
                "-ar",
                "44100",
                "-b:a",
                "192k",
                "-af",
                audio_filter,
                str(mp3_target),
            ],
            check=True,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
        )

        if not mp3_target.exists():
            raise RuntimeError("mp3 파일 생성에 실패했습니다.")
    except subprocess.CalledProcessError as exc:
        raise RuntimeError("ffmpeg 변환에 실패했습니다.") from exc
    except OSError as exc:
        raise RuntimeError("파일 생성 권한 또는 경로 오류입니다.") from exc
    finally:
        try:
            os.remove(wav_path)
        except OSError:
            pass