From e240153e3fb0de012ec144795f2fd23a7ddddc86 Mon Sep 17 00:00:00 2001 From: dsyoon Date: Fri, 30 Jan 2026 20:42:36 +0900 Subject: [PATCH] Handle Korean numeric dates in TTS Convert common date and count numerals to Korean readings so MMS outputs month/day and attendee counts correctly. --- server/tts_service.py | 63 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/server/tts_service.py b/server/tts_service.py index 14a681e..b5493ec 100644 --- a/server/tts_service.py +++ b/server/tts_service.py @@ -41,6 +41,18 @@ _PHRASE_MAP = [ ("Naver Blog", "네이버 블로그"), ("Brother Korea", "브라더 코리아"), ] +_NUM_KO = { + 0: "영", + 1: "일", + 2: "이", + 3: "삼", + 4: "사", + 5: "오", + 6: "육", + 7: "칠", + 8: "팔", + 9: "구", +} def _get_mms(): @@ -127,10 +139,61 @@ def _spell_abbrev(match: re.Match) -> str: return " ".join(_LETTER_KO.get(ch, ch) for ch in match.group(0)) +def _sino_korean(num: int) -> str: + if num == 0: + return _NUM_KO[0] + + parts = [] + if num >= 1000: + thousands = num // 1000 + if thousands > 1: + parts.append(_NUM_KO[thousands]) + parts.append("천") + num %= 1000 + if num >= 100: + hundreds = num // 100 + if hundreds > 1: + parts.append(_NUM_KO[hundreds]) + parts.append("백") + num %= 100 + if num >= 10: + tens = num // 10 + if tens > 1: + parts.append(_NUM_KO[tens]) + parts.append("십") + num %= 10 + if num > 0: + parts.append(_NUM_KO[num]) + return "".join(parts) + + +def _replace_numbers(text: str) -> str: + def _year(match: re.Match) -> str: + return f"{_sino_korean(int(match.group(1)))}년" + + def _month_day(match: re.Match) -> str: + month = _sino_korean(int(match.group(1))) + day = _sino_korean(int(match.group(2))) + return f"{month}월 {day}일" + + def _approx(match: re.Match) -> str: + return f"{_sino_korean(int(match.group(1)))}여" + + def _count(match: re.Match) -> str: + return f"{_sino_korean(int(match.group(1)))}명" + + text = re.sub(r"\b(\d{4})\s*년\b", _year, text) + text = re.sub(r"\b(\d{1,2})\s*월\s*(\d{1,2})\s*일\b", _month_day, text) + text = re.sub(r"\b(\d+)\s*여\b", _approx, text) + text = re.sub(r"\b(\d+)\s*명\b", _count, text) + return text + + def _preprocess_text(text: str) -> str: # 영어 약어/브랜드 발음 보정 for src, dst in _PHRASE_MAP: text = re.sub(rf"\b{re.escape(src)}\b", dst, text, flags=re.IGNORECASE) + text = _replace_numbers(text) text = re.sub(r"\b[A-Z]{2,6}\b", _spell_abbrev, text) # 괄호/구두점으로 인한 끊김을 완화 text = text.replace("(", " ").replace(")", " ")