Handle Korean numeric dates in TTS

Convert common date and count numerals to Korean readings so MMS outputs month/day and attendee counts correctly.
This commit is contained in:
dsyoon
2026-01-30 20:42:36 +09:00
parent 35dae473ec
commit e240153e3f

View File

@@ -41,6 +41,18 @@ _PHRASE_MAP = [
("Naver Blog", "네이버 블로그"), ("Naver Blog", "네이버 블로그"),
("Brother Korea", "브라더 코리아"), ("Brother Korea", "브라더 코리아"),
] ]
_NUM_KO = {
0: "",
1: "",
2: "",
3: "",
4: "",
5: "",
6: "",
7: "",
8: "",
9: "",
}
def _get_mms(): def _get_mms():
@@ -127,10 +139,61 @@ def _spell_abbrev(match: re.Match) -> str:
return " ".join(_LETTER_KO.get(ch, ch) for ch in match.group(0)) return " ".join(_LETTER_KO.get(ch, ch) for ch in match.group(0))
def _sino_korean(num: int) -> str:
if num == 0:
return _NUM_KO[0]
parts = []
if num >= 1000:
thousands = num // 1000
if thousands > 1:
parts.append(_NUM_KO[thousands])
parts.append("")
num %= 1000
if num >= 100:
hundreds = num // 100
if hundreds > 1:
parts.append(_NUM_KO[hundreds])
parts.append("")
num %= 100
if num >= 10:
tens = num // 10
if tens > 1:
parts.append(_NUM_KO[tens])
parts.append("")
num %= 10
if num > 0:
parts.append(_NUM_KO[num])
return "".join(parts)
def _replace_numbers(text: str) -> str:
def _year(match: re.Match) -> str:
return f"{_sino_korean(int(match.group(1)))}"
def _month_day(match: re.Match) -> str:
month = _sino_korean(int(match.group(1)))
day = _sino_korean(int(match.group(2)))
return f"{month}{day}"
def _approx(match: re.Match) -> str:
return f"{_sino_korean(int(match.group(1)))}"
def _count(match: re.Match) -> str:
return f"{_sino_korean(int(match.group(1)))}"
text = re.sub(r"\b(\d{4})\s*년\b", _year, text)
text = re.sub(r"\b(\d{1,2})\s*월\s*(\d{1,2})\s*일\b", _month_day, text)
text = re.sub(r"\b(\d+)\s*여\b", _approx, text)
text = re.sub(r"\b(\d+)\s*명\b", _count, text)
return text
def _preprocess_text(text: str) -> str: def _preprocess_text(text: str) -> str:
# 영어 약어/브랜드 발음 보정 # 영어 약어/브랜드 발음 보정
for src, dst in _PHRASE_MAP: for src, dst in _PHRASE_MAP:
text = re.sub(rf"\b{re.escape(src)}\b", dst, text, flags=re.IGNORECASE) text = re.sub(rf"\b{re.escape(src)}\b", dst, text, flags=re.IGNORECASE)
text = _replace_numbers(text)
text = re.sub(r"\b[A-Z]{2,6}\b", _spell_abbrev, text) text = re.sub(r"\b[A-Z]{2,6}\b", _spell_abbrev, text)
# 괄호/구두점으로 인한 끊김을 완화 # 괄호/구두점으로 인한 끊김을 완화
text = text.replace("(", " ").replace(")", " ") text = text.replace("(", " ").replace(")", " ")