From 710b1bb649ed1f127885ce7fc07c4b413c743f97 Mon Sep 17 00:00:00 2001
From: dsyoon <dosangyoon@gmail.com>
Date: Sat, 17 Jan 2026 20:11:59 +0900
Subject: [PATCH] perf: cache+parallelize OG fetch; fix encoding

---
 backend/app.py | 57 +++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 52 insertions(+), 5 deletions(-)
diff --git a/backend/app.py b/backend/app.py
index 1ee269f..0de7215 100644
--- a/backend/app.py
+++ b/backend/app.py
@@ -405,10 +405,52 @@ class AiNewsCreateDTO(BaseModel):
     author_id: Optional[str] = None
     author_email: Optional[str] = None
 
+def _pick_html_encoding(resp: requests.Response) -> str:
+    """HTML 인코딩 추정 순서(우선순위 높음 → 낮음)
+    1) Content-Type 헤더 charset
+    2) requests가 파싱한 resp.encoding (단, iso-8859-1 디폴트는 제외)
+    3) HTML 내부 <meta charset=...> / <meta http-equiv="content-type" ... charset=...>
+    4) resp.apparent_encoding (chardet/charset_normalizer)
+    5) utf-8
+    """
+    # 1) header charset
+    ct = (resp.headers.get("content-type") or "").lower()
+    m = re.search(r"charset\s*=\s*([a-z0-9_\-]+)", ct, re.IGNORECASE)
+    if m:
+        return m.group(1)
+
+    # 2) requests chosen encoding (but ignore the common default)
+    enc = resp.encoding
+    if enc and enc.lower() != "iso-8859-1":
+        return enc
+
+    # 3) meta charset sniff (scan first 64KB)
+    try:
+        head = resp.content[:65536]
+        # decode as latin-1 to preserve byte values 0-255 one-to-one
+        head_text = head.decode("latin-1", errors="ignore")
+        m1 = re.search(r"<meta[^>]+charset\s*=\s*['\"]?\s*([a-z0-9_\-]+)\s*['\"]?", head_text, re.IGNORECASE)
+        if m1:
+            return m1.group(1)
+        m2 = re.search(r"charset\s*=\s*([a-z0-9_\-]+)", head_text, re.IGNORECASE)
+        if m2:
+            return m2.group(1)
+    except Exception:
+        pass
+
+    # 4) heuristic
+    try:
+        if resp.apparent_encoding:
+            return resp.apparent_encoding
+    except Exception:
+        pass
+
+    # 5) fallback
+    return "utf-8"
+
+
 def _decode_html(resp: requests.Response) -> str:
-    encoding = resp.encoding
-    if not encoding or encoding.lower() == "iso-8859-1":
-        encoding = resp.apparent_encoding
+    encoding = _pick_html_encoding(resp)
     try:
         return resp.content.decode(encoding or "utf-8", errors="replace")
     except Exception:
@@ -440,18 +482,23 @@ _OG_CACHE_LOCK = threading.Lock()
 _OG_CACHE: Dict[str, Dict[str, object]] = {}  # url -> {"ts": float, "meta": dict}
 _OG_CACHE_TTL_SEC = float(os.getenv("OG_CACHE_TTL_SEC", "3600"))  # default 1h
 _OG_CACHE_MAX = int(os.getenv("OG_CACHE_MAX", "2000"))
+_OG_CACHE_VERSION = 2
 
 def _extract_og_cached(url: str) -> dict:
     now = time.time()
     with _OG_CACHE_LOCK:
         ent = _OG_CACHE.get(url)
-        if ent and (now - float(ent.get("ts", 0))) < _OG_CACHE_TTL_SEC:
+        if (
+            ent
+            and ent.get("v") == _OG_CACHE_VERSION
+            and (now - float(ent.get("ts", 0))) < _OG_CACHE_TTL_SEC
+        ):
             return ent.get("meta") or {"title": "", "description": "", "image": "", "url": url}
 
     meta = _extract_og(url)
 
     with _OG_CACHE_LOCK:
-        _OG_CACHE[url] = {"ts": now, "meta": meta}
+        _OG_CACHE[url] = {"ts": now, "meta": meta, "v": _OG_CACHE_VERSION}
         # 단순한 크기 제한 (초과 시 오래된 엔트리부터 정리)
         if len(_OG_CACHE) > _OG_CACHE_MAX:
             items = sorted(_OG_CACHE.items(), key=lambda kv: float(kv[1].get("ts", 0)))