perf: cache+parallelize OG fetch; fix encoding

2026-01-17 20:11:59 +09:00
parent 8628b25b9b
commit 710b1bb649
1 changed files with 52 additions and 5 deletions
--- a/backend/app.py
+++ b/backend/app.py
@@ -405,10 +405,52 @@ class AiNewsCreateDTO(BaseModel):
    author_id: Optional[str] = None
    author_email: Optional[str] = None
 def _pick_html_encoding(resp: requests.Response) -> str:
    """HTML 인코딩 추정 순서(우선순위 높음 → 낮음)
    1) Content-Type 헤더 charset
    2) requests가 파싱한 resp.encoding (단, iso-8859-1 디폴트는 제외)
    3) HTML 내부 <meta charset=...> / <meta http-equiv="content-type" ... charset=...>
    4) resp.apparent_encoding (chardet/charset_normalizer)
    5) utf-8
    """
    # 1) header charset
    ct = (resp.headers.get("content-type") or "").lower()
    m = re.search(r"charset\s*=\s*([a-z0-9_\-]+)", ct, re.IGNORECASE)
    if m:
        return m.group(1)
    # 2) requests chosen encoding (but ignore the common default)
    enc = resp.encoding
    if enc and enc.lower() != "iso-8859-1":
        return enc
    # 3) meta charset sniff (scan first 64KB)
    try:
        head = resp.content[:65536]
        # decode as latin-1 to preserve byte values 0-255 one-to-one
        head_text = head.decode("latin-1", errors="ignore")
        m1 = re.search(r"<meta[^>]+charset\s*=\s*['\"]?\s*([a-z0-9_\-]+)\s*['\"]?", head_text, re.IGNORECASE)
        if m1:
            return m1.group(1)
        m2 = re.search(r"charset\s*=\s*([a-z0-9_\-]+)", head_text, re.IGNORECASE)
        if m2:
            return m2.group(1)
    except Exception:
        pass
    # 4) heuristic
    try:
        if resp.apparent_encoding:
            return resp.apparent_encoding
    except Exception:
        pass
    # 5) fallback
    return "utf-8"
 def _decode_html(resp: requests.Response) -> str:
-    encoding = resp.encoding
+    encoding = _pick_html_encoding(resp)
    if not encoding or encoding.lower() == "iso-8859-1":
        encoding = resp.apparent_encoding
    try:
        return resp.content.decode(encoding or "utf-8", errors="replace")
    except Exception:
@@ -440,18 +482,23 @@ _OG_CACHE_LOCK = threading.Lock()
 _OG_CACHE: Dict[str, Dict[str, object]] = {}  # url -> {"ts": float, "meta": dict}
 _OG_CACHE_TTL_SEC = float(os.getenv("OG_CACHE_TTL_SEC", "3600"))  # default 1h
 _OG_CACHE_MAX = int(os.getenv("OG_CACHE_MAX", "2000"))
 _OG_CACHE_VERSION = 2
 def _extract_og_cached(url: str) -> dict:
    now = time.time()
    with _OG_CACHE_LOCK:
        ent = _OG_CACHE.get(url)
-        if ent and (now - float(ent.get("ts", 0))) < _OG_CACHE_TTL_SEC:
+        if (
            ent
            and ent.get("v") == _OG_CACHE_VERSION
            and (now - float(ent.get("ts", 0))) < _OG_CACHE_TTL_SEC
        ):
            return ent.get("meta") or {"title": "", "description": "", "image": "", "url": url}
    meta = _extract_og(url)
    with _OG_CACHE_LOCK:
-        _OG_CACHE[url] = {"ts": now, "meta": meta}
+        _OG_CACHE[url] = {"ts": now, "meta": meta, "v": _OG_CACHE_VERSION}
        # 단순한 크기 제한 (초과 시 오래된 엔트리부터 정리)
        if len(_OG_CACHE) > _OG_CACHE_MAX:
            items = sorted(_OG_CACHE.items(), key=lambda kv: float(kv[1].get("ts", 0)))