From 710b1bb649ed1f127885ce7fc07c4b413c743f97 Mon Sep 17 00:00:00 2001 From: dsyoon Date: Sat, 17 Jan 2026 20:11:59 +0900 Subject: [PATCH] perf: cache+parallelize OG fetch; fix encoding --- backend/app.py | 57 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/backend/app.py b/backend/app.py index 1ee269f..0de7215 100644 --- a/backend/app.py +++ b/backend/app.py @@ -405,10 +405,52 @@ class AiNewsCreateDTO(BaseModel): author_id: Optional[str] = None author_email: Optional[str] = None +def _pick_html_encoding(resp: requests.Response) -> str: + """HTML 인코딩 추정 순서(우선순위 높음 → 낮음) + 1) Content-Type 헤더 charset + 2) requests가 파싱한 resp.encoding (단, iso-8859-1 디폴트는 제외) + 3) HTML 내부 / + 4) resp.apparent_encoding (chardet/charset_normalizer) + 5) utf-8 + """ + # 1) header charset + ct = (resp.headers.get("content-type") or "").lower() + m = re.search(r"charset\s*=\s*([a-z0-9_\-]+)", ct, re.IGNORECASE) + if m: + return m.group(1) + + # 2) requests chosen encoding (but ignore the common default) + enc = resp.encoding + if enc and enc.lower() != "iso-8859-1": + return enc + + # 3) meta charset sniff (scan first 64KB) + try: + head = resp.content[:65536] + # decode as latin-1 to preserve byte values 0-255 one-to-one + head_text = head.decode("latin-1", errors="ignore") + m1 = re.search(r"]+charset\s*=\s*['\"]?\s*([a-z0-9_\-]+)\s*['\"]?", head_text, re.IGNORECASE) + if m1: + return m1.group(1) + m2 = re.search(r"charset\s*=\s*([a-z0-9_\-]+)", head_text, re.IGNORECASE) + if m2: + return m2.group(1) + except Exception: + pass + + # 4) heuristic + try: + if resp.apparent_encoding: + return resp.apparent_encoding + except Exception: + pass + + # 5) fallback + return "utf-8" + + def _decode_html(resp: requests.Response) -> str: - encoding = resp.encoding - if not encoding or encoding.lower() == "iso-8859-1": - encoding = resp.apparent_encoding + encoding = _pick_html_encoding(resp) try: return resp.content.decode(encoding or "utf-8", errors="replace") except Exception: @@ -440,18 +482,23 @@ _OG_CACHE_LOCK = threading.Lock() _OG_CACHE: Dict[str, Dict[str, object]] = {} # url -> {"ts": float, "meta": dict} _OG_CACHE_TTL_SEC = float(os.getenv("OG_CACHE_TTL_SEC", "3600")) # default 1h _OG_CACHE_MAX = int(os.getenv("OG_CACHE_MAX", "2000")) +_OG_CACHE_VERSION = 2 def _extract_og_cached(url: str) -> dict: now = time.time() with _OG_CACHE_LOCK: ent = _OG_CACHE.get(url) - if ent and (now - float(ent.get("ts", 0))) < _OG_CACHE_TTL_SEC: + if ( + ent + and ent.get("v") == _OG_CACHE_VERSION + and (now - float(ent.get("ts", 0))) < _OG_CACHE_TTL_SEC + ): return ent.get("meta") or {"title": "", "description": "", "image": "", "url": url} meta = _extract_og(url) with _OG_CACHE_LOCK: - _OG_CACHE[url] = {"ts": now, "meta": meta} + _OG_CACHE[url] = {"ts": now, "meta": meta, "v": _OG_CACHE_VERSION} # 단순한 크기 제한 (초과 시 오래된 엔트리부터 정리) if len(_OG_CACHE) > _OG_CACHE_MAX: items = sorted(_OG_CACHE.items(), key=lambda kv: float(kv[1].get("ts", 0)))