diff --git a/backend/app.py b/backend/app.py
index 1ee269f..0de7215 100644
--- a/backend/app.py
+++ b/backend/app.py
@@ -405,10 +405,52 @@ class AiNewsCreateDTO(BaseModel):
author_id: Optional[str] = None
author_email: Optional[str] = None
+def _pick_html_encoding(resp: requests.Response) -> str:
+ """HTML 인코딩 추정 순서(우선순위 높음 → 낮음)
+ 1) Content-Type 헤더 charset
+ 2) requests가 파싱한 resp.encoding (단, iso-8859-1 디폴트는 제외)
+ 3) HTML 내부 /
+ 4) resp.apparent_encoding (chardet/charset_normalizer)
+ 5) utf-8
+ """
+ # 1) header charset
+ ct = (resp.headers.get("content-type") or "").lower()
+ m = re.search(r"charset\s*=\s*([a-z0-9_\-]+)", ct, re.IGNORECASE)
+ if m:
+ return m.group(1)
+
+ # 2) requests chosen encoding (but ignore the common default)
+ enc = resp.encoding
+ if enc and enc.lower() != "iso-8859-1":
+ return enc
+
+ # 3) meta charset sniff (scan first 64KB)
+ try:
+ head = resp.content[:65536]
+ # decode as latin-1 to preserve byte values 0-255 one-to-one
+ head_text = head.decode("latin-1", errors="ignore")
+ m1 = re.search(r"]+charset\s*=\s*['\"]?\s*([a-z0-9_\-]+)\s*['\"]?", head_text, re.IGNORECASE)
+ if m1:
+ return m1.group(1)
+ m2 = re.search(r"charset\s*=\s*([a-z0-9_\-]+)", head_text, re.IGNORECASE)
+ if m2:
+ return m2.group(1)
+ except Exception:
+ pass
+
+ # 4) heuristic
+ try:
+ if resp.apparent_encoding:
+ return resp.apparent_encoding
+ except Exception:
+ pass
+
+ # 5) fallback
+ return "utf-8"
+
+
def _decode_html(resp: requests.Response) -> str:
- encoding = resp.encoding
- if not encoding or encoding.lower() == "iso-8859-1":
- encoding = resp.apparent_encoding
+ encoding = _pick_html_encoding(resp)
try:
return resp.content.decode(encoding or "utf-8", errors="replace")
except Exception:
@@ -440,18 +482,23 @@ _OG_CACHE_LOCK = threading.Lock()
_OG_CACHE: Dict[str, Dict[str, object]] = {} # url -> {"ts": float, "meta": dict}
_OG_CACHE_TTL_SEC = float(os.getenv("OG_CACHE_TTL_SEC", "3600")) # default 1h
_OG_CACHE_MAX = int(os.getenv("OG_CACHE_MAX", "2000"))
+_OG_CACHE_VERSION = 2
def _extract_og_cached(url: str) -> dict:
now = time.time()
with _OG_CACHE_LOCK:
ent = _OG_CACHE.get(url)
- if ent and (now - float(ent.get("ts", 0))) < _OG_CACHE_TTL_SEC:
+ if (
+ ent
+ and ent.get("v") == _OG_CACHE_VERSION
+ and (now - float(ent.get("ts", 0))) < _OG_CACHE_TTL_SEC
+ ):
return ent.get("meta") or {"title": "", "description": "", "image": "", "url": url}
meta = _extract_og(url)
with _OG_CACHE_LOCK:
- _OG_CACHE[url] = {"ts": now, "meta": meta}
+ _OG_CACHE[url] = {"ts": now, "meta": meta, "v": _OG_CACHE_VERSION}
# 단순한 크기 제한 (초과 시 오래된 엔트리부터 정리)
if len(_OG_CACHE) > _OG_CACHE_MAX:
items = sorted(_OG_CACHE.items(), key=lambda kv: float(kv[1].get("ts", 0)))