perf: cache+parallelize OG fetch; fix encoding

This commit is contained in:
dsyoon
2026-01-17 20:11:59 +09:00
parent 8628b25b9b
commit 710b1bb649

View File

@@ -405,10 +405,52 @@ class AiNewsCreateDTO(BaseModel):
author_id: Optional[str] = None author_id: Optional[str] = None
author_email: Optional[str] = None author_email: Optional[str] = None
def _pick_html_encoding(resp: requests.Response) -> str:
"""HTML 인코딩 추정 순서(우선순위 높음 → 낮음)
1) Content-Type 헤더 charset
2) requests가 파싱한 resp.encoding (단, iso-8859-1 디폴트는 제외)
3) HTML 내부 <meta charset=...> / <meta http-equiv="content-type" ... charset=...>
4) resp.apparent_encoding (chardet/charset_normalizer)
5) utf-8
"""
# 1) header charset
ct = (resp.headers.get("content-type") or "").lower()
m = re.search(r"charset\s*=\s*([a-z0-9_\-]+)", ct, re.IGNORECASE)
if m:
return m.group(1)
# 2) requests chosen encoding (but ignore the common default)
enc = resp.encoding
if enc and enc.lower() != "iso-8859-1":
return enc
# 3) meta charset sniff (scan first 64KB)
try:
head = resp.content[:65536]
# decode as latin-1 to preserve byte values 0-255 one-to-one
head_text = head.decode("latin-1", errors="ignore")
m1 = re.search(r"<meta[^>]+charset\s*=\s*['\"]?\s*([a-z0-9_\-]+)\s*['\"]?", head_text, re.IGNORECASE)
if m1:
return m1.group(1)
m2 = re.search(r"charset\s*=\s*([a-z0-9_\-]+)", head_text, re.IGNORECASE)
if m2:
return m2.group(1)
except Exception:
pass
# 4) heuristic
try:
if resp.apparent_encoding:
return resp.apparent_encoding
except Exception:
pass
# 5) fallback
return "utf-8"
def _decode_html(resp: requests.Response) -> str: def _decode_html(resp: requests.Response) -> str:
encoding = resp.encoding encoding = _pick_html_encoding(resp)
if not encoding or encoding.lower() == "iso-8859-1":
encoding = resp.apparent_encoding
try: try:
return resp.content.decode(encoding or "utf-8", errors="replace") return resp.content.decode(encoding or "utf-8", errors="replace")
except Exception: except Exception:
@@ -440,18 +482,23 @@ _OG_CACHE_LOCK = threading.Lock()
_OG_CACHE: Dict[str, Dict[str, object]] = {} # url -> {"ts": float, "meta": dict} _OG_CACHE: Dict[str, Dict[str, object]] = {} # url -> {"ts": float, "meta": dict}
_OG_CACHE_TTL_SEC = float(os.getenv("OG_CACHE_TTL_SEC", "3600")) # default 1h _OG_CACHE_TTL_SEC = float(os.getenv("OG_CACHE_TTL_SEC", "3600")) # default 1h
_OG_CACHE_MAX = int(os.getenv("OG_CACHE_MAX", "2000")) _OG_CACHE_MAX = int(os.getenv("OG_CACHE_MAX", "2000"))
_OG_CACHE_VERSION = 2
def _extract_og_cached(url: str) -> dict: def _extract_og_cached(url: str) -> dict:
now = time.time() now = time.time()
with _OG_CACHE_LOCK: with _OG_CACHE_LOCK:
ent = _OG_CACHE.get(url) ent = _OG_CACHE.get(url)
if ent and (now - float(ent.get("ts", 0))) < _OG_CACHE_TTL_SEC: if (
ent
and ent.get("v") == _OG_CACHE_VERSION
and (now - float(ent.get("ts", 0))) < _OG_CACHE_TTL_SEC
):
return ent.get("meta") or {"title": "", "description": "", "image": "", "url": url} return ent.get("meta") or {"title": "", "description": "", "image": "", "url": url}
meta = _extract_og(url) meta = _extract_og(url)
with _OG_CACHE_LOCK: with _OG_CACHE_LOCK:
_OG_CACHE[url] = {"ts": now, "meta": meta} _OG_CACHE[url] = {"ts": now, "meta": meta, "v": _OG_CACHE_VERSION}
# 단순한 크기 제한 (초과 시 오래된 엔트리부터 정리) # 단순한 크기 제한 (초과 시 오래된 엔트리부터 정리)
if len(_OG_CACHE) > _OG_CACHE_MAX: if len(_OG_CACHE) > _OG_CACHE_MAX:
items = sorted(_OG_CACHE.items(), key=lambda kv: float(kv[1].get("ts", 0))) items = sorted(_OG_CACHE.items(), key=lambda kv: float(kv[1].get("ts", 0)))