perf: cache+parallelize OG fetch; fix encoding
This commit is contained in:
@@ -405,10 +405,52 @@ class AiNewsCreateDTO(BaseModel):
|
||||
author_id: Optional[str] = None
|
||||
author_email: Optional[str] = None
|
||||
|
||||
def _pick_html_encoding(resp: requests.Response) -> str:
|
||||
"""HTML 인코딩 추정 순서(우선순위 높음 → 낮음)
|
||||
1) Content-Type 헤더 charset
|
||||
2) requests가 파싱한 resp.encoding (단, iso-8859-1 디폴트는 제외)
|
||||
3) HTML 내부 <meta charset=...> / <meta http-equiv="content-type" ... charset=...>
|
||||
4) resp.apparent_encoding (chardet/charset_normalizer)
|
||||
5) utf-8
|
||||
"""
|
||||
# 1) header charset
|
||||
ct = (resp.headers.get("content-type") or "").lower()
|
||||
m = re.search(r"charset\s*=\s*([a-z0-9_\-]+)", ct, re.IGNORECASE)
|
||||
if m:
|
||||
return m.group(1)
|
||||
|
||||
# 2) requests chosen encoding (but ignore the common default)
|
||||
enc = resp.encoding
|
||||
if enc and enc.lower() != "iso-8859-1":
|
||||
return enc
|
||||
|
||||
# 3) meta charset sniff (scan first 64KB)
|
||||
try:
|
||||
head = resp.content[:65536]
|
||||
# decode as latin-1 to preserve byte values 0-255 one-to-one
|
||||
head_text = head.decode("latin-1", errors="ignore")
|
||||
m1 = re.search(r"<meta[^>]+charset\s*=\s*['\"]?\s*([a-z0-9_\-]+)\s*['\"]?", head_text, re.IGNORECASE)
|
||||
if m1:
|
||||
return m1.group(1)
|
||||
m2 = re.search(r"charset\s*=\s*([a-z0-9_\-]+)", head_text, re.IGNORECASE)
|
||||
if m2:
|
||||
return m2.group(1)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 4) heuristic
|
||||
try:
|
||||
if resp.apparent_encoding:
|
||||
return resp.apparent_encoding
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 5) fallback
|
||||
return "utf-8"
|
||||
|
||||
|
||||
def _decode_html(resp: requests.Response) -> str:
|
||||
encoding = resp.encoding
|
||||
if not encoding or encoding.lower() == "iso-8859-1":
|
||||
encoding = resp.apparent_encoding
|
||||
encoding = _pick_html_encoding(resp)
|
||||
try:
|
||||
return resp.content.decode(encoding or "utf-8", errors="replace")
|
||||
except Exception:
|
||||
@@ -440,18 +482,23 @@ _OG_CACHE_LOCK = threading.Lock()
|
||||
_OG_CACHE: Dict[str, Dict[str, object]] = {} # url -> {"ts": float, "meta": dict}
|
||||
_OG_CACHE_TTL_SEC = float(os.getenv("OG_CACHE_TTL_SEC", "3600")) # default 1h
|
||||
_OG_CACHE_MAX = int(os.getenv("OG_CACHE_MAX", "2000"))
|
||||
_OG_CACHE_VERSION = 2
|
||||
|
||||
def _extract_og_cached(url: str) -> dict:
|
||||
now = time.time()
|
||||
with _OG_CACHE_LOCK:
|
||||
ent = _OG_CACHE.get(url)
|
||||
if ent and (now - float(ent.get("ts", 0))) < _OG_CACHE_TTL_SEC:
|
||||
if (
|
||||
ent
|
||||
and ent.get("v") == _OG_CACHE_VERSION
|
||||
and (now - float(ent.get("ts", 0))) < _OG_CACHE_TTL_SEC
|
||||
):
|
||||
return ent.get("meta") or {"title": "", "description": "", "image": "", "url": url}
|
||||
|
||||
meta = _extract_og(url)
|
||||
|
||||
with _OG_CACHE_LOCK:
|
||||
_OG_CACHE[url] = {"ts": now, "meta": meta}
|
||||
_OG_CACHE[url] = {"ts": now, "meta": meta, "v": _OG_CACHE_VERSION}
|
||||
# 단순한 크기 제한 (초과 시 오래된 엔트리부터 정리)
|
||||
if len(_OG_CACHE) > _OG_CACHE_MAX:
|
||||
items = sorted(_OG_CACHE.items(), key=lambda kv: float(kv[1].get("ts", 0)))
|
||||
|
||||
Reference in New Issue
Block a user