perf: cache+parallelize OG fetch; fix encoding
This commit is contained in:
@@ -405,10 +405,52 @@ class AiNewsCreateDTO(BaseModel):
|
|||||||
author_id: Optional[str] = None
|
author_id: Optional[str] = None
|
||||||
author_email: Optional[str] = None
|
author_email: Optional[str] = None
|
||||||
|
|
||||||
|
def _pick_html_encoding(resp: requests.Response) -> str:
|
||||||
|
"""HTML 인코딩 추정 순서(우선순위 높음 → 낮음)
|
||||||
|
1) Content-Type 헤더 charset
|
||||||
|
2) requests가 파싱한 resp.encoding (단, iso-8859-1 디폴트는 제외)
|
||||||
|
3) HTML 내부 <meta charset=...> / <meta http-equiv="content-type" ... charset=...>
|
||||||
|
4) resp.apparent_encoding (chardet/charset_normalizer)
|
||||||
|
5) utf-8
|
||||||
|
"""
|
||||||
|
# 1) header charset
|
||||||
|
ct = (resp.headers.get("content-type") or "").lower()
|
||||||
|
m = re.search(r"charset\s*=\s*([a-z0-9_\-]+)", ct, re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
return m.group(1)
|
||||||
|
|
||||||
|
# 2) requests chosen encoding (but ignore the common default)
|
||||||
|
enc = resp.encoding
|
||||||
|
if enc and enc.lower() != "iso-8859-1":
|
||||||
|
return enc
|
||||||
|
|
||||||
|
# 3) meta charset sniff (scan first 64KB)
|
||||||
|
try:
|
||||||
|
head = resp.content[:65536]
|
||||||
|
# decode as latin-1 to preserve byte values 0-255 one-to-one
|
||||||
|
head_text = head.decode("latin-1", errors="ignore")
|
||||||
|
m1 = re.search(r"<meta[^>]+charset\s*=\s*['\"]?\s*([a-z0-9_\-]+)\s*['\"]?", head_text, re.IGNORECASE)
|
||||||
|
if m1:
|
||||||
|
return m1.group(1)
|
||||||
|
m2 = re.search(r"charset\s*=\s*([a-z0-9_\-]+)", head_text, re.IGNORECASE)
|
||||||
|
if m2:
|
||||||
|
return m2.group(1)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 4) heuristic
|
||||||
|
try:
|
||||||
|
if resp.apparent_encoding:
|
||||||
|
return resp.apparent_encoding
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 5) fallback
|
||||||
|
return "utf-8"
|
||||||
|
|
||||||
|
|
||||||
def _decode_html(resp: requests.Response) -> str:
|
def _decode_html(resp: requests.Response) -> str:
|
||||||
encoding = resp.encoding
|
encoding = _pick_html_encoding(resp)
|
||||||
if not encoding or encoding.lower() == "iso-8859-1":
|
|
||||||
encoding = resp.apparent_encoding
|
|
||||||
try:
|
try:
|
||||||
return resp.content.decode(encoding or "utf-8", errors="replace")
|
return resp.content.decode(encoding or "utf-8", errors="replace")
|
||||||
except Exception:
|
except Exception:
|
||||||
@@ -440,18 +482,23 @@ _OG_CACHE_LOCK = threading.Lock()
|
|||||||
_OG_CACHE: Dict[str, Dict[str, object]] = {} # url -> {"ts": float, "meta": dict}
|
_OG_CACHE: Dict[str, Dict[str, object]] = {} # url -> {"ts": float, "meta": dict}
|
||||||
_OG_CACHE_TTL_SEC = float(os.getenv("OG_CACHE_TTL_SEC", "3600")) # default 1h
|
_OG_CACHE_TTL_SEC = float(os.getenv("OG_CACHE_TTL_SEC", "3600")) # default 1h
|
||||||
_OG_CACHE_MAX = int(os.getenv("OG_CACHE_MAX", "2000"))
|
_OG_CACHE_MAX = int(os.getenv("OG_CACHE_MAX", "2000"))
|
||||||
|
_OG_CACHE_VERSION = 2
|
||||||
|
|
||||||
def _extract_og_cached(url: str) -> dict:
|
def _extract_og_cached(url: str) -> dict:
|
||||||
now = time.time()
|
now = time.time()
|
||||||
with _OG_CACHE_LOCK:
|
with _OG_CACHE_LOCK:
|
||||||
ent = _OG_CACHE.get(url)
|
ent = _OG_CACHE.get(url)
|
||||||
if ent and (now - float(ent.get("ts", 0))) < _OG_CACHE_TTL_SEC:
|
if (
|
||||||
|
ent
|
||||||
|
and ent.get("v") == _OG_CACHE_VERSION
|
||||||
|
and (now - float(ent.get("ts", 0))) < _OG_CACHE_TTL_SEC
|
||||||
|
):
|
||||||
return ent.get("meta") or {"title": "", "description": "", "image": "", "url": url}
|
return ent.get("meta") or {"title": "", "description": "", "image": "", "url": url}
|
||||||
|
|
||||||
meta = _extract_og(url)
|
meta = _extract_og(url)
|
||||||
|
|
||||||
with _OG_CACHE_LOCK:
|
with _OG_CACHE_LOCK:
|
||||||
_OG_CACHE[url] = {"ts": now, "meta": meta}
|
_OG_CACHE[url] = {"ts": now, "meta": meta, "v": _OG_CACHE_VERSION}
|
||||||
# 단순한 크기 제한 (초과 시 오래된 엔트리부터 정리)
|
# 단순한 크기 제한 (초과 시 오래된 엔트리부터 정리)
|
||||||
if len(_OG_CACHE) > _OG_CACHE_MAX:
|
if len(_OG_CACHE) > _OG_CACHE_MAX:
|
||||||
items = sorted(_OG_CACHE.items(), key=lambda kv: float(kv[1].get("ts", 0)))
|
items = sorted(_OG_CACHE.items(), key=lambda kv: float(kv[1].get("ts", 0)))
|
||||||
|
|||||||
Reference in New Issue
Block a user