import os import time from concurrent.futures import ThreadPoolExecutor from datetime import datetime from urllib.parse import urlparse import psycopg2 import requests from bs4 import BeautifulSoup from dotenv import load_dotenv from flask import Flask, jsonify, render_template, request load_dotenv() app = Flask(__name__, static_folder="static", template_folder="templates") DEFAULT_DESCRIPTION = "설명 없음" DEFAULT_IMAGE = "/static/placeholder.svg" CACHE_TTL_SECONDS = int(os.getenv("CACHE_TTL_SECONDS", "3600")) FAILED_TTL_SECONDS = int(os.getenv("FAILED_TTL_SECONDS", "300")) METADATA_CACHE = {} PLACEHOLDER_DATA_URI = ( "data:image/svg+xml;utf8," "" "" "" "" "" "No%20Image" "" ) DEFAULT_PAGE_SIZE = int(os.getenv("DEFAULT_PAGE_SIZE", "30")) MAX_PAGE_SIZE = int(os.getenv("MAX_PAGE_SIZE", "60")) def get_db_connection(): return psycopg2.connect( host=os.getenv("DB_HOST"), port=os.getenv("DB_PORT"), dbname=os.getenv("DB_NAME"), user=os.getenv("DB_USER"), password=os.getenv("DB_PASSWORD"), ) def normalize_url(raw_url: str) -> str: if not raw_url: return raw_url parsed = urlparse(raw_url) if parsed.scheme: return raw_url return f"https://{raw_url}" def extract_meta(soup: BeautifulSoup, property_name: str, name: str): tag = soup.find("meta", property=property_name) if tag and tag.get("content"): return tag.get("content").strip() tag = soup.find("meta", attrs={"name": name}) if tag and tag.get("content"): return tag.get("content").strip() return "" def extract_fallback_description(soup: BeautifulSoup) -> str: for paragraph in soup.find_all("p"): text = paragraph.get_text(" ", strip=True) if len(text) >= 40: return text[:180] return "" def fetch_metadata(url: str): fallback = { "title": url, "description": DEFAULT_DESCRIPTION, "image": DEFAULT_IMAGE, } cached = METADATA_CACHE.get(url) now = time.time() if cached and cached["expires_at"] > now: return cached["data"] try: response = requests.get( url, headers={ "User-Agent": ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/121.0.0.0 Safari/537.36" ) }, timeout=6, ) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") resolved_url = response.url or url title = ( extract_meta(soup, "og:title", "twitter:title") or extract_meta(soup, "twitter:title", "title") or (soup.title.string.strip() if soup.title and soup.title.string else "") or resolved_url ) description = ( extract_meta(soup, "og:description", "description") or extract_meta(soup, "twitter:description", "description") ) if not description: description = extract_fallback_description(soup) or DEFAULT_DESCRIPTION image = ( extract_meta(soup, "og:image", "twitter:image") or extract_meta(soup, "twitter:image", "image") or DEFAULT_IMAGE ) data = {"title": title, "description": description, "image": image} METADATA_CACHE[url] = { "data": data, "expires_at": now + CACHE_TTL_SECONDS, "ok": True, } return data except Exception: METADATA_CACHE[url] = { "data": fallback, "expires_at": now + FAILED_TTL_SECONDS, "ok": False, } return fallback def _clamp_int(value, default: int, minimum: int, maximum: int) -> int: try: parsed = int(value) except Exception: return default return max(minimum, min(parsed, maximum)) def fetch_links_page_from_db(limit: int, offset: int): table = os.getenv("TABLE", "news_link") with get_db_connection() as conn: with conn.cursor() as cur: cur.execute( f"SELECT id, url, created_at FROM {table} ORDER BY created_at DESC OFFSET %s LIMIT %s", (offset, limit), ) return cur.fetchall() @app.get("/") def index(): links = [] error_message = "" try: rows = fetch_links_page_from_db(DEFAULT_PAGE_SIZE, 0) for link_id, url, created_at in rows: links.append( { "id": link_id, "url": url, "created_at": created_at.isoformat() if isinstance(created_at, datetime) else str(created_at), "title": "", "description": "", "image": "", } ) except Exception as exc: error_message = f"DB 조회 실패: {exc}" return render_template( "index.html", links=links, error_message=error_message, placeholder_data_uri=PLACEHOLDER_DATA_URI, default_image=DEFAULT_IMAGE, ) @app.get("/links") def get_links(): limit = _clamp_int( request.args.get("limit"), DEFAULT_PAGE_SIZE, minimum=1, maximum=MAX_PAGE_SIZE ) offset = _clamp_int(request.args.get("offset"), 0, minimum=0, maximum=10_000_000) try: rows_plus_one = fetch_links_page_from_db(limit + 1, offset) except Exception as exc: return jsonify({"error": "DB 조회 실패", "detail": str(exc)}), 500 has_more = len(rows_plus_one) > limit rows = rows_plus_one[:limit] urls = [url for _, url, _ in rows] metas = [] if urls: with ThreadPoolExecutor(max_workers=min(8, len(urls))) as executor: metas = list(executor.map(fetch_metadata, urls)) results = [] for (link_id, url, created_at), meta in zip(rows, metas): results.append( { "id": link_id, "url": url, "created_at": created_at.isoformat() if isinstance(created_at, datetime) else str(created_at), **meta, } ) return jsonify( { "items": results, "limit": limit, "offset": offset, "next_offset": offset + len(results), "has_more": has_more, } ) @app.post("/links") def add_link(): data = request.get_json(silent=True) or {} raw_url = (data.get("url") or "").strip() if not raw_url: return jsonify({"error": "URL을 입력해주세요."}), 400 url = normalize_url(raw_url) table = os.getenv("TABLE", "news_link") try: with get_db_connection() as conn: with conn.cursor() as cur: cur.execute( f"INSERT INTO {table} (url) VALUES (%s) RETURNING id, created_at", (url,), ) link_id, created_at = cur.fetchone() conn.commit() except Exception as exc: return jsonify({"error": "DB 저장 실패", "detail": str(exc)}), 500 meta = fetch_metadata(url) return jsonify( { "id": link_id, "url": url, "created_at": created_at.isoformat() if isinstance(created_at, datetime) else str(created_at), **meta, } ) if __name__ == "__main__": app.run(host="0.0.0.0", port=8021, debug=True)