DeepLottery/DataCrawler.py

# 웹 호출 라이브러리를 호출합니다.
import os
import time
import requests
# JSON 포맷을 다루기 위한 라이브러리를 호출합니다.
import json
from datetime import datetime, timedelta
import random
import socket

import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

try:
    from TelegramBot import TelegramBot
except ModuleNotFoundError:
    class TelegramBot:
        def __init__(self, enable=True):
            pass

        def sendMsg(self, msg):
            pass

_LOTTO_URLS = (
    "https://www.dhlottery.co.kr/common.do?method=getLottoNumber&drwNo={}",
    "https://dhlottery.co.kr/common.do?method=getLottoNumber&drwNo={}",
)
_REQUEST_TIMEOUT = float(os.environ.get("LOTTO_REQUEST_TIMEOUT", "12"))
_FETCH_RETRIES_PER_DRAW = int(os.environ.get("LOTTO_FETCH_RETRIES", "3"))
_BACKOFF_BASE_SECONDS = float(os.environ.get("LOTTO_BACKOFF_BASE", "0.7"))
_MAX_CONSECUTIVE_FETCH_FAILURES = int(os.environ.get("LOTTO_MAX_CONSEC_FAIL", "8"))
_CONNECTION_PROBE_TIMEOUT = float(os.environ.get("LOTTO_PROBE_TIMEOUT", "3"))
_BROWSER_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
    ),
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
    "Referer": "https://www.dhlottery.co.kr/gameResult.do?method=byWin",
    "X-Requested-With": "XMLHttpRequest",
}


def _ssl_verify_arg():
    try:
        import certifi

        return certifi.where()
    except ImportError:
        return True

# 로또 데이터를 수집하기 위한 파이썬 클래스를 선언합니다.
class DataCrawler:

    bot = None

    # 클래스 생성자로 수집할 회차를 입력받습니다.
    def __init__(self):
        self.bot = TelegramBot()
        self._session = requests.Session()
        self._session.headers.update(_BROWSER_HEADERS)
        self._last_fetch_error = ""

    def _can_reach_lottery_host(self):
        """
        API 호스트 TCP 연결 가능 여부를 빠르게 확인합니다.
        완전한 보장은 아니지만, 완전 차단 상태를 조기 감지해 불필요한 대기 시간을 줄입니다.
        """
        for host in ("www.dhlottery.co.kr", "dhlottery.co.kr"):
            try:
                with socket.create_connection((host, 443), timeout=_CONNECTION_PROBE_TIMEOUT):
                    return True
            except OSError:
                continue
        return False

    def _fetch_draw(self, drw_no):
        """동행복권 API에서 단일 회차 결과를 가져옵니다. 실패 시 None."""
        self._last_fetch_error = ""
        verify_options = (_ssl_verify_arg(), False)
        last_error = "unknown"
        for attempt in range(1, _FETCH_RETRIES_PER_DRAW + 1):
            for raw_url in _LOTTO_URLS:
                url = raw_url.format(int(drw_no))
                for verify in verify_options:
                    for method in ("POST", "GET"):
                        try:
                            res = self._session.request(
                                method,
                                url,
                                timeout=_REQUEST_TIMEOUT,
                                verify=verify,
                            )
                            if res.status_code != 200:
                                last_error = "http {}".format(res.status_code)
                                continue
                            text = res.text.strip()
                            if not text.startswith("{"):
                                last_error = "non-json response"
                                continue
                            result = json.loads(text)
                        except (
                            requests.RequestException,
                            ValueError,
                            json.JSONDecodeError,
                        ) as ex:
                            last_error = str(ex)
                            continue
                        if isinstance(result, dict) and result.get("returnValue") == "success":
                            return result
                        rv = result.get("returnValue") if isinstance(result, dict) else "unknown"
                        last_error = "api returnValue={}".format(rv)
            if attempt < _FETCH_RETRIES_PER_DRAW:
                # 지수 백오프 + 지터로 일시적 네트워크 혼잡 완화
                delay = _BACKOFF_BASE_SECONDS * (2 ** (attempt - 1)) + random.uniform(0, 0.25)
                time.sleep(delay)
        self._last_fetch_error = last_error
        return None

    def _append_draw_files(self, lottoHistoryFile, result):
        """성공 응답 한 건을 txt/json에 이어 씁니다."""
        drw_no = result["drwNo"]
        json_path = lottoHistoryFile + ".json"
        txt_path = lottoHistoryFile + ".txt"
        with open(json_path, "a", encoding="utf-8") as json_fp:
            json_fp.write(json.dumps(result, ensure_ascii=False) + "\n")
        with open(txt_path, "a", encoding="utf-8") as text_fp:
            text_fp.write(
                "%d,%d,%d,%d,%d,%d,%d,%d\n"
                % (
                    drw_no,
                    result["drwtNo1"],
                    result["drwtNo2"],
                    result["drwtNo3"],
                    result["drwtNo4"],
                    result["drwtNo5"],
                    result["drwtNo6"],
                    result["bnusNo"],
                )
            )

    def _read_last_draw_from_json(self, json_path):
        """JSONL 마지막 유효 레코드의 drwNo를 반환. 없으면 None."""
        if not os.path.isfile(json_path) or os.path.getsize(json_path) == 0:
            return None
        last_json = None
        with open(json_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    last_json = json.loads(line)
                except json.JSONDecodeError:
                    continue
        if not last_json or last_json.get("returnValue") != "success":
            return None
        return last_json.get("drwNo")

    def _read_draw_map_from_json(self, json_path):
        """
        JSONL 전체를 읽어 drwNo -> record 맵으로 반환합니다.
        잘못된 라인/중복 라인은 정리 대상이며, 마지막 유효값을 유지합니다.
        """
        draw_map = {}
        if not os.path.isfile(json_path) or os.path.getsize(json_path) == 0:
            return draw_map

        with open(json_path, "r", encoding="utf-8") as fp:
            for line in fp:
                line = line.strip()
                if not line:
                    continue
                try:
                    data = json.loads(line)
                except json.JSONDecodeError:
                    continue
                if (
                    isinstance(data, dict)
                    and data.get("returnValue") == "success"
                    and isinstance(data.get("drwNo"), int)
                ):
                    draw_map[data["drwNo"]] = data

        return draw_map

    def _write_draw_map_files(self, lottoHistoryFile, draw_map):
        """
        drwNo 오름차순으로 json/txt를 재생성합니다.
        누락 회차 보강/중복 제거 후 일관된 파일 상태를 보장합니다.
        """
        json_path = lottoHistoryFile + ".json"
        txt_path = lottoHistoryFile + ".txt"
        with open(json_path, "w", encoding="utf-8") as json_fp, open(
            txt_path, "w", encoding="utf-8"
        ) as text_fp:
            for drw_no in sorted(draw_map.keys()):
                result = draw_map[drw_no]
                json_fp.write(json.dumps(result, ensure_ascii=False) + "\n")
                text_fp.write(
                    "%d,%d,%d,%d,%d,%d,%d,%d\n"
                    % (
                        drw_no,
                        result["drwtNo1"],
                        result["drwtNo2"],
                        result["drwtNo3"],
                        result["drwtNo4"],
                        result["drwtNo5"],
                        result["drwtNo6"],
                        result["bnusNo"],
                    )
                )

    def _get_last_week_draw_date(self):
        """
        '지난 주' 기준 토요일 날짜를 반환합니다.
        예: 금요일(2026-05-08) 실행 시 직전 토요일(2026-05-02)
        """
        now = datetime.now()
        days_since_saturday = (now.weekday() - 5) % 7
        latest_saturday = now.date() - timedelta(days=days_since_saturday)
        # 토요일이면서 추첨 전(20시 이전)이라면 지난 주 토요일을 목표로 사용
        if now.weekday() == 5 and now.hour < 20:
            latest_saturday = latest_saturday - timedelta(days=7)
        return latest_saturday

    def _estimate_target_draw_no(self, draw_map):
        """
        기존 데이터의 마지막 drwNoDate와 지난 주 토요일을 비교해
        이번 실행에서 확보해야 할 목표 회차를 계산합니다.
        """
        if not draw_map:
            return None

        last_no = max(draw_map.keys())
        last_data = draw_map[last_no]
        last_date_str = last_data.get("drwNoDate", "")
        try:
            last_date = datetime.strptime(last_date_str, "%Y-%m-%d").date()
        except ValueError:
            return last_no

        target_date = self._get_last_week_draw_date()
        if target_date <= last_date:
            return last_no

        week_gap = (target_date - last_date).days // 7
        if week_gap <= 0:
            return last_no

        return last_no + week_gap

    # 로또 당첨 데이터를 수집해서 파일로 저장합니다.
    # lottoHistoryFile: 로또 당첨 데이터를 저장할 파일 (확장자 제외)
    def craw(self, lottoHistoryFile, drwNo=None):

        if drwNo is not None:
            result = self._fetch_draw(drwNo)
            if result is None:
                return False
            self._append_draw_files(lottoHistoryFile, result)
            return True

        json_path = lottoHistoryFile + ".json"
        text_path = lottoHistoryFile + ".txt"
        with open(json_path, "w", encoding="utf-8") as json_fp, open(
            text_path, "w", encoding="utf-8"
        ) as text_fp:
            idx = 1
            while True:
                result = self._fetch_draw(idx)
                if result is None:
                    break
                json_fp.write(json.dumps(result, ensure_ascii=False) + "\n")
                text_fp.write(
                    "%d,%d,%d,%d,%d,%d,%d,%d\n"
                    % (
                        idx,
                        result["drwtNo1"],
                        result["drwtNo2"],
                        result["drwtNo3"],
                        result["drwtNo4"],
                        result["drwtNo5"],
                        result["drwtNo6"],
                        result["bnusNo"],
                    )
                )
                idx += 1
                time.sleep(0.5)
        return True

    def excute(self, resource_path):
        """
        resources/lotto_history.* 를 지난 주 기준으로 누락 없이 동기화합니다.
        - 마지막 회차+1만 확인하지 않고, 1~목표회차 범위에서 누락 회차를 탐지/보강
        - 중복/깨진 라인을 정리해 json/txt를 일관 상태로 재생성
        """
        lottoHistoryFile = os.path.join(resource_path, "lotto_history")
        json_path = lottoHistoryFile + ".json"

        draw_map = self._read_draw_map_from_json(json_path)

        # 기존 이력이 비었거나 깨졌으면 전체 재수집(기존 동작 유지)
        if not draw_map:
            try:
                self.craw(lottoHistoryFile)
                self.bot.sendMsg("[Lottery Crawler] full history rebuilt (no valid json).")
            except Exception:
                pass
            return True

        target_no = self._estimate_target_draw_no(draw_map)
        if target_no is None:
            target_no = max(draw_map.keys())

        if not self._can_reach_lottery_host():
            msg = "[Lottery Crawler] network blocked: cannot reach dhlottery host."
            print(msg)
            try:
                self.bot.sendMsg(msg)
            except Exception:
                pass
            return False

        missing_nos = [no for no in range(1, target_no + 1) if no not in draw_map]
        added = 0
        failed = []
        aborted_missing_nos = []
        consecutive_failure = 0
        fail_reasons = {}
        for no in missing_nos:
            result = self._fetch_draw(no)
            if result is None:
                failed.append(no)
                reason = self._last_fetch_error or "unknown"
                fail_reasons[reason] = fail_reasons.get(reason, 0) + 1
                consecutive_failure += 1
                if consecutive_failure >= _MAX_CONSECUTIVE_FETCH_FAILURES:
                    aborted_missing_nos = [x for x in missing_nos if x > no]
                    break
                continue
            draw_map[no] = result
            added += 1
            consecutive_failure = 0
            time.sleep(0.2)

        # 누락 보강 또는 중복 정리 여지가 있으면 파일을 재생성
        self._write_draw_map_files(lottoHistoryFile, draw_map)

        last_no = max(draw_map.keys())
        if added == 0 and not failed:
            try:
                self.bot.sendMsg(
                    "[Lottery Crawler] up to date (last drwNo={}, target={}).".format(
                        last_no, target_no
                    )
                )
            except Exception:
                pass
        elif failed:
            sample = ",".join(str(x) for x in failed[:10])
            reason_items = sorted(fail_reasons.items(), key=lambda x: x[1], reverse=True)
            reason_str = "; ".join("{} x{}".format(reason, count) for reason, count in reason_items[:3])
            if aborted_missing_nos:
                reason_str += " | aborted {} pending draws due to consecutive failures".format(
                    len(aborted_missing_nos)
                )
            try:
                self.bot.sendMsg(
                    "[Lottery Crawler] appended {}, failed {} draw(s): {}{} | {}".format(
                        added,
                        len(failed),
                        sample,
                        "..." if len(failed) > 10 else "",
                        reason_str or "no reason",
                    )
                )
            except Exception:
                pass
        else:
            try:
                self.bot.sendMsg(
                    "[Lottery Crawler] appended {} draw(s), last drwNo={}, target={}.".format(
                        added, last_no, target_no
                    )
                )
            except Exception:
                pass
        return True

    # 오타 호환: 기존 코드에서 excute 를 쓰고 있음
    execute = excute

if __name__ == "__main__":
    PROJECT_HOME = '.'
    resource_path = os.path.join(PROJECT_HOME, 'resources')

    # 로또 데이터를 수집하기 위한 파이썬 클래스를 지정합니다.
    dataCrawler = DataCrawler()
    dataCrawler.excute(resource_path)