# 실행 순서 * FilterFeature.py를 실행한다. * lotto_history.json을 읽어서 all_filter_[1-100].[cluster,csv,feature] 파일을 생성한다. * FilterFeatureCluster.py를 실행한다. * 첫수는 1~10까지만 허용한다. * random_state 전체 내 각 cluster에 대해서 당첨 회수를 파악하여 ./resources/cluster_win_info.csv 파일을 생성한다. * 생성 파일 * filtertest_1.csv: random_state 내 cluster 개수를 파악한다. * filtertest_2.csv: random_state 내 cluster 개수 별 전체 당첨 회수를 파악한다. * filtertest_3.csv: random_state 내 cluster 개수 별 최초 당첨 번호만 파악한다. * 실행할 random_state와 cluster 번호 파악 * filtertest_2.csv과 answer_pattern_analsys.xlsx을 이용하여 선별한다. * cluster_info.json 파일 업데이트 * 실행할 random_state와 cluster 번호를 json 형태로 등록한다. * Util_filegen.py 실행 * m1, amd, intel 컴퓨터에서 실행할 sh, bat 파일을 생성한다. * 파이썬 내에서 아래 두 부분만 수정하면 된다. * m1_file_max, amd_file_max, intel_file_max = 8,12,7 * m1_proc_limit, amd_proc_limit, intel_proc_limit = 124,125,110 * 각 장비에서 sh와 bat 파일 실행 ## Ruleset(임계값 설정) 기반으로 운영하기 `filter_model.BallFilter`의 주요 임계값(합/평균/앞3합/뒤3합/간격 등)을 **JSON ruleset**으로 외부화했습니다. 이제 “코드 수정 없이” ruleset 파일만 바꿔서 실험/튜닝을 자동화할 수 있습니다. - **기본 ruleset 경로**: `resources/rulesets/default.json` - **주의/한계**: 로또는 본질적으로 랜덤(독립/균등 가설)이며, ruleset은 “구매 조합 수를 줄이기 위한 필터”입니다. **당첨 보장/예측을 주장하지 않습니다.** ### valid 성능 확인 예시 ```bash python scripts/eval_filters.py \ --data valid \ --resources resources \ --ruleset resources/rulesets/default.json \ --start-no 801 --end-no 1000 \ --survivors-samples 0 ``` ### survivors(생존 조합 수) 근사 포함 예시 ```bash python scripts/eval_filters.py \ --data valid \ --resources resources \ --ruleset resources/rulesets/default.json \ --start-no 801 --end-no 1000 \ --survivors-samples 3000 ``` ## 자동 튜닝 → ruleset 생성 → 일괄 평가 파이프라인 ### 1) train 기반 자동 튜닝(후보 ruleset 생성) 아래 스크립트는 **train 구간에서만** 임계값을 랜덤 탐색으로 튜닝한 뒤, `resources/rulesets/`에 `Balanced.json`, `Coverage-First.json`을 저장합니다. ```bash python scripts/tune_ruleset.py \ --resources resources \ --base-ruleset resources/rulesets/default.json \ --out-dir resources/rulesets \ --train-start 21 --train-end 800 \ --hit-rate-min 0.01 \ --iters 200 \ --mc-samples 40000 ``` - **Coverage-First**: survivors(생존 조합 수) 최소화를 우선 - **Balanced**: survivors를 줄이되 hit-rate도 함께 고려 > 주의: survivors는 전수(8,145,060조합) 대신 **풀링 Monte Carlo**로 근사하므로 오차가 있습니다. ### 2) valid/train 구간에서 ruleset 일괄 평가 ```bash python scripts/eval_rulesets.py \ --resources resources \ --rulesets-dir resources/rulesets \ --data valid \ --start-no 801 --end-no 1000 \ --survivors-samples 0 ``` # Query ```SQL ##### 전체 진행 확인 ##### with source_count as ( select source, count(*) as source_count from cluster_info where priority not in (99) and source in (1,3) group by 1 ), ball_count as ( # 1) random_state, cluster 별 추천볼 개수 select source, random_state, cluster, ball_cnt from ( SELECT source, random_state, cluster, count(*) as ball_cnt from recommend_ball where no=1136 and b1 > 0 group by 1,2,3 union all SELECT source, random_state, cluster, 0 as ball_cnt from recommend_ball where no=1136 and b1 = 0 group by 1,2,3 ) lj ), source_rc_cluster_list as ( select ci.source, ci.random_state, ci.cluster, ci.cluster_count, ci.win_count, ci.priority, rc.source_count, bc.ball_cnt from cluster_info ci left join source_count rc on ci.source = rc.source left join ball_count bc on ci.source = bc.source and ci.random_state = bc.random_state and ci.cluster = bc.cluster where ci.priority not in (99) and ci.source in (1,3) ), source_process as ( select source, "done" as type, count(*) as cnt from source_rc_cluster_list where ball_cnt is not NULL group by 1,2 union all select source, "yet" as type, count(*) as cnt from source_rc_cluster_list where ball_cnt is NULL group by 1,2 ) select source, type, cnt, case when source=1 then concat(round(100.0 * cnt / (select source_count from source_count where source=1),2), '%') when source=3 then concat(round(100.0 * cnt / (select source_count from source_count where source=3),2), '%') end as rate from source_process order by 1,2 ; ### 미진행 클러스터 확인 ### SELECT ci.source, ci.random_state, ci.cluster, lj.cnt from cluster_info ci left join (select source, random_state, cluster, count(*) as cnt from recommend_ball rb where no=1136 group by 1,2,3) lj on ci.source=lj.source and ci.random_state=lj.random_state and ci.cluster=lj.cluster where priority not in (99) and lj.cnt is null order by 1,2,3 ; ##### 전체 cluster 확인 ##### with raw_data as ( select rb.source, ci.priority, rb.random_state, rb.cluster, ci.cluster_count, ci.win_count, b1, count(*) as ball_cnt from recommend_ball rb left join cluster_info ci on rb.source=ci.source and rb.random_state = ci.random_state and rb.cluster = ci.cluster where no=1136 group by 1,2,3,4,5,6,7 ), all_cluster as ( select source, priority, random_state, cluster, ball_cnt from raw_data where ( (source = 1 and priority in (1,2)) or (source = 3 and priority in (1,2)) ) group by 1,2,3,4 ), valid_total_cluster as ( select source, priority, random_state, cluster, ball_cnt from raw_data where ( (source = 1 and priority = 1 and ball_cnt BETWEEN 50 and 80 ) or (source = 1 and priority = 2 and ( win_count = 12 and ball_cnt BETWEEN 50 and 80) ) or (source = 3 and priority = 1 and (ball_cnt BETWEEN 1 and 30 or ball_cnt BETWEEN 50 and 100) ) or (source = 3 and priority = 2 and ( win_count=13 and (ball_cnt BETWEEN 1 and 30 or ball_cnt BETWEEN 50 and 100)) ) or (source = 1 and ((win_count between 5 and 10) and ball_cnt BETWEEN 1 and 20) ) ) group by 1,2,3,4 ), valid_none_0_cluster as ( select source, priority, random_state, cluster, ball_cnt from raw_data where b1 <> 0 AND ( (source = 1 and priority = 1 and ball_cnt BETWEEN 50 and 80 ) or (source = 1 and priority = 2 and ( win_count = 12 and ball_cnt BETWEEN 50 and 80) ) or (source = 3 and priority = 1 and (ball_cnt BETWEEN 1 and 30 or ball_cnt BETWEEN 50 and 100) ) or (source = 3 and priority = 2 and ( win_count=13 and (ball_cnt BETWEEN 1 and 30 or ball_cnt BETWEEN 50 and 100)) ) or (source = 1 and ((win_count between 5 and 10) and ball_cnt BETWEEN 1 and 20) ) ) group by 1,2,3,4 ) # 전체 클러스터 개수 select 1 as col, count(*) from all_cluster union all # 조건에 해당하는 클러스터 개수 select 2 as col, count(*) from valid_total_cluster union all # 조건에 해당하는 클러스터 중 추천이 0이 아닌 유효한 클러스터 개수 select 3 as col, count(*) from valid_none_0_cluster ; ##### 가장 많은 추천볼 ##### select b1,b2,b3,b4,b5,b6,count(*) as ball_cnt from recommend_ball where no=1136 and b1>0 group by 1,2,3,4,5,6 order by 7 desc; ##### 추천볼 분석 ##### with priority as ( select source, random_state, cluster, cluster_count, win_count, priority from cluster_info where priority not in (99) ), recommend as ( select source, random_state, cluster, b1,b2,b3,b4,b5,b6 from recommend_ball where b1 > 0 and no=1136 ), recommend_count as ( select source, random_state, cluster, count(*) as ball_cnt from recommend_ball where b1 > 0 and no=1136 group by 1,2,3 ), raw_data as ( select r.source, r.random_state, r.cluster, p.cluster_count, p.win_count, p.priority, r.b1,r.b2,r.b3,r.b4,r.b5,r.b6, rc.ball_cnt from recommend r left join priority p on r.source=p.source and r.random_state=p.random_state and r.cluster=p.cluster left join recommend_count rc on r.source=rc.source and r.random_state=rc.random_state and r.cluster=rc.cluster ), candidate as ( select source, random_state, cluster, cluster_count, win_count, priority, b1,b2,b3,b4,b5,b6, ball_cnt from raw_data where ( (source = 0 and b1=7) or (source = 1 and priority=-1 and ball_cnt<=140 and ( b1 not in (13, 19, 28) and b2 not in (13, 19, 28) and b3 not in (13, 19, 28) and b4 not in (13, 19, 28) and b5 not in (13, 19, 28) and b6 not in (13, 19, 28) ) ) or (source = 3 and priority=-1 and ball_cnt<=150 and ( b1 not in (13, 19, 28) and b2 not in (13, 19, 28) and b3 not in (13, 19, 28) and b4 not in (13, 19, 28) and b5 not in (13, 19, 28) and b6 not in (13, 19, 28) ) ) ) ) #select source, random_state,cluster,b1,b2,b3,b4,b5,b6 from candidate order by 4,5,6,7,8,9; , duplication as ( # 히스트업한 추천볼에서 중복을 제거함34 select source, random_state, cluster, cluster_count, win_count, priority, b1,b2,b3,b4,b5,b6, ball_cnt from ( select source, random_state, cluster, cluster_count, win_count, priority, b1,b2,b3,b4,b5,b6, ball_cnt, ROW_NUMBER() OVER(PARTITION BY b1,b2,b3,b4,b5,b6 ORDER BY b1,b2,b3,b4,b5,b6) AS rnk from candidate ) a where rnk=1 order by source,random_state,cluster,b1,b2,b3,b4,b5,b6 ) select count(*) as cnt from duplication; #select source, priority, random_state, cluster, win_count, count(*) as cnt from duplication group by 1,2,3; #select b1, count(*) as ball_cnt from duplication group by 1 #select b6, count(*) as ball_cnt from duplication group by 1 #select source,random_state,cluster,b1,b2,b3,b4,b5,b6 from duplication order by 4,5,6,7,8,9; ```