""" GT 타점 MTF 프로필 반복 보강 — 스냅샷 recall·총자산 비율 90% 목표. """ from __future__ import annotations import json from pathlib import Path from typing import Any import numpy as np import pandas as pd from config import ( GENERAL_ANALYSIS_INTERVALS, MATCH_PROFILE_MIN_SAMPLES, MATCH_PROFILE_MIN_SEPARATION, ) from deepcoin.analysis.general_analysis_core import interval_tf_prefix from deepcoin.matching.config import ANALYSIS_TRADES_CSV from deepcoin.matching.gt_asset_calibration import ( evaluate_gt_snapshot_recall, portfolio_asset_ratio, ) from deepcoin.matching.gt_mtf_profile import ( analyze_gt_mtf_profile, discover_profile_columns, ) from deepcoin.matching.profile_rules import ( _condition_from_series, _feature_separation, build_rule_candidates, ) from deepcoin.matching.rule_eval import eval_rule_mask from deepcoin.paths import ( ANALYSIS_GT_CALIBRATION_JSON, ANALYSIS_GT_MTF_PROFILE_JSON, resolve_ground_truth_file, ) from deepcoin.ground_truth.ground_truth import load_ground_truth def _condition_or_group( series: pd.Series, side: str, quantile_lo: float = 0.15, quantile_hi: float = 0.85, ) -> dict[str, Any] | None: """ 한 컬럼 GT 분포에서 between 조건. Args: series: side GT 값. side: buy | sell. quantile_lo: 하한 분위. quantile_hi: 상한 분위. Returns: 조건 dict. """ col_name = series.name if series.dtype == object or not pd.api.types.is_numeric_dtype(series): mode = series.dropna().astype(str).mode() if mode.empty: return None return {"col": col_name, "op": "eq", "value": str(mode.iloc[0])} s = pd.to_numeric(series, errors="coerce").dropna() if len(s) < MATCH_PROFILE_MIN_SAMPLES: return None lo = float(s.quantile(quantile_lo)) hi = float(s.quantile(quantile_hi)) if lo >= hi: return None return {"col": col_name, "op": "between", "lo": lo, "hi": hi} def build_or_tf_rules( buy: pd.DataFrame, sell: pd.DataFrame, ranked_cols: list[str], *, per_tf: int = 4, ) -> list[dict[str, Any]]: """ TF별 OR 복합 규칙 (해당 TF 상위 분리 컬럼 중 하나만 충족). Args: buy: 매수 GT. sell: 매도 GT. ranked_cols: 분리도 순 컬럼. per_tf: TF당 OR 조건 수. Returns: rule dict 리스트. """ rules: list[dict[str, Any]] = [] for side, subset in (("buy", buy), ("sell", sell)): for iv in GENERAL_ANALYSIS_INTERVALS: pfx = interval_tf_prefix(iv) iv_cols = [ c for c in ranked_cols if c.startswith(f"{pfx}_") and c in subset.columns ] iv_cols = sorted( iv_cols, key=lambda c: _feature_separation(buy, sell, c), reverse=True, )[:per_tf] conds: list[dict[str, Any]] = [] for col in iv_cols: c = _condition_or_group(subset[col], side, 0.20, 0.80) if c: conds.append(c) if len(conds) >= 2 and pfx not in ("m240",): rules.append( { "rule_id": f"{side}_or_{pfx}", "side": side, "kind": "or_tf", "logic": "or", "conditions": conds, } ) return rules def build_unmatched_atomic_rules( trades_df: pd.DataFrame, rules: list[dict[str, Any]], side: str, *, max_new: int = 12, ) -> list[dict[str, Any]]: """ 스냅샷 미매칭 GT 행에서 분리도 큰 컬럼 atomic 규칙 추가. Args: trades_df: 03b CSV. rules: 기존 규칙. side: buy | sell. Returns: 신규 atomic rule dict. """ gt = trades_df[trades_df["action"] == side] buy_all = trades_df[trades_df["action"] == "buy"] sell_all = trades_df[trades_df["action"] == "sell"] side_rules = [r for r in rules if r.get("side") == side] unmatched_idx: list[int] = [] for idx, row in gt.iterrows(): fr = pd.DataFrame([row]) if not any(bool(eval_rule_mask(fr, r).iloc[0]) for r in side_rules): unmatched_idx.append(idx) if not unmatched_idx: return [] unmatched = gt.loc[unmatched_idx] matched = gt.drop(index=unmatched_idx, errors="ignore") other = sell_all if side == "buy" else buy_all cols = discover_profile_columns(trades_df) scores: list[tuple[float, str]] = [] for col in cols: if col not in unmatched.columns: continue if not pd.api.types.is_numeric_dtype(unmatched[col]): continue u = pd.to_numeric(unmatched[col], errors="coerce").dropna() m = pd.to_numeric(matched[col], errors="coerce").dropna() if len(matched) >= 5 else pd.to_numeric(gt[col], errors="coerce").dropna() o = pd.to_numeric(other[col], errors="coerce").dropna() if len(u) < 3 or len(o) < 5: continue sep = abs(float(u.mean() - o.mean())) / (np.sqrt((u.var() + o.var()) / 2) + 1e-9) scores.append((sep, col)) scores.sort(reverse=True) new_rules: list[dict[str, Any]] = [] existing_cols = { c["col"] for r in rules if r.get("side") == side for c in r.get("conditions", []) } for sep, col in scores[: max_new * 3]: if col in existing_cols: continue if sep < MATCH_PROFILE_MIN_SEPARATION * 0.5: continue cond = _condition_from_series(unmatched[col], side) if cond is None: cond = _condition_or_group(unmatched[col], side, 0.10, 0.90) if cond is None: continue rid = f"{side}_cal_{col}" new_rules.append( { "rule_id": rid, "side": side, "kind": "calibration_atomic", "logic": "and", "conditions": [cond], "profile_col": col, "calibration_sep": round(sep, 4), } ) existing_cols.add(col) if len(new_rules) >= max_new: break return new_rules def _feature_separation_df( buy: pd.DataFrame, sell: pd.DataFrame, col: str, ) -> float: """DataFrame 컬럼 분리도.""" if col not in buy.columns: return 0.0 a = pd.to_numeric(buy[col], errors="coerce").dropna() b = pd.to_numeric(sell[col], errors="coerce").dropna() if len(a) < 5 or len(b) < 5: return 0.0 pooled = np.sqrt((a.var() + b.var()) / 2) if pooled < 1e-9: return abs(float(a.mean() - b.mean())) return abs(float(a.mean() - b.mean())) / pooled def run_profile_calibration_loop( trades_csv: Path | None = None, *, target_recall: float = 0.90, target_asset_ratio: float = 0.90, max_iterations: int = 5, ) -> dict[str, Any]: """ 03b·GT 기준 반복 규칙 보강 및 검증. Args: trades_csv: 03b CSV. target_recall: 매수·매도 스냅샷 recall 목표. target_asset_ratio: GT 총자산 대비 subset 비율 목표. max_iterations: 최대 반복. Returns: calibration 리포트 dict. """ path = trades_csv or ANALYSIS_TRADES_CSV df = pd.read_csv(path) buy = df[df["action"] == "buy"] sell = df[df["action"] == "sell"] analysis = analyze_gt_mtf_profile(df) ANALYSIS_GT_MTF_PROFILE_JSON.parent.mkdir(parents=True, exist_ok=True) ANALYSIS_GT_MTF_PROFILE_JSON.write_text( json.dumps(analysis, ensure_ascii=False, indent=2), encoding="utf-8", ) numeric_ranked = sorted( [ f["col"] for f in analysis["features"] if f["dtype"] == "numeric" ], key=lambda c: next( (x["separation"] for x in analysis["global_top_separation"] if x["col"] == c), _feature_separation_df(buy, sell, c), ), reverse=True, ) base = build_rule_candidates(path) rules: list[dict[str, Any]] = list(base.get("rules", [])) for r in rules: if "logic" not in r: r["logic"] = "and" rules.extend(build_or_tf_rules(buy, sell, numeric_ranked[:80])) history: list[dict[str, Any]] = [] best_rules: list[dict[str, Any]] = list(rules) best_asset_ratio = -1.0 gt_data = load_ground_truth(resolve_ground_truth_file()) or {} gt_trades = gt_data.get("trades") or [] mark = (gt_data.get("summary") or {}).get("mark_price") for it in range(max_iterations): recall = evaluate_gt_snapshot_recall(df, rules) buy_rec = recall["buy"]["recall"] sell_rec = recall["sell"]["recall"] buy_legs = {int(t["leg_id"]) for t in gt_trades if t["action"] == "buy"} sell_legs = {int(t["leg_id"]) for t in gt_trades if t["action"] == "sell"} all_legs = buy_legs | sell_legs included_legs = set() gt_df = pd.DataFrame(gt_trades) for lid in all_legs: leg = gt_df[gt_df["leg_id"] == lid] leg_buy_ok = True leg_sell_ok = True for _, row in leg[leg["action"] == "buy"].iterrows(): sub = df[(df["dt"] == row["dt"]) & (df["action"] == "buy")] if sub.empty: leg_buy_ok = False break fr = pd.DataFrame([sub.iloc[0]]) if not any( bool(eval_rule_mask(fr, r).iloc[0]) for r in rules if r.get("side") == "buy" ): leg_buy_ok = False break for _, row in leg[leg["action"] == "sell"].iterrows(): sub = df[(df["dt"] == row["dt"]) & (df["action"] == "sell")] if sub.empty: leg_sell_ok = False break fr = pd.DataFrame([sub.iloc[0]]) if not any( bool(eval_rule_mask(fr, r).iloc[0]) for r in rules if r.get("side") == "sell" ): leg_sell_ok = False break if leg_buy_ok and leg_sell_ok: included_legs.add(int(lid)) asset = portfolio_asset_ratio(gt_trades, included_legs, mark) row_hist = { "iteration": it, "rule_count": len(rules), "buy_recall": buy_rec, "sell_recall": sell_rec, **asset, } history.append(row_hist) print( f"[cal {it}] rules={len(rules)} " f"buy_rec={buy_rec:.2%} sell_rec={sell_rec:.2%} " f"asset_ratio={asset['asset_ratio']:.2%} legs={asset['legs_covered']}/{asset['legs_total']}" ) if asset["asset_ratio"] > best_asset_ratio: best_asset_ratio = asset["asset_ratio"] best_rules = list(rules) if ( buy_rec >= target_recall and sell_rec >= target_recall and asset["asset_ratio"] >= target_asset_ratio ): break added = 0 for side in ("buy", "sell"): rec = recall[side]["recall"] if rec >= target_recall: continue new_rules = build_unmatched_atomic_rules(df, rules, side, max_new=15) rules.extend(new_rules) added += len(new_rules) if added == 0: rules.extend(build_or_tf_rules(buy, sell, numeric_ranked[:120])) for side in ("buy", "sell"): rules.extend( build_unmatched_atomic_rules(df, rules, side, max_new=20) ) if len(rules) > 200: break final_recall = evaluate_gt_snapshot_recall(df, rules) final_legs: set[int] = set() gt_df = pd.DataFrame(gt_trades) for lid in gt_df["leg_id"].unique(): leg = gt_df[gt_df["leg_id"] == lid] ok_b = ok_s = True for _, row in leg[leg["action"] == "buy"].iterrows(): sub = df[(df["dt"] == row["dt"]) & (df["action"] == "buy")] if sub.empty or not any( bool(eval_rule_mask(pd.DataFrame([sub.iloc[0]]), r).iloc[0]) for r in rules if r.get("side") == "buy" ): ok_b = False for _, row in leg[leg["action"] == "sell"].iterrows(): sub = df[(df["dt"] == row["dt"]) & (df["action"] == "sell")] if sub.empty or not any( bool(eval_rule_mask(pd.DataFrame([sub.iloc[0]]), r).iloc[0]) for r in rules if r.get("side") == "sell" ): ok_s = False if ok_b and ok_s: final_legs.add(int(lid)) final_asset = portfolio_asset_ratio(gt_trades, final_legs, mark) out = { "target_recall": target_recall, "target_asset_ratio": target_asset_ratio, "iterations": history, "final": { "rule_count": len(rules), "snapshot_recall": final_recall, "portfolio": final_asset, "targets_met": ( final_recall["buy"]["recall"] >= target_recall and final_recall["sell"]["recall"] >= target_recall and final_asset["asset_ratio"] >= target_asset_ratio ), }, "calibrated_rules": rules, } deduped: list[dict[str, Any]] = [] seen_rid: set[str] = set() for r in best_rules: rid = r.get("rule_id", "") if rid in seen_rid: continue seen_rid.add(rid) deduped.append(r) rules = _greedy_recall_cover(df, deduped, target_recall=target_recall) out["final"]["rule_count_after_greedy"] = len(rules) out["calibrated_rules"] = rules out["final"]["snapshot_recall"] = evaluate_gt_snapshot_recall(df, rules) final_legs_g: set[int] = set() gt_df = pd.DataFrame(gt_trades) for lid in gt_df["leg_id"].unique(): leg = gt_df[gt_df["leg_id"] == lid] ok_b = ok_s = True for _, row in leg[leg["action"] == "buy"].iterrows(): sub = df[(df["dt"] == row["dt"]) & (df["action"] == "buy")] if sub.empty or not any( bool(eval_rule_mask(pd.DataFrame([sub.iloc[0]]), r).iloc[0]) for r in rules if r.get("side") == "buy" ): ok_b = False for _, row in leg[leg["action"] == "sell"].iterrows(): sub = df[(df["dt"] == row["dt"]) & (df["action"] == "sell")] if sub.empty or not any( bool(eval_rule_mask(pd.DataFrame([sub.iloc[0]]), r).iloc[0]) for r in rules if r.get("side") == "sell" ): ok_s = False if ok_b and ok_s: final_legs_g.add(int(lid)) out["final"]["portfolio"] = portfolio_asset_ratio( gt_trades, final_legs_g, mark ) fr = out["final"]["snapshot_recall"] pa = out["final"]["portfolio"] out["final"]["targets_met"] = ( fr["buy"]["recall"] >= target_recall and fr["sell"]["recall"] >= target_recall and pa["asset_ratio"] >= target_asset_ratio ) ANALYSIS_GT_CALIBRATION_JSON.parent.mkdir(parents=True, exist_ok=True) ANALYSIS_GT_CALIBRATION_JSON.write_text( json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8", ) return out def _greedy_recall_cover( trades_df: pd.DataFrame, rules: list[dict[str, Any]], *, target_recall: float = 0.90, max_per_side: int = 40, ) -> list[dict[str, Any]]: """ 측면별 recall 목표까지 greedy로 규칙 축소. Args: trades_df: 03b CSV. rules: 후보 규칙 전체. target_recall: 목표 recall. Returns: 축소된 규칙 + 기존 compound/mtf_cross 유지. """ keep_kinds = { "compound_tight", "compound", "contrast", "mtf_cross", "or_tf", } kept = [r for r in rules if r.get("kind") in keep_kinds] pool = [r for r in rules if r not in kept] for side in ("buy", "sell"): gt = trades_df[trades_df["action"] == side] if gt.empty: continue uncovered = set(gt.index) side_pool = [r for r in pool if r.get("side") == side] picked: list[dict[str, Any]] = [] while uncovered and len(picked) < max_per_side: best_rule = None best_new = 0 for rule in side_pool: if rule in picked: continue new_hit = 0 for idx in list(uncovered): row = gt.loc[idx] if bool(eval_rule_mask(pd.DataFrame([row]), rule).iloc[0]): new_hit += 1 if new_hit > best_new: best_new = new_hit best_rule = rule if best_rule is None or best_new == 0: break picked.append(best_rule) still = set() for idx in uncovered: row = gt.loc[idx] if not any( bool(eval_rule_mask(pd.DataFrame([row]), r).iloc[0]) for r in picked + [x for x in kept if x.get("side") == side] ): still.add(idx) uncovered = still rec = 1.0 - len(uncovered) / len(gt) if rec >= target_recall: break kept.extend(picked) return kept