GT MTF 프로필·캘리브레이션과 04 매칭/시뮬/실거래 파이프라인을 추가한다.

3분~일봉 GT 타점 분석(03c), leg 체결 순서 수정, 총자산 90% 검증 루프, walk-forward Go/No-Go 시뮬, monitor·live_trader 및 reference 문서를 포함한다. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-31 11:27:50 +09:00
parent b52d61b777
commit 2cb67c42b3
47 changed files with 5956 additions and 209 deletions
--- a/deepcoin/matching/gt_profile_iterate.py
+++ b/deepcoin/matching/gt_profile_iterate.py
@@ -0,0 +1,539 @@
+"""
+GT 타점 MTF 프로필 반복 보강 — 스냅샷 recall·총자산 비율 90% 목표.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import pandas as pd
+
+from config import (
+    GENERAL_ANALYSIS_INTERVALS,
+    MATCH_PROFILE_MIN_SAMPLES,
+    MATCH_PROFILE_MIN_SEPARATION,
+)
+from deepcoin.analysis.general_analysis_core import interval_tf_prefix
+from deepcoin.matching.config import ANALYSIS_TRADES_CSV
+from deepcoin.matching.gt_asset_calibration import (
+    evaluate_gt_snapshot_recall,
+    portfolio_asset_ratio,
+)
+from deepcoin.matching.gt_mtf_profile import (
+    analyze_gt_mtf_profile,
+    discover_profile_columns,
+)
+from deepcoin.matching.profile_rules import (
+    _condition_from_series,
+    _feature_separation,
+    build_rule_candidates,
+)
+from deepcoin.matching.rule_eval import eval_rule_mask
+from deepcoin.paths import (
+    ANALYSIS_GT_CALIBRATION_JSON,
+    ANALYSIS_GT_MTF_PROFILE_JSON,
+    resolve_ground_truth_file,
+)
+from deepcoin.ground_truth.ground_truth import load_ground_truth
+
+
+def _condition_or_group(
+    series: pd.Series,
+    side: str,
+    quantile_lo: float = 0.15,
+    quantile_hi: float = 0.85,
+) -> dict[str, Any] | None:
+    """
+    한 컬럼 GT 분포에서 between 조건.
+
+    Args:
+        series: side GT 값.
+        side: buy | sell.
+        quantile_lo: 하한 분위.
+        quantile_hi: 상한 분위.
+
+    Returns:
+        조건 dict.
+    """
+    col_name = series.name
+    if series.dtype == object or not pd.api.types.is_numeric_dtype(series):
+        mode = series.dropna().astype(str).mode()
+        if mode.empty:
+            return None
+        return {"col": col_name, "op": "eq", "value": str(mode.iloc[0])}
+    s = pd.to_numeric(series, errors="coerce").dropna()
+    if len(s) < MATCH_PROFILE_MIN_SAMPLES:
+        return None
+    lo = float(s.quantile(quantile_lo))
+    hi = float(s.quantile(quantile_hi))
+    if lo >= hi:
+        return None
+    return {"col": col_name, "op": "between", "lo": lo, "hi": hi}
+
+
+def build_or_tf_rules(
+    buy: pd.DataFrame,
+    sell: pd.DataFrame,
+    ranked_cols: list[str],
+    *,
+    per_tf: int = 4,
+) -> list[dict[str, Any]]:
+    """
+    TF별 OR 복합 규칙 (해당 TF 상위 분리 컬럼 중 하나만 충족).
+
+    Args:
+        buy: 매수 GT.
+        sell: 매도 GT.
+        ranked_cols: 분리도 순 컬럼.
+        per_tf: TF당 OR 조건 수.
+
+    Returns:
+        rule dict 리스트.
+    """
+    rules: list[dict[str, Any]] = []
+    for side, subset in (("buy", buy), ("sell", sell)):
+        for iv in GENERAL_ANALYSIS_INTERVALS:
+            pfx = interval_tf_prefix(iv)
+            iv_cols = [
+                c
+                for c in ranked_cols
+                if c.startswith(f"{pfx}_") and c in subset.columns
+            ]
+            iv_cols = sorted(
+                iv_cols,
+                key=lambda c: _feature_separation(buy, sell, c),
+                reverse=True,
+            )[:per_tf]
+            conds: list[dict[str, Any]] = []
+            for col in iv_cols:
+                c = _condition_or_group(subset[col], side, 0.20, 0.80)
+                if c:
+                    conds.append(c)
+            if len(conds) >= 2 and pfx not in ("m240",):
+                rules.append(
+                    {
+                        "rule_id": f"{side}_or_{pfx}",
+                        "side": side,
+                        "kind": "or_tf",
+                        "logic": "or",
+                        "conditions": conds,
+                    }
+                )
+    return rules
+
+
+def build_unmatched_atomic_rules(
+    trades_df: pd.DataFrame,
+    rules: list[dict[str, Any]],
+    side: str,
+    *,
+    max_new: int = 12,
+) -> list[dict[str, Any]]:
+    """
+    스냅샷 미매칭 GT 행에서 분리도 큰 컬럼 atomic 규칙 추가.
+
+    Args:
+        trades_df: 03b CSV.
+        rules: 기존 규칙.
+        side: buy | sell.
+
+    Returns:
+        신규 atomic rule dict.
+    """
+    gt = trades_df[trades_df["action"] == side]
+    buy_all = trades_df[trades_df["action"] == "buy"]
+    sell_all = trades_df[trades_df["action"] == "sell"]
+    side_rules = [r for r in rules if r.get("side") == side]
+
+    unmatched_idx: list[int] = []
+    for idx, row in gt.iterrows():
+        fr = pd.DataFrame([row])
+        if not any(bool(eval_rule_mask(fr, r).iloc[0]) for r in side_rules):
+            unmatched_idx.append(idx)
+
+    if not unmatched_idx:
+        return []
+
+    unmatched = gt.loc[unmatched_idx]
+    matched = gt.drop(index=unmatched_idx, errors="ignore")
+    other = sell_all if side == "buy" else buy_all
+
+    cols = discover_profile_columns(trades_df)
+    scores: list[tuple[float, str]] = []
+    for col in cols:
+        if col not in unmatched.columns:
+            continue
+        if not pd.api.types.is_numeric_dtype(unmatched[col]):
+            continue
+        u = pd.to_numeric(unmatched[col], errors="coerce").dropna()
+        m = pd.to_numeric(matched[col], errors="coerce").dropna() if len(matched) >= 5 else pd.to_numeric(gt[col], errors="coerce").dropna()
+        o = pd.to_numeric(other[col], errors="coerce").dropna()
+        if len(u) < 3 or len(o) < 5:
+            continue
+        sep = abs(float(u.mean() - o.mean())) / (np.sqrt((u.var() + o.var()) / 2) + 1e-9)
+        scores.append((sep, col))
+
+    scores.sort(reverse=True)
+    new_rules: list[dict[str, Any]] = []
+    existing_cols = {
+        c["col"]
+        for r in rules
+        if r.get("side") == side
+        for c in r.get("conditions", [])
+    }
+    for sep, col in scores[: max_new * 3]:
+        if col in existing_cols:
+            continue
+        if sep < MATCH_PROFILE_MIN_SEPARATION * 0.5:
+            continue
+        cond = _condition_from_series(unmatched[col], side)
+        if cond is None:
+            cond = _condition_or_group(unmatched[col], side, 0.10, 0.90)
+        if cond is None:
+            continue
+        rid = f"{side}_cal_{col}"
+        new_rules.append(
+            {
+                "rule_id": rid,
+                "side": side,
+                "kind": "calibration_atomic",
+                "logic": "and",
+                "conditions": [cond],
+                "profile_col": col,
+                "calibration_sep": round(sep, 4),
+            }
+        )
+        existing_cols.add(col)
+        if len(new_rules) >= max_new:
+            break
+    return new_rules
+
+
+def _feature_separation_df(
+    buy: pd.DataFrame,
+    sell: pd.DataFrame,
+    col: str,
+) -> float:
+    """DataFrame 컬럼 분리도."""
+    if col not in buy.columns:
+        return 0.0
+    a = pd.to_numeric(buy[col], errors="coerce").dropna()
+    b = pd.to_numeric(sell[col], errors="coerce").dropna()
+    if len(a) < 5 or len(b) < 5:
+        return 0.0
+    pooled = np.sqrt((a.var() + b.var()) / 2)
+    if pooled < 1e-9:
+        return abs(float(a.mean() - b.mean()))
+    return abs(float(a.mean() - b.mean())) / pooled
+
+
+def run_profile_calibration_loop(
+    trades_csv: Path | None = None,
+    *,
+    target_recall: float = 0.90,
+    target_asset_ratio: float = 0.90,
+    max_iterations: int = 5,
+) -> dict[str, Any]:
+    """
+    03b·GT 기준 반복 규칙 보강 및 검증.
+
+    Args:
+        trades_csv: 03b CSV.
+        target_recall: 매수·매도 스냅샷 recall 목표.
+        target_asset_ratio: GT 총자산 대비 subset 비율 목표.
+        max_iterations: 최대 반복.
+
+    Returns:
+        calibration 리포트 dict.
+    """
+    path = trades_csv or ANALYSIS_TRADES_CSV
+    df = pd.read_csv(path)
+    buy = df[df["action"] == "buy"]
+    sell = df[df["action"] == "sell"]
+
+    analysis = analyze_gt_mtf_profile(df)
+    ANALYSIS_GT_MTF_PROFILE_JSON.parent.mkdir(parents=True, exist_ok=True)
+    ANALYSIS_GT_MTF_PROFILE_JSON.write_text(
+        json.dumps(analysis, ensure_ascii=False, indent=2),
+        encoding="utf-8",
+    )
+
+    numeric_ranked = sorted(
+        [
+            f["col"]
+            for f in analysis["features"]
+            if f["dtype"] == "numeric"
+        ],
+        key=lambda c: next(
+            (x["separation"] for x in analysis["global_top_separation"] if x["col"] == c),
+            _feature_separation_df(buy, sell, c),
+        ),
+        reverse=True,
+    )
+
+    base = build_rule_candidates(path)
+    rules: list[dict[str, Any]] = list(base.get("rules", []))
+    for r in rules:
+        if "logic" not in r:
+            r["logic"] = "and"
+
+    rules.extend(build_or_tf_rules(buy, sell, numeric_ranked[:80]))
+
+    history: list[dict[str, Any]] = []
+    best_rules: list[dict[str, Any]] = list(rules)
+    best_asset_ratio = -1.0
+    gt_data = load_ground_truth(resolve_ground_truth_file()) or {}
+    gt_trades = gt_data.get("trades") or []
+    mark = (gt_data.get("summary") or {}).get("mark_price")
+
+    for it in range(max_iterations):
+        recall = evaluate_gt_snapshot_recall(df, rules)
+        buy_rec = recall["buy"]["recall"]
+        sell_rec = recall["sell"]["recall"]
+
+        buy_legs = {int(t["leg_id"]) for t in gt_trades if t["action"] == "buy"}
+        sell_legs = {int(t["leg_id"]) for t in gt_trades if t["action"] == "sell"}
+        all_legs = buy_legs | sell_legs
+
+        included_legs = set()
+        gt_df = pd.DataFrame(gt_trades)
+        for lid in all_legs:
+            leg = gt_df[gt_df["leg_id"] == lid]
+            leg_buy_ok = True
+            leg_sell_ok = True
+            for _, row in leg[leg["action"] == "buy"].iterrows():
+                sub = df[(df["dt"] == row["dt"]) & (df["action"] == "buy")]
+                if sub.empty:
+                    leg_buy_ok = False
+                    break
+                fr = pd.DataFrame([sub.iloc[0]])
+                if not any(
+                    bool(eval_rule_mask(fr, r).iloc[0])
+                    for r in rules
+                    if r.get("side") == "buy"
+                ):
+                    leg_buy_ok = False
+                    break
+            for _, row in leg[leg["action"] == "sell"].iterrows():
+                sub = df[(df["dt"] == row["dt"]) & (df["action"] == "sell")]
+                if sub.empty:
+                    leg_sell_ok = False
+                    break
+                fr = pd.DataFrame([sub.iloc[0]])
+                if not any(
+                    bool(eval_rule_mask(fr, r).iloc[0])
+                    for r in rules
+                    if r.get("side") == "sell"
+                ):
+                    leg_sell_ok = False
+                    break
+            if leg_buy_ok and leg_sell_ok:
+                included_legs.add(int(lid))
+
+        asset = portfolio_asset_ratio(gt_trades, included_legs, mark)
+        row_hist = {
+            "iteration": it,
+            "rule_count": len(rules),
+            "buy_recall": buy_rec,
+            "sell_recall": sell_rec,
+            **asset,
+        }
+        history.append(row_hist)
+        print(
+            f"[cal {it}] rules={len(rules)} "
+            f"buy_rec={buy_rec:.2%} sell_rec={sell_rec:.2%} "
+            f"asset_ratio={asset['asset_ratio']:.2%} legs={asset['legs_covered']}/{asset['legs_total']}"
+        )
+        if asset["asset_ratio"] > best_asset_ratio:
+            best_asset_ratio = asset["asset_ratio"]
+            best_rules = list(rules)
+
+        if (
+            buy_rec >= target_recall
+            and sell_rec >= target_recall
+            and asset["asset_ratio"] >= target_asset_ratio
+        ):
+            break
+
+        added = 0
+        for side in ("buy", "sell"):
+            rec = recall[side]["recall"]
+            if rec >= target_recall:
+                continue
+            new_rules = build_unmatched_atomic_rules(df, rules, side, max_new=15)
+            rules.extend(new_rules)
+            added += len(new_rules)
+        if added == 0:
+            rules.extend(build_or_tf_rules(buy, sell, numeric_ranked[:120]))
+            for side in ("buy", "sell"):
+                rules.extend(
+                    build_unmatched_atomic_rules(df, rules, side, max_new=20)
+                )
+            if len(rules) > 200:
+                break
+
+    final_recall = evaluate_gt_snapshot_recall(df, rules)
+    final_legs: set[int] = set()
+    gt_df = pd.DataFrame(gt_trades)
+    for lid in gt_df["leg_id"].unique():
+        leg = gt_df[gt_df["leg_id"] == lid]
+        ok_b = ok_s = True
+        for _, row in leg[leg["action"] == "buy"].iterrows():
+            sub = df[(df["dt"] == row["dt"]) & (df["action"] == "buy")]
+            if sub.empty or not any(
+                bool(eval_rule_mask(pd.DataFrame([sub.iloc[0]]), r).iloc[0])
+                for r in rules
+                if r.get("side") == "buy"
+            ):
+                ok_b = False
+        for _, row in leg[leg["action"] == "sell"].iterrows():
+            sub = df[(df["dt"] == row["dt"]) & (df["action"] == "sell")]
+            if sub.empty or not any(
+                bool(eval_rule_mask(pd.DataFrame([sub.iloc[0]]), r).iloc[0])
+                for r in rules
+                if r.get("side") == "sell"
+            ):
+                ok_s = False
+        if ok_b and ok_s:
+            final_legs.add(int(lid))
+
+    final_asset = portfolio_asset_ratio(gt_trades, final_legs, mark)
+
+    out = {
+        "target_recall": target_recall,
+        "target_asset_ratio": target_asset_ratio,
+        "iterations": history,
+        "final": {
+            "rule_count": len(rules),
+            "snapshot_recall": final_recall,
+            "portfolio": final_asset,
+            "targets_met": (
+                final_recall["buy"]["recall"] >= target_recall
+                and final_recall["sell"]["recall"] >= target_recall
+                and final_asset["asset_ratio"] >= target_asset_ratio
+            ),
+        },
+        "calibrated_rules": rules,
+    }
+    deduped: list[dict[str, Any]] = []
+    seen_rid: set[str] = set()
+    for r in best_rules:
+        rid = r.get("rule_id", "")
+        if rid in seen_rid:
+            continue
+        seen_rid.add(rid)
+        deduped.append(r)
+    rules = _greedy_recall_cover(df, deduped, target_recall=target_recall)
+    out["final"]["rule_count_after_greedy"] = len(rules)
+    out["calibrated_rules"] = rules
+    out["final"]["snapshot_recall"] = evaluate_gt_snapshot_recall(df, rules)
+    final_legs_g: set[int] = set()
+    gt_df = pd.DataFrame(gt_trades)
+    for lid in gt_df["leg_id"].unique():
+        leg = gt_df[gt_df["leg_id"] == lid]
+        ok_b = ok_s = True
+        for _, row in leg[leg["action"] == "buy"].iterrows():
+            sub = df[(df["dt"] == row["dt"]) & (df["action"] == "buy")]
+            if sub.empty or not any(
+                bool(eval_rule_mask(pd.DataFrame([sub.iloc[0]]), r).iloc[0])
+                for r in rules
+                if r.get("side") == "buy"
+            ):
+                ok_b = False
+        for _, row in leg[leg["action"] == "sell"].iterrows():
+            sub = df[(df["dt"] == row["dt"]) & (df["action"] == "sell")]
+            if sub.empty or not any(
+                bool(eval_rule_mask(pd.DataFrame([sub.iloc[0]]), r).iloc[0])
+                for r in rules
+                if r.get("side") == "sell"
+            ):
+                ok_s = False
+        if ok_b and ok_s:
+            final_legs_g.add(int(lid))
+    out["final"]["portfolio"] = portfolio_asset_ratio(
+        gt_trades, final_legs_g, mark
+    )
+    fr = out["final"]["snapshot_recall"]
+    pa = out["final"]["portfolio"]
+    out["final"]["targets_met"] = (
+        fr["buy"]["recall"] >= target_recall
+        and fr["sell"]["recall"] >= target_recall
+        and pa["asset_ratio"] >= target_asset_ratio
+    )
+    ANALYSIS_GT_CALIBRATION_JSON.parent.mkdir(parents=True, exist_ok=True)
+    ANALYSIS_GT_CALIBRATION_JSON.write_text(
+        json.dumps(out, ensure_ascii=False, indent=2),
+        encoding="utf-8",
+    )
+    return out
+
+
+def _greedy_recall_cover(
+    trades_df: pd.DataFrame,
+    rules: list[dict[str, Any]],
+    *,
+    target_recall: float = 0.90,
+    max_per_side: int = 40,
+) -> list[dict[str, Any]]:
+    """
+    측면별 recall 목표까지 greedy로 규칙 축소.
+
+    Args:
+        trades_df: 03b CSV.
+        rules: 후보 규칙 전체.
+        target_recall: 목표 recall.
+
+    Returns:
+        축소된 규칙 + 기존 compound/mtf_cross 유지.
+    """
+    keep_kinds = {
+        "compound_tight",
+        "compound",
+        "contrast",
+        "mtf_cross",
+        "or_tf",
+    }
+    kept = [r for r in rules if r.get("kind") in keep_kinds]
+    pool = [r for r in rules if r not in kept]
+
+    for side in ("buy", "sell"):
+        gt = trades_df[trades_df["action"] == side]
+        if gt.empty:
+            continue
+        uncovered = set(gt.index)
+        side_pool = [r for r in pool if r.get("side") == side]
+        picked: list[dict[str, Any]] = []
+        while uncovered and len(picked) < max_per_side:
+            best_rule = None
+            best_new = 0
+            for rule in side_pool:
+                if rule in picked:
+                    continue
+                new_hit = 0
+                for idx in list(uncovered):
+                    row = gt.loc[idx]
+                    if bool(eval_rule_mask(pd.DataFrame([row]), rule).iloc[0]):
+                        new_hit += 1
+                if new_hit > best_new:
+                    best_new = new_hit
+                    best_rule = rule
+            if best_rule is None or best_new == 0:
+                break
+            picked.append(best_rule)
+            still = set()
+            for idx in uncovered:
+                row = gt.loc[idx]
+                if not any(
+                    bool(eval_rule_mask(pd.DataFrame([row]), r).iloc[0])
+                    for r in picked + [x for x in kept if x.get("side") == side]
+                ):
+                    still.add(idx)
+            uncovered = still
+            rec = 1.0 - len(uncovered) / len(gt)
+            if rec >= target_recall:
+                break
+        kept.extend(picked)
+    return kept