GT MTF 프로필·캘리브레이션과 04 매칭/시뮬/실거래 파이프라인을 추가한다.

3분~일봉 GT 타점 분석(03c), leg 체결 순서 수정, 총자산 90% 검증 루프,
walk-forward Go/No-Go 시뮬, monitor·live_trader 및 reference 문서를 포함한다.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
2026-05-31 11:27:50 +09:00
parent b52d61b777
commit 2cb67c42b3
47 changed files with 5956 additions and 209 deletions

View File

@@ -0,0 +1,539 @@
"""
GT 타점 MTF 프로필 반복 보강 — 스냅샷 recall·총자산 비율 90% 목표.
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
import numpy as np
import pandas as pd
from config import (
GENERAL_ANALYSIS_INTERVALS,
MATCH_PROFILE_MIN_SAMPLES,
MATCH_PROFILE_MIN_SEPARATION,
)
from deepcoin.analysis.general_analysis_core import interval_tf_prefix
from deepcoin.matching.config import ANALYSIS_TRADES_CSV
from deepcoin.matching.gt_asset_calibration import (
evaluate_gt_snapshot_recall,
portfolio_asset_ratio,
)
from deepcoin.matching.gt_mtf_profile import (
analyze_gt_mtf_profile,
discover_profile_columns,
)
from deepcoin.matching.profile_rules import (
_condition_from_series,
_feature_separation,
build_rule_candidates,
)
from deepcoin.matching.rule_eval import eval_rule_mask
from deepcoin.paths import (
ANALYSIS_GT_CALIBRATION_JSON,
ANALYSIS_GT_MTF_PROFILE_JSON,
resolve_ground_truth_file,
)
from deepcoin.ground_truth.ground_truth import load_ground_truth
def _condition_or_group(
series: pd.Series,
side: str,
quantile_lo: float = 0.15,
quantile_hi: float = 0.85,
) -> dict[str, Any] | None:
"""
한 컬럼 GT 분포에서 between 조건.
Args:
series: side GT 값.
side: buy | sell.
quantile_lo: 하한 분위.
quantile_hi: 상한 분위.
Returns:
조건 dict.
"""
col_name = series.name
if series.dtype == object or not pd.api.types.is_numeric_dtype(series):
mode = series.dropna().astype(str).mode()
if mode.empty:
return None
return {"col": col_name, "op": "eq", "value": str(mode.iloc[0])}
s = pd.to_numeric(series, errors="coerce").dropna()
if len(s) < MATCH_PROFILE_MIN_SAMPLES:
return None
lo = float(s.quantile(quantile_lo))
hi = float(s.quantile(quantile_hi))
if lo >= hi:
return None
return {"col": col_name, "op": "between", "lo": lo, "hi": hi}
def build_or_tf_rules(
buy: pd.DataFrame,
sell: pd.DataFrame,
ranked_cols: list[str],
*,
per_tf: int = 4,
) -> list[dict[str, Any]]:
"""
TF별 OR 복합 규칙 (해당 TF 상위 분리 컬럼 중 하나만 충족).
Args:
buy: 매수 GT.
sell: 매도 GT.
ranked_cols: 분리도 순 컬럼.
per_tf: TF당 OR 조건 수.
Returns:
rule dict 리스트.
"""
rules: list[dict[str, Any]] = []
for side, subset in (("buy", buy), ("sell", sell)):
for iv in GENERAL_ANALYSIS_INTERVALS:
pfx = interval_tf_prefix(iv)
iv_cols = [
c
for c in ranked_cols
if c.startswith(f"{pfx}_") and c in subset.columns
]
iv_cols = sorted(
iv_cols,
key=lambda c: _feature_separation(buy, sell, c),
reverse=True,
)[:per_tf]
conds: list[dict[str, Any]] = []
for col in iv_cols:
c = _condition_or_group(subset[col], side, 0.20, 0.80)
if c:
conds.append(c)
if len(conds) >= 2 and pfx not in ("m240",):
rules.append(
{
"rule_id": f"{side}_or_{pfx}",
"side": side,
"kind": "or_tf",
"logic": "or",
"conditions": conds,
}
)
return rules
def build_unmatched_atomic_rules(
trades_df: pd.DataFrame,
rules: list[dict[str, Any]],
side: str,
*,
max_new: int = 12,
) -> list[dict[str, Any]]:
"""
스냅샷 미매칭 GT 행에서 분리도 큰 컬럼 atomic 규칙 추가.
Args:
trades_df: 03b CSV.
rules: 기존 규칙.
side: buy | sell.
Returns:
신규 atomic rule dict.
"""
gt = trades_df[trades_df["action"] == side]
buy_all = trades_df[trades_df["action"] == "buy"]
sell_all = trades_df[trades_df["action"] == "sell"]
side_rules = [r for r in rules if r.get("side") == side]
unmatched_idx: list[int] = []
for idx, row in gt.iterrows():
fr = pd.DataFrame([row])
if not any(bool(eval_rule_mask(fr, r).iloc[0]) for r in side_rules):
unmatched_idx.append(idx)
if not unmatched_idx:
return []
unmatched = gt.loc[unmatched_idx]
matched = gt.drop(index=unmatched_idx, errors="ignore")
other = sell_all if side == "buy" else buy_all
cols = discover_profile_columns(trades_df)
scores: list[tuple[float, str]] = []
for col in cols:
if col not in unmatched.columns:
continue
if not pd.api.types.is_numeric_dtype(unmatched[col]):
continue
u = pd.to_numeric(unmatched[col], errors="coerce").dropna()
m = pd.to_numeric(matched[col], errors="coerce").dropna() if len(matched) >= 5 else pd.to_numeric(gt[col], errors="coerce").dropna()
o = pd.to_numeric(other[col], errors="coerce").dropna()
if len(u) < 3 or len(o) < 5:
continue
sep = abs(float(u.mean() - o.mean())) / (np.sqrt((u.var() + o.var()) / 2) + 1e-9)
scores.append((sep, col))
scores.sort(reverse=True)
new_rules: list[dict[str, Any]] = []
existing_cols = {
c["col"]
for r in rules
if r.get("side") == side
for c in r.get("conditions", [])
}
for sep, col in scores[: max_new * 3]:
if col in existing_cols:
continue
if sep < MATCH_PROFILE_MIN_SEPARATION * 0.5:
continue
cond = _condition_from_series(unmatched[col], side)
if cond is None:
cond = _condition_or_group(unmatched[col], side, 0.10, 0.90)
if cond is None:
continue
rid = f"{side}_cal_{col}"
new_rules.append(
{
"rule_id": rid,
"side": side,
"kind": "calibration_atomic",
"logic": "and",
"conditions": [cond],
"profile_col": col,
"calibration_sep": round(sep, 4),
}
)
existing_cols.add(col)
if len(new_rules) >= max_new:
break
return new_rules
def _feature_separation_df(
buy: pd.DataFrame,
sell: pd.DataFrame,
col: str,
) -> float:
"""DataFrame 컬럼 분리도."""
if col not in buy.columns:
return 0.0
a = pd.to_numeric(buy[col], errors="coerce").dropna()
b = pd.to_numeric(sell[col], errors="coerce").dropna()
if len(a) < 5 or len(b) < 5:
return 0.0
pooled = np.sqrt((a.var() + b.var()) / 2)
if pooled < 1e-9:
return abs(float(a.mean() - b.mean()))
return abs(float(a.mean() - b.mean())) / pooled
def run_profile_calibration_loop(
trades_csv: Path | None = None,
*,
target_recall: float = 0.90,
target_asset_ratio: float = 0.90,
max_iterations: int = 5,
) -> dict[str, Any]:
"""
03b·GT 기준 반복 규칙 보강 및 검증.
Args:
trades_csv: 03b CSV.
target_recall: 매수·매도 스냅샷 recall 목표.
target_asset_ratio: GT 총자산 대비 subset 비율 목표.
max_iterations: 최대 반복.
Returns:
calibration 리포트 dict.
"""
path = trades_csv or ANALYSIS_TRADES_CSV
df = pd.read_csv(path)
buy = df[df["action"] == "buy"]
sell = df[df["action"] == "sell"]
analysis = analyze_gt_mtf_profile(df)
ANALYSIS_GT_MTF_PROFILE_JSON.parent.mkdir(parents=True, exist_ok=True)
ANALYSIS_GT_MTF_PROFILE_JSON.write_text(
json.dumps(analysis, ensure_ascii=False, indent=2),
encoding="utf-8",
)
numeric_ranked = sorted(
[
f["col"]
for f in analysis["features"]
if f["dtype"] == "numeric"
],
key=lambda c: next(
(x["separation"] for x in analysis["global_top_separation"] if x["col"] == c),
_feature_separation_df(buy, sell, c),
),
reverse=True,
)
base = build_rule_candidates(path)
rules: list[dict[str, Any]] = list(base.get("rules", []))
for r in rules:
if "logic" not in r:
r["logic"] = "and"
rules.extend(build_or_tf_rules(buy, sell, numeric_ranked[:80]))
history: list[dict[str, Any]] = []
best_rules: list[dict[str, Any]] = list(rules)
best_asset_ratio = -1.0
gt_data = load_ground_truth(resolve_ground_truth_file()) or {}
gt_trades = gt_data.get("trades") or []
mark = (gt_data.get("summary") or {}).get("mark_price")
for it in range(max_iterations):
recall = evaluate_gt_snapshot_recall(df, rules)
buy_rec = recall["buy"]["recall"]
sell_rec = recall["sell"]["recall"]
buy_legs = {int(t["leg_id"]) for t in gt_trades if t["action"] == "buy"}
sell_legs = {int(t["leg_id"]) for t in gt_trades if t["action"] == "sell"}
all_legs = buy_legs | sell_legs
included_legs = set()
gt_df = pd.DataFrame(gt_trades)
for lid in all_legs:
leg = gt_df[gt_df["leg_id"] == lid]
leg_buy_ok = True
leg_sell_ok = True
for _, row in leg[leg["action"] == "buy"].iterrows():
sub = df[(df["dt"] == row["dt"]) & (df["action"] == "buy")]
if sub.empty:
leg_buy_ok = False
break
fr = pd.DataFrame([sub.iloc[0]])
if not any(
bool(eval_rule_mask(fr, r).iloc[0])
for r in rules
if r.get("side") == "buy"
):
leg_buy_ok = False
break
for _, row in leg[leg["action"] == "sell"].iterrows():
sub = df[(df["dt"] == row["dt"]) & (df["action"] == "sell")]
if sub.empty:
leg_sell_ok = False
break
fr = pd.DataFrame([sub.iloc[0]])
if not any(
bool(eval_rule_mask(fr, r).iloc[0])
for r in rules
if r.get("side") == "sell"
):
leg_sell_ok = False
break
if leg_buy_ok and leg_sell_ok:
included_legs.add(int(lid))
asset = portfolio_asset_ratio(gt_trades, included_legs, mark)
row_hist = {
"iteration": it,
"rule_count": len(rules),
"buy_recall": buy_rec,
"sell_recall": sell_rec,
**asset,
}
history.append(row_hist)
print(
f"[cal {it}] rules={len(rules)} "
f"buy_rec={buy_rec:.2%} sell_rec={sell_rec:.2%} "
f"asset_ratio={asset['asset_ratio']:.2%} legs={asset['legs_covered']}/{asset['legs_total']}"
)
if asset["asset_ratio"] > best_asset_ratio:
best_asset_ratio = asset["asset_ratio"]
best_rules = list(rules)
if (
buy_rec >= target_recall
and sell_rec >= target_recall
and asset["asset_ratio"] >= target_asset_ratio
):
break
added = 0
for side in ("buy", "sell"):
rec = recall[side]["recall"]
if rec >= target_recall:
continue
new_rules = build_unmatched_atomic_rules(df, rules, side, max_new=15)
rules.extend(new_rules)
added += len(new_rules)
if added == 0:
rules.extend(build_or_tf_rules(buy, sell, numeric_ranked[:120]))
for side in ("buy", "sell"):
rules.extend(
build_unmatched_atomic_rules(df, rules, side, max_new=20)
)
if len(rules) > 200:
break
final_recall = evaluate_gt_snapshot_recall(df, rules)
final_legs: set[int] = set()
gt_df = pd.DataFrame(gt_trades)
for lid in gt_df["leg_id"].unique():
leg = gt_df[gt_df["leg_id"] == lid]
ok_b = ok_s = True
for _, row in leg[leg["action"] == "buy"].iterrows():
sub = df[(df["dt"] == row["dt"]) & (df["action"] == "buy")]
if sub.empty or not any(
bool(eval_rule_mask(pd.DataFrame([sub.iloc[0]]), r).iloc[0])
for r in rules
if r.get("side") == "buy"
):
ok_b = False
for _, row in leg[leg["action"] == "sell"].iterrows():
sub = df[(df["dt"] == row["dt"]) & (df["action"] == "sell")]
if sub.empty or not any(
bool(eval_rule_mask(pd.DataFrame([sub.iloc[0]]), r).iloc[0])
for r in rules
if r.get("side") == "sell"
):
ok_s = False
if ok_b and ok_s:
final_legs.add(int(lid))
final_asset = portfolio_asset_ratio(gt_trades, final_legs, mark)
out = {
"target_recall": target_recall,
"target_asset_ratio": target_asset_ratio,
"iterations": history,
"final": {
"rule_count": len(rules),
"snapshot_recall": final_recall,
"portfolio": final_asset,
"targets_met": (
final_recall["buy"]["recall"] >= target_recall
and final_recall["sell"]["recall"] >= target_recall
and final_asset["asset_ratio"] >= target_asset_ratio
),
},
"calibrated_rules": rules,
}
deduped: list[dict[str, Any]] = []
seen_rid: set[str] = set()
for r in best_rules:
rid = r.get("rule_id", "")
if rid in seen_rid:
continue
seen_rid.add(rid)
deduped.append(r)
rules = _greedy_recall_cover(df, deduped, target_recall=target_recall)
out["final"]["rule_count_after_greedy"] = len(rules)
out["calibrated_rules"] = rules
out["final"]["snapshot_recall"] = evaluate_gt_snapshot_recall(df, rules)
final_legs_g: set[int] = set()
gt_df = pd.DataFrame(gt_trades)
for lid in gt_df["leg_id"].unique():
leg = gt_df[gt_df["leg_id"] == lid]
ok_b = ok_s = True
for _, row in leg[leg["action"] == "buy"].iterrows():
sub = df[(df["dt"] == row["dt"]) & (df["action"] == "buy")]
if sub.empty or not any(
bool(eval_rule_mask(pd.DataFrame([sub.iloc[0]]), r).iloc[0])
for r in rules
if r.get("side") == "buy"
):
ok_b = False
for _, row in leg[leg["action"] == "sell"].iterrows():
sub = df[(df["dt"] == row["dt"]) & (df["action"] == "sell")]
if sub.empty or not any(
bool(eval_rule_mask(pd.DataFrame([sub.iloc[0]]), r).iloc[0])
for r in rules
if r.get("side") == "sell"
):
ok_s = False
if ok_b and ok_s:
final_legs_g.add(int(lid))
out["final"]["portfolio"] = portfolio_asset_ratio(
gt_trades, final_legs_g, mark
)
fr = out["final"]["snapshot_recall"]
pa = out["final"]["portfolio"]
out["final"]["targets_met"] = (
fr["buy"]["recall"] >= target_recall
and fr["sell"]["recall"] >= target_recall
and pa["asset_ratio"] >= target_asset_ratio
)
ANALYSIS_GT_CALIBRATION_JSON.parent.mkdir(parents=True, exist_ok=True)
ANALYSIS_GT_CALIBRATION_JSON.write_text(
json.dumps(out, ensure_ascii=False, indent=2),
encoding="utf-8",
)
return out
def _greedy_recall_cover(
trades_df: pd.DataFrame,
rules: list[dict[str, Any]],
*,
target_recall: float = 0.90,
max_per_side: int = 40,
) -> list[dict[str, Any]]:
"""
측면별 recall 목표까지 greedy로 규칙 축소.
Args:
trades_df: 03b CSV.
rules: 후보 규칙 전체.
target_recall: 목표 recall.
Returns:
축소된 규칙 + 기존 compound/mtf_cross 유지.
"""
keep_kinds = {
"compound_tight",
"compound",
"contrast",
"mtf_cross",
"or_tf",
}
kept = [r for r in rules if r.get("kind") in keep_kinds]
pool = [r for r in rules if r not in kept]
for side in ("buy", "sell"):
gt = trades_df[trades_df["action"] == side]
if gt.empty:
continue
uncovered = set(gt.index)
side_pool = [r for r in pool if r.get("side") == side]
picked: list[dict[str, Any]] = []
while uncovered and len(picked) < max_per_side:
best_rule = None
best_new = 0
for rule in side_pool:
if rule in picked:
continue
new_hit = 0
for idx in list(uncovered):
row = gt.loc[idx]
if bool(eval_rule_mask(pd.DataFrame([row]), rule).iloc[0]):
new_hit += 1
if new_hit > best_new:
best_new = new_hit
best_rule = rule
if best_rule is None or best_new == 0:
break
picked.append(best_rule)
still = set()
for idx in uncovered:
row = gt.loc[idx]
if not any(
bool(eval_rule_mask(pd.DataFrame([row]), r).iloc[0])
for r in picked + [x for x in kept if x.get("side") == side]
):
still.add(idx)
uncovered = still
rec = 1.0 - len(uncovered) / len(gt)
if rec >= target_recall:
break
kept.extend(picked)
return kept