GT MTF 프로필·캘리브레이션과 04 매칭/시뮬/실거래 파이프라인을 추가한다.
3분~일봉 GT 타점 분석(03c), leg 체결 순서 수정, 총자산 90% 검증 루프, walk-forward Go/No-Go 시뮬, monitor·live_trader 및 reference 문서를 포함한다. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
539
deepcoin/matching/gt_profile_iterate.py
Normal file
539
deepcoin/matching/gt_profile_iterate.py
Normal file
@@ -0,0 +1,539 @@
|
||||
"""
|
||||
GT 타점 MTF 프로필 반복 보강 — 스냅샷 recall·총자산 비율 90% 목표.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from config import (
|
||||
GENERAL_ANALYSIS_INTERVALS,
|
||||
MATCH_PROFILE_MIN_SAMPLES,
|
||||
MATCH_PROFILE_MIN_SEPARATION,
|
||||
)
|
||||
from deepcoin.analysis.general_analysis_core import interval_tf_prefix
|
||||
from deepcoin.matching.config import ANALYSIS_TRADES_CSV
|
||||
from deepcoin.matching.gt_asset_calibration import (
|
||||
evaluate_gt_snapshot_recall,
|
||||
portfolio_asset_ratio,
|
||||
)
|
||||
from deepcoin.matching.gt_mtf_profile import (
|
||||
analyze_gt_mtf_profile,
|
||||
discover_profile_columns,
|
||||
)
|
||||
from deepcoin.matching.profile_rules import (
|
||||
_condition_from_series,
|
||||
_feature_separation,
|
||||
build_rule_candidates,
|
||||
)
|
||||
from deepcoin.matching.rule_eval import eval_rule_mask
|
||||
from deepcoin.paths import (
|
||||
ANALYSIS_GT_CALIBRATION_JSON,
|
||||
ANALYSIS_GT_MTF_PROFILE_JSON,
|
||||
resolve_ground_truth_file,
|
||||
)
|
||||
from deepcoin.ground_truth.ground_truth import load_ground_truth
|
||||
|
||||
|
||||
def _condition_or_group(
|
||||
series: pd.Series,
|
||||
side: str,
|
||||
quantile_lo: float = 0.15,
|
||||
quantile_hi: float = 0.85,
|
||||
) -> dict[str, Any] | None:
|
||||
"""
|
||||
한 컬럼 GT 분포에서 between 조건.
|
||||
|
||||
Args:
|
||||
series: side GT 값.
|
||||
side: buy | sell.
|
||||
quantile_lo: 하한 분위.
|
||||
quantile_hi: 상한 분위.
|
||||
|
||||
Returns:
|
||||
조건 dict.
|
||||
"""
|
||||
col_name = series.name
|
||||
if series.dtype == object or not pd.api.types.is_numeric_dtype(series):
|
||||
mode = series.dropna().astype(str).mode()
|
||||
if mode.empty:
|
||||
return None
|
||||
return {"col": col_name, "op": "eq", "value": str(mode.iloc[0])}
|
||||
s = pd.to_numeric(series, errors="coerce").dropna()
|
||||
if len(s) < MATCH_PROFILE_MIN_SAMPLES:
|
||||
return None
|
||||
lo = float(s.quantile(quantile_lo))
|
||||
hi = float(s.quantile(quantile_hi))
|
||||
if lo >= hi:
|
||||
return None
|
||||
return {"col": col_name, "op": "between", "lo": lo, "hi": hi}
|
||||
|
||||
|
||||
def build_or_tf_rules(
|
||||
buy: pd.DataFrame,
|
||||
sell: pd.DataFrame,
|
||||
ranked_cols: list[str],
|
||||
*,
|
||||
per_tf: int = 4,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""
|
||||
TF별 OR 복합 규칙 (해당 TF 상위 분리 컬럼 중 하나만 충족).
|
||||
|
||||
Args:
|
||||
buy: 매수 GT.
|
||||
sell: 매도 GT.
|
||||
ranked_cols: 분리도 순 컬럼.
|
||||
per_tf: TF당 OR 조건 수.
|
||||
|
||||
Returns:
|
||||
rule dict 리스트.
|
||||
"""
|
||||
rules: list[dict[str, Any]] = []
|
||||
for side, subset in (("buy", buy), ("sell", sell)):
|
||||
for iv in GENERAL_ANALYSIS_INTERVALS:
|
||||
pfx = interval_tf_prefix(iv)
|
||||
iv_cols = [
|
||||
c
|
||||
for c in ranked_cols
|
||||
if c.startswith(f"{pfx}_") and c in subset.columns
|
||||
]
|
||||
iv_cols = sorted(
|
||||
iv_cols,
|
||||
key=lambda c: _feature_separation(buy, sell, c),
|
||||
reverse=True,
|
||||
)[:per_tf]
|
||||
conds: list[dict[str, Any]] = []
|
||||
for col in iv_cols:
|
||||
c = _condition_or_group(subset[col], side, 0.20, 0.80)
|
||||
if c:
|
||||
conds.append(c)
|
||||
if len(conds) >= 2 and pfx not in ("m240",):
|
||||
rules.append(
|
||||
{
|
||||
"rule_id": f"{side}_or_{pfx}",
|
||||
"side": side,
|
||||
"kind": "or_tf",
|
||||
"logic": "or",
|
||||
"conditions": conds,
|
||||
}
|
||||
)
|
||||
return rules
|
||||
|
||||
|
||||
def build_unmatched_atomic_rules(
|
||||
trades_df: pd.DataFrame,
|
||||
rules: list[dict[str, Any]],
|
||||
side: str,
|
||||
*,
|
||||
max_new: int = 12,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""
|
||||
스냅샷 미매칭 GT 행에서 분리도 큰 컬럼 atomic 규칙 추가.
|
||||
|
||||
Args:
|
||||
trades_df: 03b CSV.
|
||||
rules: 기존 규칙.
|
||||
side: buy | sell.
|
||||
|
||||
Returns:
|
||||
신규 atomic rule dict.
|
||||
"""
|
||||
gt = trades_df[trades_df["action"] == side]
|
||||
buy_all = trades_df[trades_df["action"] == "buy"]
|
||||
sell_all = trades_df[trades_df["action"] == "sell"]
|
||||
side_rules = [r for r in rules if r.get("side") == side]
|
||||
|
||||
unmatched_idx: list[int] = []
|
||||
for idx, row in gt.iterrows():
|
||||
fr = pd.DataFrame([row])
|
||||
if not any(bool(eval_rule_mask(fr, r).iloc[0]) for r in side_rules):
|
||||
unmatched_idx.append(idx)
|
||||
|
||||
if not unmatched_idx:
|
||||
return []
|
||||
|
||||
unmatched = gt.loc[unmatched_idx]
|
||||
matched = gt.drop(index=unmatched_idx, errors="ignore")
|
||||
other = sell_all if side == "buy" else buy_all
|
||||
|
||||
cols = discover_profile_columns(trades_df)
|
||||
scores: list[tuple[float, str]] = []
|
||||
for col in cols:
|
||||
if col not in unmatched.columns:
|
||||
continue
|
||||
if not pd.api.types.is_numeric_dtype(unmatched[col]):
|
||||
continue
|
||||
u = pd.to_numeric(unmatched[col], errors="coerce").dropna()
|
||||
m = pd.to_numeric(matched[col], errors="coerce").dropna() if len(matched) >= 5 else pd.to_numeric(gt[col], errors="coerce").dropna()
|
||||
o = pd.to_numeric(other[col], errors="coerce").dropna()
|
||||
if len(u) < 3 or len(o) < 5:
|
||||
continue
|
||||
sep = abs(float(u.mean() - o.mean())) / (np.sqrt((u.var() + o.var()) / 2) + 1e-9)
|
||||
scores.append((sep, col))
|
||||
|
||||
scores.sort(reverse=True)
|
||||
new_rules: list[dict[str, Any]] = []
|
||||
existing_cols = {
|
||||
c["col"]
|
||||
for r in rules
|
||||
if r.get("side") == side
|
||||
for c in r.get("conditions", [])
|
||||
}
|
||||
for sep, col in scores[: max_new * 3]:
|
||||
if col in existing_cols:
|
||||
continue
|
||||
if sep < MATCH_PROFILE_MIN_SEPARATION * 0.5:
|
||||
continue
|
||||
cond = _condition_from_series(unmatched[col], side)
|
||||
if cond is None:
|
||||
cond = _condition_or_group(unmatched[col], side, 0.10, 0.90)
|
||||
if cond is None:
|
||||
continue
|
||||
rid = f"{side}_cal_{col}"
|
||||
new_rules.append(
|
||||
{
|
||||
"rule_id": rid,
|
||||
"side": side,
|
||||
"kind": "calibration_atomic",
|
||||
"logic": "and",
|
||||
"conditions": [cond],
|
||||
"profile_col": col,
|
||||
"calibration_sep": round(sep, 4),
|
||||
}
|
||||
)
|
||||
existing_cols.add(col)
|
||||
if len(new_rules) >= max_new:
|
||||
break
|
||||
return new_rules
|
||||
|
||||
|
||||
def _feature_separation_df(
|
||||
buy: pd.DataFrame,
|
||||
sell: pd.DataFrame,
|
||||
col: str,
|
||||
) -> float:
|
||||
"""DataFrame 컬럼 분리도."""
|
||||
if col not in buy.columns:
|
||||
return 0.0
|
||||
a = pd.to_numeric(buy[col], errors="coerce").dropna()
|
||||
b = pd.to_numeric(sell[col], errors="coerce").dropna()
|
||||
if len(a) < 5 or len(b) < 5:
|
||||
return 0.0
|
||||
pooled = np.sqrt((a.var() + b.var()) / 2)
|
||||
if pooled < 1e-9:
|
||||
return abs(float(a.mean() - b.mean()))
|
||||
return abs(float(a.mean() - b.mean())) / pooled
|
||||
|
||||
|
||||
def run_profile_calibration_loop(
|
||||
trades_csv: Path | None = None,
|
||||
*,
|
||||
target_recall: float = 0.90,
|
||||
target_asset_ratio: float = 0.90,
|
||||
max_iterations: int = 5,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
03b·GT 기준 반복 규칙 보강 및 검증.
|
||||
|
||||
Args:
|
||||
trades_csv: 03b CSV.
|
||||
target_recall: 매수·매도 스냅샷 recall 목표.
|
||||
target_asset_ratio: GT 총자산 대비 subset 비율 목표.
|
||||
max_iterations: 최대 반복.
|
||||
|
||||
Returns:
|
||||
calibration 리포트 dict.
|
||||
"""
|
||||
path = trades_csv or ANALYSIS_TRADES_CSV
|
||||
df = pd.read_csv(path)
|
||||
buy = df[df["action"] == "buy"]
|
||||
sell = df[df["action"] == "sell"]
|
||||
|
||||
analysis = analyze_gt_mtf_profile(df)
|
||||
ANALYSIS_GT_MTF_PROFILE_JSON.parent.mkdir(parents=True, exist_ok=True)
|
||||
ANALYSIS_GT_MTF_PROFILE_JSON.write_text(
|
||||
json.dumps(analysis, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
numeric_ranked = sorted(
|
||||
[
|
||||
f["col"]
|
||||
for f in analysis["features"]
|
||||
if f["dtype"] == "numeric"
|
||||
],
|
||||
key=lambda c: next(
|
||||
(x["separation"] for x in analysis["global_top_separation"] if x["col"] == c),
|
||||
_feature_separation_df(buy, sell, c),
|
||||
),
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
base = build_rule_candidates(path)
|
||||
rules: list[dict[str, Any]] = list(base.get("rules", []))
|
||||
for r in rules:
|
||||
if "logic" not in r:
|
||||
r["logic"] = "and"
|
||||
|
||||
rules.extend(build_or_tf_rules(buy, sell, numeric_ranked[:80]))
|
||||
|
||||
history: list[dict[str, Any]] = []
|
||||
best_rules: list[dict[str, Any]] = list(rules)
|
||||
best_asset_ratio = -1.0
|
||||
gt_data = load_ground_truth(resolve_ground_truth_file()) or {}
|
||||
gt_trades = gt_data.get("trades") or []
|
||||
mark = (gt_data.get("summary") or {}).get("mark_price")
|
||||
|
||||
for it in range(max_iterations):
|
||||
recall = evaluate_gt_snapshot_recall(df, rules)
|
||||
buy_rec = recall["buy"]["recall"]
|
||||
sell_rec = recall["sell"]["recall"]
|
||||
|
||||
buy_legs = {int(t["leg_id"]) for t in gt_trades if t["action"] == "buy"}
|
||||
sell_legs = {int(t["leg_id"]) for t in gt_trades if t["action"] == "sell"}
|
||||
all_legs = buy_legs | sell_legs
|
||||
|
||||
included_legs = set()
|
||||
gt_df = pd.DataFrame(gt_trades)
|
||||
for lid in all_legs:
|
||||
leg = gt_df[gt_df["leg_id"] == lid]
|
||||
leg_buy_ok = True
|
||||
leg_sell_ok = True
|
||||
for _, row in leg[leg["action"] == "buy"].iterrows():
|
||||
sub = df[(df["dt"] == row["dt"]) & (df["action"] == "buy")]
|
||||
if sub.empty:
|
||||
leg_buy_ok = False
|
||||
break
|
||||
fr = pd.DataFrame([sub.iloc[0]])
|
||||
if not any(
|
||||
bool(eval_rule_mask(fr, r).iloc[0])
|
||||
for r in rules
|
||||
if r.get("side") == "buy"
|
||||
):
|
||||
leg_buy_ok = False
|
||||
break
|
||||
for _, row in leg[leg["action"] == "sell"].iterrows():
|
||||
sub = df[(df["dt"] == row["dt"]) & (df["action"] == "sell")]
|
||||
if sub.empty:
|
||||
leg_sell_ok = False
|
||||
break
|
||||
fr = pd.DataFrame([sub.iloc[0]])
|
||||
if not any(
|
||||
bool(eval_rule_mask(fr, r).iloc[0])
|
||||
for r in rules
|
||||
if r.get("side") == "sell"
|
||||
):
|
||||
leg_sell_ok = False
|
||||
break
|
||||
if leg_buy_ok and leg_sell_ok:
|
||||
included_legs.add(int(lid))
|
||||
|
||||
asset = portfolio_asset_ratio(gt_trades, included_legs, mark)
|
||||
row_hist = {
|
||||
"iteration": it,
|
||||
"rule_count": len(rules),
|
||||
"buy_recall": buy_rec,
|
||||
"sell_recall": sell_rec,
|
||||
**asset,
|
||||
}
|
||||
history.append(row_hist)
|
||||
print(
|
||||
f"[cal {it}] rules={len(rules)} "
|
||||
f"buy_rec={buy_rec:.2%} sell_rec={sell_rec:.2%} "
|
||||
f"asset_ratio={asset['asset_ratio']:.2%} legs={asset['legs_covered']}/{asset['legs_total']}"
|
||||
)
|
||||
if asset["asset_ratio"] > best_asset_ratio:
|
||||
best_asset_ratio = asset["asset_ratio"]
|
||||
best_rules = list(rules)
|
||||
|
||||
if (
|
||||
buy_rec >= target_recall
|
||||
and sell_rec >= target_recall
|
||||
and asset["asset_ratio"] >= target_asset_ratio
|
||||
):
|
||||
break
|
||||
|
||||
added = 0
|
||||
for side in ("buy", "sell"):
|
||||
rec = recall[side]["recall"]
|
||||
if rec >= target_recall:
|
||||
continue
|
||||
new_rules = build_unmatched_atomic_rules(df, rules, side, max_new=15)
|
||||
rules.extend(new_rules)
|
||||
added += len(new_rules)
|
||||
if added == 0:
|
||||
rules.extend(build_or_tf_rules(buy, sell, numeric_ranked[:120]))
|
||||
for side in ("buy", "sell"):
|
||||
rules.extend(
|
||||
build_unmatched_atomic_rules(df, rules, side, max_new=20)
|
||||
)
|
||||
if len(rules) > 200:
|
||||
break
|
||||
|
||||
final_recall = evaluate_gt_snapshot_recall(df, rules)
|
||||
final_legs: set[int] = set()
|
||||
gt_df = pd.DataFrame(gt_trades)
|
||||
for lid in gt_df["leg_id"].unique():
|
||||
leg = gt_df[gt_df["leg_id"] == lid]
|
||||
ok_b = ok_s = True
|
||||
for _, row in leg[leg["action"] == "buy"].iterrows():
|
||||
sub = df[(df["dt"] == row["dt"]) & (df["action"] == "buy")]
|
||||
if sub.empty or not any(
|
||||
bool(eval_rule_mask(pd.DataFrame([sub.iloc[0]]), r).iloc[0])
|
||||
for r in rules
|
||||
if r.get("side") == "buy"
|
||||
):
|
||||
ok_b = False
|
||||
for _, row in leg[leg["action"] == "sell"].iterrows():
|
||||
sub = df[(df["dt"] == row["dt"]) & (df["action"] == "sell")]
|
||||
if sub.empty or not any(
|
||||
bool(eval_rule_mask(pd.DataFrame([sub.iloc[0]]), r).iloc[0])
|
||||
for r in rules
|
||||
if r.get("side") == "sell"
|
||||
):
|
||||
ok_s = False
|
||||
if ok_b and ok_s:
|
||||
final_legs.add(int(lid))
|
||||
|
||||
final_asset = portfolio_asset_ratio(gt_trades, final_legs, mark)
|
||||
|
||||
out = {
|
||||
"target_recall": target_recall,
|
||||
"target_asset_ratio": target_asset_ratio,
|
||||
"iterations": history,
|
||||
"final": {
|
||||
"rule_count": len(rules),
|
||||
"snapshot_recall": final_recall,
|
||||
"portfolio": final_asset,
|
||||
"targets_met": (
|
||||
final_recall["buy"]["recall"] >= target_recall
|
||||
and final_recall["sell"]["recall"] >= target_recall
|
||||
and final_asset["asset_ratio"] >= target_asset_ratio
|
||||
),
|
||||
},
|
||||
"calibrated_rules": rules,
|
||||
}
|
||||
deduped: list[dict[str, Any]] = []
|
||||
seen_rid: set[str] = set()
|
||||
for r in best_rules:
|
||||
rid = r.get("rule_id", "")
|
||||
if rid in seen_rid:
|
||||
continue
|
||||
seen_rid.add(rid)
|
||||
deduped.append(r)
|
||||
rules = _greedy_recall_cover(df, deduped, target_recall=target_recall)
|
||||
out["final"]["rule_count_after_greedy"] = len(rules)
|
||||
out["calibrated_rules"] = rules
|
||||
out["final"]["snapshot_recall"] = evaluate_gt_snapshot_recall(df, rules)
|
||||
final_legs_g: set[int] = set()
|
||||
gt_df = pd.DataFrame(gt_trades)
|
||||
for lid in gt_df["leg_id"].unique():
|
||||
leg = gt_df[gt_df["leg_id"] == lid]
|
||||
ok_b = ok_s = True
|
||||
for _, row in leg[leg["action"] == "buy"].iterrows():
|
||||
sub = df[(df["dt"] == row["dt"]) & (df["action"] == "buy")]
|
||||
if sub.empty or not any(
|
||||
bool(eval_rule_mask(pd.DataFrame([sub.iloc[0]]), r).iloc[0])
|
||||
for r in rules
|
||||
if r.get("side") == "buy"
|
||||
):
|
||||
ok_b = False
|
||||
for _, row in leg[leg["action"] == "sell"].iterrows():
|
||||
sub = df[(df["dt"] == row["dt"]) & (df["action"] == "sell")]
|
||||
if sub.empty or not any(
|
||||
bool(eval_rule_mask(pd.DataFrame([sub.iloc[0]]), r).iloc[0])
|
||||
for r in rules
|
||||
if r.get("side") == "sell"
|
||||
):
|
||||
ok_s = False
|
||||
if ok_b and ok_s:
|
||||
final_legs_g.add(int(lid))
|
||||
out["final"]["portfolio"] = portfolio_asset_ratio(
|
||||
gt_trades, final_legs_g, mark
|
||||
)
|
||||
fr = out["final"]["snapshot_recall"]
|
||||
pa = out["final"]["portfolio"]
|
||||
out["final"]["targets_met"] = (
|
||||
fr["buy"]["recall"] >= target_recall
|
||||
and fr["sell"]["recall"] >= target_recall
|
||||
and pa["asset_ratio"] >= target_asset_ratio
|
||||
)
|
||||
ANALYSIS_GT_CALIBRATION_JSON.parent.mkdir(parents=True, exist_ok=True)
|
||||
ANALYSIS_GT_CALIBRATION_JSON.write_text(
|
||||
json.dumps(out, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
def _greedy_recall_cover(
|
||||
trades_df: pd.DataFrame,
|
||||
rules: list[dict[str, Any]],
|
||||
*,
|
||||
target_recall: float = 0.90,
|
||||
max_per_side: int = 40,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""
|
||||
측면별 recall 목표까지 greedy로 규칙 축소.
|
||||
|
||||
Args:
|
||||
trades_df: 03b CSV.
|
||||
rules: 후보 규칙 전체.
|
||||
target_recall: 목표 recall.
|
||||
|
||||
Returns:
|
||||
축소된 규칙 + 기존 compound/mtf_cross 유지.
|
||||
"""
|
||||
keep_kinds = {
|
||||
"compound_tight",
|
||||
"compound",
|
||||
"contrast",
|
||||
"mtf_cross",
|
||||
"or_tf",
|
||||
}
|
||||
kept = [r for r in rules if r.get("kind") in keep_kinds]
|
||||
pool = [r for r in rules if r not in kept]
|
||||
|
||||
for side in ("buy", "sell"):
|
||||
gt = trades_df[trades_df["action"] == side]
|
||||
if gt.empty:
|
||||
continue
|
||||
uncovered = set(gt.index)
|
||||
side_pool = [r for r in pool if r.get("side") == side]
|
||||
picked: list[dict[str, Any]] = []
|
||||
while uncovered and len(picked) < max_per_side:
|
||||
best_rule = None
|
||||
best_new = 0
|
||||
for rule in side_pool:
|
||||
if rule in picked:
|
||||
continue
|
||||
new_hit = 0
|
||||
for idx in list(uncovered):
|
||||
row = gt.loc[idx]
|
||||
if bool(eval_rule_mask(pd.DataFrame([row]), rule).iloc[0]):
|
||||
new_hit += 1
|
||||
if new_hit > best_new:
|
||||
best_new = new_hit
|
||||
best_rule = rule
|
||||
if best_rule is None or best_new == 0:
|
||||
break
|
||||
picked.append(best_rule)
|
||||
still = set()
|
||||
for idx in uncovered:
|
||||
row = gt.loc[idx]
|
||||
if not any(
|
||||
bool(eval_rule_mask(pd.DataFrame([row]), r).iloc[0])
|
||||
for r in picked + [x for x in kept if x.get("side") == side]
|
||||
):
|
||||
still.add(idx)
|
||||
uncovered = still
|
||||
rec = 1.0 - len(uncovered) / len(gt)
|
||||
if rec >= target_recall:
|
||||
break
|
||||
kept.extend(picked)
|
||||
return kept
|
||||
Reference in New Issue
Block a user