Files
Bithumb/deepcoin/matching/gt_mtf_profile.py
dsyoon 2cb67c42b3 GT MTF 프로필·캘리브레이션과 04 매칭/시뮬/실거래 파이프라인을 추가한다.
3분~일봉 GT 타점 분석(03c), leg 체결 순서 수정, 총자산 90% 검증 루프,
walk-forward Go/No-Go 시뮬, monitor·live_trader 및 reference 문서를 포함한다.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-31 11:27:50 +09:00

515 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
GT 매수/매도 타점 MTF 프로필 분석 (3분~일봉 전 TF).
03b wide CSV에서 간격별·기법별 분포를 비교하고,
04 규칙 후보 생성용 피처 목록을 산출합니다.
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
import numpy as np
import pandas as pd
from config import (
GENERAL_ANALYSIS_INTERVALS,
MATCH_PROFILE_MIN_SAMPLES,
MATCH_PROFILE_MIN_SEPARATION,
MATCH_PROFILE_TOP_GLOBAL,
MATCH_PROFILE_TOP_PER_TF,
)
from deepcoin.analysis.general_analysis_config import INTERVAL_PREFIX
from deepcoin.analysis.general_analysis_core import interval_tf_prefix
from deepcoin.matching.config import ANALYSIS_TRADES_CSV, META_COLS
from deepcoin.paths import ANALYSIS_GT_MTF_PROFILE_HTML, ANALYSIS_GT_MTF_PROFILE_JSON
def _feature_separation(
buy: pd.Series,
sell: pd.Series,
) -> float:
"""
매수·매도 GT 분포 간 분리도(Cohen 유사).
Args:
buy: 매수 타점 값.
sell: 매도 타점 값.
Returns:
분리도(비숫자·표본 부족 시 0).
"""
a = pd.to_numeric(buy, errors="coerce").dropna()
b = pd.to_numeric(sell, errors="coerce").dropna()
if len(a) < MATCH_PROFILE_MIN_SAMPLES or len(b) < MATCH_PROFILE_MIN_SAMPLES:
return 0.0
pooled = np.sqrt((a.var() + b.var()) / 2)
if pooled < 1e-9:
return abs(float(a.mean() - b.mean()))
return abs(float(a.mean() - b.mean())) / pooled
def _numeric_stats(series: pd.Series) -> dict[str, float | int]:
"""
숫자 컬럼 요약 통계.
Args:
series: 한 side GT 값.
Returns:
count, mean, median, q25, q75, std.
"""
s = pd.to_numeric(series, errors="coerce").dropna()
if s.empty:
return {"count": 0}
return {
"count": int(len(s)),
"mean": round(float(s.mean()), 4),
"median": round(float(s.median()), 4),
"q25": round(float(s.quantile(0.25)), 4),
"q75": round(float(s.quantile(0.75)), 4),
"std": round(float(s.std()), 4) if len(s) > 1 else 0.0,
}
def _categorical_stats(series: pd.Series) -> dict[str, Any]:
"""
범주형 컬럼 최빈값·비율.
Args:
series: GT 값.
Returns:
mode, mode_frac, value_counts 상위 5.
"""
s = series.dropna().astype(str)
if s.empty:
return {"count": 0}
vc = s.value_counts()
mode = str(vc.index[0])
return {
"count": int(len(s)),
"mode": mode,
"mode_frac": round(float(vc.iloc[0] / len(s)), 3),
"top": {str(k): int(v) for k, v in vc.head(5).items()},
}
def _parse_tf_column(col: str) -> tuple[str, int | None, str]:
"""
컬럼명에서 TF 접두사·간격·베이스명 추출.
Args:
col: 예 m3_ga_rsi, ga_align_timing_buy_score.
Returns:
(tf_label, interval_minutes|None, base_name).
"""
if col.startswith("ga_align_"):
return ("mtf_align", None, col)
prefixes = sorted(
set(INTERVAL_PREFIX.values()),
key=len,
reverse=True,
)
for p in prefixes:
if col.startswith(f"{p}_"):
inv = {v: k for k, v in INTERVAL_PREFIX.items()}
return (p, inv.get(p), col[len(p) + 1 :])
return ("other", None, col)
def _feature_family(base: str) -> str:
"""기법군 라벨."""
if base in ("bb_pos", "RSI", "macd_hist", "stoch_k", "stoch_d", "BB_Width"):
return "legacy"
if base.startswith("ga_align_"):
return "mtf_align"
if "pattern" in base:
return "pattern"
if "struct" in base or "elliott" in base or "wyckoff" in base or "fib_" in base:
return "wave_structure"
if "chart" in base:
return "chart"
if "volume" in base or "vp_" in base:
return "volume"
if "harmonic" in base:
return "harmonic"
if base.startswith("ga_"):
return "indicator"
return "other"
def discover_profile_columns(df: pd.DataFrame) -> list[str]:
"""
규칙·프로필 분석 대상 컬럼 목록.
Args:
df: 03b wide CSV DataFrame.
Returns:
META 제외·분석 가능 컬럼명.
"""
meta = set(META_COLS)
out: list[str] = []
for col in df.columns:
if col in meta:
continue
if df[col].notna().sum() < MATCH_PROFILE_MIN_SAMPLES:
continue
if pd.api.types.is_numeric_dtype(df[col]):
out.append(col)
continue
nuniq = df[col].dropna().astype(str).nunique()
if 1 < nuniq <= 20:
out.append(col)
return out
def _analyze_one_column(
buy: pd.DataFrame,
sell: pd.DataFrame,
col: str,
) -> dict[str, Any]:
"""
단일 컬럼 매수 vs 매도 GT 비교.
Args:
buy: 매수 행.
sell: 매도 행.
col: 컬럼명.
Returns:
분리도·통계·방향 힌트.
"""
tf_label, interval, base = _parse_tf_column(col)
family = _feature_family(base)
row: dict[str, Any] = {
"col": col,
"tf": tf_label,
"interval": interval,
"base": base,
"family": family,
"dtype": "numeric" if pd.api.types.is_numeric_dtype(buy[col]) else "categorical",
}
if row["dtype"] == "numeric":
row["buy"] = _numeric_stats(buy[col])
row["sell"] = _numeric_stats(sell[col])
sep = _feature_separation(buy[col], sell[col])
row["separation"] = round(sep, 4)
bm = row["buy"].get("median")
sm = row["sell"].get("median")
if bm is not None and sm is not None:
row["buy_lower_than_sell"] = bm < sm
else:
row["buy_lower_than_sell"] = None
else:
row["buy"] = _categorical_stats(buy[col])
row["sell"] = _categorical_stats(sell[col])
row["separation"] = 0.0
if row["buy"].get("mode") and row["sell"].get("mode"):
row["modes_differ"] = row["buy"]["mode"] != row["sell"]["mode"]
return row
def analyze_gt_mtf_profile(df: pd.DataFrame) -> dict[str, Any]:
"""
전 TF·전 컬럼 GT 매수/매도 프로필 분석.
Args:
df: general_analysis_trades.csv.
Returns:
JSON 직렬화 가능 분석 결과.
"""
buy = df[df["action"] == "buy"].copy()
sell = df[df["action"] == "sell"].copy()
cols = discover_profile_columns(df)
features: list[dict[str, Any]] = []
for col in cols:
features.append(_analyze_one_column(buy, sell, col))
numeric_feats = [f for f in features if f["dtype"] == "numeric"]
ranked = sorted(numeric_feats, key=lambda x: x["separation"], reverse=True)
by_interval: dict[str, dict[str, Any]] = {}
for iv in GENERAL_ANALYSIS_INTERVALS:
pfx = interval_tf_prefix(iv)
iv_feats = [f for f in numeric_feats if f["tf"] == pfx]
iv_ranked = sorted(iv_feats, key=lambda x: x["separation"], reverse=True)
buy_favor = [f for f in iv_ranked if f.get("buy_lower_than_sell") is True][:10]
sell_favor = [f for f in iv_ranked if f.get("buy_lower_than_sell") is False][:10]
by_interval[pfx] = {
"interval_minutes": iv,
"feature_count": len(iv_feats),
"top_separation": [
{"col": x["col"], "separation": x["separation"]}
for x in iv_ranked[:15]
],
"buy_favor_lower_median": [
{"col": x["col"], "separation": x["separation"]}
for x in buy_favor[:8]
],
"sell_favor_higher_median": [
{"col": x["col"], "separation": x["separation"]}
for x in sell_favor[:8]
],
}
align_feats = [f for f in features if f["family"] == "mtf_align"]
selected_buy = _select_side_features(ranked, "buy")
selected_sell = _select_side_features(ranked, "sell")
return {
"source_rows": int(len(df)),
"buy_gt_count": int(len(buy)),
"sell_gt_count": int(len(sell)),
"columns_analyzed": len(cols),
"intervals": list(GENERAL_ANALYSIS_INTERVALS),
"config": {
"top_per_tf": MATCH_PROFILE_TOP_PER_TF,
"top_global": MATCH_PROFILE_TOP_GLOBAL,
"min_separation": MATCH_PROFILE_MIN_SEPARATION,
"min_samples": MATCH_PROFILE_MIN_SAMPLES,
},
"global_top_separation": [
{
"col": x["col"],
"tf": x["tf"],
"family": x["family"],
"separation": x["separation"],
"buy_median": x["buy"].get("median"),
"sell_median": x["sell"].get("median"),
}
for x in ranked[:40]
],
"by_interval": by_interval,
"mtf_align": align_feats,
"selected_features": {
"buy": selected_buy,
"sell": selected_sell,
},
"features": features,
}
def _select_side_features(
ranked: list[dict[str, Any]],
side: str,
) -> list[str]:
"""
04 규칙용 피처 목록: TF별 상위 + 글로벌 상위.
Args:
ranked: separation 내림차순 numeric feature dicts.
side: buy | sell.
Returns:
컬럼명 리스트(중복 제거, 순서 유지).
"""
chosen: list[str] = []
seen: set[str] = set()
def add(col: str) -> None:
if col not in seen:
seen.add(col)
chosen.append(col)
for iv in GENERAL_ANALYSIS_INTERVALS:
pfx = interval_tf_prefix(iv)
iv_list = [
f
for f in ranked
if f["tf"] == pfx and f["separation"] >= MATCH_PROFILE_MIN_SEPARATION
]
if side == "buy":
iv_list.sort(
key=lambda x: (
x["separation"],
1 if x.get("buy_lower_than_sell") else 0,
),
reverse=True,
)
else:
iv_list.sort(
key=lambda x: (
x["separation"],
1 if x.get("buy_lower_than_sell") is False else 0,
),
reverse=True,
)
for f in iv_list[:MATCH_PROFILE_TOP_PER_TF]:
add(f["col"])
global_list = [f for f in ranked if f["separation"] >= MATCH_PROFILE_MIN_SEPARATION]
if side == "buy":
global_list.sort(
key=lambda x: (
x["separation"],
1 if x.get("buy_lower_than_sell") else 0,
),
reverse=True,
)
else:
global_list.sort(
key=lambda x: (
x["separation"],
1 if x.get("buy_lower_than_sell") is False else 0,
),
reverse=True,
)
for f in global_list[:MATCH_PROFILE_TOP_GLOBAL]:
add(f["col"])
for name in (
"ga_align_timing_buy_score",
"ga_align_timing_sell_score",
"ga_align_trend_score",
"ga_align_rsi_oversold_tf",
"ga_align_rsi_overbought_tf",
"ga_align_mtf_conflict",
):
add(name)
return chosen
def load_selected_features(
profile_path: Path | None = None,
) -> tuple[list[str], list[str]]:
"""
저장된 프로필 JSON에서 buy/sell 피처 목록 로드.
Args:
profile_path: gt_mtf_profile.json.
Returns:
(buy_features, sell_features). 없으면 빈 리스트.
"""
path = profile_path or ANALYSIS_GT_MTF_PROFILE_JSON
if not path.is_file():
return [], []
data = json.loads(path.read_text(encoding="utf-8"))
sel = data.get("selected_features") or {}
return list(sel.get("buy") or []), list(sel.get("sell") or [])
def run_gt_mtf_profile(
trades_csv: Path | None = None,
*,
write_json: bool = True,
write_html: bool = True,
) -> dict[str, Any]:
"""
03b CSV 분석 후 JSON/HTML 저장.
Args:
trades_csv: 입력 CSV.
write_json: JSON 저장 여부.
write_html: HTML 저장 여부.
Returns:
analyze_gt_mtf_profile 결과.
"""
path = trades_csv or ANALYSIS_TRADES_CSV
if not path.is_file():
raise FileNotFoundError(f"03b CSV 없음: {path}")
df = pd.read_csv(path)
analysis = analyze_gt_mtf_profile(df)
buy_n = len(analysis["selected_features"]["buy"])
sell_n = len(analysis["selected_features"]["sell"])
print(
f"[03c] GT MTF 프로필: 분석 {analysis['columns_analyzed']}"
f"→ 매수 피처 {buy_n}, 매도 피처 {sell_n}"
)
if write_json:
ANALYSIS_GT_MTF_PROFILE_JSON.parent.mkdir(parents=True, exist_ok=True)
ANALYSIS_GT_MTF_PROFILE_JSON.write_text(
json.dumps(analysis, ensure_ascii=False, indent=2),
encoding="utf-8",
)
print(f"[03c] 저장: {ANALYSIS_GT_MTF_PROFILE_JSON}")
if write_html:
write_gt_mtf_profile_html(analysis, ANALYSIS_GT_MTF_PROFILE_HTML)
print(f"[03c] 저장: {ANALYSIS_GT_MTF_PROFILE_HTML}")
return analysis
def write_gt_mtf_profile_html(
analysis: dict[str, Any],
html_path: Path,
) -> Path:
"""
TF별·글로벌 분리도 요약 HTML.
Args:
analysis: analyze_gt_mtf_profile 결과.
html_path: 출력 경로.
Returns:
html_path.
"""
html_path.parent.mkdir(parents=True, exist_ok=True)
def _rows_interval() -> str:
rows = ""
for pfx, block in analysis.get("by_interval", {}).items():
top = block.get("top_separation") or []
top_s = ", ".join(
f"{t['col'].split('_', 1)[-1][:20]}({t['separation']:.2f})"
for t in top[:5]
) or "-"
rows += (
f"<tr><td>{pfx}</td><td>{block.get('feature_count', 0)}</td>"
f"<td>{top_s}</td></tr>"
)
return rows
def _rows_global() -> str:
rows = ""
for item in analysis.get("global_top_separation") or []:
rows += (
f"<tr><td>{item['col']}</td><td>{item['tf']}</td>"
f"<td>{item['family']}</td><td>{item['separation']:.3f}</td>"
f"<td>{item.get('buy_median','')}</td><td>{item.get('sell_median','')}</td></tr>"
)
return rows
buy_feats = ", ".join(analysis["selected_features"]["buy"][:25])
sell_feats = ", ".join(analysis["selected_features"]["sell"][:25])
html = f"""<!DOCTYPE html>
<html lang="ko"><head><meta charset="utf-8"/>
<title>GT MTF 프로필 (3분~일봉)</title>
<style>
body {{ font-family: "Malgun Gothic", Arial, sans-serif; margin: 24px; background: #f5f5f5; color: #1e293b; }}
h1, h2 {{ color: #0f172a; }}
table {{ border-collapse: collapse; width: 100%; background: #fff; margin-bottom: 20px; font-size: 0.85rem; }}
th, td {{ border: 1px solid #e2e8f0; padding: 8px; text-align: left; }}
th {{ background: #e2e8f0; }}
p.note {{ font-size: 0.9rem; color: #475569; }}
code {{ font-size: 0.8rem; word-break: break-all; }}
</style></head><body>
<h1>Ground Truth MTF 타점 프로필</h1>
<p>매수 GT {analysis['buy_gt_count']}건 · 매도 GT {analysis['sell_gt_count']}건 ·
분석 컬럼 {analysis['columns_analyzed']}개 (3,5,10,15,30,60,240,1440분 + MTF 합성)</p>
<p class="note">분리도 = |mean_buy mean_sell| / pooled_std. TF별·글로벌 상위 피처로 04 규칙 후보를 생성합니다.</p>
<h2>간격별 분리도 상위 (요약)</h2>
<table><thead><tr><th>TF</th><th>숫자 피처 수</th><th>상위 5 (분리도)</th></tr></thead>
<tbody>{_rows_interval()}</tbody></table>
<h2>글로벌 분리도 Top 40</h2>
<table><thead><tr><th>컬럼</th><th>TF</th><th>기법군</th><th>분리도</th><th>매수 median</th><th>매도 median</th></tr></thead>
<tbody>{_rows_global()}</tbody></table>
<h2>04 규칙 선별용 피처 (발췌)</h2>
<p><strong>매수</strong><br/><code>{buy_feats}</code></p>
<p><strong>매도</strong><br/><code>{sell_feats}</code></p>
</body></html>"""
html_path.write_text(html, encoding="utf-8")
return html_path