Files
stt/app/diarize.py
dosangyoon 2e503d1a56 Web STT: speaker diarization via pyannote; whisper_stt snapshot validation
- Add app/diarize.py: merge faster-whisper segments with pyannote (A/B/C)
- Wire /api/jobs and /api/transcribe; job API returns speaker_diarization, diarize_skip_reason
- UI: meta line shows diarization applied/skipped; hint for models path
- requirements.txt: pyannote.audio; README APP_DIARIZE / APP_PYANNOTE_MODEL_DIR
- whisper_stt.py: validate config.yaml before loading pipeline
- requirements-whisper-stt.txt: minor doc updates if any

Made-with: Cursor
2026-03-23 13:09:31 +09:00

187 lines
6.0 KiB
Python

"""
업로드 STT 결과에 pyannote 화자 구분을 합칩니다 (whisper_stt.py 와 동일한 규칙).
환경변수 APP_DIARIZE=0 이면 비활성화. 모델: APP_PYANNOTE_MODEL_DIR 또는 프로젝트 models/pyannote-diarization-3.1
"""
from __future__ import annotations
import logging
import os
from pathlib import Path
from typing import Any
log = logging.getLogger(__name__)
_APP_DIR = Path(__file__).resolve().parent
_PROJECT_ROOT = _APP_DIR.parent
_DEFAULT_SNAPSHOT = _PROJECT_ROOT / "models" / "pyannote-diarization-3.1"
_DISCLAIMER = (
"※ 화자 A, B, C… 는 실제 이름이 아니라, 이 녹음에서 말이 처음 잡힌 순서로 붙인 구분자입니다.\n"
"※ 같은 사람이 여러 구간으로 나뉘면 라벨이 바뀌거나 섞일 수 있으니, 중요한 회의는 검수가 필요합니다.\n\n"
)
def _env_disabled() -> bool:
v = os.getenv("APP_DIARIZE", "1").strip().lower()
return v in ("0", "false", "no", "off")
def resolve_snapshot_dir() -> Path | None:
raw = os.getenv("APP_PYANNOTE_MODEL_DIR", "").strip()
if raw:
p = Path(raw).expanduser()
if not p.is_absolute():
p = (_PROJECT_ROOT / p).resolve()
else:
p = _DEFAULT_SNAPSHOT.resolve()
if (p / "config.yaml").is_file():
return p
return None
def _overlap_sec(a0: float, a1: float, b0: float, b1: float) -> float:
return max(0.0, min(a1, b1) - max(a0, b0))
def _assign_speaker(
seg_start: float, seg_end: float, turns: list[tuple[float, float, str]]
) -> str | None:
best: str | None = None
best_ov = 0.0
for t0, t1, sp in turns:
ov = _overlap_sec(seg_start, seg_end, t0, t1)
if ov > best_ov:
best_ov = ov
best = sp
if best is None or best_ov < 0.05:
return None
return best
def _speaker_label_order(turns: list[tuple[float, float, str]]) -> dict[str, str]:
order: list[str] = []
for t0, _, sp in sorted(turns, key=lambda x: x[0]):
if sp not in order:
order.append(sp)
def letter(i: int) -> str:
if i < 26:
return chr(ord("A") + i)
return f"SP{i + 1}"
return {sp: letter(i) for i, sp in enumerate(order)}
def _merge_segments(
whisper_segments: list[dict[str, Any]],
turns: list[tuple[float, float, str]],
) -> tuple[str, list[dict[str, Any]]]:
labels = _speaker_label_order(turns)
merged_lines: list[str] = []
out_segments: list[dict[str, Any]] = []
current_letter: str | None = None
current_parts: list[str] = []
current_start: float | None = None
current_end: float | None = None
def flush() -> None:
nonlocal current_letter, current_parts, current_start, current_end
if current_letter is not None and current_parts and current_start is not None and current_end is not None:
line = " ".join(current_parts).strip()
merged_lines.append(f"{current_letter}: {line}")
out_segments.append(
{
"start": current_start,
"end": current_end,
"speaker": current_letter,
"text": line,
}
)
current_letter = None
current_parts = []
current_start = None
current_end = None
for seg in whisper_segments:
text = (seg.get("text") or "").strip()
if not text:
continue
start = float(seg["start"])
end = float(seg["end"])
sp = _assign_speaker(start, end, turns)
letter = labels.get(sp, "?") if sp is not None else "?"
if letter == current_letter:
current_parts.append(text)
current_end = end
else:
flush()
current_letter = letter
current_parts = [text]
current_start = start
current_end = end
flush()
body = "\n".join(merged_lines).strip()
return body, out_segments
def _run_pyannote(audio_path: str, model_dir: Path) -> list[tuple[float, float, str]]:
import torch
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained(str(model_dir))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipeline.to(device)
diarization = pipeline(audio_path)
turns: list[tuple[float, float, str]] = []
for segment, _, label in diarization.itertracks(yield_label=True):
turns.append((float(segment.start), float(segment.end), str(label)))
turns.sort(key=lambda x: x[0])
return turns
def apply_speaker_diarization(result: dict[str, Any], audio_path: str) -> dict[str, Any]:
"""
transcribe_file 결과에 speaker 필드·A:/B: 본문을 반영.
실패·비활성 시 원본 유지 및 메타만 추가.
"""
out = dict(result)
out.setdefault("speaker_diarization", False)
out.pop("diarize_skip_reason", None)
if _env_disabled():
out["diarize_skip_reason"] = "APP_DIARIZE=0"
return out
snap = resolve_snapshot_dir()
if snap is None:
out["diarize_skip_reason"] = f"pyannote 스냅샷 없음(config.yaml): {_DEFAULT_SNAPSHOT}"
log.warning("Speaker diarization skipped: %s", out["diarize_skip_reason"])
return out
try:
import pyannote.audio # noqa: F401
except ImportError:
out["diarize_skip_reason"] = "pyannote.audio 미설치"
log.warning("Speaker diarization skipped: pyannote not installed")
return out
segs = list(out.get("segments") or [])
if not segs:
out["diarize_skip_reason"] = "세그먼트 없음"
return out
try:
turns = _run_pyannote(audio_path, snap)
body, new_segs = _merge_segments(segs, turns)
out["text"] = _DISCLAIMER + body if body else out.get("text", "")
out["segments"] = new_segs
out["speaker_diarization"] = True
out.pop("diarize_skip_reason", None)
except Exception as e:
out["diarize_skip_reason"] = str(e)
log.exception("Speaker diarization failed")
return out