""" 업로드 STT 결과에 pyannote 화자 구분을 합칩니다 (whisper_stt.py 와 동일한 규칙). 환경변수 APP_DIARIZE=0 이면 비활성화. 모델: APP_PYANNOTE_MODEL_DIR 또는 프로젝트 models/pyannote-diarization-3.1 """ from __future__ import annotations import logging import os from pathlib import Path from typing import Any log = logging.getLogger(__name__) _APP_DIR = Path(__file__).resolve().parent _PROJECT_ROOT = _APP_DIR.parent _DEFAULT_SNAPSHOT = _PROJECT_ROOT / "models" / "pyannote-diarization-3.1" _DISCLAIMER = ( "※ 화자 A, B, C… 는 실제 이름이 아니라, 이 녹음에서 말이 처음 잡힌 순서로 붙인 구분자입니다.\n" "※ 같은 사람이 여러 구간으로 나뉘면 라벨이 바뀌거나 섞일 수 있으니, 중요한 회의는 검수가 필요합니다.\n\n" ) def _env_disabled() -> bool: v = os.getenv("APP_DIARIZE", "1").strip().lower() return v in ("0", "false", "no", "off") def resolve_snapshot_dir() -> Path | None: raw = os.getenv("APP_PYANNOTE_MODEL_DIR", "").strip() if raw: p = Path(raw).expanduser() if not p.is_absolute(): p = (_PROJECT_ROOT / p).resolve() else: p = _DEFAULT_SNAPSHOT.resolve() if (p / "config.yaml").is_file(): return p return None def _overlap_sec(a0: float, a1: float, b0: float, b1: float) -> float: return max(0.0, min(a1, b1) - max(a0, b0)) def _assign_speaker( seg_start: float, seg_end: float, turns: list[tuple[float, float, str]] ) -> str | None: best: str | None = None best_ov = 0.0 for t0, t1, sp in turns: ov = _overlap_sec(seg_start, seg_end, t0, t1) if ov > best_ov: best_ov = ov best = sp if best is None or best_ov < 0.05: return None return best def _speaker_label_order(turns: list[tuple[float, float, str]]) -> dict[str, str]: order: list[str] = [] for t0, _, sp in sorted(turns, key=lambda x: x[0]): if sp not in order: order.append(sp) def letter(i: int) -> str: if i < 26: return chr(ord("A") + i) return f"SP{i + 1}" return {sp: letter(i) for i, sp in enumerate(order)} def _merge_segments( whisper_segments: list[dict[str, Any]], turns: list[tuple[float, float, str]], ) -> tuple[str, list[dict[str, Any]]]: labels = _speaker_label_order(turns) merged_lines: list[str] = [] out_segments: list[dict[str, Any]] = [] current_letter: str | None = None current_parts: list[str] = [] current_start: float | None = None current_end: float | None = None def flush() -> None: nonlocal current_letter, current_parts, current_start, current_end if current_letter is not None and current_parts and current_start is not None and current_end is not None: line = " ".join(current_parts).strip() merged_lines.append(f"{current_letter}: {line}") out_segments.append( { "start": current_start, "end": current_end, "speaker": current_letter, "text": line, } ) current_letter = None current_parts = [] current_start = None current_end = None for seg in whisper_segments: text = (seg.get("text") or "").strip() if not text: continue start = float(seg["start"]) end = float(seg["end"]) sp = _assign_speaker(start, end, turns) letter = labels.get(sp, "?") if sp is not None else "?" if letter == current_letter: current_parts.append(text) current_end = end else: flush() current_letter = letter current_parts = [text] current_start = start current_end = end flush() body = "\n".join(merged_lines).strip() return body, out_segments def _run_pyannote(audio_path: str, model_dir: Path) -> list[tuple[float, float, str]]: import torch from pyannote.audio import Pipeline pipeline = Pipeline.from_pretrained(str(model_dir)) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") pipeline.to(device) diarization = pipeline(audio_path) turns: list[tuple[float, float, str]] = [] for segment, _, label in diarization.itertracks(yield_label=True): turns.append((float(segment.start), float(segment.end), str(label))) turns.sort(key=lambda x: x[0]) return turns def apply_speaker_diarization(result: dict[str, Any], audio_path: str) -> dict[str, Any]: """ transcribe_file 결과에 speaker 필드·A:/B: 본문을 반영. 실패·비활성 시 원본 유지 및 메타만 추가. """ out = dict(result) out.setdefault("speaker_diarization", False) out.pop("diarize_skip_reason", None) if _env_disabled(): out["diarize_skip_reason"] = "APP_DIARIZE=0" return out snap = resolve_snapshot_dir() if snap is None: out["diarize_skip_reason"] = f"pyannote 스냅샷 없음(config.yaml): {_DEFAULT_SNAPSHOT}" log.warning("Speaker diarization skipped: %s", out["diarize_skip_reason"]) return out try: import pyannote.audio # noqa: F401 except ImportError: out["diarize_skip_reason"] = "pyannote.audio 미설치" log.warning("Speaker diarization skipped: pyannote not installed") return out segs = list(out.get("segments") or []) if not segs: out["diarize_skip_reason"] = "세그먼트 없음" return out try: turns = _run_pyannote(audio_path, snap) body, new_segs = _merge_segments(segs, turns) out["text"] = _DISCLAIMER + body if body else out.get("text", "") out["segments"] = new_segs out["speaker_diarization"] = True out.pop("diarize_skip_reason", None) except Exception as e: out["diarize_skip_reason"] = str(e) log.exception("Speaker diarization failed") return out