Rollback web speaker diarization; HF token helper for whisper_stt
- Remove app/diarize.py and pyannote from requirements.txt; web uses faster-whisper only - Revert main.py job/transcribe flow and index.html meta/hints - Add app/pyannote_auth.py for Pipeline.from_pretrained(..., token=...) used by whisper_stt - Expand whisper_stt / README / requirements-whisper-stt for gated repos (community-1, 403) Made-with: Cursor
This commit is contained in:
186
app/diarize.py
186
app/diarize.py
@@ -1,186 +0,0 @@
|
||||
"""
|
||||
업로드 STT 결과에 pyannote 화자 구분을 합칩니다 (whisper_stt.py 와 동일한 규칙).
|
||||
환경변수 APP_DIARIZE=0 이면 비활성화. 모델: APP_PYANNOTE_MODEL_DIR 또는 프로젝트 models/pyannote-diarization-3.1
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
_APP_DIR = Path(__file__).resolve().parent
|
||||
_PROJECT_ROOT = _APP_DIR.parent
|
||||
_DEFAULT_SNAPSHOT = _PROJECT_ROOT / "models" / "pyannote-diarization-3.1"
|
||||
|
||||
_DISCLAIMER = (
|
||||
"※ 화자 A, B, C… 는 실제 이름이 아니라, 이 녹음에서 말이 처음 잡힌 순서로 붙인 구분자입니다.\n"
|
||||
"※ 같은 사람이 여러 구간으로 나뉘면 라벨이 바뀌거나 섞일 수 있으니, 중요한 회의는 검수가 필요합니다.\n\n"
|
||||
)
|
||||
|
||||
|
||||
def _env_disabled() -> bool:
|
||||
v = os.getenv("APP_DIARIZE", "1").strip().lower()
|
||||
return v in ("0", "false", "no", "off")
|
||||
|
||||
|
||||
def resolve_snapshot_dir() -> Path | None:
|
||||
raw = os.getenv("APP_PYANNOTE_MODEL_DIR", "").strip()
|
||||
if raw:
|
||||
p = Path(raw).expanduser()
|
||||
if not p.is_absolute():
|
||||
p = (_PROJECT_ROOT / p).resolve()
|
||||
else:
|
||||
p = _DEFAULT_SNAPSHOT.resolve()
|
||||
if (p / "config.yaml").is_file():
|
||||
return p
|
||||
return None
|
||||
|
||||
|
||||
def _overlap_sec(a0: float, a1: float, b0: float, b1: float) -> float:
|
||||
return max(0.0, min(a1, b1) - max(a0, b0))
|
||||
|
||||
|
||||
def _assign_speaker(
|
||||
seg_start: float, seg_end: float, turns: list[tuple[float, float, str]]
|
||||
) -> str | None:
|
||||
best: str | None = None
|
||||
best_ov = 0.0
|
||||
for t0, t1, sp in turns:
|
||||
ov = _overlap_sec(seg_start, seg_end, t0, t1)
|
||||
if ov > best_ov:
|
||||
best_ov = ov
|
||||
best = sp
|
||||
if best is None or best_ov < 0.05:
|
||||
return None
|
||||
return best
|
||||
|
||||
|
||||
def _speaker_label_order(turns: list[tuple[float, float, str]]) -> dict[str, str]:
|
||||
order: list[str] = []
|
||||
for t0, _, sp in sorted(turns, key=lambda x: x[0]):
|
||||
if sp not in order:
|
||||
order.append(sp)
|
||||
|
||||
def letter(i: int) -> str:
|
||||
if i < 26:
|
||||
return chr(ord("A") + i)
|
||||
return f"SP{i + 1}"
|
||||
|
||||
return {sp: letter(i) for i, sp in enumerate(order)}
|
||||
|
||||
|
||||
def _merge_segments(
|
||||
whisper_segments: list[dict[str, Any]],
|
||||
turns: list[tuple[float, float, str]],
|
||||
) -> tuple[str, list[dict[str, Any]]]:
|
||||
labels = _speaker_label_order(turns)
|
||||
merged_lines: list[str] = []
|
||||
out_segments: list[dict[str, Any]] = []
|
||||
|
||||
current_letter: str | None = None
|
||||
current_parts: list[str] = []
|
||||
current_start: float | None = None
|
||||
current_end: float | None = None
|
||||
|
||||
def flush() -> None:
|
||||
nonlocal current_letter, current_parts, current_start, current_end
|
||||
if current_letter is not None and current_parts and current_start is not None and current_end is not None:
|
||||
line = " ".join(current_parts).strip()
|
||||
merged_lines.append(f"{current_letter}: {line}")
|
||||
out_segments.append(
|
||||
{
|
||||
"start": current_start,
|
||||
"end": current_end,
|
||||
"speaker": current_letter,
|
||||
"text": line,
|
||||
}
|
||||
)
|
||||
current_letter = None
|
||||
current_parts = []
|
||||
current_start = None
|
||||
current_end = None
|
||||
|
||||
for seg in whisper_segments:
|
||||
text = (seg.get("text") or "").strip()
|
||||
if not text:
|
||||
continue
|
||||
start = float(seg["start"])
|
||||
end = float(seg["end"])
|
||||
sp = _assign_speaker(start, end, turns)
|
||||
letter = labels.get(sp, "?") if sp is not None else "?"
|
||||
|
||||
if letter == current_letter:
|
||||
current_parts.append(text)
|
||||
current_end = end
|
||||
else:
|
||||
flush()
|
||||
current_letter = letter
|
||||
current_parts = [text]
|
||||
current_start = start
|
||||
current_end = end
|
||||
|
||||
flush()
|
||||
body = "\n".join(merged_lines).strip()
|
||||
return body, out_segments
|
||||
|
||||
|
||||
def _run_pyannote(audio_path: str, model_dir: Path) -> list[tuple[float, float, str]]:
|
||||
import torch
|
||||
from pyannote.audio import Pipeline
|
||||
|
||||
pipeline = Pipeline.from_pretrained(str(model_dir))
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
pipeline.to(device)
|
||||
diarization = pipeline(audio_path)
|
||||
turns: list[tuple[float, float, str]] = []
|
||||
for segment, _, label in diarization.itertracks(yield_label=True):
|
||||
turns.append((float(segment.start), float(segment.end), str(label)))
|
||||
turns.sort(key=lambda x: x[0])
|
||||
return turns
|
||||
|
||||
|
||||
def apply_speaker_diarization(result: dict[str, Any], audio_path: str) -> dict[str, Any]:
|
||||
"""
|
||||
transcribe_file 결과에 speaker 필드·A:/B: 본문을 반영.
|
||||
실패·비활성 시 원본 유지 및 메타만 추가.
|
||||
"""
|
||||
out = dict(result)
|
||||
out.setdefault("speaker_diarization", False)
|
||||
out.pop("diarize_skip_reason", None)
|
||||
|
||||
if _env_disabled():
|
||||
out["diarize_skip_reason"] = "APP_DIARIZE=0"
|
||||
return out
|
||||
|
||||
snap = resolve_snapshot_dir()
|
||||
if snap is None:
|
||||
out["diarize_skip_reason"] = f"pyannote 스냅샷 없음(config.yaml): {_DEFAULT_SNAPSHOT}"
|
||||
log.warning("Speaker diarization skipped: %s", out["diarize_skip_reason"])
|
||||
return out
|
||||
|
||||
try:
|
||||
import pyannote.audio # noqa: F401
|
||||
except ImportError:
|
||||
out["diarize_skip_reason"] = "pyannote.audio 미설치"
|
||||
log.warning("Speaker diarization skipped: pyannote not installed")
|
||||
return out
|
||||
|
||||
segs = list(out.get("segments") or [])
|
||||
if not segs:
|
||||
out["diarize_skip_reason"] = "세그먼트 없음"
|
||||
return out
|
||||
|
||||
try:
|
||||
turns = _run_pyannote(audio_path, snap)
|
||||
body, new_segs = _merge_segments(segs, turns)
|
||||
out["text"] = _DISCLAIMER + body if body else out.get("text", "")
|
||||
out["segments"] = new_segs
|
||||
out["speaker_diarization"] = True
|
||||
out.pop("diarize_skip_reason", None)
|
||||
except Exception as e:
|
||||
out["diarize_skip_reason"] = str(e)
|
||||
log.exception("Speaker diarization failed")
|
||||
return out
|
||||
20
app/main.py
20
app/main.py
@@ -18,7 +18,6 @@ from fastapi.staticfiles import StaticFiles
|
||||
from pydantic import BaseModel
|
||||
|
||||
from . import db
|
||||
from .diarize import apply_speaker_diarization
|
||||
from .stt import transcribe_file, transcribe_iter
|
||||
|
||||
|
||||
@@ -75,8 +74,6 @@ class _Job:
|
||||
created_at: float = dataclasses.field(default_factory=time.time)
|
||||
updated_at: float = dataclasses.field(default_factory=time.time)
|
||||
cancel_event: threading.Event = dataclasses.field(default_factory=threading.Event, repr=False)
|
||||
speaker_diarization: bool = False
|
||||
diarize_skip_reason: str | None = None
|
||||
|
||||
|
||||
_JOBS: dict[str, _Job] = {}
|
||||
@@ -114,8 +111,6 @@ def _job_public(job: _Job) -> dict[str, Any]:
|
||||
"language_probability": job.language_probability,
|
||||
"duration_sec": job.duration_sec,
|
||||
"error": job.error,
|
||||
"speaker_diarization": job.speaker_diarization,
|
||||
"diarize_skip_reason": job.diarize_skip_reason,
|
||||
"created_at": job.created_at,
|
||||
"updated_at": job.updated_at,
|
||||
}
|
||||
@@ -209,7 +204,6 @@ async def api_transcribe(
|
||||
vad_filter=bool(vad_filter),
|
||||
beam_size=int(beam_size),
|
||||
)
|
||||
result = apply_speaker_diarization(result, saved_path)
|
||||
# 단발성 API도 DB 저장
|
||||
try:
|
||||
db.insert_record(
|
||||
@@ -434,20 +428,6 @@ def _run_job(job_id: str) -> None:
|
||||
if cancelled or job.cancel_event.is_set():
|
||||
job.status = "cancelled"
|
||||
else:
|
||||
merged = apply_speaker_diarization(
|
||||
{
|
||||
"text": job.text,
|
||||
"segments": list(job.segments),
|
||||
"detected_language": job.detected_language,
|
||||
"language_probability": job.language_probability,
|
||||
"duration_sec": job.duration_sec,
|
||||
},
|
||||
tmp_path,
|
||||
)
|
||||
job.text = merged.get("text", job.text)
|
||||
job.segments = merged.get("segments", job.segments)
|
||||
job.speaker_diarization = bool(merged.get("speaker_diarization"))
|
||||
job.diarize_skip_reason = merged.get("diarize_skip_reason")
|
||||
job.status = "completed"
|
||||
job.progress = 1.0
|
||||
job.updated_at = time.time()
|
||||
|
||||
26
app/pyannote_auth.py
Normal file
26
app/pyannote_auth.py
Normal file
@@ -0,0 +1,26 @@
|
||||
"""pyannote 파이프라인은 config가 허브의 gated 하위 모델(segmentation·embedding 등)을 가리킬 수 있어 토큰이 필요하다."""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
def hf_token_for_pyannote() -> str | bool:
|
||||
"""명시 토큰 또는 True(= huggingface-cli 로그인 캐시)."""
|
||||
for key in ("HF_TOKEN", "HUGGING_FACE_HUB_TOKEN", "PYANNOTE_AUTH_TOKEN"):
|
||||
v = os.environ.get(key, "").strip()
|
||||
if v:
|
||||
return v
|
||||
return True
|
||||
|
||||
|
||||
def load_pyannote_pipeline(model_dir: str | Path) -> Any:
|
||||
from pyannote.audio import Pipeline
|
||||
|
||||
auth = hf_token_for_pyannote()
|
||||
path = str(model_dir)
|
||||
try:
|
||||
return Pipeline.from_pretrained(path, token=auth)
|
||||
except TypeError:
|
||||
return Pipeline.from_pretrained(path, use_auth_token=auth)
|
||||
@@ -314,8 +314,7 @@
|
||||
|
||||
<div class="hint">
|
||||
- 허용: mp3, m4a, wav, mp4, aac, ogg, flac, webm<br />
|
||||
- 첫 실행 시 Whisper 모델 다운로드로 시간이 걸릴 수 있습니다.<br />
|
||||
- 완료 후 pyannote로 화자 구분을 시도합니다 (<code>models/pyannote-diarization-3.1</code> 필요).
|
||||
- 첫 실행 시 Whisper 모델 다운로드로 시간이 걸릴 수 있습니다.
|
||||
</div>
|
||||
|
||||
<div class="progress">
|
||||
@@ -614,10 +613,7 @@
|
||||
const lang = body.detected_language ? `${body.detected_language}` : "-";
|
||||
const prob = typeof body.language_probability === "number" ? body.language_probability.toFixed(3) : "-";
|
||||
const dur = typeof body.duration_sec === "number" ? `${body.duration_sec.toFixed(1)}s` : "-";
|
||||
let diarizeMeta = "";
|
||||
if (body.speaker_diarization === true) diarizeMeta = " · 화자 구분: 적용";
|
||||
else if (body.diarize_skip_reason) diarizeMeta = " · 화자 구분: 생략";
|
||||
metaEl.textContent = `감지 언어: ${lang} (p=${prob}), 오디오 길이: ${dur}${diarizeMeta}`;
|
||||
metaEl.textContent = `감지 언어: ${lang} (p=${prob}), 오디오 길이: ${dur}`;
|
||||
|
||||
if (startedAt) {
|
||||
timingEl.textContent = `${((performance.now() - startedAt) / 1000).toFixed(2)}s`;
|
||||
|
||||
Reference in New Issue
Block a user