Files
stt/app/stt.py
dosangyoon 26ff9b59c2 feat(web): speaker diarization via pyannote (parity with whisper_stt)
- Add app/diarize.py: local snapshot, A/B labels, disclaimer text
- transcribe_file and async jobs support diarize flag; Form diarize on API
- UI checkbox (default on); requirements: pyannote.audio, huggingface_hub
- README: env vars and model notes

Made-with: Cursor
2026-03-23 15:23:49 +09:00

102 lines
2.5 KiB
Python

from __future__ import annotations
import os
from dataclasses import dataclass
from typing import Any, Iterable, Tuple
from faster_whisper import WhisperModel
@dataclass(frozen=True)
class SegmentOut:
start: float
end: float
text: str
_MODEL: WhisperModel | None = None
def _get_model() -> WhisperModel:
global _MODEL
if _MODEL is not None:
return _MODEL
model_name = os.getenv("APP_WHISPER_MODEL", "small")
device = os.getenv("APP_WHISPER_DEVICE", "cpu")
compute_type = os.getenv("APP_WHISPER_COMPUTE_TYPE", "int8")
# WhisperModel download/cache handled by faster-whisper internally.
_MODEL = WhisperModel(model_name, device=device, compute_type=compute_type)
return _MODEL
def transcribe_iter(
audio_path: str,
*,
language: str | None = None,
vad_filter: bool = True,
beam_size: int = 5,
) -> Tuple[Iterable[Any], Any]:
model = _get_model()
segments_iter, info = model.transcribe(
audio_path,
language=language,
vad_filter=vad_filter,
beam_size=beam_size,
)
return segments_iter, info
def transcribe_file(
audio_path: str,
*,
language: str | None = None,
vad_filter: bool = True,
beam_size: int = 5,
diarize: bool = True,
diarize_model_dir: str | None = None,
) -> dict[str, Any]:
segments_iter, info = transcribe_iter(
audio_path,
language=language,
vad_filter=vad_filter,
beam_size=beam_size,
)
segments: list[SegmentOut] = []
texts: list[str] = []
for s in _iter_segments(segments_iter):
seg = SegmentOut(start=float(s.start), end=float(s.end), text=(s.text or "").strip())
if seg.text:
segments.append(seg)
texts.append(seg.text)
seg_dicts = [seg.__dict__ for seg in segments]
full_text = "\n".join(texts).strip()
if diarize:
from . import diarize as dz
mdir = diarize_model_dir or os.getenv("APP_PYANNOTE_MODEL_DIR") or None
full_text, seg_dicts = dz.build_diarized_output(
seg_dicts,
audio_path,
model_dir=mdir,
with_disclaimer=True,
)
return {
"text": full_text,
"segments": seg_dicts,
"detected_language": getattr(info, "language", None),
"language_probability": getattr(info, "language_probability", None),
"duration_sec": getattr(info, "duration", None),
}
def _iter_segments(segments_iter: Iterable[Any]) -> Iterable[Any]:
for s in segments_iter:
yield s