From 2c738b2f09779eaadc0ecc931f5d15137faa84e2 Mon Sep 17 00:00:00 2001 From: skitsanos Date: Mon, 16 Mar 2026 08:50:48 +0200 Subject: [PATCH 1/5] feat: Speaker diarization via sherpa-onnx C API Two-pass pipeline: whisper.cpp transcribes, then sherpa-onnx diarizes the same audio and assigns speaker labels by timestamp overlap. - Raw FFI bindings to sherpa-onnx offline speaker diarization C API (not yet exposed by the sherpa-onnx Rust crate) - Dedicated worker thread for diarization (C types are !Send/!Sync) - CLI: --speakers N --diarize-segmentation-model --diarize-embedding-model - Env vars: DIARIZE_SEGMENTATION_MODEL, DIARIZE_EMBEDDING_MODEL - Speaker labels in VTT (), SRT ([Speaker 0]), and manifest JSON - Segment struct gains optional speaker field - Gated behind sherpa-onnx feature flag --- src/diarize/ffi.rs | 92 +++++++++++++++++ src/diarize/mod.rs | 191 +++++++++++++++++++++++++++++++++++ src/engines/openai_api.rs | 3 + src/engines/sherpa_onnx.rs | 3 + src/engines/whisper_local.rs | 1 + src/main.rs | 20 ++++ src/output/manifest.rs | 2 + src/output/srt.rs | 7 +- src/output/vtt.rs | 8 ++ src/pipeline.rs | 64 +++++++++++- src/transcriber.rs | 2 + 11 files changed, 391 insertions(+), 2 deletions(-) create mode 100644 src/diarize/ffi.rs create mode 100644 src/diarize/mod.rs diff --git a/src/diarize/ffi.rs b/src/diarize/ffi.rs new file mode 100644 index 0000000..ca63d85 --- /dev/null +++ b/src/diarize/ffi.rs @@ -0,0 +1,92 @@ +//! Raw FFI bindings for sherpa-onnx speaker diarization C API. +#![allow(dead_code)] +//! These are not exposed by sherpa-onnx-sys 0.1.10 so we bind them directly. + +use std::os::raw::{c_char, c_float, c_int}; + +#[repr(C)] +pub struct SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig { + pub model: *const c_char, +} + +#[repr(C)] +pub struct SherpaOnnxOfflineSpeakerSegmentationModelConfig { + pub pyannote: SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig, + pub num_threads: c_int, + pub debug: c_int, + pub provider: *const c_char, +} + +#[repr(C)] +pub struct SherpaOnnxSpeakerEmbeddingExtractorConfig { + pub model: *const c_char, + pub num_threads: c_int, + pub debug: c_int, + pub provider: *const c_char, +} + +#[repr(C)] +pub struct SherpaOnnxFastClusteringConfig { + pub num_clusters: c_int, + pub threshold: c_float, +} + +#[repr(C)] +pub struct SherpaOnnxOfflineSpeakerDiarizationConfig { + pub segmentation: SherpaOnnxOfflineSpeakerSegmentationModelConfig, + pub embedding: SherpaOnnxSpeakerEmbeddingExtractorConfig, + pub clustering: SherpaOnnxFastClusteringConfig, + pub min_duration_on: c_float, + pub min_duration_off: c_float, +} + +#[repr(C)] +pub struct SherpaOnnxOfflineSpeakerDiarizationSegment { + pub start: c_float, + pub end: c_float, + pub speaker: c_int, +} + +// Opaque types +pub enum SherpaOnnxOfflineSpeakerDiarization {} +pub enum SherpaOnnxOfflineSpeakerDiarizationResult {} + +unsafe extern "C" { + pub fn SherpaOnnxCreateOfflineSpeakerDiarization( + config: *const SherpaOnnxOfflineSpeakerDiarizationConfig, + ) -> *const SherpaOnnxOfflineSpeakerDiarization; + + pub fn SherpaOnnxDestroyOfflineSpeakerDiarization( + sd: *const SherpaOnnxOfflineSpeakerDiarization, + ); + + pub fn SherpaOnnxOfflineSpeakerDiarizationGetSampleRate( + sd: *const SherpaOnnxOfflineSpeakerDiarization, + ) -> c_int; + + pub fn SherpaOnnxOfflineSpeakerDiarizationProcess( + sd: *const SherpaOnnxOfflineSpeakerDiarization, + samples: *const c_float, + n: c_int, + ) -> *const SherpaOnnxOfflineSpeakerDiarizationResult; + + pub fn SherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakers( + r: *const SherpaOnnxOfflineSpeakerDiarizationResult, + ) -> c_int; + + pub fn SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments( + r: *const SherpaOnnxOfflineSpeakerDiarizationResult, + ) -> c_int; + + pub fn SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime( + r: *const SherpaOnnxOfflineSpeakerDiarizationResult, + ) -> *const SherpaOnnxOfflineSpeakerDiarizationSegment; + + pub fn SherpaOnnxOfflineSpeakerDiarizationDestroySegment( + s: *const SherpaOnnxOfflineSpeakerDiarizationSegment, + ); + + pub fn SherpaOnnxOfflineSpeakerDiarizationDestroyResult( + r: *const SherpaOnnxOfflineSpeakerDiarizationResult, + ); +} diff --git a/src/diarize/mod.rs b/src/diarize/mod.rs new file mode 100644 index 0000000..4296baa --- /dev/null +++ b/src/diarize/mod.rs @@ -0,0 +1,191 @@ +mod ffi; + +use std::ffi::CString; +use std::path::Path; +use std::sync::mpsc; +use std::thread::JoinHandle; + +use anyhow::{Context, Result}; +use tokio::sync::oneshot; + +use crate::transcriber::Transcript; + +/// A speaker-labeled time span from diarization. +#[derive(Debug, Clone)] +pub struct DiarizedSegment { + pub start_secs: f32, + pub end_secs: f32, + pub speaker: i32, +} + +/// Request sent to the diarization worker thread. +struct DiarizeRequest { + samples: Vec, + response_tx: oneshot::Sender>>, +} + +/// Speaker diarization engine using sherpa-onnx's C API directly. +/// Runs on a dedicated thread (the C types are !Send/!Sync). +pub struct Diarizer { + request_tx: mpsc::Sender, + _thread: JoinHandle<()>, +} + +impl Diarizer { + /// Create a new diarizer. + /// + /// - `segmentation_model`: path to pyannote segmentation ONNX model + /// - `embedding_model`: path to speaker embedding ONNX model + /// - `num_speakers`: number of speakers (must be > 0) + pub fn new( + segmentation_model: &Path, + embedding_model: &Path, + num_speakers: i32, + ) -> Result { + let seg_model = segmentation_model.to_path_buf(); + let emb_model = embedding_model.to_path_buf(); + + let (init_tx, init_rx) = std::sync::mpsc::channel::>(); + let (request_tx, request_rx) = mpsc::channel::(); + + let thread = std::thread::spawn(move || { + let seg_model_c = + CString::new(seg_model.to_string_lossy().as_bytes()).unwrap_or_default(); + let emb_model_c = + CString::new(emb_model.to_string_lossy().as_bytes()).unwrap_or_default(); + let provider_c = CString::new("cpu").unwrap(); + + let config = ffi::SherpaOnnxOfflineSpeakerDiarizationConfig { + segmentation: ffi::SherpaOnnxOfflineSpeakerSegmentationModelConfig { + pyannote: ffi::SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig { + model: seg_model_c.as_ptr(), + }, + num_threads: std::thread::available_parallelism() + .map(|n| n.get() as i32) + .unwrap_or(4), + debug: 0, + provider: provider_c.as_ptr(), + }, + embedding: ffi::SherpaOnnxSpeakerEmbeddingExtractorConfig { + model: emb_model_c.as_ptr(), + num_threads: std::thread::available_parallelism() + .map(|n| n.get() as i32) + .unwrap_or(4), + debug: 0, + provider: provider_c.as_ptr(), + }, + clustering: ffi::SherpaOnnxFastClusteringConfig { + num_clusters: num_speakers, + threshold: 0.5, + }, + min_duration_on: 0.3, + min_duration_off: 0.5, + }; + + let sd = unsafe { ffi::SherpaOnnxCreateOfflineSpeakerDiarization(&config) }; + if sd.is_null() { + init_tx + .send(Err(anyhow::anyhow!( + "Failed to create speaker diarization engine" + ))) + .ok(); + return; + } + + init_tx.send(Ok(())).ok(); + + while let Ok(req) = request_rx.recv() { + let result = unsafe { process_diarization(sd, &req.samples) }; + req.response_tx.send(result).ok(); + } + + unsafe { + ffi::SherpaOnnxDestroyOfflineSpeakerDiarization(sd); + } + }); + + init_rx + .recv() + .context("Diarization worker thread exited during init")??; + + Ok(Self { + request_tx, + _thread: thread, + }) + } + + /// Run diarization on audio samples (16kHz mono f32). + pub async fn diarize(&self, samples: Vec) -> Result> { + let (response_tx, response_rx) = oneshot::channel(); + self.request_tx + .send(DiarizeRequest { + samples, + response_tx, + }) + .map_err(|_| anyhow::anyhow!("Diarization worker thread has stopped"))?; + response_rx + .await + .context("Diarization worker dropped without responding")? + } +} + +unsafe fn process_diarization( + sd: *const ffi::SherpaOnnxOfflineSpeakerDiarization, + samples: &[f32], +) -> Result> { + let result = unsafe { + ffi::SherpaOnnxOfflineSpeakerDiarizationProcess(sd, samples.as_ptr(), samples.len() as i32) + }; + + if result.is_null() { + anyhow::bail!("Diarization returned null result"); + } + + let num_segments = + unsafe { ffi::SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(result) }; + let sorted = unsafe { ffi::SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(result) }; + + let mut segments = Vec::with_capacity(num_segments as usize); + if !sorted.is_null() { + for i in 0..num_segments as isize { + let seg = unsafe { &*sorted.offset(i) }; + segments.push(DiarizedSegment { + start_secs: seg.start, + end_secs: seg.end, + speaker: seg.speaker, + }); + } + unsafe { ffi::SherpaOnnxOfflineSpeakerDiarizationDestroySegment(sorted) }; + } + + unsafe { ffi::SherpaOnnxOfflineSpeakerDiarizationDestroyResult(result) }; + + Ok(segments) +} + +/// Assign speaker labels to transcript segments by timestamp overlap. +pub fn assign_speakers(transcript: &mut Transcript, diarized: &[DiarizedSegment]) { + for seg in &mut transcript.segments { + let seg_start = seg.start_ms as f32 / 1000.0; + let seg_end = seg.end_ms as f32 / 1000.0; + + // Find the diarization segment with maximum overlap + let mut best_speaker = None; + let mut best_overlap = 0.0f32; + + for d in diarized { + let overlap_start = seg_start.max(d.start_secs); + let overlap_end = seg_end.min(d.end_secs); + let overlap = (overlap_end - overlap_start).max(0.0); + + if overlap > best_overlap { + best_overlap = overlap; + best_speaker = Some(d.speaker); + } + } + + if let Some(speaker) = best_speaker { + seg.speaker = Some(format!("Speaker {}", speaker)); + } + } +} diff --git a/src/engines/openai_api.rs b/src/engines/openai_api.rs index 91609fc..8fa8310 100644 --- a/src/engines/openai_api.rs +++ b/src/engines/openai_api.rs @@ -194,6 +194,7 @@ pub fn parse_response_bytes(body: &[u8]) -> Transcript { start_ms: (s.start * 1000.0) as i64, end_ms: (s.end * 1000.0) as i64, text: s.text, + speaker: None, }) .collect(), }; @@ -206,6 +207,7 @@ pub fn parse_response_bytes(body: &[u8]) -> Transcript { start_ms: 0, end_ms: 0, text: resp.text, + speaker: None, }], }; } @@ -216,6 +218,7 @@ pub fn parse_response_bytes(body: &[u8]) -> Transcript { start_ms: 0, end_ms: 0, text: String::from_utf8_lossy(body).into_owned(), + speaker: None, }], } } diff --git a/src/engines/sherpa_onnx.rs b/src/engines/sherpa_onnx.rs index 4b16d62..68fb6ac 100644 --- a/src/engines/sherpa_onnx.rs +++ b/src/engines/sherpa_onnx.rs @@ -214,6 +214,7 @@ fn recognize(recognizer: &OfflineRecognizer, samples: &[f32]) -> Result Vec { start_ms: (timestamps[seg_start_idx] * 1000.0) as i64, end_ms: (timestamps[i] * 1000.0) as i64, text: trimmed.to_string(), + speaker: None, }); } seg_start_idx = i + 1; @@ -264,6 +266,7 @@ fn tokens_to_segments(tokens: &[String], timestamps: &[f32]) -> Vec { start_ms: (timestamps[0] * 1000.0) as i64, end_ms: (timestamps[len.saturating_sub(1)] * 1000.0) as i64, text: trimmed.to_string(), + speaker: None, }]; } } diff --git a/src/engines/whisper_local.rs b/src/engines/whisper_local.rs index 377f30d..894f1bf 100644 --- a/src/engines/whisper_local.rs +++ b/src/engines/whisper_local.rs @@ -72,6 +72,7 @@ impl Transcriber for WhisperLocal { start_ms: start * 10, end_ms: end * 10, text, + speaker: None, }); } diff --git a/src/main.rs b/src/main.rs index 4128ae6..aab885e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,6 @@ mod audio; +#[cfg(feature = "sherpa-onnx")] +mod diarize; mod engines; mod output; mod pipeline; @@ -297,6 +299,18 @@ enum Command { /// Normalize audio with ffmpeg loudnorm before transcription #[arg(long)] normalize: bool, + + /// Number of speakers for diarization (requires sherpa-onnx feature and models) + #[arg(long)] + speakers: Option, + + /// Path to speaker segmentation model (pyannote ONNX) + #[arg(long, env = "DIARIZE_SEGMENTATION_MODEL")] + diarize_segmentation_model: Option, + + /// Path to speaker embedding model (ONNX) + #[arg(long, env = "DIARIZE_EMBEDDING_MODEL")] + diarize_embedding_model: Option, }, } @@ -353,6 +367,9 @@ async fn main() -> Result<()> { request_timeout_secs, retry_wait_base_secs, retry_wait_max_secs, + speakers, + diarize_segmentation_model, + diarize_embedding_model, } => { check_ffmpeg()?; @@ -488,6 +505,9 @@ async fn main() -> Result<()> { upload_as_mp3, segment_concurrency, normalize_audio: normalize, + speakers, + diarize_segmentation_model: diarize_segmentation_model.clone(), + diarize_embedding_model: diarize_embedding_model.clone(), }; run_pipeline(engine.as_ref(), config).await?; diff --git a/src/output/manifest.rs b/src/output/manifest.rs index 5ed79b6..77d84b1 100644 --- a/src/output/manifest.rs +++ b/src/output/manifest.rs @@ -34,6 +34,8 @@ pub struct SegmentInfo { pub start_secs: f64, pub end_secs: f64, pub text: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub speaker: Option, } #[derive(Serialize)] diff --git a/src/output/srt.rs b/src/output/srt.rs index 537c9ca..6b653b1 100644 --- a/src/output/srt.rs +++ b/src/output/srt.rs @@ -23,7 +23,11 @@ pub fn write_srt(transcript: &Transcript, writer: &mut impl Write) -> Result<()> format_timestamp(segment.start_ms), format_timestamp(segment.end_ms) )?; - writeln!(writer, "{}", segment.text.trim())?; + if let Some(ref spk) = segment.speaker { + writeln!(writer, "[{}] {}", spk, segment.text.trim())?; + } else { + writeln!(writer, "{}", segment.text.trim())?; + } writeln!(writer)?; } @@ -43,6 +47,7 @@ mod tests { start_ms: 0, end_ms: 1234, text: " Hello ".to_string(), + speaker: None, }], }; diff --git a/src/output/vtt.rs b/src/output/vtt.rs index 363f3a9..044b3d7 100644 --- a/src/output/vtt.rs +++ b/src/output/vtt.rs @@ -26,6 +26,9 @@ pub fn write_vtt(transcript: &Transcript, writer: &mut impl Write) -> Result<()> format_timestamp(segment.start_ms), format_timestamp(segment.end_ms) )?; + if let Some(ref spk) = segment.speaker { + write!(writer, "", spk)?; + } writeln!(writer, "{}", segment.text.trim())?; writeln!(writer)?; } @@ -47,11 +50,13 @@ mod tests { start_ms: 0, end_ms: 1234, text: " Hello ".to_string(), + speaker: None, }, Segment { start_ms: 5_000, end_ms: 6_100, text: "world".to_string(), + speaker: None, }, ], }; @@ -75,16 +80,19 @@ mod tests { start_ms: 0, end_ms: 10, text: "A".to_string(), + speaker: None, }, Segment { start_ms: 10, end_ms: 20, text: "B".to_string(), + speaker: None, }, Segment { start_ms: 20, end_ms: 30, text: "C".to_string(), + speaker: None, }, ], }; diff --git a/src/pipeline.rs b/src/pipeline.rs index e112b8d..5ea7963 100644 --- a/src/pipeline.rs +++ b/src/pipeline.rs @@ -47,6 +47,12 @@ pub struct PipelineConfig { pub upload_as_mp3: bool, pub segment_concurrency: usize, pub normalize_audio: bool, + #[cfg_attr(not(feature = "sherpa-onnx"), allow(dead_code))] + pub speakers: Option, + #[cfg_attr(not(feature = "sherpa-onnx"), allow(dead_code))] + pub diarize_segmentation_model: Option, + #[cfg_attr(not(feature = "sherpa-onnx"), allow(dead_code))] + pub diarize_embedding_model: Option, } pub async fn run_pipeline(engine: &dyn Transcriber, config: PipelineConfig) -> Result<()> { @@ -79,12 +85,53 @@ pub async fn run_pipeline(engine: &dyn Transcriber, config: PipelineConfig) -> R ); } - let transcript = if should_segment { + #[allow(unused_mut)] + let mut transcript = if should_segment { transcribe_segmented(engine, input_path, total_duration, &config).await? } else { transcribe_with_spinner("Transcribing...", engine.transcribe_path(input_path)).await? }; + // Speaker diarization (if requested) + #[cfg(feature = "sherpa-onnx")] + if let Some(num_speakers) = config.speakers { + let seg_model = config + .diarize_segmentation_model + .as_deref() + .context("--diarize-segmentation-model is required when --speakers is set")?; + let emb_model = config + .diarize_embedding_model + .as_deref() + .context("--diarize-embedding-model is required when --speakers is set")?; + + eprintln!("Running speaker diarization ({num_speakers} speakers)..."); + + let diarizer = crate::diarize::Diarizer::new( + std::path::Path::new(seg_model), + std::path::Path::new(emb_model), + num_speakers, + )?; + + // Read the audio samples for diarization + let wav_bytes = std::fs::read(input_path).with_context(|| { + format!( + "Failed to read audio for diarization: {}", + input_path.display() + ) + })?; + let diarize_samples = crate::audio::wav::read_wav_bytes(&wav_bytes)?; + let diarized = + transcribe_with_spinner("Diarizing...", diarizer.diarize(diarize_samples)).await?; + + eprintln!( + "Found {} speaker segments across {} speakers.", + diarized.len(), + num_speakers + ); + + crate::diarize::assign_speakers(&mut transcript, &diarized); + } + let processing_time = started.elapsed().as_secs_f64(); // Output @@ -180,6 +227,7 @@ pub async fn run_pipeline(engine: &dyn Transcriber, config: PipelineConfig) -> R start_secs: s.start_ms as f64 / 1000.0, end_secs: s.end_ms as f64 / 1000.0, text: s.text.trim().to_string(), + speaker: s.speaker.clone(), }) .collect(), stats: Stats { @@ -357,6 +405,9 @@ mod tests { upload_as_mp3: false, segment_concurrency: 1, normalize_audio: false, + speakers: None, + diarize_segmentation_model: None, + diarize_embedding_model: None, }, ) .await?; @@ -417,6 +468,9 @@ mod tests { upload_as_mp3: false, segment_concurrency: 1, normalize_audio: false, + speakers: None, + diarize_segmentation_model: None, + diarize_embedding_model: None, }, ) .await?; @@ -463,6 +517,9 @@ mod tests { upload_as_mp3: false, segment_concurrency: 1, normalize_audio: false, + speakers: None, + diarize_segmentation_model: None, + diarize_embedding_model: None, }, ) .await?; @@ -511,6 +568,9 @@ mod tests { upload_as_mp3: true, segment_concurrency: 2, normalize_audio: false, + speakers: None, + diarize_segmentation_model: None, + diarize_embedding_model: None, }, ) .await?; @@ -567,6 +627,7 @@ mod tests { start_ms: 0, end_ms: 1000, text: "integration".to_string(), + speaker: None, }], }) } @@ -582,6 +643,7 @@ mod tests { start_ms: 0, end_ms: 1000, text: "integration".to_string(), + speaker: None, }], }) } diff --git a/src/transcriber.rs b/src/transcriber.rs index d668997..b1b8e6d 100644 --- a/src/transcriber.rs +++ b/src/transcriber.rs @@ -5,10 +5,12 @@ use std::path::Path; use crate::audio::wav::read_wav_bytes; /// A segment of transcribed text with timing info. +#[derive(Default)] pub struct Segment { pub start_ms: i64, pub end_ms: i64, pub text: String, + pub speaker: Option, } /// Full transcript result. From 329d6fc6e20e5b2184eec9166f2e37a77cb1fca9 Mon Sep 17 00:00:00 2001 From: skitsanos Date: Mon, 16 Mar 2026 09:30:08 +0200 Subject: [PATCH 2/5] feat: VAD-based speech segmentation, dependency upgrades, docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VAD segmentation via Silero VAD (sherpa-onnx): - Detects speech boundaries instead of silence dB thresholds - 250ms padding protects word boundaries from clipping - Merges chunks separated by <200ms gaps - Splits long chunks at lowest-energy points (not arbitrary positions) - Use --vad-model path/to/silero_vad.onnx to enable - Falls back to FFmpeg silencedetect when no VAD model Dependency upgrades: - whisper-rs 0.12 → 0.16 (iterator API, updated log callback) - reqwest 0.12 → 0.13 - indicatif 0.17 → 0.18 - bzip2 0.5 → 0.6 (pure Rust) Comprehensive docs update for VAD, diarization, and env vars. --- Cargo.lock | 689 ++++++++++++++++++++++----------- Cargo.toml | 8 +- README.md | 14 +- docs/architecture.md | 65 +++- docs/cli-reference.md | 41 +- docs/performance-benchmarks.md | 20 +- docs/provider-behavior.md | 16 +- docs/troubleshooting.md | 43 ++ src/audio/mod.rs | 2 + src/audio/vad.rs | 273 +++++++++++++ src/main.rs | 6 + src/pipeline.rs | 81 ++++ 12 files changed, 1015 insertions(+), 243 deletions(-) create mode 100644 src/audio/vad.rs diff --git a/Cargo.lock b/Cargo.lock index 61da3df..a12cfa7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -84,6 +84,28 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" +[[package]] +name = "aws-lc-rs" +version = "1.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94bffc006df10ac2a68c83692d734a465f8ee6c5b384d8545a636f81d858f4bf" +dependencies = [ + "aws-lc-sys", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.38.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4321e568ed89bb5a7d291a7f37997c2c0df89809d7b6d12062c81ddb54aa782e" +dependencies = [ + "cc", + "cmake", + "dunce", + "fs_extra", +] + [[package]] name = "base64" version = "0.22.1" @@ -92,16 +114,14 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" [[package]] name = "bindgen" -version = "0.69.5" +version = "0.72.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" +checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" dependencies = [ "bitflags", "cexpr", "clang-sys", "itertools", - "lazy_static", - "lazycell", "log", "prettyplease", "proc-macro2", @@ -110,7 +130,6 @@ dependencies = [ "rustc-hash", "shlex", "syn", - "which", ] [[package]] @@ -133,21 +152,11 @@ checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" [[package]] name = "bzip2" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" -dependencies = [ - "bzip2-sys", -] - -[[package]] -name = "bzip2-sys" -version = "0.1.13+1.0.8" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" +checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c" dependencies = [ - "cc", - "pkg-config", + "libbz2-rs-sys", ] [[package]] @@ -157,9 +166,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a0dd1ca384932ff3641c8718a02769f1698e7563dc6974ffd03346116310423" dependencies = [ "find-msvc-tools", + "jobserver", + "libc", "shlex", ] +[[package]] +name = "cesu8" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" + [[package]] name = "cexpr" version = "0.6.0" @@ -175,6 +192,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + [[package]] name = "clang-sys" version = "1.8.1" @@ -241,17 +264,26 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" +[[package]] +name = "combine" +version = "4.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd" +dependencies = [ + "bytes", + "memchr", +] + [[package]] name = "console" -version = "0.15.11" +version = "0.16.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8" +checksum = "d64e8af5551369d19cf50138de61f1c42074ab970f74e99be916646777f8fc87" dependencies = [ "encode_unicode", "libc", - "once_cell", "unicode-width", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -297,6 +329,12 @@ version = "0.15.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + [[package]] name = "either" version = "1.15.0" @@ -369,21 +407,6 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" -[[package]] -name = "foreign-types" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" -dependencies = [ - "foreign-types-shared", -] - -[[package]] -name = "foreign-types-shared" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" - [[package]] name = "form_urlencoded" version = "1.2.2" @@ -466,8 +489,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" dependencies = [ "cfg-if", + "js-sys", "libc", "wasi", + "wasm-bindgen", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "r-efi 5.3.0", + "wasip2", + "wasm-bindgen", ] [[package]] @@ -478,7 +517,7 @@ checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" dependencies = [ "cfg-if", "libc", - "r-efi", + "r-efi 6.0.0", "wasip2", "wasip3", ] @@ -529,15 +568,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "home" -version = "0.5.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d" -dependencies = [ - "windows-sys 0.61.2", -] - [[package]] name = "hound" version = "3.5.1" @@ -621,22 +651,6 @@ dependencies = [ "tower-service", ] -[[package]] -name = "hyper-tls" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" -dependencies = [ - "bytes", - "http-body-util", - "hyper", - "hyper-util", - "native-tls", - "tokio", - "tokio-native-tls", - "tower-service", -] - [[package]] name = "hyper-util" version = "0.1.20" @@ -784,14 +798,14 @@ dependencies = [ [[package]] name = "indicatif" -version = "0.17.11" +version = "0.18.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" +checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb" dependencies = [ "console", - "number_prefix", "portable-atomic", "unicode-width", + "unit-prefix", "web-time", ] @@ -833,26 +847,46 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" [[package]] -name = "js-sys" -version = "0.3.91" +name = "jni" +version = "0.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c" +checksum = "1a87aa2bb7d2af34197c04845522473242e1aa17c12f4935d5856491a7fb8c97" dependencies = [ - "once_cell", - "wasm-bindgen", + "cesu8", + "cfg-if", + "combine", + "jni-sys", + "log", + "thiserror 1.0.69", + "walkdir", + "windows-sys 0.45.0", ] [[package]] -name = "lazy_static" -version = "1.5.0" +name = "jni-sys" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" [[package]] -name = "lazycell" -version = "1.3.0" +name = "jobserver" +version = "0.1.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.91" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c" +dependencies = [ + "once_cell", + "wasm-bindgen", +] [[package]] name = "leb128fmt" @@ -860,6 +894,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" +[[package]] +name = "libbz2-rs-sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" + [[package]] name = "libc" version = "0.2.183" @@ -888,12 +928,6 @@ dependencies = [ "redox_syscall 0.7.3", ] -[[package]] -name = "linux-raw-sys" -version = "0.4.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" - [[package]] name = "linux-raw-sys" version = "0.12.1" @@ -921,6 +955,12 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +[[package]] +name = "lru-slab" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" + [[package]] name = "memchr" version = "2.8.0" @@ -960,23 +1000,6 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "native-tls" -version = "0.2.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "465500e14ea162429d264d44189adc38b199b62b1c21eea9f69e4b73cb03bbf2" -dependencies = [ - "libc", - "log", - "openssl", - "openssl-probe", - "openssl-sys", - "schannel", - "security-framework", - "security-framework-sys", - "tempfile", -] - [[package]] name = "nom" version = "7.1.3" @@ -987,12 +1010,6 @@ dependencies = [ "minimal-lexical", ] -[[package]] -name = "number_prefix" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" - [[package]] name = "once_cell" version = "1.21.4" @@ -1005,50 +1022,12 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" -[[package]] -name = "openssl" -version = "0.10.76" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "951c002c75e16ea2c65b8c7e4d3d51d5530d8dfa7d060b4776828c88cfb18ecf" -dependencies = [ - "bitflags", - "cfg-if", - "foreign-types", - "libc", - "once_cell", - "openssl-macros", - "openssl-sys", -] - -[[package]] -name = "openssl-macros" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "openssl-probe" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" -[[package]] -name = "openssl-sys" -version = "0.9.112" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57d55af3b3e226502be1526dfdba67ab0e9c96fc293004e79576b2b9edb0dbdb" -dependencies = [ - "cc", - "libc", - "pkg-config", - "vcpkg", -] - [[package]] name = "parking_lot" version = "0.12.5" @@ -1090,12 +1069,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" -[[package]] -name = "pkg-config" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" - [[package]] name = "plain" version = "0.2.3" @@ -1117,6 +1090,15 @@ dependencies = [ "zerovec", ] +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + [[package]] name = "prettyplease" version = "0.2.37" @@ -1136,6 +1118,62 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "quinn" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" +dependencies = [ + "bytes", + "cfg_aliases", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash", + "rustls", + "socket2", + "thiserror 2.0.18", + "tokio", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-proto" +version = "0.11.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" +dependencies = [ + "aws-lc-rs", + "bytes", + "getrandom 0.3.4", + "lru-slab", + "rand", + "ring", + "rustc-hash", + "rustls", + "rustls-pki-types", + "slab", + "thiserror 2.0.18", + "tinyvec", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-udp" +version = "0.5.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd" +dependencies = [ + "cfg_aliases", + "libc", + "once_cell", + "socket2", + "tracing", + "windows-sys 0.59.0", +] + [[package]] name = "quote" version = "1.0.45" @@ -1145,12 +1183,47 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + [[package]] name = "r-efi" version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" +[[package]] +name = "rand" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +dependencies = [ + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + [[package]] name = "redox_syscall" version = "0.5.18" @@ -1200,9 +1273,9 @@ checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" [[package]] name = "reqwest" -version = "0.12.28" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" +checksum = "ab3f43e3283ab1488b624b44b0e988d0acea0b3214e694730a055cb6b2efa801" dependencies = [ "base64", "bytes", @@ -1215,22 +1288,22 @@ dependencies = [ "http-body-util", "hyper", "hyper-rustls", - "hyper-tls", "hyper-util", "js-sys", "log", "mime", "mime_guess", - "native-tls", "percent-encoding", "pin-project-lite", + "quinn", + "rustls", "rustls-pki-types", + "rustls-platform-verifier", "serde", "serde_json", - "serde_urlencoded", "sync_wrapper", "tokio", - "tokio-native-tls", + "tokio-rustls", "tokio-util", "tower", "tower-http", @@ -1258,22 +1331,9 @@ dependencies = [ [[package]] name = "rustc-hash" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" - -[[package]] -name = "rustix" -version = "0.38.44" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" -dependencies = [ - "bitflags", - "errno", - "libc", - "linux-raw-sys 0.4.15", - "windows-sys 0.59.0", -] +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" [[package]] name = "rustix" @@ -1284,7 +1344,7 @@ dependencies = [ "bitflags", "errno", "libc", - "linux-raw-sys 0.12.1", + "linux-raw-sys", "windows-sys 0.61.2", ] @@ -1294,6 +1354,7 @@ version = "0.23.37" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4" dependencies = [ + "aws-lc-rs", "once_cell", "rustls-pki-types", "rustls-webpki", @@ -1301,21 +1362,62 @@ dependencies = [ "zeroize", ] +[[package]] +name = "rustls-native-certs" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" +dependencies = [ + "openssl-probe", + "rustls-pki-types", + "schannel", + "security-framework", +] + [[package]] name = "rustls-pki-types" version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" dependencies = [ + "web-time", "zeroize", ] +[[package]] +name = "rustls-platform-verifier" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d99feebc72bae7ab76ba994bb5e121b8d83d910ca40b36e0921f53becc41784" +dependencies = [ + "core-foundation 0.10.1", + "core-foundation-sys", + "jni", + "log", + "once_cell", + "rustls", + "rustls-native-certs", + "rustls-platform-verifier-android", + "rustls-webpki", + "security-framework", + "security-framework-sys", + "webpki-root-certs", + "windows-sys 0.61.2", +] + +[[package]] +name = "rustls-platform-verifier-android" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f" + [[package]] name = "rustls-webpki" version = "0.103.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53" dependencies = [ + "aws-lc-rs", "ring", "rustls-pki-types", "untrusted", @@ -1328,10 +1430,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] -name = "ryu" -version = "1.0.23" +name = "same-file" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] [[package]] name = "schannel" @@ -1420,18 +1525,6 @@ dependencies = [ "zmij", ] -[[package]] -name = "serde_urlencoded" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" -dependencies = [ - "form_urlencoded", - "itoa", - "ryu", - "serde", -] - [[package]] name = "sherpa-onnx" version = "0.1.10" @@ -1577,10 +1670,50 @@ dependencies = [ "fastrand", "getrandom 0.4.2", "once_cell", - "rustix 1.1.4", + "rustix", "windows-sys 0.61.2", ] +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl 2.0.18", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "tinystr" version = "0.8.2" @@ -1591,6 +1724,21 @@ dependencies = [ "zerovec", ] +[[package]] +name = "tinyvec" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + [[package]] name = "tokio" version = "1.50.0" @@ -1619,16 +1767,6 @@ dependencies = [ "syn", ] -[[package]] -name = "tokio-native-tls" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" -dependencies = [ - "native-tls", - "tokio", -] - [[package]] name = "tokio-rustls" version = "0.26.4" @@ -1772,6 +1910,12 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "unit-prefix" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3" + [[package]] name = "untrusted" version = "0.9.0" @@ -1803,10 +1947,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] -name = "vcpkg" -version = "0.2.15" +name = "walkdir" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] [[package]] name = "want" @@ -1924,9 +2072,9 @@ dependencies = [ [[package]] name = "wasm-streams" -version = "0.4.2" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +checksum = "9d1ec4f6517c9e11ae630e200b2b65d193279042e28edd4a2cda233e46670bbb" dependencies = [ "futures-util", "js-sys", @@ -1968,36 +2116,43 @@ dependencies = [ ] [[package]] -name = "which" -version = "4.4.2" +name = "webpki-root-certs" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca" dependencies = [ - "either", - "home", - "once_cell", - "rustix 0.38.44", + "rustls-pki-types", ] [[package]] name = "whisper-rs" -version = "0.12.0" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c597ac8a9d5c4719fee232abc871da184ea50a4fea38d2d00348fd95072b2b0" +checksum = "2088172d00f936c348d6a72f488dc2660ab3f507263a195df308a3c2383229f6" dependencies = [ "whisper-rs-sys", ] [[package]] name = "whisper-rs-sys" -version = "0.10.0" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d22f00ed0995463eecc34ef89905845f6bf6fd37ea70789fed180520050da8f8" +checksum = "6986c0fe081241d391f09b9a071fbcbb59720c3563628c3c829057cf69f2a56f" dependencies = [ "bindgen", "cfg-if", "cmake", "fs_extra", + "semver", +] + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys 0.61.2", ] [[package]] @@ -2035,13 +2190,22 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-sys" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +dependencies = [ + "windows-targets 0.42.2", +] + [[package]] name = "windows-sys" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -2050,7 +2214,7 @@ version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -2062,34 +2226,67 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-targets" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +dependencies = [ + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", +] + [[package]] name = "windows-targets" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" + [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_i686_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" + [[package]] name = "windows_i686_gnu" version = "0.52.6" @@ -2102,24 +2299,48 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" + [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" @@ -2227,7 +2448,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" dependencies = [ "libc", - "rustix 1.1.4", + "rustix", ] [[package]] @@ -2253,6 +2474,26 @@ dependencies = [ "synstructure", ] +[[package]] +name = "zerocopy" +version = "0.8.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2578b716f8a7a858b7f02d5bd870c14bf4ddbbcf3a4c05414ba6503640505e3" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e6cc098ea4d3bd6246687de65af3f920c430e236bee1e3bf2e441463f08a02f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "zerofrom" version = "0.1.6" diff --git a/Cargo.toml b/Cargo.toml index 32676c6..a8acee7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,8 +25,8 @@ dotenvy = "0.15" futures-util = "0.3" hound = "3.5" glob = "0.3" -indicatif = "0.17" -reqwest = { version = "0.12", features = ["json", "multipart", "stream"] } +indicatif = "0.18" +reqwest = { version = "0.13", features = ["json", "multipart", "stream"] } serde = { version = "1", features = ["derive"] } serde_json = "1" tempfile = "3" @@ -34,7 +34,7 @@ regex = "1" tokio = { version = "1", features = ["full"] } sherpa-onnx = { version = "0.1", optional = true } tar = "0.4" -bzip2 = "0.5" +bzip2 = "0.6" libc = "0.2" -whisper-rs = "0.12" +whisper-rs = "0.16" bytes = "1.11.1" diff --git a/README.md b/README.md index 714df9a..5cb0c7f 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,14 @@ transcribeit run -p azure -i recording.mp3 \ # Force language and normalize before transcription transcribeit run -i recording.wav -m base --language en --normalize + +# VAD-based segmentation (speech-aware, avoids mid-word cuts) +transcribeit run -p sherpa-onnx -m base -i recording.mp3 --vad-model .cache/silero_vad.onnx + +# Speaker diarization (2 speakers) +transcribeit run -i interview.mp3 -m base --speakers 2 \ + --diarize-segmentation-model .cache/sherpa-onnx-pyannote-segmentation-3-0/model.onnx \ + --diarize-embedding-model .cache/wespeaker_en_voxceleb_CAM++.onnx ``` ## Features @@ -72,7 +80,8 @@ transcribeit run -i recording.wav -m base --language en --normalize - **Model aliases** — `-m base`, `-m tiny`, etc. resolve from `MODEL_CACHE_DIR` for both `local` and `sherpa-onnx` providers. The sherpa-onnx resolver also supports glob matching (e.g., `-m moonshine-base`, `-m sense-voice`). - **Language hinting** — Pass `--language` to force local and API transcription language. - **FFmpeg audio normalization** — Optional `--normalize` to apply loudnorm before transcription. -- **Silence-based segmentation** — Splits long audio at silence boundaries for better accuracy and API compatibility. +- **VAD-based segmentation** — Speech-aware segmentation via Silero VAD (sherpa-onnx). Detects speech boundaries with padding and gap merging to avoid mid-word cuts. Use `--vad-model .cache/silero_vad.onnx`. +- **Silence-based segmentation** — Fallback segmentation via FFmpeg `silencedetect` for API providers or when VAD model is not available. - **sherpa-onnx auto-segmentation** — Whisper ONNX models only support ≤30s per call; segmentation is enabled automatically. - **sherpa-onnx is optional** — Enabled by default as a Cargo feature. Build without it: `cargo build --no-default-features`. - **Auto-split for API limits** — Files exceeding 25MB are automatically segmented when using remote providers. @@ -102,6 +111,9 @@ TRANSCRIBEIT_MAX_RETRIES=5 TRANSCRIBEIT_REQUEST_TIMEOUT_SECS=120 TRANSCRIBEIT_RETRY_WAIT_BASE_SECS=10 TRANSCRIBEIT_RETRY_WAIT_MAX_SECS=120 +VAD_MODEL=.cache/silero_vad.onnx +DIARIZE_SEGMENTATION_MODEL=.cache/sherpa-onnx-pyannote-segmentation-3-0/model.onnx +DIARIZE_EMBEDDING_MODEL=.cache/wespeaker_en_voxceleb_CAM++.onnx ``` ## Documentation diff --git a/docs/architecture.md b/docs/architecture.md index bbdaed2..ea7ab34 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -12,11 +12,15 @@ src/ ├── audio/ │ ├── extract.rs # FFmpeg audio conversion │ ├── segment.rs # Silence detection and audio splitting +│ ├── vad.rs # VAD-based speech segmentation (Silero VAD via sherpa-onnx) │ └── wav.rs # WAV reading and encoding (shared) +├── diarize/ +│ ├── mod.rs # Speaker diarization engine and speaker assignment +│ └── ffi.rs # Raw C FFI bindings for sherpa-onnx speaker diarization ├── output/ -│ ├── vtt.rs # WebVTT subtitle writer -│ ├── srt.rs # SRT subtitle writer -│ └── manifest.rs # JSON manifest writer +│ ├── vtt.rs # WebVTT subtitle writer (supports tags) +│ ├── srt.rs # SRT subtitle writer (supports [Speaker N] labels) +│ └── manifest.rs # JSON manifest writer (includes speaker labels) └── engines/ ├── whisper_local.rs # Local whisper.cpp via whisper-rs ├── sherpa_onnx.rs # Local sherpa-onnx engine (auto-detects Whisper, Moonshine, SenseVoice) @@ -67,21 +71,32 @@ Input file (any format) │ └─ Auto: sherpa-onnx provider (always segments; max 30s per chunk) │ ├─ If segmenting: - │ ├─ detect_silence() via FFmpeg silencedetect filter - │ ├─ compute_segments() at silence midpoints - │ ├─ split_audio() into temp WAV files - │ └─ Transcribe each segment, offset timestamps (concurrently for API providers) + │ ├─ VAD path (when --vad-model is set and sherpa-onnx feature is enabled): + │ │ ├─ read_wav_bytes() → f32 PCM samples + │ │ ├─ vad_segment(): detect speech → pad 250ms → merge gaps <200ms → split long chunks at low-energy points + │ │ ├─ Extract chunk samples directly from memory + │ │ └─ Transcribe each chunk via transcribe(), offset timestamps + │ ├─ FFmpeg fallback (no VAD model, or sherpa-onnx feature disabled): + │ │ ├─ detect_silence() via FFmpeg silencedetect filter + │ │ ├─ compute_segments() at silence midpoints + │ │ ├─ split_audio() into temp WAV files + │ │ └─ Transcribe each segment, offset timestamps (concurrently for API providers) │ ├─ If not segmenting: │ ├─ Local: read_wav() → transcribe() directly │ └─ API: transcribe_path() with prepared file │ ├─ normalize_audio? ──→ optional loudnorm filter in ffmpeg conversion pipeline + ├─ Speaker diarization? (when --speakers N is set) + │ ├─ read audio samples for diarization + │ ├─ Diarizer.diarize() → speaker-labeled time spans + │ └─ assign_speakers() overlays speaker labels onto transcript segments + │ └─ Output: ├─ Text to stdout or `.txt` - ├─ VTT to file or stdout - ├─ SRT to file or stdout - └─ JSON manifest to output directory + ├─ VTT to file or stdout (with `` tags when diarized) + ├─ SRT to file or stdout (with `[Speaker N]` labels when diarized) + └─ JSON manifest to output directory (includes speaker field per segment) ``` Temporary files use the `tempfile` crate and are cleaned up automatically on drop. @@ -184,6 +199,36 @@ cargo build --release --no-default-features This removes the sherpa-onnx provider and eliminates the need for `SHERPA_ONNX_LIB_DIR`. +## VAD-based segmentation (`audio/vad.rs`) + +When `--vad-model` is set and the `sherpa-onnx` feature is enabled, the pipeline uses Silero VAD (via sherpa-onnx) for speech-aware segmentation instead of FFmpeg's `silencedetect` filter. This avoids the main problem with silence-based splitting: mid-word cuts. + +The VAD pipeline (`vad_segment()`) has four stages: + +1. **Detect speech** -- Silero VAD processes 512-sample frames (~32ms at 16kHz) to find speech boundaries with sample-level precision. +2. **Pad 250ms** -- Each speech chunk is extended by 250ms on both sides to protect word boundaries at the edges. +3. **Merge gaps <200ms** -- Adjacent chunks separated by less than 200ms are merged to avoid splitting within short pauses. +4. **Split long chunks** -- Chunks exceeding `--max-segment-secs` are split at the lowest-energy point within a 1-second search window around the target cut point. + +The VAD approach works directly on in-memory PCM samples, so there is no need for intermediate temp files during segmentation. Each chunk is transcribed via `engine.transcribe()` with sample slices, and timestamps are offset by the chunk start time. + +When `--vad-model` is not set, segmentation falls back to FFmpeg `silencedetect` (the original behavior). + +## Speaker diarization (`diarize/`) + +Speaker diarization identifies which speaker is talking at each point in the audio. It requires the `sherpa-onnx` feature and two ONNX models: + +- **Segmentation model** (`--diarize-segmentation-model`): a pyannote segmentation ONNX model that detects speaker change points. +- **Embedding model** (`--diarize-embedding-model`): a speaker embedding ONNX model that clusters voice characteristics. + +The `Diarizer` follows the same dedicated worker thread pattern as `SherpaOnnxEngine`: the C FFI types are not `Send`/`Sync`, so they live on a plain `std::thread` and communicate via channels. Diarization requests are sent through `mpsc` and results come back through `tokio::sync::oneshot`. + +After transcription completes, `assign_speakers()` overlays speaker labels onto transcript segments by finding the diarization segment with the maximum time overlap for each transcript segment. Speaker labels appear as: + +- **VTT**: `text` +- **SRT**: `[Speaker 0] text` +- **Manifest JSON**: `"speaker": "Speaker 0"` field on each segment + ## Adding a new engine 1. Create `src/engines/your_engine.rs` diff --git a/docs/cli-reference.md b/docs/cli-reference.md index 992a474..907e5b1 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -119,9 +119,22 @@ These options apply to OpenAI/Azure providers: | `--min-silence-duration` | Minimum silence duration in seconds | `0.8` | | `--max-segment-secs` | Maximum segment length in seconds | `600` | | `--segment-concurrency` | Max parallel segment requests (API providers only) | `2` | +| `--vad-model` | Path to Silero VAD ONNX model (`silero_vad.onnx`) for speech-aware segmentation | `VAD_MODEL` env var | When using `openai` or `azure` providers, files exceeding 25MB are automatically segmented even without `--segment`. When using `sherpa-onnx`, segmentation is always enabled with a maximum segment length of 30 seconds. +When `--vad-model` is set and segmentation is needed, VAD-based segmentation is used instead of FFmpeg `silencedetect`. VAD detects actual speech boundaries using Silero VAD, avoiding mid-word cuts. It pads chunks by 250ms, merges gaps shorter than 200ms, and splits long chunks at low-energy points. This requires the `sherpa-onnx` feature to be enabled. When `--vad-model` is not set, the original FFmpeg silence-based segmentation is used as a fallback. + +#### Speaker diarization options + +| Option | Description | Default | +|--------|-------------|---------| +| `--speakers` | Number of speakers for diarization | disabled | +| `--diarize-segmentation-model` | Path to pyannote segmentation ONNX model | `DIARIZE_SEGMENTATION_MODEL` env var | +| `--diarize-embedding-model` | Path to speaker embedding ONNX model | `DIARIZE_EMBEDDING_MODEL` env var | + +When `--speakers N` is set, speaker diarization runs after transcription to label each segment with a speaker identity. Both `--diarize-segmentation-model` and `--diarize-embedding-model` are required. Speaker labels appear in VTT output as ``, in SRT output as `[Speaker 0]`, and in manifest JSON as a `"speaker"` field on each segment. Requires the `sherpa-onnx` feature. + ## Output behavior During transcription, the CLI shows an animated spinner in the terminal so you can see progress while waiting for Whisper/API calls to complete. @@ -155,6 +168,9 @@ When `--input` resolves to multiple files (directory or glob), all files are pro | `AZURE_OPENAI_ENDPOINT` | Azure OpenAI endpoint URL | none | | `AZURE_DEPLOYMENT_NAME` | Azure deployment name | `whisper` | | `AZURE_API_VERSION` | Azure API version | `2024-06-01` | +| `VAD_MODEL` | Path to Silero VAD ONNX model for speech-aware segmentation | none | +| `DIARIZE_SEGMENTATION_MODEL` | Path to pyannote segmentation ONNX model for speaker diarization | none | +| `DIARIZE_EMBEDDING_MODEL` | Path to speaker embedding ONNX model for speaker diarization | none | | `TRANSCRIBEIT_MAX_RETRIES` | Maximum 429 retries | `5` | | `TRANSCRIBEIT_REQUEST_TIMEOUT_SECS` | API request timeout in seconds | `120` | | `TRANSCRIBEIT_RETRY_WAIT_BASE_SECS` | Base retry wait time in seconds | `10` | @@ -211,6 +227,28 @@ transcribeit run -i lecture.mp4 -m base -f srt -o ./output transcribeit run -i noisy.wav -m .cache/ggml-base.bin \ --segment --silence-threshold -30 --min-silence-duration 0.5 +# VAD-based segmentation (avoids mid-word cuts) +transcribeit run -p sherpa-onnx -i lecture.mp4 -m base.en \ + --vad-model /path/to/silero_vad.onnx -f vtt -o ./output + +# VAD with env var (set VAD_MODEL in .env) +VAD_MODEL=/path/to/silero_vad.onnx transcribeit run -p sherpa-onnx -i recording.mp3 -m base.en + +# Speaker diarization (2 speakers) +transcribeit run -p sherpa-onnx -i meeting.mp4 -m base.en \ + --speakers 2 \ + --diarize-segmentation-model /path/to/segmentation.onnx \ + --diarize-embedding-model /path/to/embedding.onnx \ + -f vtt -o ./output + +# VAD + speaker diarization combined +transcribeit run -p sherpa-onnx -i interview.wav -m base.en \ + --vad-model /path/to/silero_vad.onnx \ + --speakers 2 \ + --diarize-segmentation-model /path/to/segmentation.onnx \ + --diarize-embedding-model /path/to/embedding.onnx \ + -f srt -o ./output + # OpenAI API OPENAI_API_KEY=sk-... transcribeit run -p openai -i recording.mp3 @@ -267,7 +305,8 @@ When `--output-dir` is specified, the following files are created: "index": 0, "start_secs": 0.0, "end_secs": 5.25, - "text": "Hello, welcome to the meeting." + "text": "Hello, welcome to the meeting.", + "speaker": "Speaker 0" } ], "stats": { diff --git a/docs/performance-benchmarks.md b/docs/performance-benchmarks.md index 20d88f4..6ae4337 100644 --- a/docs/performance-benchmarks.md +++ b/docs/performance-benchmarks.md @@ -61,16 +61,23 @@ Record: ### 3. Segmentation impact ```bash +# FFmpeg silencedetect segmentation time transcribeit run -p openai -i --segment --segment-concurrency 2 -f text -o ./output time transcribeit run -p openai -i --segment --segment-concurrency 1 --max-segment-secs 300 -f text -o ./output -# sherpa-onnx always segments at 30s max + +# sherpa-onnx with FFmpeg silencedetect (default, always segments at 30s max) time transcribeit run -p sherpa-onnx -i -m base -f text -o ./output + +# sherpa-onnx with VAD-based segmentation +time transcribeit run -p sherpa-onnx -i -m base --vad-model /path/to/silero_vad.onnx -f text -o ./output ``` Record: - total segment count - max queue wait - request-level retry counts +- segmentation method used (VAD vs silencedetect) +- transcript quality at segment boundaries (check for mid-word cuts) ### 4. I/O + conversion overhead @@ -117,6 +124,17 @@ These results were measured on a 5-minute medical interview recording. - Moonshine provides a compact alternative but is slower than Whisper at the same size tier. - For highest quality where speed is not critical, use `large-v3-turbo` with local whisper.cpp. +### VAD vs FFmpeg silencedetect segmentation + +VAD-based segmentation (Silero VAD via `--vad-model`) and FFmpeg `silencedetect` produce different segment boundaries. Key differences to observe when benchmarking: + +- **Segment boundary quality:** VAD detects speech regions directly, so segment boundaries align with actual speech. FFmpeg `silencedetect` splits at silence midpoints, which can cut mid-word if silence gaps are short or thresholds are mistuned. +- **Segment count:** VAD typically produces more segments (one per speech region after merging) while `silencedetect` produces fewer, longer segments based on silence gaps. +- **Processing overhead:** VAD runs on the audio samples in-memory (fast, no subprocess). FFmpeg `silencedetect` runs as a subprocess and requires parsing its stderr output. +- **Transcript quality:** VAD-segmented transcripts tend to have fewer artifacts at segment boundaries because chunks start and end at speech boundaries with 250ms padding, rather than at arbitrary silence midpoints. + +When comparing, use the same audio file and model to isolate the effect of the segmentation method on overall transcript quality and timing. + ## CI/automatable baseline For now, treat these as manual benchmarks in a fixed environment. diff --git a/docs/provider-behavior.md b/docs/provider-behavior.md index 69fa566..c0c0a53 100644 --- a/docs/provider-behavior.md +++ b/docs/provider-behavior.md @@ -26,6 +26,8 @@ This project supports four providers. They share the same input/output surface, - Transcription runs in-process on a dedicated worker thread using the sherpa-onnx C library via FFI. - C++ stderr warnings from the sherpa-onnx library are suppressed during inference to keep terminal output clean. - Whisper ONNX models only support audio of 30 seconds or less per call. The pipeline automatically enables segmentation and caps `--max-segment-secs` at 30, regardless of user-supplied values. +- **VAD-based segmentation:** When `--vad-model` is set (or `VAD_MODEL` env var), Silero VAD is used for speech-aware segmentation instead of FFmpeg `silencedetect`. This detects actual speech boundaries and avoids mid-word cuts. The VAD pipeline pads chunks by 250ms, merges gaps shorter than 200ms, and splits long chunks at low-energy points. This is the recommended segmentation method for sherpa-onnx. When no VAD model is provided, the pipeline falls back to FFmpeg `silencedetect`. +- **Speaker diarization:** When `--speakers N` is set along with `--diarize-segmentation-model` and `--diarize-embedding-model`, speaker labels are assigned to each transcript segment after transcription. Labels appear in VTT (``), SRT (`[Speaker 0]`), and manifest JSON output. - **SenseVoice limitation:** emotion and audio event detection tags are stripped by the sherpa-onnx C API and are not available in the output. - Segment concurrency is always 1 (sequential processing). - No external API key is required. @@ -65,8 +67,18 @@ This project supports four providers. They share the same input/output surface, Both are local engines that run without network access. They differ in the model format and inference backend: -- **Local** uses GGML models via `whisper.cpp` (`whisper-rs` binding). Supports all Whisper model sizes. -- **Sherpa-ONNX** uses ONNX models via the `sherpa-onnx` C library. Supports three model architectures (Whisper, Moonshine, SenseVoice) with automatic detection. Whisper ONNX supports all sizes except `large-v3`. Requires auto-segmentation at 30s due to Whisper ONNX limitations. The `sherpa-onnx` feature is optional (enabled by default); build without it using `cargo build --no-default-features`. +- **Local** uses GGML models via `whisper.cpp` (`whisper-rs` binding). Supports all Whisper model sizes. Uses FFmpeg `silencedetect` for segmentation. +- **Sherpa-ONNX** uses ONNX models via the `sherpa-onnx` C library. Supports three model architectures (Whisper, Moonshine, SenseVoice) with automatic detection. Whisper ONNX supports all sizes except `large-v3`. Requires auto-segmentation at 30s due to Whisper ONNX limitations. Supports VAD-based segmentation via `--vad-model` for cleaner speech boundaries (recommended). Also supports speaker diarization via `--speakers`. The `sherpa-onnx` feature is optional (enabled by default); build without it using `cargo build --no-default-features`. + +### Segmentation: VAD vs FFmpeg silencedetect + +| | VAD (Silero) | FFmpeg silencedetect | +|---|---|---| +| **Availability** | Requires `sherpa-onnx` feature + `--vad-model` | Always available | +| **Boundary quality** | Speech-aware; avoids mid-word cuts | Silence-based; may cut mid-word | +| **Approach** | Detects speech regions, pads, merges, splits at low-energy | Detects silence gaps, splits at midpoints | +| **Config flags** | `--vad-model`, `--max-segment-secs` | `--silence-threshold`, `--min-silence-duration`, `--max-segment-secs` | +| **Best for** | Local sherpa-onnx transcription | API providers, or when no VAD model is available | ### OpenAI vs Azure diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index c254bfa..c93babd 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -56,6 +56,49 @@ Fix: - Verify with: `transcribeit list-models` (ONNX models appear with an `[onnx]` tag) - The model resolver supports partial name matching (e.g., `-m moonshine-base`, `-m sense-voice`). +### VAD model not found or fails to load + +Symptoms: +- `Failed to create VAD (check vad_model_path)` +- `No such file or directory` when using `--vad-model` + +Fix: +- Verify that the path provided to `--vad-model` (or the `VAD_MODEL` env var) points to a valid `silero_vad.onnx` file. +- Download the Silero VAD model from the [sherpa-onnx releases](https://github.com/k2-fsa/sherpa-onnx/releases). Look for `silero_vad.onnx` in the VAD model archives. +- Ensure the `sherpa-onnx` feature is enabled (it is by default). VAD-based segmentation is not available without it. +- The VAD model path can be set in your `.env` file: + +```bash +# .env +VAD_MODEL=/path/to/silero_vad.onnx +``` + +If you do not have a VAD model, omit `--vad-model` and the pipeline will fall back to FFmpeg `silencedetect` for segmentation. + +### Speaker diarization model issues + +Symptoms: +- `Failed to create speaker diarization engine` +- `--diarize-segmentation-model is required when --speakers is set` +- `--diarize-embedding-model is required when --speakers is set` + +Fix: +- When using `--speakers N`, both `--diarize-segmentation-model` and `--diarize-embedding-model` are required. +- Ensure both model paths point to valid ONNX files: + - **Segmentation model:** a pyannote speaker segmentation ONNX model. + - **Embedding model:** a speaker embedding extraction ONNX model. +- Download compatible models from the [sherpa-onnx speaker diarization releases](https://github.com/k2-fsa/sherpa-onnx/releases). +- The model paths can be set via environment variables in your `.env` file: + +```bash +# .env +DIARIZE_SEGMENTATION_MODEL=/path/to/segmentation.onnx +DIARIZE_EMBEDDING_MODEL=/path/to/embedding.onnx +``` + +- Requires the `sherpa-onnx` feature to be enabled. +- The `--speakers` value must be greater than 0. + ### Building without sherpa-onnx If you do not need the sherpa-onnx provider and want to avoid installing the shared libraries: diff --git a/src/audio/mod.rs b/src/audio/mod.rs index 37669f2..24ccc59 100644 --- a/src/audio/mod.rs +++ b/src/audio/mod.rs @@ -1,3 +1,5 @@ pub mod extract; pub mod segment; +#[cfg(feature = "sherpa-onnx")] +pub mod vad; pub mod wav; diff --git a/src/audio/vad.rs b/src/audio/vad.rs new file mode 100644 index 0000000..9947ed7 --- /dev/null +++ b/src/audio/vad.rs @@ -0,0 +1,273 @@ +//! VAD-based speech segmentation using sherpa-onnx's Silero VAD. +//! Produces clean speech boundaries that avoid mid-word cuts. + +use anyhow::{Context, Result}; +use sherpa_onnx::{SileroVadModelConfig, VadModelConfig, VoiceActivityDetector}; + +const SAMPLE_RATE: u32 = 16_000; +const FRAME_SIZE: usize = 512; // ~32ms at 16kHz + +/// A speech chunk with sample-level boundaries. +#[derive(Debug, Clone)] +pub struct SpeechChunk { + pub start_sample: usize, + pub end_sample: usize, +} + +impl SpeechChunk { + pub fn start_secs(&self) -> f64 { + self.start_sample as f64 / SAMPLE_RATE as f64 + } + + pub fn end_secs(&self) -> f64 { + self.end_sample as f64 / SAMPLE_RATE as f64 + } + + pub fn duration_secs(&self) -> f64 { + (self.end_sample - self.start_sample) as f64 / SAMPLE_RATE as f64 + } +} + +/// Detect speech segments in audio using Silero VAD. +pub fn detect_speech_chunks(samples: &[f32], vad_model_path: &str) -> Result> { + let config = VadModelConfig { + silero_vad: SileroVadModelConfig { + model: Some(vad_model_path.to_string()), + threshold: 0.5, + min_silence_duration: 0.25, + min_speech_duration: 0.1, + window_size: FRAME_SIZE as i32, + max_speech_duration: 30.0, + }, + sample_rate: SAMPLE_RATE as i32, + num_threads: 1, + provider: Some("cpu".into()), + debug: false, + ..Default::default() + }; + + let vad = VoiceActivityDetector::create(&config, 60.0) + .context("Failed to create VAD (check vad_model_path)")?; + + let mut chunks = Vec::new(); + let mut cursor = 0usize; + + while cursor < samples.len() { + let end = (cursor + FRAME_SIZE).min(samples.len()); + let frame = &samples[cursor..end]; + vad.accept_waveform(frame); + + while let Some(seg) = vad.front() { + chunks.push(SpeechChunk { + start_sample: seg.start() as usize, + end_sample: seg.start() as usize + seg.n() as usize, + }); + vad.pop(); + } + + cursor = end; + } + + vad.flush(); + + while let Some(seg) = vad.front() { + chunks.push(SpeechChunk { + start_sample: seg.start() as usize, + end_sample: seg.start() as usize + seg.n() as usize, + }); + vad.pop(); + } + + Ok(chunks) +} + +/// Add padding around each chunk to protect word boundaries. +pub fn pad_chunks( + chunks: &[SpeechChunk], + total_len: usize, + pad_samples: usize, +) -> Vec { + chunks + .iter() + .map(|c| SpeechChunk { + start_sample: c.start_sample.saturating_sub(pad_samples), + end_sample: (c.end_sample + pad_samples).min(total_len), + }) + .collect() +} + +/// Merge chunks separated by less than max_gap_samples. +pub fn merge_close_chunks(chunks: &[SpeechChunk], max_gap_samples: usize) -> Vec { + if chunks.is_empty() { + return Vec::new(); + } + + let mut sorted = chunks.to_vec(); + sorted.sort_by_key(|c| c.start_sample); + + let mut merged = Vec::new(); + let mut cur = sorted[0].clone(); + + for next in sorted.into_iter().skip(1) { + let gap = next.start_sample.saturating_sub(cur.end_sample); + if gap <= max_gap_samples { + cur.end_sample = cur.end_sample.max(next.end_sample); + } else { + merged.push(cur); + cur = next; + } + } + + merged.push(cur); + merged +} + +/// Split chunks that exceed max duration, cutting at the lowest-energy point. +pub fn split_long_chunks( + samples: &[f32], + chunks: &[SpeechChunk], + max_chunk_secs: f32, +) -> Vec { + let max_len = (max_chunk_secs * SAMPLE_RATE as f32) as usize; + let mut out = Vec::new(); + + for c in chunks { + let mut start = c.start_sample; + while c.end_sample.saturating_sub(start) > max_len { + let target = start + max_len; + // Search ±500ms around the target for the quietest spot + let search_radius = (SAMPLE_RATE / 2) as usize; + let left = target.saturating_sub(search_radius).max(start); + let right = (target + search_radius).min(c.end_sample); + + let cut = find_low_energy_cut(samples, left, right).unwrap_or(target); + + out.push(SpeechChunk { + start_sample: start, + end_sample: cut, + }); + start = cut; + } + + if start < c.end_sample { + out.push(SpeechChunk { + start_sample: start, + end_sample: c.end_sample, + }); + } + } + + out +} + +/// Find the sample position with the lowest energy in a window. +fn find_low_energy_cut(samples: &[f32], start: usize, end: usize) -> Option { + let window = 320; // 20ms window + if end <= start + window || end > samples.len() { + return None; + } + + let mut best_pos = None; + let mut best_energy = f32::INFINITY; + + let mut i = start; + while i + window <= end { + let energy: f32 = samples[i..i + window].iter().map(|x| x * x).sum::() / window as f32; + + if energy < best_energy { + best_energy = energy; + best_pos = Some(i + window / 2); + } + + i += window / 2; // 50% overlap + } + + best_pos +} + +/// Full VAD pipeline: detect → pad → merge → split. +/// Returns clean speech chunks ready for STT. +pub fn vad_segment( + samples: &[f32], + vad_model_path: &str, + max_chunk_secs: f32, +) -> Result> { + let raw = detect_speech_chunks(samples, vad_model_path)?; + + // 250ms padding to protect word boundaries + let pad_samples = (SAMPLE_RATE as f32 * 0.25) as usize; + let padded = pad_chunks(&raw, samples.len(), pad_samples); + + // Merge chunks separated by <200ms gap + let merge_gap = (SAMPLE_RATE as f32 * 0.20) as usize; + let merged = merge_close_chunks(&padded, merge_gap); + + // Split oversized chunks at low-energy points + let final_chunks = split_long_chunks(samples, &merged, max_chunk_secs); + + Ok(final_chunks) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn pad_extends_boundaries() { + let chunks = vec![SpeechChunk { + start_sample: 1000, + end_sample: 5000, + }]; + let padded = pad_chunks(&chunks, 10000, 500); + assert_eq!(padded[0].start_sample, 500); + assert_eq!(padded[0].end_sample, 5500); + } + + #[test] + fn pad_clamps_to_bounds() { + let chunks = vec![SpeechChunk { + start_sample: 100, + end_sample: 9900, + }]; + let padded = pad_chunks(&chunks, 10000, 500); + assert_eq!(padded[0].start_sample, 0); + assert_eq!(padded[0].end_sample, 10000); + } + + #[test] + fn merge_combines_close_chunks() { + let chunks = vec![ + SpeechChunk { + start_sample: 0, + end_sample: 1000, + }, + SpeechChunk { + start_sample: 1100, + end_sample: 2000, + }, + SpeechChunk { + start_sample: 5000, + end_sample: 6000, + }, + ]; + let merged = merge_close_chunks(&chunks, 200); + assert_eq!(merged.len(), 2); + assert_eq!(merged[0].start_sample, 0); + assert_eq!(merged[0].end_sample, 2000); + assert_eq!(merged[1].start_sample, 5000); + } + + #[test] + fn split_cuts_long_chunks() { + let samples = vec![0.0f32; 80000]; // 5 seconds at 16kHz + let chunks = vec![SpeechChunk { + start_sample: 0, + end_sample: 80000, + }]; + let split = split_long_chunks(&samples, &chunks, 2.0); + assert!(split.len() >= 2); + for chunk in &split { + assert!(chunk.duration_secs() <= 2.5); // some tolerance for cut point + } + } +} diff --git a/src/main.rs b/src/main.rs index aab885e..d2e2ca0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -311,6 +311,10 @@ enum Command { /// Path to speaker embedding model (ONNX) #[arg(long, env = "DIARIZE_EMBEDDING_MODEL")] diarize_embedding_model: Option, + + /// Path to Silero VAD model for speech-aware segmentation (avoids mid-word cuts) + #[arg(long, env = "VAD_MODEL")] + vad_model: Option, }, } @@ -370,6 +374,7 @@ async fn main() -> Result<()> { speakers, diarize_segmentation_model, diarize_embedding_model, + vad_model, } => { check_ffmpeg()?; @@ -508,6 +513,7 @@ async fn main() -> Result<()> { speakers, diarize_segmentation_model: diarize_segmentation_model.clone(), diarize_embedding_model: diarize_embedding_model.clone(), + vad_model: vad_model.clone(), }; run_pipeline(engine.as_ref(), config).await?; diff --git a/src/pipeline.rs b/src/pipeline.rs index 5ea7963..1c00c56 100644 --- a/src/pipeline.rs +++ b/src/pipeline.rs @@ -53,6 +53,9 @@ pub struct PipelineConfig { pub diarize_segmentation_model: Option, #[cfg_attr(not(feature = "sherpa-onnx"), allow(dead_code))] pub diarize_embedding_model: Option, + /// Path to Silero VAD model for speech-aware segmentation (sherpa-onnx only) + #[cfg_attr(not(feature = "sherpa-onnx"), allow(dead_code))] + pub vad_model: Option, } pub async fn run_pipeline(engine: &dyn Transcriber, config: PipelineConfig) -> Result<()> { @@ -87,6 +90,14 @@ pub async fn run_pipeline(engine: &dyn Transcriber, config: PipelineConfig) -> R #[allow(unused_mut)] let mut transcript = if should_segment { + // Use VAD-based segmentation when available (sherpa-onnx), fall back to FFmpeg silencedetect + #[cfg(feature = "sherpa-onnx")] + if let Some(ref vad_model) = config.vad_model { + transcribe_vad_segmented(engine, input_path, vad_model, &config).await? + } else { + transcribe_segmented(engine, input_path, total_duration, &config).await? + } + #[cfg(not(feature = "sherpa-onnx"))] transcribe_segmented(engine, input_path, total_duration, &config).await? } else { transcribe_with_spinner("Transcribing...", engine.transcribe_path(input_path)).await? @@ -246,6 +257,72 @@ pub async fn run_pipeline(engine: &dyn Transcriber, config: PipelineConfig) -> R Ok(()) } +#[cfg(feature = "sherpa-onnx")] +async fn transcribe_vad_segmented( + engine: &dyn Transcriber, + wav_path: &Path, + vad_model: &str, + config: &PipelineConfig, +) -> Result { + use crate::audio::vad; + use crate::audio::wav::read_wav_bytes; + + eprintln!("Running VAD-based speech segmentation..."); + + // Read audio samples for VAD + let wav_bytes = std::fs::read(wav_path) + .with_context(|| format!("Failed to read: {}", wav_path.display()))?; + let samples = read_wav_bytes(&wav_bytes)?; + + let chunks = vad::vad_segment(&samples, vad_model, config.max_segment_secs as f32)?; + + eprintln!("Found {} speech chunks (VAD).", chunks.len()); + + if chunks.is_empty() { + eprintln!("No speech detected."); + return Ok(Transcript { + segments: Vec::new(), + }); + } + + let mut all_segments: Vec = Vec::new(); + + for (i, chunk) in chunks.iter().enumerate() { + eprintln!( + " Transcribing chunk {}/{} ({:.1}s - {:.1}s, {:.1}s)...", + i + 1, + chunks.len(), + chunk.start_secs(), + chunk.end_secs(), + chunk.duration_secs(), + ); + + let chunk_samples = samples[chunk.start_sample..chunk.end_sample].to_vec(); + let transcript = transcribe_with_spinner( + &format!( + "Transcribing chunk {}/{} ({:.1}s)...", + i + 1, + chunks.len(), + chunk.duration_secs(), + ), + engine.transcribe(chunk_samples), + ) + .await?; + + // Offset timestamps by the chunk start time + let offset_ms = (chunk.start_secs() * 1000.0) as i64; + for mut seg in transcript.segments { + seg.start_ms += offset_ms; + seg.end_ms += offset_ms; + all_segments.push(seg); + } + } + + Ok(Transcript { + segments: all_segments, + }) +} + async fn transcribe_segmented( engine: &dyn Transcriber, wav_path: &Path, @@ -408,6 +485,7 @@ mod tests { speakers: None, diarize_segmentation_model: None, diarize_embedding_model: None, + vad_model: None, }, ) .await?; @@ -471,6 +549,7 @@ mod tests { speakers: None, diarize_segmentation_model: None, diarize_embedding_model: None, + vad_model: None, }, ) .await?; @@ -520,6 +599,7 @@ mod tests { speakers: None, diarize_segmentation_model: None, diarize_embedding_model: None, + vad_model: None, }, ) .await?; @@ -571,6 +651,7 @@ mod tests { speakers: None, diarize_segmentation_model: None, diarize_embedding_model: None, + vad_model: None, }, ) .await?; From b568a1d8da20a304346f94bc4b096c5bce7f2d40 Mon Sep 17 00:00:00 2001 From: skitsanos Date: Mon, 16 Mar 2026 09:38:07 +0200 Subject: [PATCH 3/5] feat: Self-bootstrapping setup command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit transcribeit setup — downloads all components for full functionality: - models: default GGML base model from HuggingFace - vad: Silero VAD model (~628KB) for speech-aware segmentation - diarize: pyannote segmentation + wespeaker embedding models - sherpa-libs: platform-specific sherpa-onnx shared libraries (auto-detects macOS/Linux x64/ARM64) Selective install: transcribeit setup -c vad Extended download-model: --vad and --diarize flags Prints env var summary at the end showing what to add to .env. All downloads are idempotent (skip if already present). --- src/main.rs | 369 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 358 insertions(+), 11 deletions(-) diff --git a/src/main.rs b/src/main.rs index d2e2ca0..b00380b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -174,6 +174,19 @@ enum OutputFormatArg { Srt, } +#[derive(Debug, Clone, ValueEnum)] +enum SetupComponent { + /// Default STT models (GGML base) + Models, + /// Silero VAD model for speech-aware segmentation + Vad, + /// Speaker diarization models (segmentation + embedding) + Diarize, + /// sherpa-onnx shared libraries for the current platform + #[value(name = "sherpa-libs")] + SherpaLibs, +} + #[derive(Parser)] #[command(name = "transcribeit", about = "Transcribe audio files")] struct Cli { @@ -184,6 +197,21 @@ struct Cli { #[derive(Subcommand)] #[allow(clippy::large_enum_variant)] enum Command { + /// Download and install all components for full functionality + Setup { + /// Install only a specific component + #[arg(short, long)] + component: Option, + + /// Directory for models (overrides MODEL_CACHE_DIR) + #[arg(short, long)] + output_dir: Option, + + /// Hugging Face token for model downloads + #[arg(short = 't', long, env = "HF_TOKEN")] + hf_token: Option, + }, + /// Download a Whisper model DownloadModel { /// Model size to download @@ -201,6 +229,14 @@ enum Command { /// Hugging Face token (optional, or set HF_TOKEN env var) #[arg(short = 't', long, env = "HF_TOKEN")] hf_token: Option, + + /// Also download VAD model (silero_vad.onnx) + #[arg(long)] + vad: bool, + + /// Also download diarization models (segmentation + embedding) + #[arg(long)] + diarize: bool, }, /// List downloaded models @@ -325,24 +361,75 @@ async fn main() -> Result<()> { let cli = Cli::parse(); match cli.command { + Command::Setup { + component, + output_dir, + hf_token, + } => { + let components = match component { + Some(c) => vec![c], + None => vec![ + SetupComponent::Models, + SetupComponent::Vad, + SetupComponent::Diarize, + SetupComponent::SherpaLibs, + ], + }; + + let mut summary: Vec<(&str, String)> = Vec::new(); + + for comp in &components { + match comp { + SetupComponent::Models => { + let status = setup_models(output_dir.clone(), hf_token.as_deref()).await?; + summary.push(("models", status)); + } + SetupComponent::Vad => { + let status = setup_vad(output_dir.clone()).await?; + summary.push(("vad", status)); + } + SetupComponent::Diarize => { + let status = setup_diarize(output_dir.clone()).await?; + summary.push(("diarize", status)); + } + SetupComponent::SherpaLibs => { + let status = setup_sherpa_libs().await?; + summary.push(("sherpa-libs", status)); + } + } + } + + print_setup_summary(&summary); + } + Command::DownloadModel { model_size, format, output_dir, hf_token, - } => match format { - ModelFormat::Ggml => { - download_model(&model_size, output_dir, hf_token.as_deref()).await?; + vad, + diarize, + } => { + match format { + ModelFormat::Ggml => { + download_model(&model_size, output_dir.clone(), hf_token.as_deref()).await?; + } + ModelFormat::Onnx => { + #[cfg(feature = "sherpa-onnx")] + download_onnx_model(&model_size, output_dir.clone()).await?; + #[cfg(not(feature = "sherpa-onnx"))] + anyhow::bail!( + "ONNX model download requires the 'sherpa-onnx' feature. Build with: cargo build --features sherpa-onnx" + ); + } } - ModelFormat::Onnx => { - #[cfg(feature = "sherpa-onnx")] - download_onnx_model(&model_size, output_dir).await?; - #[cfg(not(feature = "sherpa-onnx"))] - anyhow::bail!( - "ONNX model download requires the 'sherpa-onnx' feature. Build with: cargo build --features sherpa-onnx" - ); + if vad { + setup_vad(output_dir.clone()).await?; + } + if diarize { + setup_diarize(output_dir).await?; } - }, + } Command::ListModels { dir } => { list_models(dir)?; @@ -837,3 +924,263 @@ fn list_models(dir: Option) -> Result<()> { Ok(()) } + +// ── Setup helpers ─────────────────────────────────────────────────────────── + +const SHERPA_ONNX_VERSION: &str = "v1.12.29"; + +/// Download a single file with progress bar. Returns "installed" or "already present". +async fn download_file_with_progress(url: &str, dest: &Path, label: &str) -> Result { + if dest.exists() { + println!("{label}: already present at {}", dest.display()); + return Ok("already present".into()); + } + + if let Some(parent) = dest.parent() { + tokio::fs::create_dir_all(parent).await?; + } + + println!("Downloading {label}..."); + println!(" from: {url}"); + println!(" to: {}", dest.display()); + + let client = reqwest::Client::new(); + let resp = client + .get(url) + .send() + .await + .context("Failed to start download")?; + + if !resp.status().is_success() { + anyhow::bail!("Download failed with status: {}", resp.status()); + } + + let total_size = resp.content_length().unwrap_or(0); + let pb = ProgressBar::new(total_size); + pb.set_style( + ProgressStyle::default_bar() + .template("{bar:40.cyan/blue} {bytes}/{total_bytes} ({eta})")? + .progress_chars("##-"), + ); + + let tmp_dest = dest.with_extension("part"); + let mut file = tokio::fs::File::create(&tmp_dest) + .await + .context("Failed to create temp file")?; + + let mut stream = resp.bytes_stream(); + while let Some(chunk) = stream.next().await { + let chunk = chunk.context("Error reading download stream")?; + file.write_all(&chunk).await.context("Failed to write")?; + pb.inc(chunk.len() as u64); + } + + file.flush().await?; + drop(file); + + tokio::fs::rename(&tmp_dest, dest) + .await + .context("Failed to finalize download")?; + + pb.finish_and_clear(); + println!("Done: {}", dest.display()); + Ok("installed".into()) +} + +/// Download and extract a tar.bz2 archive. Returns "installed" or "already present". +async fn download_and_extract( + url: &str, + extract_to: &Path, + check_dir: &Path, + label: &str, +) -> Result { + if check_dir.exists() { + println!("{label}: already present at {}", check_dir.display()); + return Ok("already present".into()); + } + + tokio::fs::create_dir_all(extract_to).await?; + + println!("Downloading {label}..."); + println!(" from: {url}"); + + let client = reqwest::Client::new(); + let resp = client + .get(url) + .send() + .await + .context("Failed to start download")?; + + if !resp.status().is_success() { + anyhow::bail!("Download failed with status: {}", resp.status()); + } + + let total_size = resp.content_length().unwrap_or(0); + let pb = ProgressBar::new(total_size); + pb.set_style( + ProgressStyle::default_bar() + .template("{bar:40.cyan/blue} {bytes}/{total_bytes} ({eta})")? + .progress_chars("##-"), + ); + + let tmp = tempfile::Builder::new() + .suffix(".tar.bz2") + .tempfile_in(extract_to) + .context("Failed to create temp file")?; + let tmp_path = tmp.path().to_path_buf(); + + { + let mut file = tokio::fs::File::create(&tmp_path).await?; + let mut stream = resp.bytes_stream(); + while let Some(chunk) = stream.next().await { + let chunk = chunk.context("Error reading download stream")?; + file.write_all(&chunk).await?; + pb.inc(chunk.len() as u64); + } + file.flush().await?; + } + + pb.finish_and_clear(); + println!("Extracting..."); + + let extract_dir = extract_to.to_path_buf(); + tokio::task::spawn_blocking(move || { + let file = std::fs::File::open(&tmp_path).context("Failed to open archive")?; + let decoder = bzip2::read::BzDecoder::new(file); + let mut archive = tar::Archive::new(decoder); + archive.unpack(&extract_dir).context("Failed to extract")?; + let _ = std::fs::remove_file(&tmp_path); + Ok::<(), anyhow::Error>(()) + }) + .await??; + + println!("Done: {}", check_dir.display()); + Ok("installed".into()) +} + +async fn setup_models(output_dir: Option, hf_token: Option<&str>) -> Result { + let dir = output_dir.unwrap_or_else(models_dir); + let dest = dir.join("ggml-base.bin"); + if dest.exists() { + println!("models: already present (ggml-base.bin)"); + return Ok("already present".into()); + } + download_model(&ModelSize::Base, Some(dir), hf_token).await?; + Ok("installed (ggml-base.bin)".into()) +} + +async fn setup_vad(output_dir: Option) -> Result { + let dir = output_dir.unwrap_or_else(models_dir); + let dest = dir.join("silero_vad.onnx"); + download_file_with_progress( + "https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx", + &dest, + "VAD model (silero_vad.onnx)", + ) + .await +} + +async fn setup_diarize(output_dir: Option) -> Result { + let dir = output_dir.unwrap_or_else(models_dir); + let mut parts = Vec::new(); + + // Segmentation model (tar.bz2) + let seg_dir = dir.join("sherpa-onnx-pyannote-segmentation-3-0"); + let seg_status = download_and_extract( + "https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2", + &dir, + &seg_dir, + "diarize segmentation model", + ).await?; + parts.push(format!("segmentation: {seg_status}")); + + // Embedding model (single file) + let emb_dest = dir.join("wespeaker_en_voxceleb_CAM++.onnx"); + let emb_status = download_file_with_progress( + "https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/wespeaker_en_voxceleb_CAM%2B%2B.onnx", + &emb_dest, + "diarize embedding model (wespeaker)", + ).await?; + parts.push(format!("embedding: {emb_status}")); + + Ok(parts.join(", ")) +} + +async fn setup_sherpa_libs() -> Result { + let os = std::env::consts::OS; + let arch = std::env::consts::ARCH; + + let archive_suffix = match (os, arch) { + ("macos", _) => "osx-universal2-shared", + ("linux", "x86_64") => "linux-x86_64-shared", + ("linux", "aarch64") => "linux-aarch64-shared", + _ => anyhow::bail!( + "Unsupported platform: {os}-{arch}. Download sherpa-onnx shared libraries manually." + ), + }; + + let archive_name = format!("sherpa-onnx-{SHERPA_ONNX_VERSION}-{archive_suffix}"); + let url = format!( + "https://github.com/k2-fsa/sherpa-onnx/releases/download/{SHERPA_ONNX_VERSION}/{archive_name}.tar.bz2" + ); + + let vendor_dir = PathBuf::from("vendor"); + let check_dir = vendor_dir.join(&archive_name); + + let status = download_and_extract( + &url, + &vendor_dir, + &check_dir, + "sherpa-onnx shared libraries", + ) + .await?; + + if status == "installed" { + let lib_dir = check_dir.join("lib"); + eprintln!( + "\nAdd to .env:\n SHERPA_ONNX_LIB_DIR={}\n", + lib_dir.display() + ); + } + + Ok(format!("{status} ({archive_suffix})")) +} + +fn print_setup_summary(summary: &[(&str, String)]) { + println!("\n=== Setup Summary ==="); + for (name, status) in summary { + println!(" {name:<14} {status}"); + } + + let dir = models_dir(); + println!("\nAdd to .env (if not already set):"); + println!(" MODEL_CACHE_DIR={}", dir.display()); + + let vad_path = dir.join("silero_vad.onnx"); + if vad_path.exists() { + println!(" VAD_MODEL={}", vad_path.display()); + } + + let seg_path = dir.join("sherpa-onnx-pyannote-segmentation-3-0/model.onnx"); + if seg_path.exists() { + println!(" DIARIZE_SEGMENTATION_MODEL={}", seg_path.display()); + } + + let emb_path = dir.join("wespeaker_en_voxceleb_CAM++.onnx"); + if emb_path.exists() { + println!(" DIARIZE_EMBEDDING_MODEL={}", emb_path.display()); + } + + // Check for sherpa-onnx libs in vendor/ + if let Ok(entries) = std::fs::read_dir("vendor") { + for entry in entries.flatten() { + let path = entry.path(); + if path.is_dir() && path.join("lib").exists() { + println!(" SHERPA_ONNX_LIB_DIR={}", path.join("lib").display()); + break; + } + } + } + + println!(); +} From 039ebe821739ff59d0d3d16c0f7ce0f27b03d743 Mon Sep 17 00:00:00 2001 From: skitsanos Date: Mon, 16 Mar 2026 14:25:19 +0200 Subject: [PATCH 4/5] chore: Add BSL 1.1 license MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Business Source License 1.1: - Free for non-commercial and evaluation use - Commercial/production use requires a separate license - Converts to Apache 2.0 on 2030-03-16 All dependencies verified compatible (MIT, Apache-2.0, BSD, ISC, Unlicense — no GPL/copyleft). --- Cargo.toml | 2 ++ LICENSE | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 8 ++++++++ 3 files changed, 68 insertions(+) create mode 100644 LICENSE diff --git a/Cargo.toml b/Cargo.toml index a8acee7..f5e3a03 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,6 +2,8 @@ name = "transcribeit" version = "1.1.0" edition = "2024" +license = "LicenseRef-BSL-1.1" +license-file = "LICENSE" [profile.release] opt-level = 3 diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..649141a --- /dev/null +++ b/LICENSE @@ -0,0 +1,58 @@ +Business Source License 1.1 + +License text copyright (c) 2017 MariaDB Corporation Ab, All Rights Reserved. +"Business Source License" is a trademark of MariaDB Corporation Ab. + +Parameters + +Licensor: TranscriptIntel +Licensed Work: transcribeit + The Licensed Work is (c) 2026 TranscriptIntel +Additional Use Grant: You may use the Licensed Work for non-commercial + and evaluation purposes without a license. + Production use in a commercial setting requires + a separate commercial license from the Licensor. +Change Date: 2030-03-16 +Change License: Apache License, Version 2.0 + +Terms + +The Licensor hereby grants you the right to copy, modify, create derivative +works, redistribute, and make non-production use of the Licensed Work. The +Licensor may make an Additional Use Grant, above, permitting limited +production use. + +Effective on the Change Date, or the fourth anniversary of the first publicly +available distribution of a specific version of the Licensed Work under this +License, whichever comes first, the Licensor hereby grants you rights under +the terms of the Change License, and the rights granted in the paragraph +above terminate. + +If your use of the Licensed Work does not comply with the requirements +currently in effect as described in this License, you must purchase a +commercial license from the Licensor, its affiliated entities, or authorized +resellers, or you must refrain from using the Licensed Work. + +All copies of the original and modified Licensed Work, and derivative works +of the Licensed Work, are subject to this License. This License applies +separately for each version of the Licensed Work and the Change Date may vary +for each version of the Licensed Work released by Licensor. + +You must conspicuously display this License on each original or modified copy +of the Licensed Work. If you receive the Licensed Work in original or +modified form from a third party, the terms and conditions set forth in this +License apply to your use of that work. + +Any use of the Licensed Work in violation of this License will automatically +terminate your rights under this License for the current and all other +versions of the Licensed Work. + +This License does not grant you any right in any trademark or logo of +Licensor or its affiliates (provided that you may use a trademark or logo of +Licensor as expressly required by this License). + +TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON +AN "AS IS" BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, +EXPRESS OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND +TITLE. diff --git a/README.md b/README.md index 5cb0c7f..fe88c95 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,14 @@ DIARIZE_SEGMENTATION_MODEL=.cache/sherpa-onnx-pyannote-segmentation-3-0/model.on DIARIZE_EMBEDDING_MODEL=.cache/wespeaker_en_voxceleb_CAM++.onnx ``` +## License + +This project is licensed under the [Business Source License 1.1](LICENSE). + +- **Free** for non-commercial and evaluation use +- **Commercial/production use** requires a separate license — contact [TranscriptIntel](https://github.com/transcriptintel) +- Converts to **Apache 2.0** on March 16, 2030 + ## Documentation See the [docs](docs/) folder for detailed documentation: From 9857591ff4aabb59960649f400dac6e8afa9b692 Mon Sep 17 00:00:00 2001 From: skitsanos Date: Mon, 16 Mar 2026 14:28:47 +0200 Subject: [PATCH 5/5] chore: Bump version to 1.2.0, fix cargo license warning, whisper-rs 0.16 API --- Cargo.lock | 2 +- Cargo.toml | 3 +-- src/engines/model_cache.rs | 17 +++++++---------- src/engines/whisper_local.rs | 31 ++++++++----------------------- 4 files changed, 17 insertions(+), 36 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a12cfa7..265fba4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1856,7 +1856,7 @@ dependencies = [ [[package]] name = "transcribeit" -version = "1.1.0" +version = "1.2.0" dependencies = [ "anyhow", "async-trait", diff --git a/Cargo.toml b/Cargo.toml index f5e3a03..258f59e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,7 @@ [package] name = "transcribeit" -version = "1.1.0" +version = "1.2.0" edition = "2024" -license = "LicenseRef-BSL-1.1" license-file = "LICENSE" [profile.release] diff --git a/src/engines/model_cache.rs b/src/engines/model_cache.rs index 5783ccc..544dfc6 100644 --- a/src/engines/model_cache.rs +++ b/src/engines/model_cache.rs @@ -1,5 +1,4 @@ use std::collections::HashMap; -use std::os::raw::{c_char, c_void}; use std::sync::{Arc, Mutex}; use anyhow::{Context, Result}; @@ -10,17 +9,15 @@ pub struct ModelCache { } impl ModelCache { - #[inline] - unsafe extern "C" fn whisper_log_silencer( - _level: u32, - _text: *const c_char, - _user_data: *mut c_void, - ) { - } - fn silence_whisper_logs() { + unsafe extern "C" fn noop( + _level: std::os::raw::c_uint, + _text: *const std::os::raw::c_char, + _user_data: *mut std::os::raw::c_void, + ) { + } unsafe { - whisper_rs::set_log_callback(Some(Self::whisper_log_silencer), std::ptr::null_mut()); + whisper_rs::set_log_callback(Some(noop), std::ptr::null_mut()); } } diff --git a/src/engines/whisper_local.rs b/src/engines/whisper_local.rs index 894f1bf..4420c58 100644 --- a/src/engines/whisper_local.rs +++ b/src/engines/whisper_local.rs @@ -30,7 +30,6 @@ impl Transcriber for WhisperLocal { let cache = Arc::clone(&self.cache); let language = self.language.clone(); - // whisper-rs is synchronous and CPU-heavy; run on a blocking thread tokio::task::spawn_blocking(move || { let ctx = cache.get_or_load(&model_path)?; @@ -52,29 +51,15 @@ impl Transcriber for WhisperLocal { .full(params, &audio_samples) .context("Whisper inference failed")?; - let num_segments = state - .full_n_segments() - .context("Failed to get segment count")?; - - let mut segments = Vec::new(); - for i in 0..num_segments { - let text = state - .full_get_segment_text(i) - .context("Failed to get segment text")?; - let start = state - .full_get_segment_t0(i) - .context("Failed to get segment start")?; - let end = state - .full_get_segment_t1(i) - .context("Failed to get segment end")?; - - segments.push(Segment { - start_ms: start * 10, - end_ms: end * 10, - text, + let segments: Vec = state + .as_iter() + .map(|seg| Segment { + start_ms: seg.start_timestamp() * 10, + end_ms: seg.end_timestamp() * 10, + text: seg.to_string(), speaker: None, - }); - } + }) + .collect(); Ok(Transcript { segments }) })