From 2c738b2f09779eaadc0ecc931f5d15137faa84e2 Mon Sep 17 00:00:00 2001
From: skitsanos <evgenios@skitsanos.com>
Date: Mon, 16 Mar 2026 08:50:48 +0200
Subject: [PATCH 1/5] feat: Speaker diarization via sherpa-onnx C API

Two-pass pipeline: whisper.cpp transcribes, then sherpa-onnx diarizes
the same audio and assigns speaker labels by timestamp overlap.

- Raw FFI bindings to sherpa-onnx offline speaker diarization C API
  (not yet exposed by the sherpa-onnx Rust crate)
- Dedicated worker thread for diarization (C types are !Send/!Sync)
- CLI: --speakers N --diarize-segmentation-model --diarize-embedding-model
- Env vars: DIARIZE_SEGMENTATION_MODEL, DIARIZE_EMBEDDING_MODEL
- Speaker labels in VTT (<v Speaker 0>), SRT ([Speaker 0]), and manifest JSON
- Segment struct gains optional speaker field
- Gated behind sherpa-onnx feature flag
---
 src/diarize/ffi.rs           |  92 +++++++++++++++++
 src/diarize/mod.rs           | 191 +++++++++++++++++++++++++++++++++++
 src/engines/openai_api.rs    |   3 +
 src/engines/sherpa_onnx.rs   |   3 +
 src/engines/whisper_local.rs |   1 +
 src/main.rs                  |  20 ++++
 src/output/manifest.rs       |   2 +
 src/output/srt.rs            |   7 +-
 src/output/vtt.rs            |   8 ++
 src/pipeline.rs              |  64 +++++++++++-
 src/transcriber.rs           |   2 +
 11 files changed, 391 insertions(+), 2 deletions(-)
 create mode 100644 src/diarize/ffi.rs
 create mode 100644 src/diarize/mod.rs
diff --git a/src/diarize/ffi.rs b/src/diarize/ffi.rs
new file mode 100644
index 0000000..ca63d85
--- /dev/null
+++ b/src/diarize/ffi.rs
@@ -0,0 +1,92 @@
+//! Raw FFI bindings for sherpa-onnx speaker diarization C API.
+#![allow(dead_code)]
+//! These are not exposed by sherpa-onnx-sys 0.1.10 so we bind them directly.
+
+use std::os::raw::{c_char, c_float, c_int};
+
+#[repr(C)]
+pub struct SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig {
+    pub model: *const c_char,
+}
+
+#[repr(C)]
+pub struct SherpaOnnxOfflineSpeakerSegmentationModelConfig {
+    pub pyannote: SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig,
+    pub num_threads: c_int,
+    pub debug: c_int,
+    pub provider: *const c_char,
+}
+
+#[repr(C)]
+pub struct SherpaOnnxSpeakerEmbeddingExtractorConfig {
+    pub model: *const c_char,
+    pub num_threads: c_int,
+    pub debug: c_int,
+    pub provider: *const c_char,
+}
+
+#[repr(C)]
+pub struct SherpaOnnxFastClusteringConfig {
+    pub num_clusters: c_int,
+    pub threshold: c_float,
+}
+
+#[repr(C)]
+pub struct SherpaOnnxOfflineSpeakerDiarizationConfig {
+    pub segmentation: SherpaOnnxOfflineSpeakerSegmentationModelConfig,
+    pub embedding: SherpaOnnxSpeakerEmbeddingExtractorConfig,
+    pub clustering: SherpaOnnxFastClusteringConfig,
+    pub min_duration_on: c_float,
+    pub min_duration_off: c_float,
+}
+
+#[repr(C)]
+pub struct SherpaOnnxOfflineSpeakerDiarizationSegment {
+    pub start: c_float,
+    pub end: c_float,
+    pub speaker: c_int,
+}
+
+// Opaque types
+pub enum SherpaOnnxOfflineSpeakerDiarization {}
+pub enum SherpaOnnxOfflineSpeakerDiarizationResult {}
+
+unsafe extern "C" {
+    pub fn SherpaOnnxCreateOfflineSpeakerDiarization(
+        config: *const SherpaOnnxOfflineSpeakerDiarizationConfig,
+    ) -> *const SherpaOnnxOfflineSpeakerDiarization;
+
+    pub fn SherpaOnnxDestroyOfflineSpeakerDiarization(
+        sd: *const SherpaOnnxOfflineSpeakerDiarization,
+    );
+
+    pub fn SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(
+        sd: *const SherpaOnnxOfflineSpeakerDiarization,
+    ) -> c_int;
+
+    pub fn SherpaOnnxOfflineSpeakerDiarizationProcess(
+        sd: *const SherpaOnnxOfflineSpeakerDiarization,
+        samples: *const c_float,
+        n: c_int,
+    ) -> *const SherpaOnnxOfflineSpeakerDiarizationResult;
+
+    pub fn SherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakers(
+        r: *const SherpaOnnxOfflineSpeakerDiarizationResult,
+    ) -> c_int;
+
+    pub fn SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(
+        r: *const SherpaOnnxOfflineSpeakerDiarizationResult,
+    ) -> c_int;
+
+    pub fn SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(
+        r: *const SherpaOnnxOfflineSpeakerDiarizationResult,
+    ) -> *const SherpaOnnxOfflineSpeakerDiarizationSegment;
+
+    pub fn SherpaOnnxOfflineSpeakerDiarizationDestroySegment(
+        s: *const SherpaOnnxOfflineSpeakerDiarizationSegment,
+    );
+
+    pub fn SherpaOnnxOfflineSpeakerDiarizationDestroyResult(
+        r: *const SherpaOnnxOfflineSpeakerDiarizationResult,
+    );
+}
diff --git a/src/diarize/mod.rs b/src/diarize/mod.rs
new file mode 100644
index 0000000..4296baa
--- /dev/null
+++ b/src/diarize/mod.rs
@@ -0,0 +1,191 @@
+mod ffi;
+
+use std::ffi::CString;
+use std::path::Path;
+use std::sync::mpsc;
+use std::thread::JoinHandle;
+
+use anyhow::{Context, Result};
+use tokio::sync::oneshot;
+
+use crate::transcriber::Transcript;
+
+/// A speaker-labeled time span from diarization.
+#[derive(Debug, Clone)]
+pub struct DiarizedSegment {
+    pub start_secs: f32,
+    pub end_secs: f32,
+    pub speaker: i32,
+}
+
+/// Request sent to the diarization worker thread.
+struct DiarizeRequest {
+    samples: Vec<f32>,
+    response_tx: oneshot::Sender<Result<Vec<DiarizedSegment>>>,
+}
+
+/// Speaker diarization engine using sherpa-onnx's C API directly.
+/// Runs on a dedicated thread (the C types are !Send/!Sync).
+pub struct Diarizer {
+    request_tx: mpsc::Sender<DiarizeRequest>,
+    _thread: JoinHandle<()>,
+}
+
+impl Diarizer {
+    /// Create a new diarizer.
+    ///
+    /// - `segmentation_model`: path to pyannote segmentation ONNX model
+    /// - `embedding_model`: path to speaker embedding ONNX model
+    /// - `num_speakers`: number of speakers (must be > 0)
+    pub fn new(
+        segmentation_model: &Path,
+        embedding_model: &Path,
+        num_speakers: i32,
+    ) -> Result<Self> {
+        let seg_model = segmentation_model.to_path_buf();
+        let emb_model = embedding_model.to_path_buf();
+
+        let (init_tx, init_rx) = std::sync::mpsc::channel::<Result<()>>();
+        let (request_tx, request_rx) = mpsc::channel::<DiarizeRequest>();
+
+        let thread = std::thread::spawn(move || {
+            let seg_model_c =
+                CString::new(seg_model.to_string_lossy().as_bytes()).unwrap_or_default();
+            let emb_model_c =
+                CString::new(emb_model.to_string_lossy().as_bytes()).unwrap_or_default();
+            let provider_c = CString::new("cpu").unwrap();
+
+            let config = ffi::SherpaOnnxOfflineSpeakerDiarizationConfig {
+                segmentation: ffi::SherpaOnnxOfflineSpeakerSegmentationModelConfig {
+                    pyannote: ffi::SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig {
+                        model: seg_model_c.as_ptr(),
+                    },
+                    num_threads: std::thread::available_parallelism()
+                        .map(|n| n.get() as i32)
+                        .unwrap_or(4),
+                    debug: 0,
+                    provider: provider_c.as_ptr(),
+                },
+                embedding: ffi::SherpaOnnxSpeakerEmbeddingExtractorConfig {
+                    model: emb_model_c.as_ptr(),
+                    num_threads: std::thread::available_parallelism()
+                        .map(|n| n.get() as i32)
+                        .unwrap_or(4),
+                    debug: 0,
+                    provider: provider_c.as_ptr(),
+                },
+                clustering: ffi::SherpaOnnxFastClusteringConfig {
+                    num_clusters: num_speakers,
+                    threshold: 0.5,
+                },
+                min_duration_on: 0.3,
+                min_duration_off: 0.5,
+            };
+
+            let sd = unsafe { ffi::SherpaOnnxCreateOfflineSpeakerDiarization(&config) };
+            if sd.is_null() {
+                init_tx
+                    .send(Err(anyhow::anyhow!(
+                        "Failed to create speaker diarization engine"
+                    )))
+                    .ok();
+                return;
+            }
+
+            init_tx.send(Ok(())).ok();
+
+            while let Ok(req) = request_rx.recv() {
+                let result = unsafe { process_diarization(sd, &req.samples) };
+                req.response_tx.send(result).ok();
+            }
+
+            unsafe {
+                ffi::SherpaOnnxDestroyOfflineSpeakerDiarization(sd);
+            }
+        });
+
+        init_rx
+            .recv()
+            .context("Diarization worker thread exited during init")??;
+
+        Ok(Self {
+            request_tx,
+            _thread: thread,
+        })
+    }
+
+    /// Run diarization on audio samples (16kHz mono f32).
+    pub async fn diarize(&self, samples: Vec<f32>) -> Result<Vec<DiarizedSegment>> {
+        let (response_tx, response_rx) = oneshot::channel();
+        self.request_tx
+            .send(DiarizeRequest {
+                samples,
+                response_tx,
+            })
+            .map_err(|_| anyhow::anyhow!("Diarization worker thread has stopped"))?;
+        response_rx
+            .await
+            .context("Diarization worker dropped without responding")?
+    }
+}
+
+unsafe fn process_diarization(
+    sd: *const ffi::SherpaOnnxOfflineSpeakerDiarization,
+    samples: &[f32],
+) -> Result<Vec<DiarizedSegment>> {
+    let result = unsafe {
+        ffi::SherpaOnnxOfflineSpeakerDiarizationProcess(sd, samples.as_ptr(), samples.len() as i32)
+    };
+
+    if result.is_null() {
+        anyhow::bail!("Diarization returned null result");
+    }
+
+    let num_segments =
+        unsafe { ffi::SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(result) };
+    let sorted = unsafe { ffi::SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(result) };
+
+    let mut segments = Vec::with_capacity(num_segments as usize);
+    if !sorted.is_null() {
+        for i in 0..num_segments as isize {
+            let seg = unsafe { &*sorted.offset(i) };
+            segments.push(DiarizedSegment {
+                start_secs: seg.start,
+                end_secs: seg.end,
+                speaker: seg.speaker,
+            });
+        }
+        unsafe { ffi::SherpaOnnxOfflineSpeakerDiarizationDestroySegment(sorted) };
+    }
+
+    unsafe { ffi::SherpaOnnxOfflineSpeakerDiarizationDestroyResult(result) };
+
+    Ok(segments)
+}
+
+/// Assign speaker labels to transcript segments by timestamp overlap.
+pub fn assign_speakers(transcript: &mut Transcript, diarized: &[DiarizedSegment]) {
+    for seg in &mut transcript.segments {
+        let seg_start = seg.start_ms as f32 / 1000.0;
+        let seg_end = seg.end_ms as f32 / 1000.0;
+
+        // Find the diarization segment with maximum overlap
+        let mut best_speaker = None;
+        let mut best_overlap = 0.0f32;
+
+        for d in diarized {
+            let overlap_start = seg_start.max(d.start_secs);
+            let overlap_end = seg_end.min(d.end_secs);
+            let overlap = (overlap_end - overlap_start).max(0.0);
+
+            if overlap > best_overlap {
+                best_overlap = overlap;
+                best_speaker = Some(d.speaker);
+            }
+        }
+
+        if let Some(speaker) = best_speaker {
+            seg.speaker = Some(format!("Speaker {}", speaker));
+        }
+    }
+}
diff --git a/src/engines/openai_api.rs b/src/engines/openai_api.rs
index 91609fc..8fa8310 100644
--- a/src/engines/openai_api.rs
+++ b/src/engines/openai_api.rs
@@ -194,6 +194,7 @@ pub fn parse_response_bytes(body: &[u8]) -> Transcript {
                     start_ms: (s.start * 1000.0) as i64,
                     end_ms: (s.end * 1000.0) as i64,
                     text: s.text,
+                    speaker: None,
                 })
                 .collect(),
         };
@@ -206,6 +207,7 @@ pub fn parse_response_bytes(body: &[u8]) -> Transcript {
                 start_ms: 0,
                 end_ms: 0,
                 text: resp.text,
+                speaker: None,
             }],
         };
     }
@@ -216,6 +218,7 @@ pub fn parse_response_bytes(body: &[u8]) -> Transcript {
             start_ms: 0,
             end_ms: 0,
             text: String::from_utf8_lossy(body).into_owned(),
+            speaker: None,
         }],
     }
 }
diff --git a/src/engines/sherpa_onnx.rs b/src/engines/sherpa_onnx.rs
index 4b16d62..68fb6ac 100644
--- a/src/engines/sherpa_onnx.rs
+++ b/src/engines/sherpa_onnx.rs
@@ -214,6 +214,7 @@ fn recognize(recognizer: &OfflineRecognizer, samples: &[f32]) -> Result<Transcri
                 start_ms: 0,
                 end_ms: 0,
                 text: result.text.clone(),
+                speaker: None,
             }]
         }
     };
@@ -246,6 +247,7 @@ fn tokens_to_segments(tokens: &[String], timestamps: &[f32]) -> Vec<Segment> {
                     start_ms: (timestamps[seg_start_idx] * 1000.0) as i64,
                     end_ms: (timestamps[i] * 1000.0) as i64,
                     text: trimmed.to_string(),
+                    speaker: None,
                 });
             }
             seg_start_idx = i + 1;
@@ -264,6 +266,7 @@ fn tokens_to_segments(tokens: &[String], timestamps: &[f32]) -> Vec<Segment> {
                 start_ms: (timestamps[0] * 1000.0) as i64,
                 end_ms: (timestamps[len.saturating_sub(1)] * 1000.0) as i64,
                 text: trimmed.to_string(),
+                speaker: None,
             }];
         }
     }
diff --git a/src/engines/whisper_local.rs b/src/engines/whisper_local.rs
index 377f30d..894f1bf 100644
--- a/src/engines/whisper_local.rs
+++ b/src/engines/whisper_local.rs
@@ -72,6 +72,7 @@ impl Transcriber for WhisperLocal {
                     start_ms: start * 10,
                     end_ms: end * 10,
                     text,
+                    speaker: None,
                 });
             }
 
diff --git a/src/main.rs b/src/main.rs
index 4128ae6..aab885e 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,4 +1,6 @@
 mod audio;
+#[cfg(feature = "sherpa-onnx")]
+mod diarize;
 mod engines;
 mod output;
 mod pipeline;
@@ -297,6 +299,18 @@ enum Command {
         /// Normalize audio with ffmpeg loudnorm before transcription
         #[arg(long)]
         normalize: bool,
+
+        /// Number of speakers for diarization (requires sherpa-onnx feature and models)
+        #[arg(long)]
+        speakers: Option<i32>,
+
+        /// Path to speaker segmentation model (pyannote ONNX)
+        #[arg(long, env = "DIARIZE_SEGMENTATION_MODEL")]
+        diarize_segmentation_model: Option<String>,
+
+        /// Path to speaker embedding model (ONNX)
+        #[arg(long, env = "DIARIZE_EMBEDDING_MODEL")]
+        diarize_embedding_model: Option<String>,
     },
 }
 
@@ -353,6 +367,9 @@ async fn main() -> Result<()> {
             request_timeout_secs,
             retry_wait_base_secs,
             retry_wait_max_secs,
+            speakers,
+            diarize_segmentation_model,
+            diarize_embedding_model,
         } => {
             check_ffmpeg()?;
 
@@ -488,6 +505,9 @@ async fn main() -> Result<()> {
                     upload_as_mp3,
                     segment_concurrency,
                     normalize_audio: normalize,
+                    speakers,
+                    diarize_segmentation_model: diarize_segmentation_model.clone(),
+                    diarize_embedding_model: diarize_embedding_model.clone(),
                 };
 
                 run_pipeline(engine.as_ref(), config).await?;
diff --git a/src/output/manifest.rs b/src/output/manifest.rs
index 5ed79b6..77d84b1 100644
--- a/src/output/manifest.rs
+++ b/src/output/manifest.rs
@@ -34,6 +34,8 @@ pub struct SegmentInfo {
     pub start_secs: f64,
     pub end_secs: f64,
     pub text: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub speaker: Option<String>,
 }
 
 #[derive(Serialize)]
diff --git a/src/output/srt.rs b/src/output/srt.rs
index 537c9ca..6b653b1 100644
--- a/src/output/srt.rs
+++ b/src/output/srt.rs
@@ -23,7 +23,11 @@ pub fn write_srt(transcript: &Transcript, writer: &mut impl Write) -> Result<()>
             format_timestamp(segment.start_ms),
             format_timestamp(segment.end_ms)
         )?;
-        writeln!(writer, "{}", segment.text.trim())?;
+        if let Some(ref spk) = segment.speaker {
+            writeln!(writer, "[{}] {}", spk, segment.text.trim())?;
+        } else {
+            writeln!(writer, "{}", segment.text.trim())?;
+        }
         writeln!(writer)?;
     }
 
@@ -43,6 +47,7 @@ mod tests {
                 start_ms: 0,
                 end_ms: 1234,
                 text: " Hello ".to_string(),
+                speaker: None,
             }],
         };
 
diff --git a/src/output/vtt.rs b/src/output/vtt.rs
index 363f3a9..044b3d7 100644
--- a/src/output/vtt.rs
+++ b/src/output/vtt.rs
@@ -26,6 +26,9 @@ pub fn write_vtt(transcript: &Transcript, writer: &mut impl Write) -> Result<()>
             format_timestamp(segment.start_ms),
             format_timestamp(segment.end_ms)
         )?;
+        if let Some(ref spk) = segment.speaker {
+            write!(writer, "<v {}>", spk)?;
+        }
         writeln!(writer, "{}", segment.text.trim())?;
         writeln!(writer)?;
     }
@@ -47,11 +50,13 @@ mod tests {
                     start_ms: 0,
                     end_ms: 1234,
                     text: " Hello ".to_string(),
+                    speaker: None,
                 },
                 Segment {
                     start_ms: 5_000,
                     end_ms: 6_100,
                     text: "world".to_string(),
+                    speaker: None,
                 },
             ],
         };
@@ -75,16 +80,19 @@ mod tests {
                     start_ms: 0,
                     end_ms: 10,
                     text: "A".to_string(),
+                    speaker: None,
                 },
                 Segment {
                     start_ms: 10,
                     end_ms: 20,
                     text: "B".to_string(),
+                    speaker: None,
                 },
                 Segment {
                     start_ms: 20,
                     end_ms: 30,
                     text: "C".to_string(),
+                    speaker: None,
                 },
             ],
         };
diff --git a/src/pipeline.rs b/src/pipeline.rs
index e112b8d..5ea7963 100644
--- a/src/pipeline.rs
+++ b/src/pipeline.rs
@@ -47,6 +47,12 @@ pub struct PipelineConfig {
     pub upload_as_mp3: bool,
     pub segment_concurrency: usize,
     pub normalize_audio: bool,
+    #[cfg_attr(not(feature = "sherpa-onnx"), allow(dead_code))]
+    pub speakers: Option<i32>,
+    #[cfg_attr(not(feature = "sherpa-onnx"), allow(dead_code))]
+    pub diarize_segmentation_model: Option<String>,
+    #[cfg_attr(not(feature = "sherpa-onnx"), allow(dead_code))]
+    pub diarize_embedding_model: Option<String>,
 }
 
 pub async fn run_pipeline(engine: &dyn Transcriber, config: PipelineConfig) -> Result<()> {
@@ -79,12 +85,53 @@ pub async fn run_pipeline(engine: &dyn Transcriber, config: PipelineConfig) -> R
         );
     }
 
-    let transcript = if should_segment {
+    #[allow(unused_mut)]
+    let mut transcript = if should_segment {
         transcribe_segmented(engine, input_path, total_duration, &config).await?
     } else {
         transcribe_with_spinner("Transcribing...", engine.transcribe_path(input_path)).await?
     };
 
+    // Speaker diarization (if requested)
+    #[cfg(feature = "sherpa-onnx")]
+    if let Some(num_speakers) = config.speakers {
+        let seg_model = config
+            .diarize_segmentation_model
+            .as_deref()
+            .context("--diarize-segmentation-model is required when --speakers is set")?;
+        let emb_model = config
+            .diarize_embedding_model
+            .as_deref()
+            .context("--diarize-embedding-model is required when --speakers is set")?;
+
+        eprintln!("Running speaker diarization ({num_speakers} speakers)...");
+
+        let diarizer = crate::diarize::Diarizer::new(
+            std::path::Path::new(seg_model),
+            std::path::Path::new(emb_model),
+            num_speakers,
+        )?;
+
+        // Read the audio samples for diarization
+        let wav_bytes = std::fs::read(input_path).with_context(|| {
+            format!(
+                "Failed to read audio for diarization: {}",
+                input_path.display()
+            )
+        })?;
+        let diarize_samples = crate::audio::wav::read_wav_bytes(&wav_bytes)?;
+        let diarized =
+            transcribe_with_spinner("Diarizing...", diarizer.diarize(diarize_samples)).await?;
+
+        eprintln!(
+            "Found {} speaker segments across {} speakers.",
+            diarized.len(),
+            num_speakers
+        );
+
+        crate::diarize::assign_speakers(&mut transcript, &diarized);
+    }
+
     let processing_time = started.elapsed().as_secs_f64();
 
     // Output
@@ -180,6 +227,7 @@ pub async fn run_pipeline(engine: &dyn Transcriber, config: PipelineConfig) -> R
                     start_secs: s.start_ms as f64 / 1000.0,
                     end_secs: s.end_ms as f64 / 1000.0,
                     text: s.text.trim().to_string(),
+                    speaker: s.speaker.clone(),
                 })
                 .collect(),
             stats: Stats {
@@ -357,6 +405,9 @@ mod tests {
                 upload_as_mp3: false,
                 segment_concurrency: 1,
                 normalize_audio: false,
+                speakers: None,
+                diarize_segmentation_model: None,
+                diarize_embedding_model: None,
             },
         )
         .await?;
@@ -417,6 +468,9 @@ mod tests {
                 upload_as_mp3: false,
                 segment_concurrency: 1,
                 normalize_audio: false,
+                speakers: None,
+                diarize_segmentation_model: None,
+                diarize_embedding_model: None,
             },
         )
         .await?;
@@ -463,6 +517,9 @@ mod tests {
                 upload_as_mp3: false,
                 segment_concurrency: 1,
                 normalize_audio: false,
+                speakers: None,
+                diarize_segmentation_model: None,
+                diarize_embedding_model: None,
             },
         )
         .await?;
@@ -511,6 +568,9 @@ mod tests {
                 upload_as_mp3: true,
                 segment_concurrency: 2,
                 normalize_audio: false,
+                speakers: None,
+                diarize_segmentation_model: None,
+                diarize_embedding_model: None,
             },
         )
         .await?;
@@ -567,6 +627,7 @@ mod tests {
                     start_ms: 0,
                     end_ms: 1000,
                     text: "integration".to_string(),
+                    speaker: None,
                 }],
             })
         }
@@ -582,6 +643,7 @@ mod tests {
                     start_ms: 0,
                     end_ms: 1000,
                     text: "integration".to_string(),
+                    speaker: None,
                 }],
             })
         }
diff --git a/src/transcriber.rs b/src/transcriber.rs
index d668997..b1b8e6d 100644
--- a/src/transcriber.rs
+++ b/src/transcriber.rs
@@ -5,10 +5,12 @@ use std::path::Path;
 use crate::audio::wav::read_wav_bytes;
 
 /// A segment of transcribed text with timing info.
+#[derive(Default)]
 pub struct Segment {
     pub start_ms: i64,
     pub end_ms: i64,
     pub text: String,
+    pub speaker: Option<String>,
 }
 
 /// Full transcript result.

From 329d6fc6e20e5b2184eec9166f2e37a77cb1fca9 Mon Sep 17 00:00:00 2001
From: skitsanos <evgenios@skitsanos.com>
Date: Mon, 16 Mar 2026 09:30:08 +0200
Subject: [PATCH 2/5] feat: VAD-based speech segmentation, dependency upgrades,
 docs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

VAD segmentation via Silero VAD (sherpa-onnx):
- Detects speech boundaries instead of silence dB thresholds
- 250ms padding protects word boundaries from clipping
- Merges chunks separated by <200ms gaps
- Splits long chunks at lowest-energy points (not arbitrary positions)
- Use --vad-model path/to/silero_vad.onnx to enable
- Falls back to FFmpeg silencedetect when no VAD model

Dependency upgrades:
- whisper-rs 0.12 → 0.16 (iterator API, updated log callback)
- reqwest 0.12 → 0.13
- indicatif 0.17 → 0.18
- bzip2 0.5 → 0.6 (pure Rust)

Comprehensive docs update for VAD, diarization, and env vars.
---
 Cargo.lock                     | 689 ++++++++++++++++++++++-----------
 Cargo.toml                     |   8 +-
 README.md                      |  14 +-
 docs/architecture.md           |  65 +++-
 docs/cli-reference.md          |  41 +-
 docs/performance-benchmarks.md |  20 +-
 docs/provider-behavior.md      |  16 +-
 docs/troubleshooting.md        |  43 ++
 src/audio/mod.rs               |   2 +
 src/audio/vad.rs               | 273 +++++++++++++
 src/main.rs                    |   6 +
 src/pipeline.rs                |  81 ++++
 12 files changed, 1015 insertions(+), 243 deletions(-)
 create mode 100644 src/audio/vad.rs

diff --git a/Cargo.lock b/Cargo.lock
index 61da3df..a12cfa7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -84,6 +84,28 @@ version = "1.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
 
+[[package]]
+name = "aws-lc-rs"
+version = "1.16.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94bffc006df10ac2a68c83692d734a465f8ee6c5b384d8545a636f81d858f4bf"
+dependencies = [
+ "aws-lc-sys",
+ "zeroize",
+]
+
+[[package]]
+name = "aws-lc-sys"
+version = "0.38.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4321e568ed89bb5a7d291a7f37997c2c0df89809d7b6d12062c81ddb54aa782e"
+dependencies = [
+ "cc",
+ "cmake",
+ "dunce",
+ "fs_extra",
+]
+
 [[package]]
 name = "base64"
 version = "0.22.1"
@@ -92,16 +114,14 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
 
 [[package]]
 name = "bindgen"
-version = "0.69.5"
+version = "0.72.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
+checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
 dependencies = [
  "bitflags",
  "cexpr",
  "clang-sys",
  "itertools",
- "lazy_static",
- "lazycell",
  "log",
  "prettyplease",
  "proc-macro2",
@@ -110,7 +130,6 @@ dependencies = [
  "rustc-hash",
  "shlex",
  "syn",
- "which",
 ]
 
 [[package]]
@@ -133,21 +152,11 @@ checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
 
 [[package]]
 name = "bzip2"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47"
-dependencies = [
- "bzip2-sys",
-]
-
-[[package]]
-name = "bzip2-sys"
-version = "0.1.13+1.0.8"
+version = "0.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14"
+checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c"
 dependencies = [
- "cc",
- "pkg-config",
+ "libbz2-rs-sys",
 ]
 
 [[package]]
@@ -157,9 +166,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7a0dd1ca384932ff3641c8718a02769f1698e7563dc6974ffd03346116310423"
 dependencies = [
  "find-msvc-tools",
+ "jobserver",
+ "libc",
  "shlex",
 ]
 
+[[package]]
+name = "cesu8"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c"
+
 [[package]]
 name = "cexpr"
 version = "0.6.0"
@@ -175,6 +192,12 @@ version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
 
+[[package]]
+name = "cfg_aliases"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
+
 [[package]]
 name = "clang-sys"
 version = "1.8.1"
@@ -241,17 +264,26 @@ version = "1.0.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
 
+[[package]]
+name = "combine"
+version = "4.6.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd"
+dependencies = [
+ "bytes",
+ "memchr",
+]
+
 [[package]]
 name = "console"
-version = "0.15.11"
+version = "0.16.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8"
+checksum = "d64e8af5551369d19cf50138de61f1c42074ab970f74e99be916646777f8fc87"
 dependencies = [
  "encode_unicode",
  "libc",
- "once_cell",
  "unicode-width",
- "windows-sys 0.59.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -297,6 +329,12 @@ version = "0.15.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b"
 
+[[package]]
+name = "dunce"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
+
 [[package]]
 name = "either"
 version = "1.15.0"
@@ -369,21 +407,6 @@ version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
 
-[[package]]
-name = "foreign-types"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
-dependencies = [
- "foreign-types-shared",
-]
-
-[[package]]
-name = "foreign-types-shared"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
-
 [[package]]
 name = "form_urlencoded"
 version = "1.2.2"
@@ -466,8 +489,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
 dependencies = [
  "cfg-if",
+ "js-sys",
  "libc",
  "wasi",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "libc",
+ "r-efi 5.3.0",
+ "wasip2",
+ "wasm-bindgen",
 ]
 
 [[package]]
@@ -478,7 +517,7 @@ checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
 dependencies = [
  "cfg-if",
  "libc",
- "r-efi",
+ "r-efi 6.0.0",
  "wasip2",
  "wasip3",
 ]
@@ -529,15 +568,6 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 
-[[package]]
-name = "home"
-version = "0.5.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d"
-dependencies = [
- "windows-sys 0.61.2",
-]
-
 [[package]]
 name = "hound"
 version = "3.5.1"
@@ -621,22 +651,6 @@ dependencies = [
  "tower-service",
 ]
 
-[[package]]
-name = "hyper-tls"
-version = "0.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0"
-dependencies = [
- "bytes",
- "http-body-util",
- "hyper",
- "hyper-util",
- "native-tls",
- "tokio",
- "tokio-native-tls",
- "tower-service",
-]
-
 [[package]]
 name = "hyper-util"
 version = "0.1.20"
@@ -784,14 +798,14 @@ dependencies = [
 
 [[package]]
 name = "indicatif"
-version = "0.17.11"
+version = "0.18.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235"
+checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb"
 dependencies = [
  "console",
- "number_prefix",
  "portable-atomic",
  "unicode-width",
+ "unit-prefix",
  "web-time",
 ]
 
@@ -833,26 +847,46 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2"
 
 [[package]]
-name = "js-sys"
-version = "0.3.91"
+name = "jni"
+version = "0.21.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c"
+checksum = "1a87aa2bb7d2af34197c04845522473242e1aa17c12f4935d5856491a7fb8c97"
 dependencies = [
- "once_cell",
- "wasm-bindgen",
+ "cesu8",
+ "cfg-if",
+ "combine",
+ "jni-sys",
+ "log",
+ "thiserror 1.0.69",
+ "walkdir",
+ "windows-sys 0.45.0",
 ]
 
 [[package]]
-name = "lazy_static"
-version = "1.5.0"
+name = "jni-sys"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
+checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130"
 
 [[package]]
-name = "lazycell"
-version = "1.3.0"
+name = "jobserver"
+version = "0.1.34"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
+checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
+dependencies = [
+ "getrandom 0.3.4",
+ "libc",
+]
+
+[[package]]
+name = "js-sys"
+version = "0.3.91"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c"
+dependencies = [
+ "once_cell",
+ "wasm-bindgen",
+]
 
 [[package]]
 name = "leb128fmt"
@@ -860,6 +894,12 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
 
+[[package]]
+name = "libbz2-rs-sys"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7"
+
 [[package]]
 name = "libc"
 version = "0.2.183"
@@ -888,12 +928,6 @@ dependencies = [
  "redox_syscall 0.7.3",
 ]
 
-[[package]]
-name = "linux-raw-sys"
-version = "0.4.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
-
 [[package]]
 name = "linux-raw-sys"
 version = "0.12.1"
@@ -921,6 +955,12 @@ version = "0.4.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
 
+[[package]]
+name = "lru-slab"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
+
 [[package]]
 name = "memchr"
 version = "2.8.0"
@@ -960,23 +1000,6 @@ dependencies = [
  "windows-sys 0.61.2",
 ]
 
-[[package]]
-name = "native-tls"
-version = "0.2.18"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "465500e14ea162429d264d44189adc38b199b62b1c21eea9f69e4b73cb03bbf2"
-dependencies = [
- "libc",
- "log",
- "openssl",
- "openssl-probe",
- "openssl-sys",
- "schannel",
- "security-framework",
- "security-framework-sys",
- "tempfile",
-]
-
 [[package]]
 name = "nom"
 version = "7.1.3"
@@ -987,12 +1010,6 @@ dependencies = [
  "minimal-lexical",
 ]
 
-[[package]]
-name = "number_prefix"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
-
 [[package]]
 name = "once_cell"
 version = "1.21.4"
@@ -1005,50 +1022,12 @@ version = "1.70.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
 
-[[package]]
-name = "openssl"
-version = "0.10.76"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "951c002c75e16ea2c65b8c7e4d3d51d5530d8dfa7d060b4776828c88cfb18ecf"
-dependencies = [
- "bitflags",
- "cfg-if",
- "foreign-types",
- "libc",
- "once_cell",
- "openssl-macros",
- "openssl-sys",
-]
-
-[[package]]
-name = "openssl-macros"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
 [[package]]
 name = "openssl-probe"
 version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
 
-[[package]]
-name = "openssl-sys"
-version = "0.9.112"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "57d55af3b3e226502be1526dfdba67ab0e9c96fc293004e79576b2b9edb0dbdb"
-dependencies = [
- "cc",
- "libc",
- "pkg-config",
- "vcpkg",
-]
-
 [[package]]
 name = "parking_lot"
 version = "0.12.5"
@@ -1090,12 +1069,6 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
-[[package]]
-name = "pkg-config"
-version = "0.3.32"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
-
 [[package]]
 name = "plain"
 version = "0.2.3"
@@ -1117,6 +1090,15 @@ dependencies = [
  "zerovec",
 ]
 
+[[package]]
+name = "ppv-lite86"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
+dependencies = [
+ "zerocopy",
+]
+
 [[package]]
 name = "prettyplease"
 version = "0.2.37"
@@ -1136,6 +1118,62 @@ dependencies = [
  "unicode-ident",
 ]
 
+[[package]]
+name = "quinn"
+version = "0.11.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20"
+dependencies = [
+ "bytes",
+ "cfg_aliases",
+ "pin-project-lite",
+ "quinn-proto",
+ "quinn-udp",
+ "rustc-hash",
+ "rustls",
+ "socket2",
+ "thiserror 2.0.18",
+ "tokio",
+ "tracing",
+ "web-time",
+]
+
+[[package]]
+name = "quinn-proto"
+version = "0.11.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098"
+dependencies = [
+ "aws-lc-rs",
+ "bytes",
+ "getrandom 0.3.4",
+ "lru-slab",
+ "rand",
+ "ring",
+ "rustc-hash",
+ "rustls",
+ "rustls-pki-types",
+ "slab",
+ "thiserror 2.0.18",
+ "tinyvec",
+ "tracing",
+ "web-time",
+]
+
+[[package]]
+name = "quinn-udp"
+version = "0.5.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd"
+dependencies = [
+ "cfg_aliases",
+ "libc",
+ "once_cell",
+ "socket2",
+ "tracing",
+ "windows-sys 0.59.0",
+]
+
 [[package]]
 name = "quote"
 version = "1.0.45"
@@ -1145,12 +1183,47 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "r-efi"
+version = "5.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
+
 [[package]]
 name = "r-efi"
 version = "6.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf"
 
+[[package]]
+name = "rand"
+version = "0.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
+dependencies = [
+ "rand_chacha",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c"
+dependencies = [
+ "getrandom 0.3.4",
+]
+
 [[package]]
 name = "redox_syscall"
 version = "0.5.18"
@@ -1200,9 +1273,9 @@ checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
 
 [[package]]
 name = "reqwest"
-version = "0.12.28"
+version = "0.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147"
+checksum = "ab3f43e3283ab1488b624b44b0e988d0acea0b3214e694730a055cb6b2efa801"
 dependencies = [
  "base64",
  "bytes",
@@ -1215,22 +1288,22 @@ dependencies = [
  "http-body-util",
  "hyper",
  "hyper-rustls",
- "hyper-tls",
  "hyper-util",
  "js-sys",
  "log",
  "mime",
  "mime_guess",
- "native-tls",
  "percent-encoding",
  "pin-project-lite",
+ "quinn",
+ "rustls",
  "rustls-pki-types",
+ "rustls-platform-verifier",
  "serde",
  "serde_json",
- "serde_urlencoded",
  "sync_wrapper",
  "tokio",
- "tokio-native-tls",
+ "tokio-rustls",
  "tokio-util",
  "tower",
  "tower-http",
@@ -1258,22 +1331,9 @@ dependencies = [
 
 [[package]]
 name = "rustc-hash"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
-
-[[package]]
-name = "rustix"
-version = "0.38.44"
+version = "2.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154"
-dependencies = [
- "bitflags",
- "errno",
- "libc",
- "linux-raw-sys 0.4.15",
- "windows-sys 0.59.0",
-]
+checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
 
 [[package]]
 name = "rustix"
@@ -1284,7 +1344,7 @@ dependencies = [
  "bitflags",
  "errno",
  "libc",
- "linux-raw-sys 0.12.1",
+ "linux-raw-sys",
  "windows-sys 0.61.2",
 ]
 
@@ -1294,6 +1354,7 @@ version = "0.23.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4"
 dependencies = [
+ "aws-lc-rs",
  "once_cell",
  "rustls-pki-types",
  "rustls-webpki",
@@ -1301,21 +1362,62 @@ dependencies = [
  "zeroize",
 ]
 
+[[package]]
+name = "rustls-native-certs"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63"
+dependencies = [
+ "openssl-probe",
+ "rustls-pki-types",
+ "schannel",
+ "security-framework",
+]
+
 [[package]]
 name = "rustls-pki-types"
 version = "1.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd"
 dependencies = [
+ "web-time",
  "zeroize",
 ]
 
+[[package]]
+name = "rustls-platform-verifier"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d99feebc72bae7ab76ba994bb5e121b8d83d910ca40b36e0921f53becc41784"
+dependencies = [
+ "core-foundation 0.10.1",
+ "core-foundation-sys",
+ "jni",
+ "log",
+ "once_cell",
+ "rustls",
+ "rustls-native-certs",
+ "rustls-platform-verifier-android",
+ "rustls-webpki",
+ "security-framework",
+ "security-framework-sys",
+ "webpki-root-certs",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "rustls-platform-verifier-android"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f"
+
 [[package]]
 name = "rustls-webpki"
 version = "0.103.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53"
 dependencies = [
+ "aws-lc-rs",
  "ring",
  "rustls-pki-types",
  "untrusted",
@@ -1328,10 +1430,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
 
 [[package]]
-name = "ryu"
-version = "1.0.23"
+name = "same-file"
+version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f"
+checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
+dependencies = [
+ "winapi-util",
+]
 
 [[package]]
 name = "schannel"
@@ -1420,18 +1525,6 @@ dependencies = [
  "zmij",
 ]
 
-[[package]]
-name = "serde_urlencoded"
-version = "0.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd"
-dependencies = [
- "form_urlencoded",
- "itoa",
- "ryu",
- "serde",
-]
-
 [[package]]
 name = "sherpa-onnx"
 version = "0.1.10"
@@ -1577,10 +1670,50 @@ dependencies = [
  "fastrand",
  "getrandom 0.4.2",
  "once_cell",
- "rustix 1.1.4",
+ "rustix",
  "windows-sys 0.61.2",
 ]
 
+[[package]]
+name = "thiserror"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
+dependencies = [
+ "thiserror-impl 1.0.69",
+]
+
+[[package]]
+name = "thiserror"
+version = "2.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4"
+dependencies = [
+ "thiserror-impl 2.0.18",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "2.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "tinystr"
 version = "0.8.2"
@@ -1591,6 +1724,21 @@ dependencies = [
  "zerovec",
 ]
 
+[[package]]
+name = "tinyvec"
+version = "1.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3"
+dependencies = [
+ "tinyvec_macros",
+]
+
+[[package]]
+name = "tinyvec_macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
+
 [[package]]
 name = "tokio"
 version = "1.50.0"
@@ -1619,16 +1767,6 @@ dependencies = [
  "syn",
 ]
 
-[[package]]
-name = "tokio-native-tls"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
-dependencies = [
- "native-tls",
- "tokio",
-]
-
 [[package]]
 name = "tokio-rustls"
 version = "0.26.4"
@@ -1772,6 +1910,12 @@ version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
 
+[[package]]
+name = "unit-prefix"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3"
+
 [[package]]
 name = "untrusted"
 version = "0.9.0"
@@ -1803,10 +1947,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
 
 [[package]]
-name = "vcpkg"
-version = "0.2.15"
+name = "walkdir"
+version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
+checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
+dependencies = [
+ "same-file",
+ "winapi-util",
+]
 
 [[package]]
 name = "want"
@@ -1924,9 +2072,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-streams"
-version = "0.4.2"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65"
+checksum = "9d1ec4f6517c9e11ae630e200b2b65d193279042e28edd4a2cda233e46670bbb"
 dependencies = [
  "futures-util",
  "js-sys",
@@ -1968,36 +2116,43 @@ dependencies = [
 ]
 
 [[package]]
-name = "which"
-version = "4.4.2"
+name = "webpki-root-certs"
+version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
+checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca"
 dependencies = [
- "either",
- "home",
- "once_cell",
- "rustix 0.38.44",
+ "rustls-pki-types",
 ]
 
 [[package]]
 name = "whisper-rs"
-version = "0.12.0"
+version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c597ac8a9d5c4719fee232abc871da184ea50a4fea38d2d00348fd95072b2b0"
+checksum = "2088172d00f936c348d6a72f488dc2660ab3f507263a195df308a3c2383229f6"
 dependencies = [
  "whisper-rs-sys",
 ]
 
 [[package]]
 name = "whisper-rs-sys"
-version = "0.10.0"
+version = "0.15.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d22f00ed0995463eecc34ef89905845f6bf6fd37ea70789fed180520050da8f8"
+checksum = "6986c0fe081241d391f09b9a071fbcbb59720c3563628c3c829057cf69f2a56f"
 dependencies = [
  "bindgen",
  "cfg-if",
  "cmake",
  "fs_extra",
+ "semver",
+]
+
+[[package]]
+name = "winapi-util"
+version = "0.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
+dependencies = [
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -2035,13 +2190,22 @@ dependencies = [
  "windows-link",
 ]
 
+[[package]]
+name = "windows-sys"
+version = "0.45.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
+dependencies = [
+ "windows-targets 0.42.2",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.52.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
 dependencies = [
- "windows-targets",
+ "windows-targets 0.52.6",
 ]
 
 [[package]]
@@ -2050,7 +2214,7 @@ version = "0.59.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
 dependencies = [
- "windows-targets",
+ "windows-targets 0.52.6",
 ]
 
 [[package]]
@@ -2062,34 +2226,67 @@ dependencies = [
  "windows-link",
 ]
 
+[[package]]
+name = "windows-targets"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
+dependencies = [
+ "windows_aarch64_gnullvm 0.42.2",
+ "windows_aarch64_msvc 0.42.2",
+ "windows_i686_gnu 0.42.2",
+ "windows_i686_msvc 0.42.2",
+ "windows_x86_64_gnu 0.42.2",
+ "windows_x86_64_gnullvm 0.42.2",
+ "windows_x86_64_msvc 0.42.2",
+]
+
 [[package]]
 name = "windows-targets"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
 dependencies = [
- "windows_aarch64_gnullvm",
- "windows_aarch64_msvc",
- "windows_i686_gnu",
+ "windows_aarch64_gnullvm 0.52.6",
+ "windows_aarch64_msvc 0.52.6",
+ "windows_i686_gnu 0.52.6",
  "windows_i686_gnullvm",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_gnullvm",
- "windows_x86_64_msvc",
+ "windows_i686_msvc 0.52.6",
+ "windows_x86_64_gnu 0.52.6",
+ "windows_x86_64_gnullvm 0.52.6",
+ "windows_x86_64_msvc 0.52.6",
 ]
 
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
+
 [[package]]
 name = "windows_aarch64_gnullvm"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
 
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
+
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
 
+[[package]]
+name = "windows_i686_gnu"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
+
 [[package]]
 name = "windows_i686_gnu"
 version = "0.52.6"
@@ -2102,24 +2299,48 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
 
+[[package]]
+name = "windows_i686_msvc"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
+
 [[package]]
 name = "windows_i686_msvc"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
 
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
+
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
 
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
+
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
 
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
+
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.52.6"
@@ -2227,7 +2448,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156"
 dependencies = [
  "libc",
- "rustix 1.1.4",
+ "rustix",
 ]
 
 [[package]]
@@ -2253,6 +2474,26 @@ dependencies = [
  "synstructure",
 ]
 
+[[package]]
+name = "zerocopy"
+version = "0.8.42"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2578b716f8a7a858b7f02d5bd870c14bf4ddbbcf3a4c05414ba6503640505e3"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.8.42"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e6cc098ea4d3bd6246687de65af3f920c430e236bee1e3bf2e441463f08a02f"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "zerofrom"
 version = "0.1.6"
diff --git a/Cargo.toml b/Cargo.toml
index 32676c6..a8acee7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -25,8 +25,8 @@ dotenvy = "0.15"
 futures-util = "0.3"
 hound = "3.5"
 glob = "0.3"
-indicatif = "0.17"
-reqwest = { version = "0.12", features = ["json", "multipart", "stream"] }
+indicatif = "0.18"
+reqwest = { version = "0.13", features = ["json", "multipart", "stream"] }
 serde = { version = "1", features = ["derive"] }
 serde_json = "1"
 tempfile = "3"
@@ -34,7 +34,7 @@ regex = "1"
 tokio = { version = "1", features = ["full"] }
 sherpa-onnx = { version = "0.1", optional = true }
 tar = "0.4"
-bzip2 = "0.5"
+bzip2 = "0.6"
 libc = "0.2"
-whisper-rs = "0.12"
+whisper-rs = "0.16"
 bytes = "1.11.1"
diff --git a/README.md b/README.md
index 714df9a..5cb0c7f 100644
--- a/README.md
+++ b/README.md
@@ -62,6 +62,14 @@ transcribeit run -p azure -i recording.mp3 \
 
 # Force language and normalize before transcription
 transcribeit run -i recording.wav -m base --language en --normalize
+
+# VAD-based segmentation (speech-aware, avoids mid-word cuts)
+transcribeit run -p sherpa-onnx -m base -i recording.mp3 --vad-model .cache/silero_vad.onnx
+
+# Speaker diarization (2 speakers)
+transcribeit run -i interview.mp3 -m base --speakers 2 \
+  --diarize-segmentation-model .cache/sherpa-onnx-pyannote-segmentation-3-0/model.onnx \
+  --diarize-embedding-model .cache/wespeaker_en_voxceleb_CAM++.onnx
 ```
 
 ## Features
@@ -72,7 +80,8 @@ transcribeit run -i recording.wav -m base --language en --normalize
 - **Model aliases** — `-m base`, `-m tiny`, etc. resolve from `MODEL_CACHE_DIR` for both `local` and `sherpa-onnx` providers. The sherpa-onnx resolver also supports glob matching (e.g., `-m moonshine-base`, `-m sense-voice`).
 - **Language hinting** — Pass `--language` to force local and API transcription language.
 - **FFmpeg audio normalization** — Optional `--normalize` to apply loudnorm before transcription.
-- **Silence-based segmentation** — Splits long audio at silence boundaries for better accuracy and API compatibility.
+- **VAD-based segmentation** — Speech-aware segmentation via Silero VAD (sherpa-onnx). Detects speech boundaries with padding and gap merging to avoid mid-word cuts. Use `--vad-model .cache/silero_vad.onnx`.
+- **Silence-based segmentation** — Fallback segmentation via FFmpeg `silencedetect` for API providers or when VAD model is not available.
 - **sherpa-onnx auto-segmentation** — Whisper ONNX models only support ≤30s per call; segmentation is enabled automatically.
 - **sherpa-onnx is optional** — Enabled by default as a Cargo feature. Build without it: `cargo build --no-default-features`.
 - **Auto-split for API limits** — Files exceeding 25MB are automatically segmented when using remote providers.
@@ -102,6 +111,9 @@ TRANSCRIBEIT_MAX_RETRIES=5
 TRANSCRIBEIT_REQUEST_TIMEOUT_SECS=120
 TRANSCRIBEIT_RETRY_WAIT_BASE_SECS=10
 TRANSCRIBEIT_RETRY_WAIT_MAX_SECS=120
+VAD_MODEL=.cache/silero_vad.onnx
+DIARIZE_SEGMENTATION_MODEL=.cache/sherpa-onnx-pyannote-segmentation-3-0/model.onnx
+DIARIZE_EMBEDDING_MODEL=.cache/wespeaker_en_voxceleb_CAM++.onnx
 ```
 
 ## Documentation
diff --git a/docs/architecture.md b/docs/architecture.md
index bbdaed2..ea7ab34 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -12,11 +12,15 @@ src/
 ├── audio/
 │   ├── extract.rs         # FFmpeg audio conversion
 │   ├── segment.rs         # Silence detection and audio splitting
+│   ├── vad.rs             # VAD-based speech segmentation (Silero VAD via sherpa-onnx)
 │   └── wav.rs             # WAV reading and encoding (shared)
+├── diarize/
+│   ├── mod.rs             # Speaker diarization engine and speaker assignment
+│   └── ffi.rs             # Raw C FFI bindings for sherpa-onnx speaker diarization
 ├── output/
-│   ├── vtt.rs             # WebVTT subtitle writer
-│   ├── srt.rs             # SRT subtitle writer
-│   └── manifest.rs        # JSON manifest writer
+│   ├── vtt.rs             # WebVTT subtitle writer (supports <v Speaker N> tags)
+│   ├── srt.rs             # SRT subtitle writer (supports [Speaker N] labels)
+│   └── manifest.rs        # JSON manifest writer (includes speaker labels)
 └── engines/
     ├── whisper_local.rs   # Local whisper.cpp via whisper-rs
     ├── sherpa_onnx.rs     # Local sherpa-onnx engine (auto-detects Whisper, Moonshine, SenseVoice)
@@ -67,21 +71,32 @@ Input file (any format)
   │   └─ Auto: sherpa-onnx provider (always segments; max 30s per chunk)
   │
   ├─ If segmenting:
-  │   ├─ detect_silence() via FFmpeg silencedetect filter
-  │   ├─ compute_segments() at silence midpoints
-  │   ├─ split_audio() into temp WAV files
-  │   └─ Transcribe each segment, offset timestamps (concurrently for API providers)
+  │   ├─ VAD path (when --vad-model is set and sherpa-onnx feature is enabled):
+  │   │   ├─ read_wav_bytes() → f32 PCM samples
+  │   │   ├─ vad_segment(): detect speech → pad 250ms → merge gaps <200ms → split long chunks at low-energy points
+  │   │   ├─ Extract chunk samples directly from memory
+  │   │   └─ Transcribe each chunk via transcribe(), offset timestamps
+  │   ├─ FFmpeg fallback (no VAD model, or sherpa-onnx feature disabled):
+  │   │   ├─ detect_silence() via FFmpeg silencedetect filter
+  │   │   ├─ compute_segments() at silence midpoints
+  │   │   ├─ split_audio() into temp WAV files
+  │   │   └─ Transcribe each segment, offset timestamps (concurrently for API providers)
   │
   ├─ If not segmenting:
   │   ├─ Local: read_wav() → transcribe() directly
   │   └─ API: transcribe_path() with prepared file
   │
   ├─ normalize_audio? ──→ optional loudnorm filter in ffmpeg conversion pipeline
+  ├─ Speaker diarization? (when --speakers N is set)
+  │   ├─ read audio samples for diarization
+  │   ├─ Diarizer.diarize() → speaker-labeled time spans
+  │   └─ assign_speakers() overlays speaker labels onto transcript segments
+  │
   └─ Output:
       ├─ Text to stdout or `<input_stem>.txt`
-      ├─ VTT to file or stdout
-      ├─ SRT to file or stdout
-      └─ JSON manifest to output directory
+      ├─ VTT to file or stdout (with `<v Speaker N>` tags when diarized)
+      ├─ SRT to file or stdout (with `[Speaker N]` labels when diarized)
+      └─ JSON manifest to output directory (includes speaker field per segment)
 ```
 
 Temporary files use the `tempfile` crate and are cleaned up automatically on drop.
@@ -184,6 +199,36 @@ cargo build --release --no-default-features
 
 This removes the sherpa-onnx provider and eliminates the need for `SHERPA_ONNX_LIB_DIR`.
 
+## VAD-based segmentation (`audio/vad.rs`)
+
+When `--vad-model` is set and the `sherpa-onnx` feature is enabled, the pipeline uses Silero VAD (via sherpa-onnx) for speech-aware segmentation instead of FFmpeg's `silencedetect` filter. This avoids the main problem with silence-based splitting: mid-word cuts.
+
+The VAD pipeline (`vad_segment()`) has four stages:
+
+1. **Detect speech** -- Silero VAD processes 512-sample frames (~32ms at 16kHz) to find speech boundaries with sample-level precision.
+2. **Pad 250ms** -- Each speech chunk is extended by 250ms on both sides to protect word boundaries at the edges.
+3. **Merge gaps <200ms** -- Adjacent chunks separated by less than 200ms are merged to avoid splitting within short pauses.
+4. **Split long chunks** -- Chunks exceeding `--max-segment-secs` are split at the lowest-energy point within a 1-second search window around the target cut point.
+
+The VAD approach works directly on in-memory PCM samples, so there is no need for intermediate temp files during segmentation. Each chunk is transcribed via `engine.transcribe()` with sample slices, and timestamps are offset by the chunk start time.
+
+When `--vad-model` is not set, segmentation falls back to FFmpeg `silencedetect` (the original behavior).
+
+## Speaker diarization (`diarize/`)
+
+Speaker diarization identifies which speaker is talking at each point in the audio. It requires the `sherpa-onnx` feature and two ONNX models:
+
+- **Segmentation model** (`--diarize-segmentation-model`): a pyannote segmentation ONNX model that detects speaker change points.
+- **Embedding model** (`--diarize-embedding-model`): a speaker embedding ONNX model that clusters voice characteristics.
+
+The `Diarizer` follows the same dedicated worker thread pattern as `SherpaOnnxEngine`: the C FFI types are not `Send`/`Sync`, so they live on a plain `std::thread` and communicate via channels. Diarization requests are sent through `mpsc` and results come back through `tokio::sync::oneshot`.
+
+After transcription completes, `assign_speakers()` overlays speaker labels onto transcript segments by finding the diarization segment with the maximum time overlap for each transcript segment. Speaker labels appear as:
+
+- **VTT**: `<v Speaker 0>text</v>`
+- **SRT**: `[Speaker 0] text`
+- **Manifest JSON**: `"speaker": "Speaker 0"` field on each segment
+
 ## Adding a new engine
 
 1. Create `src/engines/your_engine.rs`
diff --git a/docs/cli-reference.md b/docs/cli-reference.md
index 992a474..907e5b1 100644
--- a/docs/cli-reference.md
+++ b/docs/cli-reference.md
@@ -119,9 +119,22 @@ These options apply to OpenAI/Azure providers:
 | `--min-silence-duration` | Minimum silence duration in seconds | `0.8` |
 | `--max-segment-secs` | Maximum segment length in seconds | `600` |
 | `--segment-concurrency` | Max parallel segment requests (API providers only) | `2` |
+| `--vad-model` | Path to Silero VAD ONNX model (`silero_vad.onnx`) for speech-aware segmentation | `VAD_MODEL` env var |
 
 When using `openai` or `azure` providers, files exceeding 25MB are automatically segmented even without `--segment`. When using `sherpa-onnx`, segmentation is always enabled with a maximum segment length of 30 seconds.
 
+When `--vad-model` is set and segmentation is needed, VAD-based segmentation is used instead of FFmpeg `silencedetect`. VAD detects actual speech boundaries using Silero VAD, avoiding mid-word cuts. It pads chunks by 250ms, merges gaps shorter than 200ms, and splits long chunks at low-energy points. This requires the `sherpa-onnx` feature to be enabled. When `--vad-model` is not set, the original FFmpeg silence-based segmentation is used as a fallback.
+
+#### Speaker diarization options
+
+| Option | Description | Default |
+|--------|-------------|---------|
+| `--speakers` | Number of speakers for diarization | disabled |
+| `--diarize-segmentation-model` | Path to pyannote segmentation ONNX model | `DIARIZE_SEGMENTATION_MODEL` env var |
+| `--diarize-embedding-model` | Path to speaker embedding ONNX model | `DIARIZE_EMBEDDING_MODEL` env var |
+
+When `--speakers N` is set, speaker diarization runs after transcription to label each segment with a speaker identity. Both `--diarize-segmentation-model` and `--diarize-embedding-model` are required. Speaker labels appear in VTT output as `<v Speaker 0>`, in SRT output as `[Speaker 0]`, and in manifest JSON as a `"speaker"` field on each segment. Requires the `sherpa-onnx` feature.
+
 ## Output behavior
 
 During transcription, the CLI shows an animated spinner in the terminal so you can see progress while waiting for Whisper/API calls to complete.
@@ -155,6 +168,9 @@ When `--input` resolves to multiple files (directory or glob), all files are pro
 | `AZURE_OPENAI_ENDPOINT` | Azure OpenAI endpoint URL | none |
 | `AZURE_DEPLOYMENT_NAME` | Azure deployment name | `whisper` |
 | `AZURE_API_VERSION` | Azure API version | `2024-06-01` |
+| `VAD_MODEL` | Path to Silero VAD ONNX model for speech-aware segmentation | none |
+| `DIARIZE_SEGMENTATION_MODEL` | Path to pyannote segmentation ONNX model for speaker diarization | none |
+| `DIARIZE_EMBEDDING_MODEL` | Path to speaker embedding ONNX model for speaker diarization | none |
 | `TRANSCRIBEIT_MAX_RETRIES` | Maximum 429 retries | `5` |
 | `TRANSCRIBEIT_REQUEST_TIMEOUT_SECS` | API request timeout in seconds | `120` |
 | `TRANSCRIBEIT_RETRY_WAIT_BASE_SECS` | Base retry wait time in seconds | `10` |
@@ -211,6 +227,28 @@ transcribeit run -i lecture.mp4 -m base -f srt -o ./output
 transcribeit run -i noisy.wav -m .cache/ggml-base.bin \
   --segment --silence-threshold -30 --min-silence-duration 0.5
 
+# VAD-based segmentation (avoids mid-word cuts)
+transcribeit run -p sherpa-onnx -i lecture.mp4 -m base.en \
+  --vad-model /path/to/silero_vad.onnx -f vtt -o ./output
+
+# VAD with env var (set VAD_MODEL in .env)
+VAD_MODEL=/path/to/silero_vad.onnx transcribeit run -p sherpa-onnx -i recording.mp3 -m base.en
+
+# Speaker diarization (2 speakers)
+transcribeit run -p sherpa-onnx -i meeting.mp4 -m base.en \
+  --speakers 2 \
+  --diarize-segmentation-model /path/to/segmentation.onnx \
+  --diarize-embedding-model /path/to/embedding.onnx \
+  -f vtt -o ./output
+
+# VAD + speaker diarization combined
+transcribeit run -p sherpa-onnx -i interview.wav -m base.en \
+  --vad-model /path/to/silero_vad.onnx \
+  --speakers 2 \
+  --diarize-segmentation-model /path/to/segmentation.onnx \
+  --diarize-embedding-model /path/to/embedding.onnx \
+  -f srt -o ./output
+
 # OpenAI API
 OPENAI_API_KEY=sk-... transcribeit run -p openai -i recording.mp3
 
@@ -267,7 +305,8 @@ When `--output-dir` is specified, the following files are created:
       "index": 0,
       "start_secs": 0.0,
       "end_secs": 5.25,
-      "text": "Hello, welcome to the meeting."
+      "text": "Hello, welcome to the meeting.",
+      "speaker": "Speaker 0"
     }
   ],
   "stats": {
diff --git a/docs/performance-benchmarks.md b/docs/performance-benchmarks.md
index 20d88f4..6ae4337 100644
--- a/docs/performance-benchmarks.md
+++ b/docs/performance-benchmarks.md
@@ -61,16 +61,23 @@ Record:
 ### 3. Segmentation impact
 
 ```bash
+# FFmpeg silencedetect segmentation
 time transcribeit run -p openai -i <long_file> --segment --segment-concurrency 2 -f text -o ./output
 time transcribeit run -p openai -i <long_file> --segment --segment-concurrency 1 --max-segment-secs 300 -f text -o ./output
-# sherpa-onnx always segments at 30s max
+
+# sherpa-onnx with FFmpeg silencedetect (default, always segments at 30s max)
 time transcribeit run -p sherpa-onnx -i <long_file> -m base -f text -o ./output
+
+# sherpa-onnx with VAD-based segmentation
+time transcribeit run -p sherpa-onnx -i <long_file> -m base --vad-model /path/to/silero_vad.onnx -f text -o ./output
 ```
 
 Record:
 - total segment count
 - max queue wait
 - request-level retry counts
+- segmentation method used (VAD vs silencedetect)
+- transcript quality at segment boundaries (check for mid-word cuts)
 
 ### 4. I/O + conversion overhead
 
@@ -117,6 +124,17 @@ These results were measured on a 5-minute medical interview recording.
 - Moonshine provides a compact alternative but is slower than Whisper at the same size tier.
 - For highest quality where speed is not critical, use `large-v3-turbo` with local whisper.cpp.
 
+### VAD vs FFmpeg silencedetect segmentation
+
+VAD-based segmentation (Silero VAD via `--vad-model`) and FFmpeg `silencedetect` produce different segment boundaries. Key differences to observe when benchmarking:
+
+- **Segment boundary quality:** VAD detects speech regions directly, so segment boundaries align with actual speech. FFmpeg `silencedetect` splits at silence midpoints, which can cut mid-word if silence gaps are short or thresholds are mistuned.
+- **Segment count:** VAD typically produces more segments (one per speech region after merging) while `silencedetect` produces fewer, longer segments based on silence gaps.
+- **Processing overhead:** VAD runs on the audio samples in-memory (fast, no subprocess). FFmpeg `silencedetect` runs as a subprocess and requires parsing its stderr output.
+- **Transcript quality:** VAD-segmented transcripts tend to have fewer artifacts at segment boundaries because chunks start and end at speech boundaries with 250ms padding, rather than at arbitrary silence midpoints.
+
+When comparing, use the same audio file and model to isolate the effect of the segmentation method on overall transcript quality and timing.
+
 ## CI/automatable baseline
 
 For now, treat these as manual benchmarks in a fixed environment.
diff --git a/docs/provider-behavior.md b/docs/provider-behavior.md
index 69fa566..c0c0a53 100644
--- a/docs/provider-behavior.md
+++ b/docs/provider-behavior.md
@@ -26,6 +26,8 @@ This project supports four providers. They share the same input/output surface,
 - Transcription runs in-process on a dedicated worker thread using the sherpa-onnx C library via FFI.
 - C++ stderr warnings from the sherpa-onnx library are suppressed during inference to keep terminal output clean.
 - Whisper ONNX models only support audio of 30 seconds or less per call. The pipeline automatically enables segmentation and caps `--max-segment-secs` at 30, regardless of user-supplied values.
+- **VAD-based segmentation:** When `--vad-model` is set (or `VAD_MODEL` env var), Silero VAD is used for speech-aware segmentation instead of FFmpeg `silencedetect`. This detects actual speech boundaries and avoids mid-word cuts. The VAD pipeline pads chunks by 250ms, merges gaps shorter than 200ms, and splits long chunks at low-energy points. This is the recommended segmentation method for sherpa-onnx. When no VAD model is provided, the pipeline falls back to FFmpeg `silencedetect`.
+- **Speaker diarization:** When `--speakers N` is set along with `--diarize-segmentation-model` and `--diarize-embedding-model`, speaker labels are assigned to each transcript segment after transcription. Labels appear in VTT (`<v Speaker 0>`), SRT (`[Speaker 0]`), and manifest JSON output.
 - **SenseVoice limitation:** emotion and audio event detection tags are stripped by the sherpa-onnx C API and are not available in the output.
 - Segment concurrency is always 1 (sequential processing).
 - No external API key is required.
@@ -65,8 +67,18 @@ This project supports four providers. They share the same input/output surface,
 
 Both are local engines that run without network access. They differ in the model format and inference backend:
 
-- **Local** uses GGML models via `whisper.cpp` (`whisper-rs` binding). Supports all Whisper model sizes.
-- **Sherpa-ONNX** uses ONNX models via the `sherpa-onnx` C library. Supports three model architectures (Whisper, Moonshine, SenseVoice) with automatic detection. Whisper ONNX supports all sizes except `large-v3`. Requires auto-segmentation at 30s due to Whisper ONNX limitations. The `sherpa-onnx` feature is optional (enabled by default); build without it using `cargo build --no-default-features`.
+- **Local** uses GGML models via `whisper.cpp` (`whisper-rs` binding). Supports all Whisper model sizes. Uses FFmpeg `silencedetect` for segmentation.
+- **Sherpa-ONNX** uses ONNX models via the `sherpa-onnx` C library. Supports three model architectures (Whisper, Moonshine, SenseVoice) with automatic detection. Whisper ONNX supports all sizes except `large-v3`. Requires auto-segmentation at 30s due to Whisper ONNX limitations. Supports VAD-based segmentation via `--vad-model` for cleaner speech boundaries (recommended). Also supports speaker diarization via `--speakers`. The `sherpa-onnx` feature is optional (enabled by default); build without it using `cargo build --no-default-features`.
+
+### Segmentation: VAD vs FFmpeg silencedetect
+
+| | VAD (Silero) | FFmpeg silencedetect |
+|---|---|---|
+| **Availability** | Requires `sherpa-onnx` feature + `--vad-model` | Always available |
+| **Boundary quality** | Speech-aware; avoids mid-word cuts | Silence-based; may cut mid-word |
+| **Approach** | Detects speech regions, pads, merges, splits at low-energy | Detects silence gaps, splits at midpoints |
+| **Config flags** | `--vad-model`, `--max-segment-secs` | `--silence-threshold`, `--min-silence-duration`, `--max-segment-secs` |
+| **Best for** | Local sherpa-onnx transcription | API providers, or when no VAD model is available |
 
 ### OpenAI vs Azure
 
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index c254bfa..c93babd 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -56,6 +56,49 @@ Fix:
 - Verify with: `transcribeit list-models` (ONNX models appear with an `[onnx]` tag)
 - The model resolver supports partial name matching (e.g., `-m moonshine-base`, `-m sense-voice`).
 
+### VAD model not found or fails to load
+
+Symptoms:
+- `Failed to create VAD (check vad_model_path)`
+- `No such file or directory` when using `--vad-model`
+
+Fix:
+- Verify that the path provided to `--vad-model` (or the `VAD_MODEL` env var) points to a valid `silero_vad.onnx` file.
+- Download the Silero VAD model from the [sherpa-onnx releases](https://github.com/k2-fsa/sherpa-onnx/releases). Look for `silero_vad.onnx` in the VAD model archives.
+- Ensure the `sherpa-onnx` feature is enabled (it is by default). VAD-based segmentation is not available without it.
+- The VAD model path can be set in your `.env` file:
+
+```bash
+# .env
+VAD_MODEL=/path/to/silero_vad.onnx
+```
+
+If you do not have a VAD model, omit `--vad-model` and the pipeline will fall back to FFmpeg `silencedetect` for segmentation.
+
+### Speaker diarization model issues
+
+Symptoms:
+- `Failed to create speaker diarization engine`
+- `--diarize-segmentation-model is required when --speakers is set`
+- `--diarize-embedding-model is required when --speakers is set`
+
+Fix:
+- When using `--speakers N`, both `--diarize-segmentation-model` and `--diarize-embedding-model` are required.
+- Ensure both model paths point to valid ONNX files:
+  - **Segmentation model:** a pyannote speaker segmentation ONNX model.
+  - **Embedding model:** a speaker embedding extraction ONNX model.
+- Download compatible models from the [sherpa-onnx speaker diarization releases](https://github.com/k2-fsa/sherpa-onnx/releases).
+- The model paths can be set via environment variables in your `.env` file:
+
+```bash
+# .env
+DIARIZE_SEGMENTATION_MODEL=/path/to/segmentation.onnx
+DIARIZE_EMBEDDING_MODEL=/path/to/embedding.onnx
+```
+
+- Requires the `sherpa-onnx` feature to be enabled.
+- The `--speakers` value must be greater than 0.
+
 ### Building without sherpa-onnx
 
 If you do not need the sherpa-onnx provider and want to avoid installing the shared libraries:
diff --git a/src/audio/mod.rs b/src/audio/mod.rs
index 37669f2..24ccc59 100644
--- a/src/audio/mod.rs
+++ b/src/audio/mod.rs
@@ -1,3 +1,5 @@
 pub mod extract;
 pub mod segment;
+#[cfg(feature = "sherpa-onnx")]
+pub mod vad;
 pub mod wav;
diff --git a/src/audio/vad.rs b/src/audio/vad.rs
new file mode 100644
index 0000000..9947ed7
--- /dev/null
+++ b/src/audio/vad.rs
@@ -0,0 +1,273 @@
+//! VAD-based speech segmentation using sherpa-onnx's Silero VAD.
+//! Produces clean speech boundaries that avoid mid-word cuts.
+
+use anyhow::{Context, Result};
+use sherpa_onnx::{SileroVadModelConfig, VadModelConfig, VoiceActivityDetector};
+
+const SAMPLE_RATE: u32 = 16_000;
+const FRAME_SIZE: usize = 512; // ~32ms at 16kHz
+
+/// A speech chunk with sample-level boundaries.
+#[derive(Debug, Clone)]
+pub struct SpeechChunk {
+    pub start_sample: usize,
+    pub end_sample: usize,
+}
+
+impl SpeechChunk {
+    pub fn start_secs(&self) -> f64 {
+        self.start_sample as f64 / SAMPLE_RATE as f64
+    }
+
+    pub fn end_secs(&self) -> f64 {
+        self.end_sample as f64 / SAMPLE_RATE as f64
+    }
+
+    pub fn duration_secs(&self) -> f64 {
+        (self.end_sample - self.start_sample) as f64 / SAMPLE_RATE as f64
+    }
+}
+
+/// Detect speech segments in audio using Silero VAD.
+pub fn detect_speech_chunks(samples: &[f32], vad_model_path: &str) -> Result<Vec<SpeechChunk>> {
+    let config = VadModelConfig {
+        silero_vad: SileroVadModelConfig {
+            model: Some(vad_model_path.to_string()),
+            threshold: 0.5,
+            min_silence_duration: 0.25,
+            min_speech_duration: 0.1,
+            window_size: FRAME_SIZE as i32,
+            max_speech_duration: 30.0,
+        },
+        sample_rate: SAMPLE_RATE as i32,
+        num_threads: 1,
+        provider: Some("cpu".into()),
+        debug: false,
+        ..Default::default()
+    };
+
+    let vad = VoiceActivityDetector::create(&config, 60.0)
+        .context("Failed to create VAD (check vad_model_path)")?;
+
+    let mut chunks = Vec::new();
+    let mut cursor = 0usize;
+
+    while cursor < samples.len() {
+        let end = (cursor + FRAME_SIZE).min(samples.len());
+        let frame = &samples[cursor..end];
+        vad.accept_waveform(frame);
+
+        while let Some(seg) = vad.front() {
+            chunks.push(SpeechChunk {
+                start_sample: seg.start() as usize,
+                end_sample: seg.start() as usize + seg.n() as usize,
+            });
+            vad.pop();
+        }
+
+        cursor = end;
+    }
+
+    vad.flush();
+
+    while let Some(seg) = vad.front() {
+        chunks.push(SpeechChunk {
+            start_sample: seg.start() as usize,
+            end_sample: seg.start() as usize + seg.n() as usize,
+        });
+        vad.pop();
+    }
+
+    Ok(chunks)
+}
+
+/// Add padding around each chunk to protect word boundaries.
+pub fn pad_chunks(
+    chunks: &[SpeechChunk],
+    total_len: usize,
+    pad_samples: usize,
+) -> Vec<SpeechChunk> {
+    chunks
+        .iter()
+        .map(|c| SpeechChunk {
+            start_sample: c.start_sample.saturating_sub(pad_samples),
+            end_sample: (c.end_sample + pad_samples).min(total_len),
+        })
+        .collect()
+}
+
+/// Merge chunks separated by less than max_gap_samples.
+pub fn merge_close_chunks(chunks: &[SpeechChunk], max_gap_samples: usize) -> Vec<SpeechChunk> {
+    if chunks.is_empty() {
+        return Vec::new();
+    }
+
+    let mut sorted = chunks.to_vec();
+    sorted.sort_by_key(|c| c.start_sample);
+
+    let mut merged = Vec::new();
+    let mut cur = sorted[0].clone();
+
+    for next in sorted.into_iter().skip(1) {
+        let gap = next.start_sample.saturating_sub(cur.end_sample);
+        if gap <= max_gap_samples {
+            cur.end_sample = cur.end_sample.max(next.end_sample);
+        } else {
+            merged.push(cur);
+            cur = next;
+        }
+    }
+
+    merged.push(cur);
+    merged
+}
+
+/// Split chunks that exceed max duration, cutting at the lowest-energy point.
+pub fn split_long_chunks(
+    samples: &[f32],
+    chunks: &[SpeechChunk],
+    max_chunk_secs: f32,
+) -> Vec<SpeechChunk> {
+    let max_len = (max_chunk_secs * SAMPLE_RATE as f32) as usize;
+    let mut out = Vec::new();
+
+    for c in chunks {
+        let mut start = c.start_sample;
+        while c.end_sample.saturating_sub(start) > max_len {
+            let target = start + max_len;
+            // Search ±500ms around the target for the quietest spot
+            let search_radius = (SAMPLE_RATE / 2) as usize;
+            let left = target.saturating_sub(search_radius).max(start);
+            let right = (target + search_radius).min(c.end_sample);
+
+            let cut = find_low_energy_cut(samples, left, right).unwrap_or(target);
+
+            out.push(SpeechChunk {
+                start_sample: start,
+                end_sample: cut,
+            });
+            start = cut;
+        }
+
+        if start < c.end_sample {
+            out.push(SpeechChunk {
+                start_sample: start,
+                end_sample: c.end_sample,
+            });
+        }
+    }
+
+    out
+}
+
+/// Find the sample position with the lowest energy in a window.
+fn find_low_energy_cut(samples: &[f32], start: usize, end: usize) -> Option<usize> {
+    let window = 320; // 20ms window
+    if end <= start + window || end > samples.len() {
+        return None;
+    }
+
+    let mut best_pos = None;
+    let mut best_energy = f32::INFINITY;
+
+    let mut i = start;
+    while i + window <= end {
+        let energy: f32 = samples[i..i + window].iter().map(|x| x * x).sum::<f32>() / window as f32;
+
+        if energy < best_energy {
+            best_energy = energy;
+            best_pos = Some(i + window / 2);
+        }
+
+        i += window / 2; // 50% overlap
+    }
+
+    best_pos
+}
+
+/// Full VAD pipeline: detect → pad → merge → split.
+/// Returns clean speech chunks ready for STT.
+pub fn vad_segment(
+    samples: &[f32],
+    vad_model_path: &str,
+    max_chunk_secs: f32,
+) -> Result<Vec<SpeechChunk>> {
+    let raw = detect_speech_chunks(samples, vad_model_path)?;
+
+    // 250ms padding to protect word boundaries
+    let pad_samples = (SAMPLE_RATE as f32 * 0.25) as usize;
+    let padded = pad_chunks(&raw, samples.len(), pad_samples);
+
+    // Merge chunks separated by <200ms gap
+    let merge_gap = (SAMPLE_RATE as f32 * 0.20) as usize;
+    let merged = merge_close_chunks(&padded, merge_gap);
+
+    // Split oversized chunks at low-energy points
+    let final_chunks = split_long_chunks(samples, &merged, max_chunk_secs);
+
+    Ok(final_chunks)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn pad_extends_boundaries() {
+        let chunks = vec![SpeechChunk {
+            start_sample: 1000,
+            end_sample: 5000,
+        }];
+        let padded = pad_chunks(&chunks, 10000, 500);
+        assert_eq!(padded[0].start_sample, 500);
+        assert_eq!(padded[0].end_sample, 5500);
+    }
+
+    #[test]
+    fn pad_clamps_to_bounds() {
+        let chunks = vec![SpeechChunk {
+            start_sample: 100,
+            end_sample: 9900,
+        }];
+        let padded = pad_chunks(&chunks, 10000, 500);
+        assert_eq!(padded[0].start_sample, 0);
+        assert_eq!(padded[0].end_sample, 10000);
+    }
+
+    #[test]
+    fn merge_combines_close_chunks() {
+        let chunks = vec![
+            SpeechChunk {
+                start_sample: 0,
+                end_sample: 1000,
+            },
+            SpeechChunk {
+                start_sample: 1100,
+                end_sample: 2000,
+            },
+            SpeechChunk {
+                start_sample: 5000,
+                end_sample: 6000,
+            },
+        ];
+        let merged = merge_close_chunks(&chunks, 200);
+        assert_eq!(merged.len(), 2);
+        assert_eq!(merged[0].start_sample, 0);
+        assert_eq!(merged[0].end_sample, 2000);
+        assert_eq!(merged[1].start_sample, 5000);
+    }
+
+    #[test]
+    fn split_cuts_long_chunks() {
+        let samples = vec![0.0f32; 80000]; // 5 seconds at 16kHz
+        let chunks = vec![SpeechChunk {
+            start_sample: 0,
+            end_sample: 80000,
+        }];
+        let split = split_long_chunks(&samples, &chunks, 2.0);
+        assert!(split.len() >= 2);
+        for chunk in &split {
+            assert!(chunk.duration_secs() <= 2.5); // some tolerance for cut point
+        }
+    }
+}
diff --git a/src/main.rs b/src/main.rs
index aab885e..d2e2ca0 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -311,6 +311,10 @@ enum Command {
         /// Path to speaker embedding model (ONNX)
         #[arg(long, env = "DIARIZE_EMBEDDING_MODEL")]
         diarize_embedding_model: Option<String>,
+
+        /// Path to Silero VAD model for speech-aware segmentation (avoids mid-word cuts)
+        #[arg(long, env = "VAD_MODEL")]
+        vad_model: Option<String>,
     },
 }
 
@@ -370,6 +374,7 @@ async fn main() -> Result<()> {
             speakers,
             diarize_segmentation_model,
             diarize_embedding_model,
+            vad_model,
         } => {
             check_ffmpeg()?;
 
@@ -508,6 +513,7 @@ async fn main() -> Result<()> {
                     speakers,
                     diarize_segmentation_model: diarize_segmentation_model.clone(),
                     diarize_embedding_model: diarize_embedding_model.clone(),
+                    vad_model: vad_model.clone(),
                 };
 
                 run_pipeline(engine.as_ref(), config).await?;
diff --git a/src/pipeline.rs b/src/pipeline.rs
index 5ea7963..1c00c56 100644
--- a/src/pipeline.rs
+++ b/src/pipeline.rs
@@ -53,6 +53,9 @@ pub struct PipelineConfig {
     pub diarize_segmentation_model: Option<String>,
     #[cfg_attr(not(feature = "sherpa-onnx"), allow(dead_code))]
     pub diarize_embedding_model: Option<String>,
+    /// Path to Silero VAD model for speech-aware segmentation (sherpa-onnx only)
+    #[cfg_attr(not(feature = "sherpa-onnx"), allow(dead_code))]
+    pub vad_model: Option<String>,
 }
 
 pub async fn run_pipeline(engine: &dyn Transcriber, config: PipelineConfig) -> Result<()> {
@@ -87,6 +90,14 @@ pub async fn run_pipeline(engine: &dyn Transcriber, config: PipelineConfig) -> R
 
     #[allow(unused_mut)]
     let mut transcript = if should_segment {
+        // Use VAD-based segmentation when available (sherpa-onnx), fall back to FFmpeg silencedetect
+        #[cfg(feature = "sherpa-onnx")]
+        if let Some(ref vad_model) = config.vad_model {
+            transcribe_vad_segmented(engine, input_path, vad_model, &config).await?
+        } else {
+            transcribe_segmented(engine, input_path, total_duration, &config).await?
+        }
+        #[cfg(not(feature = "sherpa-onnx"))]
         transcribe_segmented(engine, input_path, total_duration, &config).await?
     } else {
         transcribe_with_spinner("Transcribing...", engine.transcribe_path(input_path)).await?
@@ -246,6 +257,72 @@ pub async fn run_pipeline(engine: &dyn Transcriber, config: PipelineConfig) -> R
     Ok(())
 }
 
+#[cfg(feature = "sherpa-onnx")]
+async fn transcribe_vad_segmented(
+    engine: &dyn Transcriber,
+    wav_path: &Path,
+    vad_model: &str,
+    config: &PipelineConfig,
+) -> Result<Transcript> {
+    use crate::audio::vad;
+    use crate::audio::wav::read_wav_bytes;
+
+    eprintln!("Running VAD-based speech segmentation...");
+
+    // Read audio samples for VAD
+    let wav_bytes = std::fs::read(wav_path)
+        .with_context(|| format!("Failed to read: {}", wav_path.display()))?;
+    let samples = read_wav_bytes(&wav_bytes)?;
+
+    let chunks = vad::vad_segment(&samples, vad_model, config.max_segment_secs as f32)?;
+
+    eprintln!("Found {} speech chunks (VAD).", chunks.len());
+
+    if chunks.is_empty() {
+        eprintln!("No speech detected.");
+        return Ok(Transcript {
+            segments: Vec::new(),
+        });
+    }
+
+    let mut all_segments: Vec<Segment> = Vec::new();
+
+    for (i, chunk) in chunks.iter().enumerate() {
+        eprintln!(
+            "  Transcribing chunk {}/{} ({:.1}s - {:.1}s, {:.1}s)...",
+            i + 1,
+            chunks.len(),
+            chunk.start_secs(),
+            chunk.end_secs(),
+            chunk.duration_secs(),
+        );
+
+        let chunk_samples = samples[chunk.start_sample..chunk.end_sample].to_vec();
+        let transcript = transcribe_with_spinner(
+            &format!(
+                "Transcribing chunk {}/{} ({:.1}s)...",
+                i + 1,
+                chunks.len(),
+                chunk.duration_secs(),
+            ),
+            engine.transcribe(chunk_samples),
+        )
+        .await?;
+
+        // Offset timestamps by the chunk start time
+        let offset_ms = (chunk.start_secs() * 1000.0) as i64;
+        for mut seg in transcript.segments {
+            seg.start_ms += offset_ms;
+            seg.end_ms += offset_ms;
+            all_segments.push(seg);
+        }
+    }
+
+    Ok(Transcript {
+        segments: all_segments,
+    })
+}
+
 async fn transcribe_segmented(
     engine: &dyn Transcriber,
     wav_path: &Path,
@@ -408,6 +485,7 @@ mod tests {
                 speakers: None,
                 diarize_segmentation_model: None,
                 diarize_embedding_model: None,
+                vad_model: None,
             },
         )
         .await?;
@@ -471,6 +549,7 @@ mod tests {
                 speakers: None,
                 diarize_segmentation_model: None,
                 diarize_embedding_model: None,
+                vad_model: None,
             },
         )
         .await?;
@@ -520,6 +599,7 @@ mod tests {
                 speakers: None,
                 diarize_segmentation_model: None,
                 diarize_embedding_model: None,
+                vad_model: None,
             },
         )
         .await?;
@@ -571,6 +651,7 @@ mod tests {
                 speakers: None,
                 diarize_segmentation_model: None,
                 diarize_embedding_model: None,
+                vad_model: None,
             },
         )
         .await?;

From b568a1d8da20a304346f94bc4b096c5bce7f2d40 Mon Sep 17 00:00:00 2001
From: skitsanos <evgenios@skitsanos.com>
Date: Mon, 16 Mar 2026 09:38:07 +0200
Subject: [PATCH 3/5] feat: Self-bootstrapping setup command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

transcribeit setup — downloads all components for full functionality:
- models: default GGML base model from HuggingFace
- vad: Silero VAD model (~628KB) for speech-aware segmentation
- diarize: pyannote segmentation + wespeaker embedding models
- sherpa-libs: platform-specific sherpa-onnx shared libraries
  (auto-detects macOS/Linux x64/ARM64)

Selective install: transcribeit setup -c vad
Extended download-model: --vad and --diarize flags

Prints env var summary at the end showing what to add to .env.
All downloads are idempotent (skip if already present).
---
 src/main.rs | 369 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 358 insertions(+), 11 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index d2e2ca0..b00380b 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -174,6 +174,19 @@ enum OutputFormatArg {
     Srt,
 }
 
+#[derive(Debug, Clone, ValueEnum)]
+enum SetupComponent {
+    /// Default STT models (GGML base)
+    Models,
+    /// Silero VAD model for speech-aware segmentation
+    Vad,
+    /// Speaker diarization models (segmentation + embedding)
+    Diarize,
+    /// sherpa-onnx shared libraries for the current platform
+    #[value(name = "sherpa-libs")]
+    SherpaLibs,
+}
+
 #[derive(Parser)]
 #[command(name = "transcribeit", about = "Transcribe audio files")]
 struct Cli {
@@ -184,6 +197,21 @@ struct Cli {
 #[derive(Subcommand)]
 #[allow(clippy::large_enum_variant)]
 enum Command {
+    /// Download and install all components for full functionality
+    Setup {
+        /// Install only a specific component
+        #[arg(short, long)]
+        component: Option<SetupComponent>,
+
+        /// Directory for models (overrides MODEL_CACHE_DIR)
+        #[arg(short, long)]
+        output_dir: Option<PathBuf>,
+
+        /// Hugging Face token for model downloads
+        #[arg(short = 't', long, env = "HF_TOKEN")]
+        hf_token: Option<String>,
+    },
+
     /// Download a Whisper model
     DownloadModel {
         /// Model size to download
@@ -201,6 +229,14 @@ enum Command {
         /// Hugging Face token (optional, or set HF_TOKEN env var)
         #[arg(short = 't', long, env = "HF_TOKEN")]
         hf_token: Option<String>,
+
+        /// Also download VAD model (silero_vad.onnx)
+        #[arg(long)]
+        vad: bool,
+
+        /// Also download diarization models (segmentation + embedding)
+        #[arg(long)]
+        diarize: bool,
     },
 
     /// List downloaded models
@@ -325,24 +361,75 @@ async fn main() -> Result<()> {
     let cli = Cli::parse();
 
     match cli.command {
+        Command::Setup {
+            component,
+            output_dir,
+            hf_token,
+        } => {
+            let components = match component {
+                Some(c) => vec![c],
+                None => vec![
+                    SetupComponent::Models,
+                    SetupComponent::Vad,
+                    SetupComponent::Diarize,
+                    SetupComponent::SherpaLibs,
+                ],
+            };
+
+            let mut summary: Vec<(&str, String)> = Vec::new();
+
+            for comp in &components {
+                match comp {
+                    SetupComponent::Models => {
+                        let status = setup_models(output_dir.clone(), hf_token.as_deref()).await?;
+                        summary.push(("models", status));
+                    }
+                    SetupComponent::Vad => {
+                        let status = setup_vad(output_dir.clone()).await?;
+                        summary.push(("vad", status));
+                    }
+                    SetupComponent::Diarize => {
+                        let status = setup_diarize(output_dir.clone()).await?;
+                        summary.push(("diarize", status));
+                    }
+                    SetupComponent::SherpaLibs => {
+                        let status = setup_sherpa_libs().await?;
+                        summary.push(("sherpa-libs", status));
+                    }
+                }
+            }
+
+            print_setup_summary(&summary);
+        }
+
         Command::DownloadModel {
             model_size,
             format,
             output_dir,
             hf_token,
-        } => match format {
-            ModelFormat::Ggml => {
-                download_model(&model_size, output_dir, hf_token.as_deref()).await?;
+            vad,
+            diarize,
+        } => {
+            match format {
+                ModelFormat::Ggml => {
+                    download_model(&model_size, output_dir.clone(), hf_token.as_deref()).await?;
+                }
+                ModelFormat::Onnx => {
+                    #[cfg(feature = "sherpa-onnx")]
+                    download_onnx_model(&model_size, output_dir.clone()).await?;
+                    #[cfg(not(feature = "sherpa-onnx"))]
+                    anyhow::bail!(
+                        "ONNX model download requires the 'sherpa-onnx' feature. Build with: cargo build --features sherpa-onnx"
+                    );
+                }
             }
-            ModelFormat::Onnx => {
-                #[cfg(feature = "sherpa-onnx")]
-                download_onnx_model(&model_size, output_dir).await?;
-                #[cfg(not(feature = "sherpa-onnx"))]
-                anyhow::bail!(
-                    "ONNX model download requires the 'sherpa-onnx' feature. Build with: cargo build --features sherpa-onnx"
-                );
+            if vad {
+                setup_vad(output_dir.clone()).await?;
+            }
+            if diarize {
+                setup_diarize(output_dir).await?;
             }
-        },
+        }
 
         Command::ListModels { dir } => {
             list_models(dir)?;
@@ -837,3 +924,263 @@ fn list_models(dir: Option<PathBuf>) -> Result<()> {
 
     Ok(())
 }
+
+// ── Setup helpers ───────────────────────────────────────────────────────────
+
+const SHERPA_ONNX_VERSION: &str = "v1.12.29";
+
+/// Download a single file with progress bar. Returns "installed" or "already present".
+async fn download_file_with_progress(url: &str, dest: &Path, label: &str) -> Result<String> {
+    if dest.exists() {
+        println!("{label}: already present at {}", dest.display());
+        return Ok("already present".into());
+    }
+
+    if let Some(parent) = dest.parent() {
+        tokio::fs::create_dir_all(parent).await?;
+    }
+
+    println!("Downloading {label}...");
+    println!("  from: {url}");
+    println!("  to:   {}", dest.display());
+
+    let client = reqwest::Client::new();
+    let resp = client
+        .get(url)
+        .send()
+        .await
+        .context("Failed to start download")?;
+
+    if !resp.status().is_success() {
+        anyhow::bail!("Download failed with status: {}", resp.status());
+    }
+
+    let total_size = resp.content_length().unwrap_or(0);
+    let pb = ProgressBar::new(total_size);
+    pb.set_style(
+        ProgressStyle::default_bar()
+            .template("{bar:40.cyan/blue} {bytes}/{total_bytes} ({eta})")?
+            .progress_chars("##-"),
+    );
+
+    let tmp_dest = dest.with_extension("part");
+    let mut file = tokio::fs::File::create(&tmp_dest)
+        .await
+        .context("Failed to create temp file")?;
+
+    let mut stream = resp.bytes_stream();
+    while let Some(chunk) = stream.next().await {
+        let chunk = chunk.context("Error reading download stream")?;
+        file.write_all(&chunk).await.context("Failed to write")?;
+        pb.inc(chunk.len() as u64);
+    }
+
+    file.flush().await?;
+    drop(file);
+
+    tokio::fs::rename(&tmp_dest, dest)
+        .await
+        .context("Failed to finalize download")?;
+
+    pb.finish_and_clear();
+    println!("Done: {}", dest.display());
+    Ok("installed".into())
+}
+
+/// Download and extract a tar.bz2 archive. Returns "installed" or "already present".
+async fn download_and_extract(
+    url: &str,
+    extract_to: &Path,
+    check_dir: &Path,
+    label: &str,
+) -> Result<String> {
+    if check_dir.exists() {
+        println!("{label}: already present at {}", check_dir.display());
+        return Ok("already present".into());
+    }
+
+    tokio::fs::create_dir_all(extract_to).await?;
+
+    println!("Downloading {label}...");
+    println!("  from: {url}");
+
+    let client = reqwest::Client::new();
+    let resp = client
+        .get(url)
+        .send()
+        .await
+        .context("Failed to start download")?;
+
+    if !resp.status().is_success() {
+        anyhow::bail!("Download failed with status: {}", resp.status());
+    }
+
+    let total_size = resp.content_length().unwrap_or(0);
+    let pb = ProgressBar::new(total_size);
+    pb.set_style(
+        ProgressStyle::default_bar()
+            .template("{bar:40.cyan/blue} {bytes}/{total_bytes} ({eta})")?
+            .progress_chars("##-"),
+    );
+
+    let tmp = tempfile::Builder::new()
+        .suffix(".tar.bz2")
+        .tempfile_in(extract_to)
+        .context("Failed to create temp file")?;
+    let tmp_path = tmp.path().to_path_buf();
+
+    {
+        let mut file = tokio::fs::File::create(&tmp_path).await?;
+        let mut stream = resp.bytes_stream();
+        while let Some(chunk) = stream.next().await {
+            let chunk = chunk.context("Error reading download stream")?;
+            file.write_all(&chunk).await?;
+            pb.inc(chunk.len() as u64);
+        }
+        file.flush().await?;
+    }
+
+    pb.finish_and_clear();
+    println!("Extracting...");
+
+    let extract_dir = extract_to.to_path_buf();
+    tokio::task::spawn_blocking(move || {
+        let file = std::fs::File::open(&tmp_path).context("Failed to open archive")?;
+        let decoder = bzip2::read::BzDecoder::new(file);
+        let mut archive = tar::Archive::new(decoder);
+        archive.unpack(&extract_dir).context("Failed to extract")?;
+        let _ = std::fs::remove_file(&tmp_path);
+        Ok::<(), anyhow::Error>(())
+    })
+    .await??;
+
+    println!("Done: {}", check_dir.display());
+    Ok("installed".into())
+}
+
+async fn setup_models(output_dir: Option<PathBuf>, hf_token: Option<&str>) -> Result<String> {
+    let dir = output_dir.unwrap_or_else(models_dir);
+    let dest = dir.join("ggml-base.bin");
+    if dest.exists() {
+        println!("models: already present (ggml-base.bin)");
+        return Ok("already present".into());
+    }
+    download_model(&ModelSize::Base, Some(dir), hf_token).await?;
+    Ok("installed (ggml-base.bin)".into())
+}
+
+async fn setup_vad(output_dir: Option<PathBuf>) -> Result<String> {
+    let dir = output_dir.unwrap_or_else(models_dir);
+    let dest = dir.join("silero_vad.onnx");
+    download_file_with_progress(
+        "https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx",
+        &dest,
+        "VAD model (silero_vad.onnx)",
+    )
+    .await
+}
+
+async fn setup_diarize(output_dir: Option<PathBuf>) -> Result<String> {
+    let dir = output_dir.unwrap_or_else(models_dir);
+    let mut parts = Vec::new();
+
+    // Segmentation model (tar.bz2)
+    let seg_dir = dir.join("sherpa-onnx-pyannote-segmentation-3-0");
+    let seg_status = download_and_extract(
+        "https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2",
+        &dir,
+        &seg_dir,
+        "diarize segmentation model",
+    ).await?;
+    parts.push(format!("segmentation: {seg_status}"));
+
+    // Embedding model (single file)
+    let emb_dest = dir.join("wespeaker_en_voxceleb_CAM++.onnx");
+    let emb_status = download_file_with_progress(
+        "https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/wespeaker_en_voxceleb_CAM%2B%2B.onnx",
+        &emb_dest,
+        "diarize embedding model (wespeaker)",
+    ).await?;
+    parts.push(format!("embedding: {emb_status}"));
+
+    Ok(parts.join(", "))
+}
+
+async fn setup_sherpa_libs() -> Result<String> {
+    let os = std::env::consts::OS;
+    let arch = std::env::consts::ARCH;
+
+    let archive_suffix = match (os, arch) {
+        ("macos", _) => "osx-universal2-shared",
+        ("linux", "x86_64") => "linux-x86_64-shared",
+        ("linux", "aarch64") => "linux-aarch64-shared",
+        _ => anyhow::bail!(
+            "Unsupported platform: {os}-{arch}. Download sherpa-onnx shared libraries manually."
+        ),
+    };
+
+    let archive_name = format!("sherpa-onnx-{SHERPA_ONNX_VERSION}-{archive_suffix}");
+    let url = format!(
+        "https://github.com/k2-fsa/sherpa-onnx/releases/download/{SHERPA_ONNX_VERSION}/{archive_name}.tar.bz2"
+    );
+
+    let vendor_dir = PathBuf::from("vendor");
+    let check_dir = vendor_dir.join(&archive_name);
+
+    let status = download_and_extract(
+        &url,
+        &vendor_dir,
+        &check_dir,
+        "sherpa-onnx shared libraries",
+    )
+    .await?;
+
+    if status == "installed" {
+        let lib_dir = check_dir.join("lib");
+        eprintln!(
+            "\nAdd to .env:\n  SHERPA_ONNX_LIB_DIR={}\n",
+            lib_dir.display()
+        );
+    }
+
+    Ok(format!("{status} ({archive_suffix})"))
+}
+
+fn print_setup_summary(summary: &[(&str, String)]) {
+    println!("\n=== Setup Summary ===");
+    for (name, status) in summary {
+        println!("  {name:<14} {status}");
+    }
+
+    let dir = models_dir();
+    println!("\nAdd to .env (if not already set):");
+    println!("  MODEL_CACHE_DIR={}", dir.display());
+
+    let vad_path = dir.join("silero_vad.onnx");
+    if vad_path.exists() {
+        println!("  VAD_MODEL={}", vad_path.display());
+    }
+
+    let seg_path = dir.join("sherpa-onnx-pyannote-segmentation-3-0/model.onnx");
+    if seg_path.exists() {
+        println!("  DIARIZE_SEGMENTATION_MODEL={}", seg_path.display());
+    }
+
+    let emb_path = dir.join("wespeaker_en_voxceleb_CAM++.onnx");
+    if emb_path.exists() {
+        println!("  DIARIZE_EMBEDDING_MODEL={}", emb_path.display());
+    }
+
+    // Check for sherpa-onnx libs in vendor/
+    if let Ok(entries) = std::fs::read_dir("vendor") {
+        for entry in entries.flatten() {
+            let path = entry.path();
+            if path.is_dir() && path.join("lib").exists() {
+                println!("  SHERPA_ONNX_LIB_DIR={}", path.join("lib").display());
+                break;
+            }
+        }
+    }
+
+    println!();
+}

From 039ebe821739ff59d0d3d16c0f7ce0f27b03d743 Mon Sep 17 00:00:00 2001
From: skitsanos <evgenios@skitsanos.com>
Date: Mon, 16 Mar 2026 14:25:19 +0200
Subject: [PATCH 4/5] chore: Add BSL 1.1 license
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Business Source License 1.1:
- Free for non-commercial and evaluation use
- Commercial/production use requires a separate license
- Converts to Apache 2.0 on 2030-03-16

All dependencies verified compatible (MIT, Apache-2.0, BSD, ISC,
Unlicense — no GPL/copyleft).
---
 Cargo.toml |  2 ++
 LICENSE    | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 README.md  |  8 ++++++++
 3 files changed, 68 insertions(+)
 create mode 100644 LICENSE

diff --git a/Cargo.toml b/Cargo.toml
index a8acee7..f5e3a03 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,6 +2,8 @@
 name = "transcribeit"
 version = "1.1.0"
 edition = "2024"
+license = "LicenseRef-BSL-1.1"
+license-file = "LICENSE"
 
 [profile.release]
 opt-level = 3
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..649141a
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,58 @@
+Business Source License 1.1
+
+License text copyright (c) 2017 MariaDB Corporation Ab, All Rights Reserved.
+"Business Source License" is a trademark of MariaDB Corporation Ab.
+
+Parameters
+
+Licensor:             TranscriptIntel
+Licensed Work:        transcribeit
+                      The Licensed Work is (c) 2026 TranscriptIntel
+Additional Use Grant: You may use the Licensed Work for non-commercial
+                      and evaluation purposes without a license.
+                      Production use in a commercial setting requires
+                      a separate commercial license from the Licensor.
+Change Date:          2030-03-16
+Change License:       Apache License, Version 2.0
+
+Terms
+
+The Licensor hereby grants you the right to copy, modify, create derivative
+works, redistribute, and make non-production use of the Licensed Work. The
+Licensor may make an Additional Use Grant, above, permitting limited
+production use.
+
+Effective on the Change Date, or the fourth anniversary of the first publicly
+available distribution of a specific version of the Licensed Work under this
+License, whichever comes first, the Licensor hereby grants you rights under
+the terms of the Change License, and the rights granted in the paragraph
+above terminate.
+
+If your use of the Licensed Work does not comply with the requirements
+currently in effect as described in this License, you must purchase a
+commercial license from the Licensor, its affiliated entities, or authorized
+resellers, or you must refrain from using the Licensed Work.
+
+All copies of the original and modified Licensed Work, and derivative works
+of the Licensed Work, are subject to this License. This License applies
+separately for each version of the Licensed Work and the Change Date may vary
+for each version of the Licensed Work released by Licensor.
+
+You must conspicuously display this License on each original or modified copy
+of the Licensed Work. If you receive the Licensed Work in original or
+modified form from a third party, the terms and conditions set forth in this
+License apply to your use of that work.
+
+Any use of the Licensed Work in violation of this License will automatically
+terminate your rights under this License for the current and all other
+versions of the Licensed Work.
+
+This License does not grant you any right in any trademark or logo of
+Licensor or its affiliates (provided that you may use a trademark or logo of
+Licensor as expressly required by this License).
+
+TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON
+AN "AS IS" BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS,
+EXPRESS OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND
+TITLE.
diff --git a/README.md b/README.md
index 5cb0c7f..fe88c95 100644
--- a/README.md
+++ b/README.md
@@ -116,6 +116,14 @@ DIARIZE_SEGMENTATION_MODEL=.cache/sherpa-onnx-pyannote-segmentation-3-0/model.on
 DIARIZE_EMBEDDING_MODEL=.cache/wespeaker_en_voxceleb_CAM++.onnx
 ```
 
+## License
+
+This project is licensed under the [Business Source License 1.1](LICENSE).
+
+- **Free** for non-commercial and evaluation use
+- **Commercial/production use** requires a separate license — contact [TranscriptIntel](https://github.com/transcriptintel)
+- Converts to **Apache 2.0** on March 16, 2030
+
 ## Documentation
 
 See the [docs](docs/) folder for detailed documentation:

From 9857591ff4aabb59960649f400dac6e8afa9b692 Mon Sep 17 00:00:00 2001
From: skitsanos <evgenios@skitsanos.com>
Date: Mon, 16 Mar 2026 14:28:47 +0200
Subject: [PATCH 5/5] chore: Bump version to 1.2.0, fix cargo license warning,
 whisper-rs 0.16 API

---
 Cargo.lock                   |  2 +-
 Cargo.toml                   |  3 +--
 src/engines/model_cache.rs   | 17 +++++++----------
 src/engines/whisper_local.rs | 31 ++++++++-----------------------
 4 files changed, 17 insertions(+), 36 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a12cfa7..265fba4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1856,7 +1856,7 @@ dependencies = [
 
 [[package]]
 name = "transcribeit"
-version = "1.1.0"
+version = "1.2.0"
 dependencies = [
  "anyhow",
  "async-trait",
diff --git a/Cargo.toml b/Cargo.toml
index f5e3a03..258f59e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,8 +1,7 @@
 [package]
 name = "transcribeit"
-version = "1.1.0"
+version = "1.2.0"
 edition = "2024"
-license = "LicenseRef-BSL-1.1"
 license-file = "LICENSE"
 
 [profile.release]
diff --git a/src/engines/model_cache.rs b/src/engines/model_cache.rs
index 5783ccc..544dfc6 100644
--- a/src/engines/model_cache.rs
+++ b/src/engines/model_cache.rs
@@ -1,5 +1,4 @@
 use std::collections::HashMap;
-use std::os::raw::{c_char, c_void};
 use std::sync::{Arc, Mutex};
 
 use anyhow::{Context, Result};
@@ -10,17 +9,15 @@ pub struct ModelCache {
 }
 
 impl ModelCache {
-    #[inline]
-    unsafe extern "C" fn whisper_log_silencer(
-        _level: u32,
-        _text: *const c_char,
-        _user_data: *mut c_void,
-    ) {
-    }
-
     fn silence_whisper_logs() {
+        unsafe extern "C" fn noop(
+            _level: std::os::raw::c_uint,
+            _text: *const std::os::raw::c_char,
+            _user_data: *mut std::os::raw::c_void,
+        ) {
+        }
         unsafe {
-            whisper_rs::set_log_callback(Some(Self::whisper_log_silencer), std::ptr::null_mut());
+            whisper_rs::set_log_callback(Some(noop), std::ptr::null_mut());
         }
     }
 
diff --git a/src/engines/whisper_local.rs b/src/engines/whisper_local.rs
index 894f1bf..4420c58 100644
--- a/src/engines/whisper_local.rs
+++ b/src/engines/whisper_local.rs
@@ -30,7 +30,6 @@ impl Transcriber for WhisperLocal {
         let cache = Arc::clone(&self.cache);
         let language = self.language.clone();
 
-        // whisper-rs is synchronous and CPU-heavy; run on a blocking thread
         tokio::task::spawn_blocking(move || {
             let ctx = cache.get_or_load(&model_path)?;
 
@@ -52,29 +51,15 @@ impl Transcriber for WhisperLocal {
                 .full(params, &audio_samples)
                 .context("Whisper inference failed")?;
 
-            let num_segments = state
-                .full_n_segments()
-                .context("Failed to get segment count")?;
-
-            let mut segments = Vec::new();
-            for i in 0..num_segments {
-                let text = state
-                    .full_get_segment_text(i)
-                    .context("Failed to get segment text")?;
-                let start = state
-                    .full_get_segment_t0(i)
-                    .context("Failed to get segment start")?;
-                let end = state
-                    .full_get_segment_t1(i)
-                    .context("Failed to get segment end")?;
-
-                segments.push(Segment {
-                    start_ms: start * 10,
-                    end_ms: end * 10,
-                    text,
+            let segments: Vec<Segment> = state
+                .as_iter()
+                .map(|seg| Segment {
+                    start_ms: seg.start_timestamp() * 10,
+                    end_ms: seg.end_timestamp() * 10,
+                    text: seg.to_string(),
                     speaker: None,
-                });
-            }
+                })
+                .collect();
 
             Ok(Transcript { segments })
         })