transcriptintel · skitsanos · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,7 +1,8 @@
 [package]
 name = "transcribeit"
-version = "1.1.0"
+version = "1.2.0"
 edition = "2024"
+license-file = "LICENSE"
 
 [profile.release]
 opt-level = 3
@@ -25,16 +26,16 @@ dotenvy = "0.15"
 futures-util = "0.3"
 hound = "3.5"
 glob = "0.3"
-indicatif = "0.17"
-reqwest = { version = "0.12", features = ["json", "multipart", "stream"] }
+indicatif = "0.18"
+reqwest = { version = "0.13", features = ["json", "multipart", "stream"] }
 serde = { version = "1", features = ["derive"] }
 serde_json = "1"
 tempfile = "3"
 regex = "1"
 tokio = { version = "1", features = ["full"] }
 sherpa-onnx = { version = "0.1", optional = true }
 tar = "0.4"
-bzip2 = "0.5"
+bzip2 = "0.6"
 libc = "0.2"
-whisper-rs = "0.12"
+whisper-rs = "0.16"
 bytes = "1.11.1"
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,58 @@
+Business Source License 1.1
+
+License text copyright (c) 2017 MariaDB Corporation Ab, All Rights Reserved.
+"Business Source License" is a trademark of MariaDB Corporation Ab.
+
+Parameters
+
+Licensor:             TranscriptIntel
+Licensed Work:        transcribeit
+                      The Licensed Work is (c) 2026 TranscriptIntel
+Additional Use Grant: You may use the Licensed Work for non-commercial
+                      and evaluation purposes without a license.
+                      Production use in a commercial setting requires
+                      a separate commercial license from the Licensor.
+Change Date:          2030-03-16
+Change License:       Apache License, Version 2.0
+
+Terms
+
+The Licensor hereby grants you the right to copy, modify, create derivative
+works, redistribute, and make non-production use of the Licensed Work. The
+Licensor may make an Additional Use Grant, above, permitting limited
+production use.
+
+Effective on the Change Date, or the fourth anniversary of the first publicly
+available distribution of a specific version of the Licensed Work under this
+License, whichever comes first, the Licensor hereby grants you rights under
+the terms of the Change License, and the rights granted in the paragraph
+above terminate.
+
+If your use of the Licensed Work does not comply with the requirements
+currently in effect as described in this License, you must purchase a
+commercial license from the Licensor, its affiliated entities, or authorized
+resellers, or you must refrain from using the Licensed Work.
+
+All copies of the original and modified Licensed Work, and derivative works
+of the Licensed Work, are subject to this License. This License applies
+separately for each version of the Licensed Work and the Change Date may vary
+for each version of the Licensed Work released by Licensor.
+
+You must conspicuously display this License on each original or modified copy
+of the Licensed Work. If you receive the Licensed Work in original or
+modified form from a third party, the terms and conditions set forth in this
+License apply to your use of that work.
+
+Any use of the Licensed Work in violation of this License will automatically
+terminate your rights under this License for the current and all other
+versions of the Licensed Work.
+
+This License does not grant you any right in any trademark or logo of
+Licensor or its affiliates (provided that you may use a trademark or logo of
+Licensor as expressly required by this License).
+
+TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON
+AN "AS IS" BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS,
+EXPRESS OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND
+TITLE.
diff --git a/README.md b/README.md
@@ -62,6 +62,14 @@ transcribeit run -p azure -i recording.mp3 \
 
 # Force language and normalize before transcription
 transcribeit run -i recording.wav -m base --language en --normalize
+
+# VAD-based segmentation (speech-aware, avoids mid-word cuts)
+transcribeit run -p sherpa-onnx -m base -i recording.mp3 --vad-model .cache/silero_vad.onnx
+
+# Speaker diarization (2 speakers)
+transcribeit run -i interview.mp3 -m base --speakers 2 \
+  --diarize-segmentation-model .cache/sherpa-onnx-pyannote-segmentation-3-0/model.onnx \
+  --diarize-embedding-model .cache/wespeaker_en_voxceleb_CAM++.onnx
 ```
 
 ## Features
@@ -72,7 +80,8 @@ transcribeit run -i recording.wav -m base --language en --normalize
 - **Model aliases** — `-m base`, `-m tiny`, etc. resolve from `MODEL_CACHE_DIR` for both `local` and `sherpa-onnx` providers. The sherpa-onnx resolver also supports glob matching (e.g., `-m moonshine-base`, `-m sense-voice`).
 - **Language hinting** — Pass `--language` to force local and API transcription language.
 - **FFmpeg audio normalization** — Optional `--normalize` to apply loudnorm before transcription.
-- **Silence-based segmentation** — Splits long audio at silence boundaries for better accuracy and API compatibility.
+- **VAD-based segmentation** — Speech-aware segmentation via Silero VAD (sherpa-onnx). Detects speech boundaries with padding and gap merging to avoid mid-word cuts. Use `--vad-model .cache/silero_vad.onnx`.
+- **Silence-based segmentation** — Fallback segmentation via FFmpeg `silencedetect` for API providers or when VAD model is not available.
 - **sherpa-onnx auto-segmentation** — Whisper ONNX models only support ≤30s per call; segmentation is enabled automatically.
 - **sherpa-onnx is optional** — Enabled by default as a Cargo feature. Build without it: `cargo build --no-default-features`.
 - **Auto-split for API limits** — Files exceeding 25MB are automatically segmented when using remote providers.
@@ -102,8 +111,19 @@ TRANSCRIBEIT_MAX_RETRIES=5
 TRANSCRIBEIT_REQUEST_TIMEOUT_SECS=120
 TRANSCRIBEIT_RETRY_WAIT_BASE_SECS=10
 TRANSCRIBEIT_RETRY_WAIT_MAX_SECS=120
+VAD_MODEL=.cache/silero_vad.onnx
+DIARIZE_SEGMENTATION_MODEL=.cache/sherpa-onnx-pyannote-segmentation-3-0/model.onnx
+DIARIZE_EMBEDDING_MODEL=.cache/wespeaker_en_voxceleb_CAM++.onnx
 ```
 
+## License
+
+This project is licensed under the [Business Source License 1.1](LICENSE).
+
+- **Free** for non-commercial and evaluation use
+- **Commercial/production use** requires a separate license — contact [TranscriptIntel](https://github.com/transcriptintel)
+- Converts to **Apache 2.0** on March 16, 2030
+
 ## Documentation
 
 See the [docs](docs/) folder for detailed documentation:

diff --git a/docs/architecture.md b/docs/architecture.md
@@ -12,11 +12,15 @@ src/
 ├── audio/
 │   ├── extract.rs         # FFmpeg audio conversion
 │   ├── segment.rs         # Silence detection and audio splitting
+│   ├── vad.rs             # VAD-based speech segmentation (Silero VAD via sherpa-onnx)
 │   └── wav.rs             # WAV reading and encoding (shared)
+├── diarize/
+│   ├── mod.rs             # Speaker diarization engine and speaker assignment
+│   └── ffi.rs             # Raw C FFI bindings for sherpa-onnx speaker diarization
 ├── output/
-│   ├── vtt.rs             # WebVTT subtitle writer
-│   ├── srt.rs             # SRT subtitle writer
-│   └── manifest.rs        # JSON manifest writer
+│   ├── vtt.rs             # WebVTT subtitle writer (supports <v Speaker N> tags)
+│   ├── srt.rs             # SRT subtitle writer (supports [Speaker N] labels)
+│   └── manifest.rs        # JSON manifest writer (includes speaker labels)
 └── engines/
     ├── whisper_local.rs   # Local whisper.cpp via whisper-rs
     ├── sherpa_onnx.rs     # Local sherpa-onnx engine (auto-detects Whisper, Moonshine, SenseVoice)
@@ -67,21 +71,32 @@ Input file (any format)
   │   └─ Auto: sherpa-onnx provider (always segments; max 30s per chunk)
   │
   ├─ If segmenting:
-  │   ├─ detect_silence() via FFmpeg silencedetect filter
-  │   ├─ compute_segments() at silence midpoints
-  │   ├─ split_audio() into temp WAV files
-  │   └─ Transcribe each segment, offset timestamps (concurrently for API providers)
+  │   ├─ VAD path (when --vad-model is set and sherpa-onnx feature is enabled):
+  │   │   ├─ read_wav_bytes() → f32 PCM samples
+  │   │   ├─ vad_segment(): detect speech → pad 250ms → merge gaps <200ms → split long chunks at low-energy points
+  │   │   ├─ Extract chunk samples directly from memory
+  │   │   └─ Transcribe each chunk via transcribe(), offset timestamps
+  │   ├─ FFmpeg fallback (no VAD model, or sherpa-onnx feature disabled):
+  │   │   ├─ detect_silence() via FFmpeg silencedetect filter
+  │   │   ├─ compute_segments() at silence midpoints
+  │   │   ├─ split_audio() into temp WAV files
+  │   │   └─ Transcribe each segment, offset timestamps (concurrently for API providers)
   │
   ├─ If not segmenting:
   │   ├─ Local: read_wav() → transcribe() directly
   │   └─ API: transcribe_path() with prepared file
   │
   ├─ normalize_audio? ──→ optional loudnorm filter in ffmpeg conversion pipeline
+  ├─ Speaker diarization? (when --speakers N is set)
+  │   ├─ read audio samples for diarization
+  │   ├─ Diarizer.diarize() → speaker-labeled time spans
+  │   └─ assign_speakers() overlays speaker labels onto transcript segments
+  │
   └─ Output:
       ├─ Text to stdout or `<input_stem>.txt`
-      ├─ VTT to file or stdout
-      ├─ SRT to file or stdout
-      └─ JSON manifest to output directory
+      ├─ VTT to file or stdout (with `<v Speaker N>` tags when diarized)
+      ├─ SRT to file or stdout (with `[Speaker N]` labels when diarized)
+      └─ JSON manifest to output directory (includes speaker field per segment)
 ```
 
 Temporary files use the `tempfile` crate and are cleaned up automatically on drop.
@@ -184,6 +199,36 @@ cargo build --release --no-default-features
 
 This removes the sherpa-onnx provider and eliminates the need for `SHERPA_ONNX_LIB_DIR`.
 
+## VAD-based segmentation (`audio/vad.rs`)
+
+When `--vad-model` is set and the `sherpa-onnx` feature is enabled, the pipeline uses Silero VAD (via sherpa-onnx) for speech-aware segmentation instead of FFmpeg's `silencedetect` filter. This avoids the main problem with silence-based splitting: mid-word cuts.
+
+The VAD pipeline (`vad_segment()`) has four stages:
+
+1. **Detect speech** -- Silero VAD processes 512-sample frames (~32ms at 16kHz) to find speech boundaries with sample-level precision.
+2. **Pad 250ms** -- Each speech chunk is extended by 250ms on both sides to protect word boundaries at the edges.
+3. **Merge gaps <200ms** -- Adjacent chunks separated by less than 200ms are merged to avoid splitting within short pauses.
+4. **Split long chunks** -- Chunks exceeding `--max-segment-secs` are split at the lowest-energy point within a 1-second search window around the target cut point.
+
+The VAD approach works directly on in-memory PCM samples, so there is no need for intermediate temp files during segmentation. Each chunk is transcribed via `engine.transcribe()` with sample slices, and timestamps are offset by the chunk start time.
+
+When `--vad-model` is not set, segmentation falls back to FFmpeg `silencedetect` (the original behavior).
+
+## Speaker diarization (`diarize/`)
+
+Speaker diarization identifies which speaker is talking at each point in the audio. It requires the `sherpa-onnx` feature and two ONNX models:
+
+- **Segmentation model** (`--diarize-segmentation-model`): a pyannote segmentation ONNX model that detects speaker change points.
+- **Embedding model** (`--diarize-embedding-model`): a speaker embedding ONNX model that clusters voice characteristics.
+
+The `Diarizer` follows the same dedicated worker thread pattern as `SherpaOnnxEngine`: the C FFI types are not `Send`/`Sync`, so they live on a plain `std::thread` and communicate via channels. Diarization requests are sent through `mpsc` and results come back through `tokio::sync::oneshot`.
+
+After transcription completes, `assign_speakers()` overlays speaker labels onto transcript segments by finding the diarization segment with the maximum time overlap for each transcript segment. Speaker labels appear as:
+
+- **VTT**: `<v Speaker 0>text</v>`
+- **SRT**: `[Speaker 0] text`
+- **Manifest JSON**: `"speaker": "Speaker 0"` field on each segment
+
 ## Adding a new engine
 
 1. Create `src/engines/your_engine.rs`

diff --git a/docs/cli-reference.md b/docs/cli-reference.md
@@ -119,9 +119,22 @@ These options apply to OpenAI/Azure providers:
 | `--min-silence-duration` | Minimum silence duration in seconds | `0.8` |
 | `--max-segment-secs` | Maximum segment length in seconds | `600` |
 | `--segment-concurrency` | Max parallel segment requests (API providers only) | `2` |
+| `--vad-model` | Path to Silero VAD ONNX model (`silero_vad.onnx`) for speech-aware segmentation | `VAD_MODEL` env var |
 
 When using `openai` or `azure` providers, files exceeding 25MB are automatically segmented even without `--segment`. When using `sherpa-onnx`, segmentation is always enabled with a maximum segment length of 30 seconds.
 
+When `--vad-model` is set and segmentation is needed, VAD-based segmentation is used instead of FFmpeg `silencedetect`. VAD detects actual speech boundaries using Silero VAD, avoiding mid-word cuts. It pads chunks by 250ms, merges gaps shorter than 200ms, and splits long chunks at low-energy points. This requires the `sherpa-onnx` feature to be enabled. When `--vad-model` is not set, the original FFmpeg silence-based segmentation is used as a fallback.
+
+#### Speaker diarization options
+
+| Option | Description | Default |
+|--------|-------------|---------|
+| `--speakers` | Number of speakers for diarization | disabled |
+| `--diarize-segmentation-model` | Path to pyannote segmentation ONNX model | `DIARIZE_SEGMENTATION_MODEL` env var |
+| `--diarize-embedding-model` | Path to speaker embedding ONNX model | `DIARIZE_EMBEDDING_MODEL` env var |
+
+When `--speakers N` is set, speaker diarization runs after transcription to label each segment with a speaker identity. Both `--diarize-segmentation-model` and `--diarize-embedding-model` are required. Speaker labels appear in VTT output as `<v Speaker 0>`, in SRT output as `[Speaker 0]`, and in manifest JSON as a `"speaker"` field on each segment. Requires the `sherpa-onnx` feature.
+
 ## Output behavior
 
 During transcription, the CLI shows an animated spinner in the terminal so you can see progress while waiting for Whisper/API calls to complete.
@@ -155,6 +168,9 @@ When `--input` resolves to multiple files (directory or glob), all files are pro
 | `AZURE_OPENAI_ENDPOINT` | Azure OpenAI endpoint URL | none |
 | `AZURE_DEPLOYMENT_NAME` | Azure deployment name | `whisper` |
 | `AZURE_API_VERSION` | Azure API version | `2024-06-01` |
+| `VAD_MODEL` | Path to Silero VAD ONNX model for speech-aware segmentation | none |
+| `DIARIZE_SEGMENTATION_MODEL` | Path to pyannote segmentation ONNX model for speaker diarization | none |
+| `DIARIZE_EMBEDDING_MODEL` | Path to speaker embedding ONNX model for speaker diarization | none |
 | `TRANSCRIBEIT_MAX_RETRIES` | Maximum 429 retries | `5` |
 | `TRANSCRIBEIT_REQUEST_TIMEOUT_SECS` | API request timeout in seconds | `120` |
 | `TRANSCRIBEIT_RETRY_WAIT_BASE_SECS` | Base retry wait time in seconds | `10` |
@@ -211,6 +227,28 @@ transcribeit run -i lecture.mp4 -m base -f srt -o ./output
 transcribeit run -i noisy.wav -m .cache/ggml-base.bin \
   --segment --silence-threshold -30 --min-silence-duration 0.5
 
+# VAD-based segmentation (avoids mid-word cuts)
+transcribeit run -p sherpa-onnx -i lecture.mp4 -m base.en \
+  --vad-model /path/to/silero_vad.onnx -f vtt -o ./output
+
+# VAD with env var (set VAD_MODEL in .env)
+VAD_MODEL=/path/to/silero_vad.onnx transcribeit run -p sherpa-onnx -i recording.mp3 -m base.en
+
+# Speaker diarization (2 speakers)
+transcribeit run -p sherpa-onnx -i meeting.mp4 -m base.en \
+  --speakers 2 \
+  --diarize-segmentation-model /path/to/segmentation.onnx \
+  --diarize-embedding-model /path/to/embedding.onnx \
+  -f vtt -o ./output
+
+# VAD + speaker diarization combined
+transcribeit run -p sherpa-onnx -i interview.wav -m base.en \
+  --vad-model /path/to/silero_vad.onnx \
+  --speakers 2 \
+  --diarize-segmentation-model /path/to/segmentation.onnx \
+  --diarize-embedding-model /path/to/embedding.onnx \
+  -f srt -o ./output
+
 # OpenAI API
 OPENAI_API_KEY=sk-... transcribeit run -p openai -i recording.mp3
 
@@ -267,7 +305,8 @@ When `--output-dir` is specified, the following files are created:
       "index": 0,
       "start_secs": 0.0,
       "end_secs": 5.25,
-      "text": "Hello, welcome to the meeting."
+      "text": "Hello, welcome to the meeting.",
+      "speaker": "Speaker 0"
     }
   ],
   "stats": {

diff --git a/docs/performance-benchmarks.md b/docs/performance-benchmarks.md
@@ -61,16 +61,23 @@ Record:
 ### 3. Segmentation impact
 
 ```bash
+# FFmpeg silencedetect segmentation
 time transcribeit run -p openai -i <long_file> --segment --segment-concurrency 2 -f text -o ./output
 time transcribeit run -p openai -i <long_file> --segment --segment-concurrency 1 --max-segment-secs 300 -f text -o ./output
-# sherpa-onnx always segments at 30s max
+
+# sherpa-onnx with FFmpeg silencedetect (default, always segments at 30s max)
 time transcribeit run -p sherpa-onnx -i <long_file> -m base -f text -o ./output
+
+# sherpa-onnx with VAD-based segmentation
+time transcribeit run -p sherpa-onnx -i <long_file> -m base --vad-model /path/to/silero_vad.onnx -f text -o ./output
 ```
 
 Record:
 - total segment count
 - max queue wait
 - request-level retry counts
+- segmentation method used (VAD vs silencedetect)
+- transcript quality at segment boundaries (check for mid-word cuts)
 
 ### 4. I/O + conversion overhead
 
@@ -117,6 +124,17 @@ These results were measured on a 5-minute medical interview recording.
 - Moonshine provides a compact alternative but is slower than Whisper at the same size tier.
 - For highest quality where speed is not critical, use `large-v3-turbo` with local whisper.cpp.
 
+### VAD vs FFmpeg silencedetect segmentation
+
+VAD-based segmentation (Silero VAD via `--vad-model`) and FFmpeg `silencedetect` produce different segment boundaries. Key differences to observe when benchmarking:
+
+- **Segment boundary quality:** VAD detects speech regions directly, so segment boundaries align with actual speech. FFmpeg `silencedetect` splits at silence midpoints, which can cut mid-word if silence gaps are short or thresholds are mistuned.
+- **Segment count:** VAD typically produces more segments (one per speech region after merging) while `silencedetect` produces fewer, longer segments based on silence gaps.
+- **Processing overhead:** VAD runs on the audio samples in-memory (fast, no subprocess). FFmpeg `silencedetect` runs as a subprocess and requires parsing its stderr output.
+- **Transcript quality:** VAD-segmented transcripts tend to have fewer artifacts at segment boundaries because chunks start and end at speech boundaries with 250ms padding, rather than at arbitrary silence midpoints.
+
+When comparing, use the same audio file and model to isolate the effect of the segmentation method on overall transcript quality and timing.
+
 ## CI/automatable baseline
 
 For now, treat these as manual benchmarks in a fixed environment.