pykeio
diff --git a/‎.gitattributes‎
Lines changed: 1 addition & 0 deletions b/‎.gitattributes‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 0 additions & 3 deletions b/‎.github/workflows/test.yml‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎Cargo.lock‎
Lines changed: 14 additions & 42 deletions b/‎Cargo.lock‎
Lines changed: 14 additions & 42 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 7 additions & 9 deletions b/‎Cargo.toml‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎LICENSE‎
Lines changed: 17 additions & 26 deletions b/‎LICENSE‎
Lines changed: 17 additions & 26 deletions
diff --git a/‎README.md‎
Lines changed: 5 additions & 10 deletions b/‎README.md‎
Lines changed: 5 additions & 10 deletions
diff --git a/‎benches/downsample.rs‎
Lines changed: 0 additions & 19 deletions b/‎benches/downsample.rs‎
Lines changed: 0 additions & 19 deletions
diff --git a/‎benches/vad.rs‎
Lines changed: 11 additions & 71 deletions b/‎benches/vad.rs‎
Lines changed: 11 additions & 71 deletions
@@ -0,0 +1 @@
+*.bin binary
@@ -24,9 +24,6 @@ jobs:
       matrix:
         platform:
           - os: ubuntu-latest
-          - os: windows-latest
-          - os: macos-12
-          - os: macos-14
     steps:
       - uses: actions/checkout@v4
       - name: Install stable Rust toolchain
 
@@ -1,28 +1,26 @@
 [package]
 name = "earshot"
 version = "0.1.0"
-description = "Ridiculously fast voice activity detection in pure #[no_std] Rust"
+description = "Ridiculously fast & accurate voice activity detection in pure Rust"
 repository = "https://github.com/pykeio/earshot"
 authors = [ "Carson M <[email protected]>" ]
-license = "BSD-3-Clause"
+license = "MIT"
 edition = "2021"
 exclude = ["tests/data", ".github"]
 
 [features]
-default = [ "std", "alloc" ]
+default = [ "std", "embed-weights" ]
 # Currently just impls `std::error::Error` for the `Error` type.
 std = []
-# Allocates internal buffers on the heap instead of the stack.
-alloc = []
+# Embed the default model weights in the binary. Enables `Default` for `QuantizedPredictor`.
+embed-weights = []
 
 [dependencies]
+libm = "0.2"
 
 [dev-dependencies]
-criterion = "0.5"
+criterion = "0.7"
 
-[[bench]]
-name = "downsample"
-harness = false
 [[bench]]
 name = "vad"
 harness = false
@@ -1,30 +1,21 @@
-Copyright (c) 2011, The WebRTC project authors. All rights reserved.
-Copyright (c) 2024 pyke.io
+MIT License
 
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
+Copyright (c) 2025 pyke.io
 
-  * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
 
-  * Redistributions in binary form must reproduce the above copyright
-    notice, this list of conditions and the following disclaimer in
-    the documentation and/or other materials provided with the
-    distribution.
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
 
-  * Neither the name of Google nor the names of its contributors may
-    be used to endorse or promote products derived from this software
-    without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -1,12 +1,7 @@
 # Earshot
-Ridiculously fast, only slightly bad voice activity detection in pure Rust. Port of the famous [WebRTC VAD](https://webrtc.googlesource.com/).
+Ridiculously fast & accurate voice activity detection in pure Rust.
 
-## Features
-- `#![no_std]`, doesn't even require `alloc`
-	* Internal buffers can get pretty big when stored on the stack, so the `alloc` feature is enabled by default, which allocates them on the heap instead.
-- Stupidly fast; uses only fixed-point arithmetic
-	* Achieves an RTF of ~3e-4 with 30 ms 48 KHz frames, ~3e-5 with 30 ms 8 KHz frames.
-	* Comparatively, Silero VAD v4 w/ [`ort`](https://ort.pyke.io/) achieves an RTF of ~3e-3 with 60 ms 16 KHz frames.
-- Okay accuracy
-	* Great at distinguishing between silence and noise, but not between noise and speech.
-	* Earshot provides alternative models with slight accuracy gains compared to the base WebRTC model.
+Achieves an RTF of 0.0014; 10x faster than Silero/TEN VAD.
+
+## Performance
+Compiling with `RUSTFLAGS="-C target-cpu=native"` in release mode is highly recommended as it can cut processing time in half.
@@ -1,83 +1,23 @@
-use std::{fs, hint::black_box, slice};
+use std::hint::black_box;
 
 use criterion::{Criterion, criterion_group, criterion_main};
-use earshot::{VoiceActivityDetector, VoiceActivityModel, VoiceActivityProfile};
+use earshot::{Detector, QuantizedPredictor};
 
-fn bench_vad_8khz(c: &mut Criterion) {
-	let file = fs::read("tests/data/audio_tiny8.raw").unwrap();
-	let i16_samples = unsafe { slice::from_raw_parts(file.as_ptr().cast::<i16>(), file.len() / 2) };
-	let mut vad = VoiceActivityDetector::new_with_model(VoiceActivityModel::ES_ALPHA, VoiceActivityProfile::VERY_AGGRESSIVE);
-	c.bench_function("VAD - 8 KHz (Real world)", |b| {
+fn bench_vad(c: &mut Criterion) {
+	let mut vad = Detector::<QuantizedPredictor>::default();
+	c.bench_function("Single frame - f32", |b| {
+		let frame = (0..256 as i16).map(|i| i.wrapping_mul(i) as f32).collect::<Vec<_>>();
 		b.iter(|| {
-			for frame in i16_samples.chunks_exact(240) {
-				let _ = black_box(vad.predict_8khz(black_box(frame)));
-			}
+			let _ = black_box(vad.predict_f32(black_box(&frame)));
 		})
 	});
-	c.bench_function("VAD - 8 KHz (Single frame)", |b| {
-		let frame = (0..240 as i16).map(|i| i.wrapping_mul(i)).collect::<Vec<_>>();
+	c.bench_function("Single frame - i16", |b| {
+		let frame = (0..256 as i16).map(|i| i.wrapping_mul(i)).collect::<Vec<_>>();
 		b.iter(|| {
-			let _ = black_box(vad.predict_8khz(black_box(&frame)));
+			let _ = black_box(vad.predict_i16(black_box(&frame)));
 		})
 	});
 }
 
-fn bench_vad_16khz(c: &mut Criterion) {
-	let file = fs::read("tests/data/audio_tiny16.raw").unwrap();
-	let i16_samples = unsafe { slice::from_raw_parts(file.as_ptr().cast::<i16>(), file.len() / 2) };
-	let mut vad = VoiceActivityDetector::new_with_model(VoiceActivityModel::ES_ALPHA, VoiceActivityProfile::VERY_AGGRESSIVE);
-	c.bench_function("VAD - 16 KHz (Real world)", |b| {
-		b.iter(|| {
-			for frame in i16_samples.chunks_exact(240) {
-				let _ = black_box(vad.predict_16khz(black_box(frame)));
-			}
-		})
-	});
-	c.bench_function("VAD - 16 KHz (Single frame)", |b| {
-		let frame = (0..480 as i16).map(|i| i.wrapping_mul(i)).collect::<Vec<_>>();
-		b.iter(|| {
-			let _ = black_box(vad.predict_16khz(black_box(&frame)));
-		})
-	});
-}
-
-fn bench_vad_32khz(c: &mut Criterion) {
-	let file = fs::read("tests/data/audio_tiny32.raw").unwrap();
-	let i16_samples = unsafe { slice::from_raw_parts(file.as_ptr().cast::<i16>(), file.len() / 2) };
-	let mut vad = VoiceActivityDetector::new_with_model(VoiceActivityModel::ES_ALPHA, VoiceActivityProfile::VERY_AGGRESSIVE);
-	c.bench_function("VAD - 32 KHz (Real world)", |b| {
-		b.iter(|| {
-			for frame in i16_samples.chunks_exact(240) {
-				let _ = black_box(vad.predict_32khz(black_box(frame)));
-			}
-		})
-	});
-	c.bench_function("VAD - 32 KHz (Single frame)", |b| {
-		let frame = (0..960 as i16).map(|i| i.wrapping_mul(i)).collect::<Vec<_>>();
-		b.iter(|| {
-			let _ = black_box(vad.predict_32khz(black_box(&frame)));
-		})
-	});
-}
-
-fn bench_vad_48khz(c: &mut Criterion) {
-	let file = fs::read("tests/data/audio_tiny48.raw").unwrap();
-	let i16_samples = unsafe { slice::from_raw_parts(file.as_ptr().cast::<i16>(), file.len() / 2) };
-	let mut vad = VoiceActivityDetector::new_with_model(VoiceActivityModel::ES_ALPHA, VoiceActivityProfile::VERY_AGGRESSIVE);
-	c.bench_function("VAD - 48 KHz (Real world)", |b| {
-		b.iter(|| {
-			for frame in i16_samples.chunks_exact(240) {
-				let _ = black_box(vad.predict_48khz(black_box(frame)));
-			}
-		})
-	});
-	c.bench_function("VAD - 48 KHz (Single frame)", |b| {
-		let frame = (0..1440 as i16).map(|i| i.wrapping_mul(i)).collect::<Vec<_>>();
-		b.iter(|| {
-			let _ = black_box(vad.predict_48khz(black_box(&frame)));
-		})
-	});
-}
-
-criterion_group!(vad, bench_vad_8khz, bench_vad_16khz, bench_vad_32khz, bench_vad_48khz);
+criterion_group!(vad, bench_vad);
 criterion_main!(vad);