Skip to content

Commit fd2fc68

Browse files
authored
feat: neural network implementation (#2)
1 parent f5f22ab commit fd2fc68

File tree

20 files changed

+1470
-1939
lines changed

20 files changed

+1470
-1939
lines changed

.gitattributes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*.bin binary

.github/workflows/test.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,6 @@ jobs:
2424
matrix:
2525
platform:
2626
- os: ubuntu-latest
27-
- os: windows-latest
28-
- os: macos-12
29-
- os: macos-14
3027
steps:
3128
- uses: actions/checkout@v4
3229
- name: Install stable Rust toolchain

Cargo.lock

Lines changed: 14 additions & 42 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,26 @@
11
[package]
22
name = "earshot"
33
version = "0.1.0"
4-
description = "Ridiculously fast voice activity detection in pure #[no_std] Rust"
4+
description = "Ridiculously fast & accurate voice activity detection in pure Rust"
55
repository = "https://github.com/pykeio/earshot"
66
authors = [ "Carson M <[email protected]>" ]
7-
license = "BSD-3-Clause"
7+
license = "MIT"
88
edition = "2021"
99
exclude = ["tests/data", ".github"]
1010

1111
[features]
12-
default = [ "std", "alloc" ]
12+
default = [ "std", "embed-weights" ]
1313
# Currently just impls `std::error::Error` for the `Error` type.
1414
std = []
15-
# Allocates internal buffers on the heap instead of the stack.
16-
alloc = []
15+
# Embed the default model weights in the binary. Enables `Default` for `QuantizedPredictor`.
16+
embed-weights = []
1717

1818
[dependencies]
19+
libm = "0.2"
1920

2021
[dev-dependencies]
21-
criterion = "0.5"
22+
criterion = "0.7"
2223

23-
[[bench]]
24-
name = "downsample"
25-
harness = false
2624
[[bench]]
2725
name = "vad"
2826
harness = false

LICENSE

Lines changed: 17 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,21 @@
1-
Copyright (c) 2011, The WebRTC project authors. All rights reserved.
2-
Copyright (c) 2024 pyke.io
1+
MIT License
32

4-
Redistribution and use in source and binary forms, with or without
5-
modification, are permitted provided that the following conditions are
6-
met:
3+
Copyright (c) 2025 pyke.io
74

8-
* Redistributions of source code must retain the above copyright
9-
notice, this list of conditions and the following disclaimer.
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
1011

11-
* Redistributions in binary form must reproduce the above copyright
12-
notice, this list of conditions and the following disclaimer in
13-
the documentation and/or other materials provided with the
14-
distribution.
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
1514

16-
* Neither the name of Google nor the names of its contributors may
17-
be used to endorse or promote products derived from this software
18-
without specific prior written permission.
19-
20-
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21-
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22-
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23-
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24-
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25-
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26-
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27-
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28-
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29-
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30-
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

README.md

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,7 @@
11
# Earshot
2-
Ridiculously fast, only slightly bad voice activity detection in pure Rust. Port of the famous [WebRTC VAD](https://webrtc.googlesource.com/).
2+
Ridiculously fast & accurate voice activity detection in pure Rust.
33

4-
## Features
5-
- `#![no_std]`, doesn't even require `alloc`
6-
* Internal buffers can get pretty big when stored on the stack, so the `alloc` feature is enabled by default, which allocates them on the heap instead.
7-
- Stupidly fast; uses only fixed-point arithmetic
8-
* Achieves an RTF of ~3e-4 with 30 ms 48 KHz frames, ~3e-5 with 30 ms 8 KHz frames.
9-
* Comparatively, Silero VAD v4 w/ [`ort`](https://ort.pyke.io/) achieves an RTF of ~3e-3 with 60 ms 16 KHz frames.
10-
- Okay accuracy
11-
* Great at distinguishing between silence and noise, but not between noise and speech.
12-
* Earshot provides alternative models with slight accuracy gains compared to the base WebRTC model.
4+
Achieves an RTF of 0.0014; 10x faster than Silero/TEN VAD.
5+
6+
## Performance
7+
Compiling with `RUSTFLAGS="-C target-cpu=native"` in release mode is highly recommended as it can cut processing time in half.

benches/downsample.rs

Lines changed: 0 additions & 19 deletions
This file was deleted.

benches/vad.rs

Lines changed: 11 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -1,83 +1,23 @@
1-
use std::{fs, hint::black_box, slice};
1+
use std::hint::black_box;
22

33
use criterion::{Criterion, criterion_group, criterion_main};
4-
use earshot::{VoiceActivityDetector, VoiceActivityModel, VoiceActivityProfile};
4+
use earshot::{Detector, QuantizedPredictor};
55

6-
fn bench_vad_8khz(c: &mut Criterion) {
7-
let file = fs::read("tests/data/audio_tiny8.raw").unwrap();
8-
let i16_samples = unsafe { slice::from_raw_parts(file.as_ptr().cast::<i16>(), file.len() / 2) };
9-
let mut vad = VoiceActivityDetector::new_with_model(VoiceActivityModel::ES_ALPHA, VoiceActivityProfile::VERY_AGGRESSIVE);
10-
c.bench_function("VAD - 8 KHz (Real world)", |b| {
6+
fn bench_vad(c: &mut Criterion) {
7+
let mut vad = Detector::<QuantizedPredictor>::default();
8+
c.bench_function("Single frame - f32", |b| {
9+
let frame = (0..256 as i16).map(|i| i.wrapping_mul(i) as f32).collect::<Vec<_>>();
1110
b.iter(|| {
12-
for frame in i16_samples.chunks_exact(240) {
13-
let _ = black_box(vad.predict_8khz(black_box(frame)));
14-
}
11+
let _ = black_box(vad.predict_f32(black_box(&frame)));
1512
})
1613
});
17-
c.bench_function("VAD - 8 KHz (Single frame)", |b| {
18-
let frame = (0..240 as i16).map(|i| i.wrapping_mul(i)).collect::<Vec<_>>();
14+
c.bench_function("Single frame - i16", |b| {
15+
let frame = (0..256 as i16).map(|i| i.wrapping_mul(i)).collect::<Vec<_>>();
1916
b.iter(|| {
20-
let _ = black_box(vad.predict_8khz(black_box(&frame)));
17+
let _ = black_box(vad.predict_i16(black_box(&frame)));
2118
})
2219
});
2320
}
2421

25-
fn bench_vad_16khz(c: &mut Criterion) {
26-
let file = fs::read("tests/data/audio_tiny16.raw").unwrap();
27-
let i16_samples = unsafe { slice::from_raw_parts(file.as_ptr().cast::<i16>(), file.len() / 2) };
28-
let mut vad = VoiceActivityDetector::new_with_model(VoiceActivityModel::ES_ALPHA, VoiceActivityProfile::VERY_AGGRESSIVE);
29-
c.bench_function("VAD - 16 KHz (Real world)", |b| {
30-
b.iter(|| {
31-
for frame in i16_samples.chunks_exact(240) {
32-
let _ = black_box(vad.predict_16khz(black_box(frame)));
33-
}
34-
})
35-
});
36-
c.bench_function("VAD - 16 KHz (Single frame)", |b| {
37-
let frame = (0..480 as i16).map(|i| i.wrapping_mul(i)).collect::<Vec<_>>();
38-
b.iter(|| {
39-
let _ = black_box(vad.predict_16khz(black_box(&frame)));
40-
})
41-
});
42-
}
43-
44-
fn bench_vad_32khz(c: &mut Criterion) {
45-
let file = fs::read("tests/data/audio_tiny32.raw").unwrap();
46-
let i16_samples = unsafe { slice::from_raw_parts(file.as_ptr().cast::<i16>(), file.len() / 2) };
47-
let mut vad = VoiceActivityDetector::new_with_model(VoiceActivityModel::ES_ALPHA, VoiceActivityProfile::VERY_AGGRESSIVE);
48-
c.bench_function("VAD - 32 KHz (Real world)", |b| {
49-
b.iter(|| {
50-
for frame in i16_samples.chunks_exact(240) {
51-
let _ = black_box(vad.predict_32khz(black_box(frame)));
52-
}
53-
})
54-
});
55-
c.bench_function("VAD - 32 KHz (Single frame)", |b| {
56-
let frame = (0..960 as i16).map(|i| i.wrapping_mul(i)).collect::<Vec<_>>();
57-
b.iter(|| {
58-
let _ = black_box(vad.predict_32khz(black_box(&frame)));
59-
})
60-
});
61-
}
62-
63-
fn bench_vad_48khz(c: &mut Criterion) {
64-
let file = fs::read("tests/data/audio_tiny48.raw").unwrap();
65-
let i16_samples = unsafe { slice::from_raw_parts(file.as_ptr().cast::<i16>(), file.len() / 2) };
66-
let mut vad = VoiceActivityDetector::new_with_model(VoiceActivityModel::ES_ALPHA, VoiceActivityProfile::VERY_AGGRESSIVE);
67-
c.bench_function("VAD - 48 KHz (Real world)", |b| {
68-
b.iter(|| {
69-
for frame in i16_samples.chunks_exact(240) {
70-
let _ = black_box(vad.predict_48khz(black_box(frame)));
71-
}
72-
})
73-
});
74-
c.bench_function("VAD - 48 KHz (Single frame)", |b| {
75-
let frame = (0..1440 as i16).map(|i| i.wrapping_mul(i)).collect::<Vec<_>>();
76-
b.iter(|| {
77-
let _ = black_box(vad.predict_48khz(black_box(&frame)));
78-
})
79-
});
80-
}
81-
82-
criterion_group!(vad, bench_vad_8khz, bench_vad_16khz, bench_vad_32khz, bench_vad_48khz);
22+
criterion_group!(vad, bench_vad);
8323
criterion_main!(vad);

0 commit comments

Comments
 (0)