diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index c37a2e6dda1..ffdcb1f7f25 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -100,6 +100,7 @@ if (EMSCRIPTEN) add_subdirectory(bench.wasm) elseif(CMAKE_JS_VERSION) add_subdirectory(addon.node) + add_subdirectory(stream.node) else() add_subdirectory(cli) add_subdirectory(bench) diff --git a/examples/stream.node/CMakeLists.txt b/examples/stream.node/CMakeLists.txt new file mode 100644 index 00000000000..d4d70046252 --- /dev/null +++ b/examples/stream.node/CMakeLists.txt @@ -0,0 +1,44 @@ +cmake_minimum_required(VERSION 3.13) +project(stream_addon) + +set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/../../cmake ${CMAKE_MODULE_PATH}) + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +add_definitions(-DNAPI_VERSION=4) +include_directories(${CMAKE_JS_INC}) + +set(TARGET stream.node) + +add_library(${TARGET} SHARED + addon.cpp + whisper-stream.cpp +) + +set_target_properties(${TARGET} PROPERTIES + PREFIX "" + SUFFIX ".node" +) +include(DefaultTargetOptions) + +execute_process(COMMAND node -p "require('node-addon-api').include" + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT_VARIABLE NODE_ADD_ON_API_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE) +string(REPLACE "\"" "" NODE_ADD_ON_API_DIR ${NODE_ADD_ON_API_DIR}) +string(REPLACE "\\\\" "/" NODE_ADD_ON_API_DIR ${NODE_ADD_ON_API_DIR}) +target_include_directories(${TARGET} PRIVATE ${NODE_ADD_ON_API_DIR}) + +target_link_libraries(${TARGET} + whisper + common + ${CMAKE_THREAD_LIBS_INIT} +) + +if(MSVC AND CMAKE_NODEJS_DEF AND CMAKE_JS_NODEJS_TARGET) + target_link_libraries(${TARGET} ${CMAKE_JS_NODEJS_TARGET}) +elseif(CMAKE_JS_LIB) + execute_process(COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_JS_LIB} ${CMAKE_NODEJS_LIB_TARGET}) + target_link_libraries(${TARGET} ${CMAKE_NODEJS_LIB_TARGET}) +endif() \ No newline at end of file diff --git a/examples/stream.node/addon.cpp b/examples/stream.node/addon.cpp new file mode 100644 index 00000000000..0e5c34862d3 --- /dev/null +++ b/examples/stream.node/addon.cpp @@ -0,0 +1,114 @@ +#include "addon.h" // Your header file for WhisperStreamWrapper +#include "whisper-stream.h" // Your header file for the WhisperStream class + +// NOTE: The N-API wrapper handles errors by throwing JS exceptions, so this macro is not needed. +// #define CHECK_STATUS(env, status, msg) ... + +// --- Implementation of the Wrapper --- + +Napi::Object WhisperStreamWrapper::Init(Napi::Env env, Napi::Object exports) { + Napi::Function func = DefineClass(env, "WhisperStream", { + InstanceMethod("startModel", &WhisperStreamWrapper::startModel), + InstanceMethod("processChunk", &WhisperStreamWrapper::ProcessChunk), + InstanceMethod("freeModel", &WhisperStreamWrapper::freeModel), + }); + + exports.Set("WhisperStream", func); + return exports; +} + +WhisperStreamWrapper::WhisperStreamWrapper(const Napi::CallbackInfo& info) : Napi::ObjectWrap(info) { +} + +Napi::Value WhisperStreamWrapper::startModel(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + + if (info.Length() < 1 || !info[0].IsObject()) { + Napi::TypeError::New(env, "Expected a configuration object").ThrowAsJavaScriptException(); + return env.Null(); + } + + Napi::Object js_params = info[0].As(); + StreamParams params; + + if (js_params.Has("modelPath")) { + params.model = js_params.Get("modelPath").As(); + } else { + Napi::TypeError::New(env, "Missing required parameter 'model'").ThrowAsJavaScriptException(); + return env.Null(); + } + + if (js_params.Has("language")) params.language = js_params.Get("language").As(); + if (js_params.Has("nThreads")) params.n_threads = js_params.Get("nThreads").As(); + if (js_params.Has("stepMs")) params.step_ms = js_params.Get("stepMs").As(); + if (js_params.Has("lengthMs")) params.length_ms = js_params.Get("lengthMs").As(); + if (js_params.Has("keepMs")) params.keep_ms = js_params.Get("keepMs").As(); + if (js_params.Has("maxTokens")) params.max_tokens = js_params.Get("maxTokens").As(); + if (js_params.Has("audioCtx")) params.audio_ctx = js_params.Get("audioCtx").As(); + if (js_params.Has("vadThold")) params.vad_thold = js_params.Get("vadThold").As(); + if (js_params.Has("beamSize")) params.beam_size = js_params.Get("beamSize").As(); + if (js_params.Has("freqThold")) params.freq_thold = js_params.Get("freqThold").As(); + if (js_params.Has("translate")) params.translate = js_params.Get("translate").As(); + if (js_params.Has("noFallback")) params.no_fallback = js_params.Get("noFallback").As(); + if (js_params.Has("printSpecial")) params.print_special = js_params.Get("printSpecial").As(); + if (js_params.Has("noContext")) params.no_context = js_params.Get("noContext").As(); + if (js_params.Has("noTimestamps")) params.no_timestamps = js_params.Get("noTimestamps").As(); + if (js_params.Has("tinydiarize")) params.tinydiarize = js_params.Get("tinydiarize").As(); + if (js_params.Has("saveAudio")) params.save_audio = js_params.Get("saveAudio").As(); + if (js_params.Has("useGpu")) params.use_gpu = js_params.Get("useGpu").As(); + if (js_params.Has("flashAttn")) params.flash_attn = js_params.Get("flashAttn").As(); + + if (this->whisperStream_) { + delete this->whisperStream_; + } + + try { + this->whisperStream_ = new WhisperStream(params); + this->whisperStream_->init(); + } catch (const std::runtime_error& e) { + Napi::Error::New(env, e.what()).ThrowAsJavaScriptException(); + return env.Null(); + } + + return env.Undefined(); +} + +Napi::Value WhisperStreamWrapper::ProcessChunk(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + + if (!this->whisperStream_) { + Napi::Error::New(env, "Model not started. Call startModel() first.").ThrowAsJavaScriptException(); + return env.Null(); + } + + if (info.Length() < 1 || !info[0].IsTypedArray() || info[0].As().TypedArrayType() != napi_float32_array) { + Napi::TypeError::New(env, "Argument must be a Float32Array").ThrowAsJavaScriptException(); + return env.Null(); + } + + Napi::Float32Array pcmf32_array = info[0].As(); + std::vector pcmf32_new(pcmf32_array.Data(), pcmf32_array.Data() + pcmf32_array.ElementLength()); + + TranscriptionResult result = this->whisperStream_->process(pcmf32_new); + + Napi::Object resultObj = Napi::Object::New(env); + resultObj.Set("text", Napi::String::New(env, result.text)); + resultObj.Set("isFinal", Napi::Boolean::New(env, result.final)); + + return resultObj; +} + +Napi::Value WhisperStreamWrapper::freeModel(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + if (this->whisperStream_) { + delete this->whisperStream_; + this->whisperStream_ = nullptr; + } + return env.Undefined(); +} + +Napi::Object InitAll(Napi::Env env, Napi::Object exports) { + return WhisperStreamWrapper::Init(env, exports); +} + +NODE_API_MODULE(whisper, InitAll) \ No newline at end of file diff --git a/examples/stream.node/addon.h b/examples/stream.node/addon.h new file mode 100644 index 00000000000..3ec4972ea71 --- /dev/null +++ b/examples/stream.node/addon.h @@ -0,0 +1,17 @@ +#pragma once + +#include +#include "whisper-stream.h" + +class WhisperStreamWrapper : public Napi::ObjectWrap { +public: + static Napi::Object Init(Napi::Env env, Napi::Object exports); + WhisperStreamWrapper(const Napi::CallbackInfo& info); + +private: + Napi::Value startModel(const Napi::CallbackInfo& info); + Napi::Value ProcessChunk(const Napi::CallbackInfo& info); + Napi::Value freeModel(const Napi::CallbackInfo& info); + + WhisperStream* whisperStream_ = nullptr; +}; \ No newline at end of file diff --git a/examples/stream.node/index.js b/examples/stream.node/index.js new file mode 100644 index 00000000000..9f842b6c766 --- /dev/null +++ b/examples/stream.node/index.js @@ -0,0 +1,83 @@ +const path = require('path'); +const os = require('os'); +const portAudio = require('naudiodon2'); + +const addonPath = path.join(__dirname, '..', '..', 'build', 'Release', 'stream.node'); + +const { WhisperStream } = require(addonPath); + +const modelPath = path.join(__dirname, '..', '..', 'models', 'ggml-base.en.bin'); +const SAMPLE_RATE = 16000; + +// --- Main Application --- +async function main() { + const whisper = new WhisperStream(); + let pendingText = ''; // Buffer for the current unconfirmed text + + console.log('Loading model...'); + whisper.startModel({ + modelPath: modelPath, + language: 'en', + nThreads: 4, + stepMs: 3000, + lengthMs: 10000, + keepMs: 200, + useGpu: true, + }); + console.log('Model loaded.'); + + const ai = new portAudio.AudioIO({ + inOptions: { + channelCount: 1, + sampleFormat: portAudio.SampleFormatFloat32, + sampleRate: SAMPLE_RATE, + deviceId: -1, + closeOnError: true, + } + }); + + ai.on('data', (chunk) => { + const floatCount = chunk.length / Float32Array.BYTES_PER_ELEMENT; + const float32 = new Float32Array(chunk.buffer, chunk.byteOffset, floatCount); + + try { + const result = whisper.processChunk(float32); + if (!result || !result.text) return; + + const { text, isFinal } = result; + + if (isFinal) { + process.stdout.write(`\r${text}\n`); + pendingText = ''; // Reset for the next utterance + } else { + pendingText = text; + // '\r' moves cursor to the start, '\x1B[K' clears the rest of the line. + process.stdout.write(`\r${pendingText}\x1B[K`); + } + } catch (err) { + console.error('Error during processing:', err); + } + }); + + ai.on('error', (err) => console.error('Audio input error:', err)); + + ai.start(); + console.log('Recording from microphone. Speak now.'); + process.stdout.write('> '); + + const shutdown = () => { + console.log('\nShutting down...'); + ai.quit(() => { + whisper.freeModel(); + process.exit(0); + }); + }; + + process.on('SIGINT', shutdown); + process.on('SIGTERM', shutdown); +} + +main().catch((err) => { + console.error('An unexpected error occurred:', err); + process.exit(1); +}); \ No newline at end of file diff --git a/examples/stream.node/package.json b/examples/stream.node/package.json new file mode 100644 index 00000000000..c53193ff55d --- /dev/null +++ b/examples/stream.node/package.json @@ -0,0 +1,16 @@ +{ + "name": "stream.node", + "version": "1.0.0", + "main": "index.js", + "scripts": { + "start": "node index_naudiodon.js", + "test": "echo \"Error: no test specified\" && exit 1" + }, + "dependencies": { + "naudiodon2": "^2.3.6" + }, + "devDependencies": { + "cmake-js": "^7.3.1", + "node-addon-api": "^5.1.0" + } +} diff --git a/examples/stream.node/whisper-stream.cpp b/examples/stream.node/whisper-stream.cpp new file mode 100644 index 00000000000..f31abfcd2bf --- /dev/null +++ b/examples/stream.node/whisper-stream.cpp @@ -0,0 +1,263 @@ +// whisper-stream.cpp +#include "whisper-stream.h" +#include "common-whisper.h" +#include "common.h" +#include +#include +#include + +WhisperStream::WhisperStream(const StreamParams &stream_params) + : params(stream_params) {} + +WhisperStream::~WhisperStream() { + if (ctx) { + whisper_print_timings(ctx); + whisper_free(ctx); + ctx = nullptr; + } + pcmf32_old.clear(); + pcmf32_new.clear(); + pcmf32.clear(); + prompt_tokens.clear(); +} + +bool WhisperStream::init() { + + // ensure keep/length constraints + params.keep_ms = std::min(params.keep_ms, params.step_ms); + params.length_ms = std::max(params.length_ms, params.step_ms); + + // store sample counts as members (SAMPLES, not bytes) + n_samples_step = int((1e-3 * params.step_ms) * WHISPER_SAMPLE_RATE); + n_samples_len = int((1e-3 * params.length_ms) * WHISPER_SAMPLE_RATE); + n_samples_keep = int((1e-3 * params.keep_ms) * WHISPER_SAMPLE_RATE); + n_samples_30s = int((1e-3 * 30000.0) * WHISPER_SAMPLE_RATE); + + use_vad = (n_samples_step <= 0); + + n_new_line = + !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; + + params.no_timestamps = !use_vad; + params.no_context |= use_vad; + + // language check + if (params.language != "auto" && + whisper_lang_id(params.language.c_str()) == -1) { + fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str()); + throw std::runtime_error("unknown language"); + } + + struct whisper_context_params cparams = whisper_context_default_params(); + cparams.use_gpu = params.use_gpu; + cparams.flash_attn = params.flash_attn; + + // assign member ctx + ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams); + if (ctx == nullptr) { + fprintf(stderr, "error: failed to initialize whisper context\n"); + throw std::runtime_error("failed to initialize whisper context"); + } + + // reserve buffers + pcmf32_new.clear(); + pcmf32_new.reserve(n_samples_30s); + pcmf32.clear(); + pcmf32_old.clear(); + prompt_tokens.clear(); + + { + fprintf(stderr, "\n"); + if (!whisper_is_multilingual(ctx)) { + if (params.language != "en" || params.translate) { + params.language = "en"; + params.translate = false; + fprintf(stderr, + "%s: WARNING: model is not multilingual, ignoring language and " + "translation options\n", + __func__); + } + } + fprintf( + stderr, + "%s: processing %d samples (step = %.1f sec / len = %.1f sec / keep = " + "%.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n", + __func__, n_samples_step, float(n_samples_step) / WHISPER_SAMPLE_RATE, + float(n_samples_len) / WHISPER_SAMPLE_RATE, + float(n_samples_keep) / WHISPER_SAMPLE_RATE, params.n_threads, + params.language.c_str(), params.translate ? "translate" : "transcribe", + params.no_timestamps ? 0 : 1); + + if (!use_vad) { + fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, + n_new_line, params.no_context); + } else { + fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", + __func__); + } + fprintf(stderr, "\n"); + } + + n_iter = 0; + return true; +} + +TranscriptionResult WhisperStream::process(const std::vector &pcmf32_chunk) { + + t_last = std::chrono::high_resolution_clock::now(); + t_start = t_last; + // append incoming samples + pcmf32_new.insert(pcmf32_new.end(), pcmf32_chunk.begin(), pcmf32_chunk.end()); + + // Not VAD mode: require at least one step worth of samples + if (!use_vad) { + if ((int)pcmf32_new.size() < n_samples_step) { + return TranscriptionResult(); // not enough samples yet + } + + const int n_samples_new = (int)pcmf32_new.size(); + + // take up to params.length_ms audio from previous iteration + const int n_samples_take = + std::min((int)pcmf32_old.size(), + std::max(0, n_samples_keep + n_samples_len - n_samples_new)); + + pcmf32.resize(n_samples_new + n_samples_take); + + // copy tail of old + for (int i = 0; i < n_samples_take; ++i) { + pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i]; + } + + // copy new samples + memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), + n_samples_new * sizeof(float)); + + // consume new buffer for next iteration + pcmf32_old = pcmf32; + pcmf32_new.clear(); + + } else { + const auto t_now = std::chrono::high_resolution_clock::now(); + // VAD mode: require at least 2 seconds of audio (example); caller can tune + if ((int)pcmf32_new.size() < 2 * WHISPER_SAMPLE_RATE) { + return TranscriptionResult(); + } + + if (!::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, + params.freq_thold, false)) { + pcmf32_new.clear(); + return TranscriptionResult(); // no speech detected + } + + // take last length_ms worth of samples + const int take = std::min((int)pcmf32_new.size(), n_samples_len); + pcmf32.assign(pcmf32_new.end() - take, pcmf32_new.end()); + pcmf32_new.clear(); + t_last = t_now; + } + + // run the inference + whisper_full_params wparams = whisper_full_default_params( + params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH + : WHISPER_SAMPLING_GREEDY); + + wparams.print_progress = false; + wparams.print_special = params.print_special; + wparams.print_realtime = false; + wparams.print_timestamps = !params.no_timestamps; + wparams.translate = params.translate; + wparams.single_segment = !use_vad; + wparams.max_tokens = params.max_tokens; + wparams.language = params.language.c_str(); + wparams.n_threads = params.n_threads; + wparams.beam_search.beam_size = params.beam_size; + + wparams.audio_ctx = params.audio_ctx; + wparams.tdrz_enable = params.tinydiarize; + wparams.temperature_inc = params.no_fallback ? 0.0f : wparams.temperature_inc; + + wparams.prompt_tokens = + params.no_context + ? nullptr + : (prompt_tokens.empty() ? nullptr : prompt_tokens.data()); + wparams.prompt_n_tokens = params.no_context ? 0 : (int)prompt_tokens.size(); + + if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) { + fprintf(stderr, "%s: failed to process audio\n", __func__); + return TranscriptionResult(); + } + + // Build result as structured segments (we return a simple concatenated string + // here; you can change it to JSON or an array of structs for the JS wrapper) + std::string plain; + if (use_vad) { + const int64_t t1 = (t_last - t_start).count() / 1000000; + const int64_t t0 = + std::max(0.0, t1 - pcmf32.size() * 1000.0 / WHISPER_SAMPLE_RATE); + + plain += "\n"; + plain += "### Transcription " + std::to_string(n_iter) + + " START | t0 = " + std::to_string(t0) + + " ms | t1 = " + std::to_string(t1) + " ms\n"; + plain += "\n"; + } + + const int n_segments = whisper_full_n_segments(ctx); + for (int i = 0; i < n_segments; ++i) { + const char *text = whisper_full_get_segment_text(ctx, i); + + if (params.no_timestamps) { + plain += text; + } else { + const int64_t t0 = whisper_full_get_segment_t0(ctx, i); + const int64_t t1 = whisper_full_get_segment_t1(ctx, i); + // append in safe steps to avoid operator precedence issues + plain += "["; + plain += to_timestamp(t0, false); + plain += " --> "; + plain += to_timestamp(t1, false); + plain += "] "; + plain += text; + if (whisper_full_get_segment_speaker_turn_next(ctx, i)) { + plain += " [SPEAKER_TURN]"; + } + plain += "\n"; + } + } + + if (use_vad) { + plain += "\n"; + plain += "### Transcription n_iter END\n"; + } + + ++n_iter; + + bool will_commit = false; + if (!use_vad && (n_iter % n_new_line) == 0) { + plain += "\n"; + will_commit = true; + // guard slicing: ensure pcmf32 has enough samples + if ((int)pcmf32.size() >= n_samples_keep && n_samples_keep > 0) { + pcmf32_old.assign(pcmf32.end() - n_samples_keep, pcmf32.end()); + } else { + pcmf32_old = pcmf32; + } + + // update prompt tokens safely + if (!params.no_context) { + prompt_tokens.clear(); + const int n_segments_after = whisper_full_n_segments(ctx); + for (int si = 0; si < n_segments_after; ++si) { + const int token_count = whisper_full_n_tokens(ctx, si); + for (int ti = 0; ti < token_count; ++ti) { + prompt_tokens.push_back(whisper_full_get_token_id(ctx, si, ti)); + } + } + } + } + + const bool is_final = use_vad || will_commit; + + return TranscriptionResult{ plain, is_final}; +} diff --git a/examples/stream.node/whisper-stream.h b/examples/stream.node/whisper-stream.h new file mode 100644 index 00000000000..f0ef61c55b2 --- /dev/null +++ b/examples/stream.node/whisper-stream.h @@ -0,0 +1,69 @@ +// whisper-stream.h +#pragma once +#include +#include +#include +#include "whisper.h" +#include + +struct StreamParams { + int n_threads = std::min(4, (int)std::thread::hardware_concurrency()); + int step_ms = 3000; + int length_ms = 10000; + int keep_ms = 200; + int max_tokens = 32; + int audio_ctx = 0; + int beam_size = -1; + float vad_thold = 0.6f; + float freq_thold = 100.0f; + bool translate = false; + bool no_fallback = false; + bool print_special = false; + bool no_context = true; + bool no_timestamps = false; + bool tinydiarize = false; + bool save_audio = false; + bool use_gpu = true; + bool flash_attn = false; + std::string language = "en"; + std::string model; +}; + +struct TranscriptionResult { + std::string text; + bool final; +}; + +class WhisperStream { +public: + WhisperStream(const StreamParams &stream_params); + ~WhisperStream(); + + bool init(); + TranscriptionResult process(const std::vector &pcmf32_chunk); + void free(); // optional explicit free + +private: + StreamParams params; + + // whisper context + struct whisper_context *ctx = nullptr; + + // buffers (samples, not bytes) + std::vector pcmf32; // assembled input for inference + std::vector pcmf32_new; // appended incoming samples buffer + std::vector pcmf32_old; // overlap kept for next chunk + + std::vector prompt_tokens; + + // sample counts and flags + int n_samples_step = 0; + int n_samples_len = 0; + int n_samples_keep = 0; + int n_samples_30s = 0; + bool use_vad = false; + int n_new_line = 1; + int n_iter = 0; + std::chrono::time_point t_start; + std::chrono::time_point t_last; +};