diff --git a/agents/src/cli.ts b/agents/src/cli.ts index ec245277..97f512d3 100644 --- a/agents/src/cli.ts +++ b/agents/src/cli.ts @@ -220,5 +220,51 @@ export const runApp = (opts: WorkerOptions) => { }); }); + program + .command('console') + .description('Start a new conversation inside the console') + .action(async () => { + const options = program.optsWithGlobals(); + initializeLogger({ pretty: true, level: options.logLevel }); + const logger = log(); + + if (!process.env.LIVEKIT_ENABLE_CONSOLE_MODE) { + logger.fatal('Console mode is disabled. Set LIVEKIT_ENABLE_CONSOLE_MODE=true to enable.'); + process.exit(1); + } + + try { + const mod = await import(new URL(opts.agent, import.meta.url).href); + const agentDef = mod?.default; + + if (!agentDef?.entry) { + logger.fatal('default export must have an entry() function'); + process.exit(1); + } + + const { CurrentJobContext } = await import('./job.js'); + + const mockCtx = { + room: undefined, + proc: { userData: {} }, + connect: async () => {}, + inferenceExecutor: { + doInference: async () => ({}), // Mock inference executor for now to keep things simple + }, + }; + + new CurrentJobContext(mockCtx as any); + + if (agentDef.prewarm) { + await agentDef.prewarm(mockCtx.proc as any); + } + + await agentDef.entry(mockCtx as any); + } catch (error) { + logger.fatal(error); + process.exit(1); + } + }); + program.parse(); }; diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts index a88334f7..fe2d0625 100644 --- a/agents/src/voice/agent_session.ts +++ b/agents/src/voice/agent_session.ts @@ -169,7 +169,7 @@ export class AgentSession< outputOptions, }: { agent: Agent; - room: Room; + room?: Room; inputOptions?: Partial; outputOptions?: Partial; }): Promise { @@ -180,30 +180,38 @@ export class AgentSession< this.agent = agent; this._updateAgentState('initializing'); - // Check for existing input/output configuration and warn if needed - if (this.input.audio && inputOptions?.audioEnabled !== false) { - this.logger.warn('RoomIO audio input is enabled but input.audio is already set, ignoring..'); - } + if (!room) { + const { ChatCLI } = await import('./chat_cli.js'); + const chatCli = new ChatCLI(this); + await chatCli.start(); + } else { + // Room mode + if (this.input.audio && inputOptions?.audioEnabled !== false) { + this.logger.warn( + 'RoomIO audio input is enabled but input.audio is already set, ignoring..', + ); + } - if (this.output.audio && outputOptions?.audioEnabled !== false) { - this.logger.warn( - 'RoomIO audio output is enabled but output.audio is already set, ignoring..', - ); - } + if (this.output.audio && outputOptions?.audioEnabled !== false) { + this.logger.warn( + 'RoomIO audio output is enabled but output.audio is already set, ignoring..', + ); + } - if (this.output.transcription && outputOptions?.transcriptionEnabled !== false) { - this.logger.warn( - 'RoomIO transcription output is enabled but output.transcription is already set, ignoring..', - ); - } + if (this.output.transcription && outputOptions?.transcriptionEnabled !== false) { + this.logger.warn( + 'RoomIO transcription output is enabled but output.transcription is already set, ignoring..', + ); + } - this.roomIO = new RoomIO({ - agentSession: this, - room, - inputOptions, - outputOptions, - }); - this.roomIO.start(); + this.roomIO = new RoomIO({ + agentSession: this, + room, + inputOptions, + outputOptions, + }); + this.roomIO.start(); + } this.updateActivity(this.agent); diff --git a/agents/src/voice/chat_cli.ts b/agents/src/voice/chat_cli.ts new file mode 100644 index 00000000..3d0dd654 --- /dev/null +++ b/agents/src/voice/chat_cli.ts @@ -0,0 +1,518 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { AudioFrame } from '@livekit/rtc-node'; +import { EventEmitter } from 'node:events'; +import { createRequire } from 'node:module'; +import process from 'node:process'; +import readline from 'node:readline'; +import { ReadableStream } from 'node:stream/web'; +import { clearInterval as clearIntervalSafe, setInterval as setIntervalSafe } from 'node:timers'; +import { log } from '../log.js'; +import { AsyncIterableQueue } from '../utils.js'; +import type { AgentSession } from './agent_session.js'; +import { AudioInput, AudioOutput, type PlaybackFinishedEvent, TextOutput } from './io.js'; +import { TranscriptionSynchronizer } from './transcription/synchronizer.js'; + +const _require = createRequire(import.meta.url); + +const MAX_AUDIO_BAR = 30; +const INPUT_DB_MIN = -70.0; +const INPUT_DB_MAX = 0.0; +const FPS = 16; +const MIN_RMS = 2.0; + +function esc(...codes: number[]): string { + return `\u001b[${codes.join(';')}m`; +} + +function clampNormalizeDb(amplitudeDb: number, dbMin: number, dbMax: number): number { + amplitudeDb = Math.max(dbMin, Math.min(amplitudeDb, dbMax)); + return (amplitudeDb - dbMin) / (dbMax - dbMin); +} + +class ConsoleAudioInput extends AudioInput { + private sampleRate: number; + private numChannels: number; + private framesPerBuffer: number; + private deviceId: number | undefined; + private ai: any | null = null; + private started = false; + private queue = new AsyncIterableQueue(); + private sourceSet = false; + private logger = log(); + private mockInterval: NodeJS.Timeout | null = null; + + microDb: number = INPUT_DB_MIN; + receivedAudio: boolean = false; + + constructor({ + sampleRate = 24000, + numChannels = 1, + framesPerBuffer = 240, + deviceId, + }: { + sampleRate?: number; + numChannels?: number; + framesPerBuffer?: number; + deviceId?: number; + } = {}) { + super(); + this.sampleRate = sampleRate; + this.numChannels = numChannels; + this.framesPerBuffer = framesPerBuffer; + this.deviceId = deviceId; + } + + async onAttached(): Promise { + if (!this.sourceSet) { + const stream = new ReadableStream({ + start: async (controller) => { + (async () => { + for await (const frame of this.queue) { + controller.enqueue(frame); + } + controller.close(); + })().catch((error) => { + this.logger.error({ error }, 'ConsoleAudioInput stream error'); + }); + }, + cancel: async () => { + // noop + }, + }); + this.deferredStream.setSource(stream); + this.sourceSet = true; + } + + if (this.started) return; + await this.startDevice(); + } + + onDetached(): void { + if (!this.started) return; + try { + this.stopDevice(); + } catch (error) { + this.logger.warn({ error }, 'ConsoleAudioInput stopDevice error'); + } + } + + private async startDevice() { + try { + // Try to use our native audio implementation + const { AudioIO, SampleFormat16Bit } = await import('./native_audio.js'); + + this.ai = new AudioIO({ + inOptions: { + channelCount: this.numChannels, + sampleFormat: SampleFormat16Bit, + sampleRate: this.sampleRate, + framesPerBuffer: this.framesPerBuffer, + }, + outOptions: undefined, // input only + }); + + this.ai.on('data', (buf: Buffer) => { + // Convert to AudioFrame + const int16 = new Int16Array(buf.buffer, buf.byteOffset, buf.byteLength / 2); + const frame = new AudioFrame(int16, this.sampleRate, this.numChannels, int16.length); + + // Calculate audio level + const maxInt16 = 32767; + let rms = 0; + for (let i = 0; i < int16.length; i++) { + const v = int16[i]! / maxInt16; + rms += v * v; + } + rms = Math.sqrt(rms / int16.length) * maxInt16; + const db = 20.0 * Math.log10(rms / maxInt16 + 1e-6); + this.microDb = db; + if (rms > MIN_RMS) { + this.receivedAudio = true; + } + + this.queue.put(frame); + }); + + this.ai.on('error', (err: Error) => { + this.logger.error({ error: err }, 'Audio input error'); + }); + + this.ai.start(); + this.started = true; + // Audio input started successfully + } catch (error) { + // Fallback to mock audio + this.logger.warn('Native audio failed, using mock audio input'); + + const frameSize = this.framesPerBuffer; + const intervalMs = (frameSize / this.sampleRate) * 1000; + + this.mockInterval = setInterval(() => { + const silentData = new Int16Array(frameSize * this.numChannels); + const frame = new AudioFrame(silentData, this.sampleRate, this.numChannels, frameSize); + + this.microDb = INPUT_DB_MIN + Math.random() * 10; + this.receivedAudio = true; + this.queue.put(frame); + }, intervalMs); + + this.started = true; + } + } + + private stopDevice() { + if (this.mockInterval) { + clearInterval(this.mockInterval); + this.mockInterval = null; + } + if (this.ai) { + try { + this.ai.quit?.(); + } catch { + try { + this.ai.stop?.(); + } catch {} + } + this.ai = null; + } + this.started = false; + } +} + +class StdoutTextOutput extends TextOutput { + private capturing = false; + private enabled = true; + + async captureText(text: string): Promise { + if (!this.enabled) return; + if (!this.capturing) { + this.capturing = true; + process.stdout.write('\r'); + process.stdout.write(esc(36)); + } + process.stdout.write(text); + } + + flush(): void { + if (this.capturing) { + process.stdout.write(esc(0)); + process.stdout.write('\n'); + this.capturing = false; + } + } + + setEnabled(enabled: boolean): void { + this.enabled = enabled; + if (!enabled) this.capturing = false; + } + + get isCapturing(): boolean { + return this.capturing; + } +} + +class ConsoleAudioOutput extends AudioOutput { + private outputSampleRate: number; + private numChannels: number; + private ao: any | null = null; + private started = false; + private pushedDuration = 0.0; + private captureStart = 0; + private dispatchTimer: NodeJS.Timeout | null = null; + private _logger = log(); + + constructor({ + sampleRate = 24000, + numChannels = 1, + }: { sampleRate?: number; numChannels?: number } = {}) { + super(sampleRate); + this.outputSampleRate = sampleRate; + this.numChannels = numChannels; + } + + async onAttached(): Promise { + if (this.started) return; + + try { + // Try to use our native audio implementation + const { AudioIO } = await import('./native_audio.js'); + + this.ao = new AudioIO({ + inOptions: undefined, // output only + outOptions: { + channelCount: this.numChannels, + sampleRate: this.outputSampleRate, + }, + }); + + this.ao.start(); + this.started = true; + this._logger.info('Using native audio output via command-line tools'); + } catch (error) { + // Fallback to mock audio output + this._logger.warn('Native audio failed, using mock audio output', error); + + this.ao = { + write: (data: Buffer) => { + const frameCount = data.length / (2 * this.numChannels); + const durationMs = (frameCount / this.outputSampleRate) * 1000; + + setTimeout(() => { + this.emit('playbackFinished'); + }, durationMs); + }, + end: () => {}, + }; + + this.started = true; + } + } + + onDetached(): void { + if (!this.started) return; + try { + this.ao?.end?.(); + } catch {} + this.ao = null; + this.started = false; + } + + async captureFrame(frame: AudioFrame): Promise { + await super.captureFrame(frame); + if (!this.captureStart) { + this.captureStart = Date.now(); + } + this.pushedDuration += frame.samplesPerChannel / frame.sampleRate; + if (this.ao) { + const view = new Int16Array(frame.data); + const buf = Buffer.from(view.buffer, view.byteOffset, view.byteLength); + this.ao.write(buf); + } + } + + flush(): void { + super.flush(); + if (this.pushedDuration > 0) { + const elapsed = (Date.now() - this.captureStart) / 1000; + const toWait = Math.max(0, this.pushedDuration - elapsed); + if (this.dispatchTimer) clearTimeout(this.dispatchTimer); + this.dispatchTimer = setTimeout(() => this.dispatchPlaybackFinished(), toWait * 1000); + } + } + + clearBuffer(): void { + if (this.dispatchTimer) { + clearTimeout(this.dispatchTimer); + this.dispatchTimer = null; + } + const played = Math.min((Date.now() - this.captureStart) / 1000, this.pushedDuration); + this.onPlaybackFinished({ + playbackPosition: played, + interrupted: played + 1.0 < this.pushedDuration, + }); + this.pushedDuration = 0; + this.captureStart = 0; + } + + private dispatchPlaybackFinished(): void { + const ev: PlaybackFinishedEvent = { playbackPosition: this.pushedDuration, interrupted: false }; + this.onPlaybackFinished(ev); + this.pushedDuration = 0; + this.captureStart = 0; + } +} + +export class ChatCLI extends EventEmitter { + private loop: NodeJS.Timeout | null = null; + private session: AgentSession; + private textSink: StdoutTextOutput; + private audioSink: ConsoleAudioOutput; + private transcriptSyncer: TranscriptionSynchronizer | null = null; + private inputAudio: ConsoleAudioInput; + private mode: 'text' | 'audio' = 'audio'; + private textBuf: string[] = []; + private micName: string = 'Microphone'; + private logger = log(); + private micCheckTimer: NodeJS.Timeout | null = null; + private currentAudioLine: string = ''; + private isLogging: boolean = false; + + constructor( + agentSession: AgentSession, + { syncTranscription = true }: { syncTranscription?: boolean } = {}, + ) { + super(); + this.session = agentSession; + this.textSink = new StdoutTextOutput(); + this.audioSink = new ConsoleAudioOutput(); + this.inputAudio = new ConsoleAudioInput(); + + if (syncTranscription) { + this.transcriptSyncer = new TranscriptionSynchronizer(this.audioSink, this.textSink); + } + + // Set logger to only show warnings and errors in console mode + this.logger.level = 'warn'; + } + + async start(): Promise { + if (this.transcriptSyncer) { + this.updateTextOutput({ enable: true, stdoutEnable: false }); + } + + this.updateMicrophone(true); + this.updateSpeaker(true); + this.renderLoopStart(); + this.stdinStart(); + } + + stop(): void { + this.renderLoopStop(); + this.stdinStop(); + this.updateMicrophone(false); + this.updateSpeaker(false); + } + + private renderLoopStart() { + const interval = 1000 / FPS; + this.loop = setIntervalSafe(() => { + if (this.mode === 'audio' && !this.textSink.isCapturing) { + this.printAudioMode(); + } else if (this.mode === 'text' && !this.textSink.isCapturing) { + this.printTextMode(); + } + }, interval); + } + + private renderLoopStop() { + if (this.loop) clearIntervalSafe(this.loop); + this.loop = null; + } + + private stdinStart() { + if (!process.stdin.isTTY) return; + process.stdin.setRawMode?.(true); + process.stdin.resume(); + process.stdin.setEncoding('utf8'); + process.stdin.on('data', this.onStdinData); + readline.emitKeypressEvents(process.stdin); + } + + private stdinStop() { + try { + process.stdin.off('data', this.onStdinData); + process.stdin.setRawMode?.(false); + } catch {} + } + + private onStdinData = (chunk: string) => { + for (const ch of chunk) { + if (ch === '\u0003') { + this.stop(); + process.exit(0); + return; + } + + if (ch === '\u0002') { + if (this.mode === 'audio') { + this.mode = 'text'; + this.updateTextOutput({ enable: true, stdoutEnable: true }); + this.updateMicrophone(false); + this.updateSpeaker(false); + process.stdout.write('\nSwitched to Text Input Mode.'); + } else { + this.mode = 'audio'; + this.updateTextOutput({ enable: true, stdoutEnable: false }); + this.updateMicrophone(true); + this.updateSpeaker(true); + this.textBuf = []; + process.stdout.write('\nSwitched to Audio Input Mode.'); + } + continue; + } + + if (this.mode === 'text') { + if (ch === '\r' || ch === '\n') { + const text = this.textBuf.join(''); + if (text) { + this.textBuf = []; + try { + this.session.interrupt(); + } catch {} + this.session.generateReply({ userInput: text }); + process.stdout.write('\n'); + } + } else if (ch === '\u007f') { + if (this.textBuf.length) { + this.textBuf.pop(); + process.stdout.write('\b \b'); + } + } else if (isPrintable(ch)) { + this.textBuf.push(ch); + process.stdout.write(ch); + } + } + } + }; + + private updateMicrophone(enable: boolean) { + if (enable) { + this.session.input.audio = this.inputAudio; + if (this.micCheckTimer) clearTimeout(this.micCheckTimer); + this.micCheckTimer = setTimeout(() => this.checkMicReceivedAudio(), 5000); + } else { + this.session.input.audio = null; + } + } + + private updateSpeaker(enable: boolean) { + if (enable) { + this.session.output.audio = this.transcriptSyncer + ? this.transcriptSyncer.audioOutput + : this.audioSink; + } else { + this.session.output.audio = null; + } + } + + private updateTextOutput({ enable, stdoutEnable }: { enable: boolean; stdoutEnable: boolean }) { + if (enable) { + this.session.output.transcription = this.transcriptSyncer + ? this.transcriptSyncer.textOutput + : this.textSink; + this.textSink.setEnabled(stdoutEnable); + } else { + this.session.output.transcription = null; + this.textBuf = []; + } + } + + private checkMicReceivedAudio() { + if (!this.inputAudio.receivedAudio) { + this.logger.error('No audio input detected. Check microphone permissions.'); + } + } + + private printAudioMode() { + const amplitude = clampNormalizeDb(this.inputAudio.microDb, INPUT_DB_MIN, INPUT_DB_MAX); + const nbBar = Math.round(amplitude * MAX_AUDIO_BAR); + const colorCode = amplitude > 0.75 ? 31 : amplitude > 0.5 ? 33 : 32; + const bar = '#'.repeat(nbBar) + '-'.repeat(MAX_AUDIO_BAR - nbBar); + this.currentAudioLine = `[Audio] ${this.micName.slice(-20)} [${this.inputAudio.microDb.toFixed(2)} dBFS] ${esc(colorCode)}[${bar}]${esc(0)}`; + process.stdout.write(`\r${this.currentAudioLine}`); + } + + private printTextMode() { + process.stdout.write('\r'); + const prompt = 'Enter your message: '; + process.stdout.write(`[Text ${prompt}${this.textBuf.join('')}`); + } +} + +function isPrintable(ch: string) { + if (ch.length !== 1) return false; + const code = ch.charCodeAt(0); + return code >= 32 && code !== 127; +} diff --git a/agents/src/voice/io.ts b/agents/src/voice/io.ts index d21044c4..8acccba6 100644 --- a/agents/src/voice/io.ts +++ b/agents/src/voice/io.ts @@ -200,8 +200,18 @@ export class AgentInput { } set audio(stream: AudioInput | null) { + // Detach old stream + if (this._audioStream && this._audioEnabled) { + this._audioStream.onDetached(); + } + this._audioStream = stream; this.audioChanged(); + + // Attach new stream if enabled + if (this._audioStream && this._audioEnabled) { + this._audioStream.onAttached(); + } } } diff --git a/agents/src/voice/native_audio.ts b/agents/src/voice/native_audio.ts new file mode 100644 index 00000000..4209c18a --- /dev/null +++ b/agents/src/voice/native_audio.ts @@ -0,0 +1,684 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { type ChildProcess, spawn } from 'child_process'; +import { EventEmitter } from 'events'; +import * as os from 'os'; +import { Duplex, Readable, Writable } from 'stream'; + +export const SampleFormat8Bit = 8; +export const SampleFormat16Bit = 16; +export const SampleFormat24Bit = 24; +export const SampleFormat32Bit = 32; +export const SampleFormatFloat32 = 1; + +interface AudioOptions { + sampleRate?: number; + channelCount?: number; + sampleFormat?: number; + deviceId?: number; + framesPerBuffer?: number; + closeOnError?: boolean; + highwaterMark?: number; +} + +interface AudioIOOptions { + inOptions?: AudioOptions; + outOptions?: AudioOptions; +} + +function getSampleFormatArgs(sampleFormat: number): { bitDepth: string; encoding: string } { + switch (sampleFormat) { + case SampleFormat8Bit: + return { bitDepth: '8', encoding: 'unsigned-integer' }; + case SampleFormat16Bit: + return { bitDepth: '16', encoding: 'signed-integer' }; + case SampleFormat24Bit: + return { bitDepth: '24', encoding: 'signed-integer' }; + case SampleFormat32Bit: + return { bitDepth: '32', encoding: 'signed-integer' }; + case SampleFormatFloat32: + return { bitDepth: '32', encoding: 'floating-point' }; + default: + return { bitDepth: '16', encoding: 'signed-integer' }; + } +} + +class AudioInputStream extends Readable { + private process: ChildProcess | null = null; + private options: AudioOptions; + private isStarted = false; + private buffer: Buffer[] = []; + private totalBytesRead = 0; + private startTime: number = 0; + + constructor(options: AudioOptions) { + super({ + highWaterMark: options.highwaterMark || 16384, + objectMode: false, + }); + this.options = { + sampleRate: 44100, + channelCount: 2, + sampleFormat: SampleFormat16Bit, + deviceId: -1, + closeOnError: true, + ...options, + }; + } + + start() { + if (this.isStarted) return; + this.isStarted = true; + this.startTime = Date.now(); + this.startRecording(); + } + + private startRecording() { + const { sampleRate, channelCount, sampleFormat } = this.options; + const { bitDepth, encoding } = getSampleFormatArgs(sampleFormat!); + const platform = os.platform(); + + try { + if (platform === 'darwin') { + this.process = spawn( + 'sox', + [ + '-d', + '-r', + String(sampleRate), + '-c', + String(channelCount), + '-b', + bitDepth, + '-e', + encoding, + '-t', + 'raw', + '-', + ], + { + stdio: ['ignore', 'pipe', 'ignore'], + }, + ); + } else if (platform === 'linux') { + const format = + sampleFormat === SampleFormat16Bit + ? 'S16_LE' + : sampleFormat === SampleFormat32Bit + ? 'S32_LE' + : 'S16_LE'; + + this.process = spawn( + 'arecord', + [ + '-f', + format, + '-r', + String(sampleRate), + '-c', + String(channelCount), + '-t', + 'raw', + '-q', + '-', + ], + { + stdio: ['ignore', 'pipe', 'ignore'], + }, + ); + } else if (platform === 'win32') { + const format = + sampleFormat === SampleFormat16Bit + ? 's16le' + : sampleFormat === SampleFormat32Bit + ? 's32le' + : sampleFormat === SampleFormatFloat32 + ? 'f32le' + : 's16le'; + + this.process = spawn( + 'ffmpeg', + [ + '-f', + 'dshow', + '-i', + 'audio="Microphone (Realtek Audio)"', + '-ar', + String(sampleRate), + '-ac', + String(channelCount), + '-f', + format, + '-', + ], + { + stdio: ['ignore', 'pipe', 'ignore'], + }, + ); + } + + if (this.process && this.process.stdout) { + this.process.stdout.on('data', (chunk: Buffer) => { + const timestamp = (Date.now() - this.startTime) / 1000; + (chunk as any).timestamp = timestamp; + this.totalBytesRead += chunk.length; + + if (!this.push(chunk)) { + this.process?.stdout?.pause(); + } + }); + + this.process.stderr?.on('data', (_data) => { + // Ignore stderr output + }); + + this.process.on('error', (err) => { + if (this.options.closeOnError) { + this.destroy(err); + } else { + this.emit('error', err); + } + }); + + this.process.on('exit', (code, _signal) => { + if (code !== 0 && code !== null) { + const err = new Error(`Audio input process exited with code ${code}`); + if (this.options.closeOnError) { + this.destroy(err); + } else { + this.emit('error', err); + } + } + this.push(null); + }); + } + } catch (err) { + if (this.options.closeOnError) { + this.destroy(err as Error); + } else { + this.emit('error', err); + } + } + } + + _read() { + if (this.process?.stdout) { + this.process.stdout.resume(); + } + if (!this.isStarted) { + this.start(); + } + } + + _destroy(err: Error | null, callback: (err: Error | null) => void) { + if (this.process) { + this.process.kill('SIGTERM'); + this.process = null; + } + callback(err); + } + + quit(callback?: () => void) { + this.destroy(); + if (callback) callback(); + } + + abort(callback?: () => void) { + this.destroy(); + if (callback) callback(); + } +} + +class AudioOutputStream extends Writable { + private process: ChildProcess | null = null; + private options: AudioOptions; + private isStarted = false; + private totalBytesWritten = 0; + + constructor(options: AudioOptions) { + super({ + highWaterMark: options.highwaterMark || 16384, + objectMode: false, + decodeStrings: false, + }); + this.options = { + sampleRate: 44100, + channelCount: 2, + sampleFormat: SampleFormat16Bit, + deviceId: -1, + closeOnError: true, + ...options, + }; + } + + start() { + if (this.isStarted) return; + this.isStarted = true; + this.startPlayback(); + } + + private startPlayback() { + const { sampleRate, channelCount, sampleFormat } = this.options; + const { bitDepth, encoding } = getSampleFormatArgs(sampleFormat!); + const platform = os.platform(); + + try { + if (platform === 'darwin') { + this.process = spawn( + 'sox', + [ + '-r', + String(sampleRate), + '-c', + String(channelCount), + '-b', + bitDepth, + '-e', + encoding, + '-t', + 'raw', + '-', + '-d', + ], + { + stdio: ['pipe', 'ignore', 'ignore'], + }, + ); + } else if (platform === 'linux') { + const format = + sampleFormat === SampleFormat16Bit + ? 'S16_LE' + : sampleFormat === SampleFormat32Bit + ? 'S32_LE' + : 'S16_LE'; + + this.process = spawn( + 'aplay', + ['-f', format, '-r', String(sampleRate), '-c', String(channelCount), '-t', 'raw', '-q'], + { + stdio: ['pipe', 'ignore', 'ignore'], + }, + ); + } else if (platform === 'win32') { + const format = + sampleFormat === SampleFormat16Bit + ? 's16le' + : sampleFormat === SampleFormat32Bit + ? 's32le' + : sampleFormat === SampleFormatFloat32 + ? 'f32le' + : 's16le'; + + this.process = spawn( + 'ffmpeg', + [ + '-f', + format, + '-ar', + String(sampleRate), + '-ac', + String(channelCount), + '-i', + '-', + '-f', + 'dsound', + 'default', + ], + { + stdio: ['pipe', 'ignore', 'ignore'], + }, + ); + } + + if (this.process) { + this.process.on('error', (err) => { + if (this.options.closeOnError) { + this.destroy(err); + } else { + this.emit('error', err); + } + }); + + this.process.on('exit', (code) => { + if (code !== 0 && code !== null) { + const err = new Error(`Audio output process exited with code ${code}`); + if (this.options.closeOnError) { + this.destroy(err); + } else { + this.emit('error', err); + } + } + }); + } + } catch (err) { + if (this.options.closeOnError) { + this.destroy(err as Error); + } else { + this.emit('error', err); + } + } + } + + _write(chunk: any, encoding: string, callback: (error?: Error | null) => void) { + if (!this.isStarted) { + this.start(); + } + + if (this.process && this.process.stdin) { + this.totalBytesWritten += chunk.length; + this.process.stdin.write(chunk, callback); + } else { + callback(new Error('Audio output process not initialized')); + } + } + + _destroy(err: Error | null, callback: (err: Error | null) => void) { + if (this.process) { + if (this.process.stdin) { + this.process.stdin.end(); + } + this.process.kill('SIGTERM'); + this.process = null; + } + callback(err); + } + + _final(callback: (error?: Error | null) => void) { + if (this.process && this.process.stdin) { + this.process.stdin.end(); + } + callback(); + } + + quit(callback?: () => void) { + this.end(); + if (callback) callback(); + } + + abort(callback?: () => void) { + this.destroy(); + if (callback) callback(); + } +} + +class AudioDuplexStream extends Duplex { + private inputStream: AudioInputStream; + private outputStream: AudioOutputStream; + + constructor(options: AudioIOOptions) { + const inOpts = options.inOptions || {}; + const outOpts = options.outOptions || {}; + + super({ + allowHalfOpen: false, + readableHighWaterMark: inOpts.highwaterMark || 16384, + writableHighWaterMark: outOpts.highwaterMark || 16384, + objectMode: false, + decodeStrings: false, + }); + + this.inputStream = new AudioInputStream(inOpts); + this.outputStream = new AudioOutputStream(outOpts); + + this.inputStream.on('data', (chunk) => { + if (!this.push(chunk)) { + this.inputStream.pause(); + } + }); + + this.inputStream.on('end', () => { + this.push(null); + }); + + this.inputStream.on('error', (err) => { + this.destroy(err); + }); + + this.outputStream.on('error', (err) => { + this.destroy(err); + }); + } + + start() { + this.inputStream.start(); + this.outputStream.start(); + } + + _read() { + this.inputStream.resume(); + } + + _write(chunk: any, encoding: string, callback: (error?: Error | null) => void) { + this.outputStream.write(chunk, encoding as BufferEncoding, callback); + } + + _destroy(err: Error | null, callback: (err: Error | null) => void) { + this.inputStream.destroy(); + this.outputStream.destroy(); + callback(err); + } + + quit(callback?: () => void) { + this.inputStream.quit(); + this.outputStream.quit(); + if (callback) callback(); + } + + abort(callback?: () => void) { + this.inputStream.abort(); + this.outputStream.abort(); + if (callback) callback(); + } +} + +export class AudioIO extends EventEmitter { + private stream: Readable | Writable | Duplex; + private options: AudioIOOptions; + + constructor(options: AudioIOOptions) { + super(); + this.options = options; + + if (options.inOptions && options.outOptions) { + this.stream = new AudioDuplexStream(options); + } else if (options.inOptions) { + this.stream = new AudioInputStream(options.inOptions); + } else if (options.outOptions) { + this.stream = new AudioOutputStream(options.outOptions); + } else { + throw new Error('AudioIO requires either inOptions or outOptions'); + } + + this.stream.on('error', (err) => { + this.emit('error', err); + }); + + this.stream.on('close', () => { + this.emit('close'); + this.emit('closed'); + }); + + this.stream.on('finish', () => { + this.quit(); + this.emit('finish'); + this.emit('finished'); + }); + } + + start() { + if ('start' in this.stream) { + (this.stream as any).start(); + } + return this.stream; + } + + quit(callback?: () => void) { + if ('quit' in this.stream) { + (this.stream as any).quit(callback); + } else { + this.stream.destroy(); + if (callback) callback(); + } + } + + abort(callback?: () => void) { + if ('abort' in this.stream) { + (this.stream as any).abort(callback); + } else { + this.stream.destroy(); + if (callback) callback(); + } + } + + pipe(destination: T, options?: { end?: boolean }): T { + return this.stream.pipe(destination, options); + } + + unpipe(destination?: NodeJS.WritableStream): this { + (this.stream as Readable).unpipe(destination); + return this; + } + + write( + chunk: any, + encoding?: BufferEncoding | ((error?: Error | null) => void), + callback?: (error?: Error | null) => void, + ): boolean { + if (this.stream instanceof Writable || this.stream instanceof Duplex) { + if (typeof encoding === 'function') { + return this.stream.write(chunk, encoding); + } else if (encoding) { + return this.stream.write(chunk, encoding, callback); + } else { + return this.stream.write(chunk, callback); + } + } + return false; + } + + end(chunk?: any, encoding?: BufferEncoding | (() => void), callback?: () => void): void { + if (this.stream instanceof Writable || this.stream instanceof Duplex) { + if (typeof encoding === 'function') { + this.stream.end(chunk, encoding); + } else if (encoding) { + this.stream.end(chunk, encoding, callback); + } else if (chunk) { + this.stream.end(chunk, callback); + } else { + this.stream.end(); + } + } + } + + on(event: string | symbol, listener: (...args: any[]) => void): this { + if (event === 'data' && (this.stream instanceof Readable || this.stream instanceof Duplex)) { + this.stream.on('data', listener); + } else { + super.on(event, listener); + } + return this; + } + + once(event: string | symbol, listener: (...args: any[]) => void): this { + if (event === 'data' && (this.stream instanceof Readable || this.stream instanceof Duplex)) { + this.stream.once('data', listener); + } else { + super.once(event, listener); + } + return this; + } +} + +export function getDevices(): Array { + const platform = os.platform(); + const devices = []; + + if (platform === 'darwin') { + try { + const _result = spawn('system_profiler', ['SPAudioDataType']); + devices.push({ + id: 0, + name: 'Built-in Microphone', + maxInputChannels: 2, + maxOutputChannels: 0, + defaultSampleRate: 44100, + defaultLowInputLatency: 0.002, + defaultLowOutputLatency: 0.01, + defaultHighInputLatency: 0.012, + defaultHighOutputLatency: 0.1, + hostAPIName: 'Core Audio', + }); + devices.push({ + id: 1, + name: 'Built-in Output', + maxInputChannels: 0, + maxOutputChannels: 2, + defaultSampleRate: 44100, + defaultLowInputLatency: 0.01, + defaultLowOutputLatency: 0.002, + defaultHighInputLatency: 0.1, + defaultHighOutputLatency: 0.012, + hostAPIName: 'Core Audio', + }); + } catch (e) { + // Fall through to defaults + } + } + + if (devices.length === 0) { + devices.push({ + id: -1, + name: 'Default Input Device', + maxInputChannels: 2, + maxOutputChannels: 0, + defaultSampleRate: 44100, + defaultLowInputLatency: 0.01, + defaultLowOutputLatency: 0.01, + defaultHighInputLatency: 0.1, + defaultHighOutputLatency: 0.1, + hostAPIName: 'Default', + }); + devices.push({ + id: -1, + name: 'Default Output Device', + maxInputChannels: 0, + maxOutputChannels: 2, + defaultSampleRate: 44100, + defaultLowInputLatency: 0.01, + defaultLowOutputLatency: 0.01, + defaultHighInputLatency: 0.1, + defaultHighOutputLatency: 0.1, + hostAPIName: 'Default', + }); + } + + return devices; +} + +export function getHostAPIs(): any { + const platform = os.platform(); + let hostAPIName = 'Default'; + + if (platform === 'darwin') { + hostAPIName = 'Core Audio'; + } else if (platform === 'win32') { + hostAPIName = 'MME'; + } else if (platform === 'linux') { + hostAPIName = 'ALSA'; + } + + return { + defaultHostAPI: 0, + HostAPIs: [ + { + id: 0, + name: hostAPIName, + type: hostAPIName, + deviceCount: 2, + defaultInput: 0, + defaultOutput: 1, + }, + ], + }; +} diff --git a/examples/src/basic_agent.ts b/examples/src/basic_agent.ts index d981973f..3f658b2d 100644 --- a/examples/src/basic_agent.ts +++ b/examples/src/basic_agent.ts @@ -29,7 +29,6 @@ export default defineAgent({ }); const vad = ctx.proc.userData.vad! as silero.VAD; - const session = new voice.AgentSession({ vad, stt: new deepgram.STT(), diff --git a/turbo.json b/turbo.json index 147df0d2..2a46b4c5 100644 --- a/turbo.json +++ b/turbo.json @@ -33,7 +33,8 @@ "GOOGLE_GENAI_API_KEY", "GOOGLE_GENAI_USE_VERTEXAI", "GOOGLE_CLOUD_PROJECT", - "GOOGLE_CLOUD_LOCATION" + "GOOGLE_CLOUD_LOCATION", + "LIVEKIT_ENABLE_CONSOLE_MODE" ], "pipeline": { "build": {