Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 67 additions & 4 deletions aisuite-js/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ npm pacakge - `npm i aisuite`
- **Streaming**: Real-time streaming responses with consistent API
- **Type Safety**: Full TypeScript support with comprehensive type definitions
- **Error Handling**: Unified error handling across providers
- **Speech-to-Text**: Automatic Speech Recognition (ASR) support with multiple providers (OpenAI Whisper, Deepgram)

## Installation

Expand All @@ -26,8 +27,11 @@ npm install aisuite
import { Client } from 'aisuite';

const client = new Client({
openai: { apiKey: process.env.OPENAI_API_KEY },
openai: {
apiKey: process.env.OPENAI_API_KEY,
},
anthropic: { apiKey: process.env.ANTHROPIC_API_KEY },
deepgram: { apiKey: process.env.DEEPGRAM_API_KEY },
});

// Use any provider with identical interface
Expand Down Expand Up @@ -143,6 +147,41 @@ try {
}
```

### Speech-to-Text Transcription

```typescript
// Initialize client with audio support for OpenAI
const client = new Client({
openai: {
apiKey: process.env.OPENAI_API_KEY,
},
deepgram: { apiKey: process.env.DEEPGRAM_API_KEY }
});

// Using Deepgram
const deepgramResponse = await client.audio.transcriptions.create({
model: 'deepgram:nova-2',
file: audioBuffer, // Buffer containing audio data
language: 'en-US',
timestamps: true,
word_confidence: true,
speaker_labels: true,
});

// Using OpenAI Whisper
const openaiResponse = await client.audio.transcriptions.create({
model: 'openai:whisper-1',
file: audioBuffer,
language: 'en',
response_format: 'verbose_json',
temperature: 0,
timestamps: true,
});

console.log('Transcribed Text:', openaiResponse.text);
console.log('Words with timestamps:', openaiResponse.words);
```

### Error Handling

```typescript
Expand Down Expand Up @@ -173,11 +212,15 @@ const client = new Client({
openai?: {
apiKey: string;
baseURL?: string;
organization?: string;
organization?: string;
},
anthropic?: {
apiKey: string;
baseURL?: string;
},
deepgram?: {
apiKey: string;
baseURL?: string;
}
});
```
Expand All @@ -199,21 +242,41 @@ interface ChatCompletionRequest {
}
```

### Transcription Request

All ASR providers use a standard transcription request format with additional provider-specific parameters:

```typescript
interface TranscriptionRequest {
model: string; // "provider:model" format
file: Buffer; // Audio file as Buffer
language?: string; // Language code (e.g., "en", "en-US")
timestamps?: boolean; // Include word-level timestamps
[key: string]: any; // Additional provider-specific parameters:
// For OpenAI: See https://platform.openai.com/docs/api-reference/audio/createTranscription
// For Deepgram: See https://developers.deepgram.com/reference/speech-to-text-api/listen
}
```

### Helper Methods

```typescript
// List configured providers
// List all configured providers (including ASR)
client.listProviders(); // ['openai', 'anthropic']
client.listASRProviders(); // ['deepgram', 'openai']

// Check if a provider is configured
client.isProviderConfigured('openai'); // true
client.isASRProviderConfigured('deepgram'); // true
```

## Current Limitations

- Only OpenAI and Anthropic providers are currently supported (Gemini, Mistral, and Bedrock coming soon)
- Only OpenAI and Anthropic providers are currently supported for chat (Gemini, Mistral, and Bedrock coming soon)
- Tool calling requires handling tool responses manually
- Streaming tool calls require manual accumulation of arguments
- ASR support is limited to OpenAI Whisper (requires explicit audio configuration) and Deepgram
- Some provider-specific ASR features might require using provider-specific parameters

## Development

Expand Down
73 changes: 73 additions & 0 deletions aisuite-js/examples/deepgram.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import { Client } from "../src";
import * as fs from "fs";
import * as path from "path";

async function main() {
// Initialize the client with Deepgram configuration
// Using Deepgram SDK v4.11.2 with the new createClient API
const client = new Client({
deepgram: {
apiKey: process.env.DEEPGRAM_API_KEY || "your-deepgram-api-key",
},
});

console.log("Available ASR providers:", client.listASRProviders());

// Example: Transcribe an audio file
try {
// Create a simple test audio file (you would replace this with your actual audio file)
const testAudioPath = path.join("test-audio.wav");

// Check if test file exists, if not create a placeholder
if (!fs.existsSync(testAudioPath)) {
console.log(
"Test audio file not found. Please provide a valid audio file for transcription."
);
console.log("Expected path:", testAudioPath);
return;
}

// Read the file as a buffer
const audioBuffer = fs.readFileSync(testAudioPath);

// Create the transcription request with the audio buffer
const result = await client.audio.transcriptions.create({
model: "deepgram:general",
file: audioBuffer,
language: "en-US",
timestamps: true,
word_confidence: true,
speaker_labels: true,
});

console.log("Transcription Result:");
console.log("Text:", result.text);
console.log("Language:", result.language);
console.log("Confidence:", result.confidence);

if (result.words && result.words.length > 0) {
console.log("\nWords with timestamps:");
result.words.slice(0, 5).forEach((word, index) => {
console.log(
`${index + 1}. "${word.text}" (${word.start}s - ${
word.end
}s, confidence: ${word.confidence})`
);
});
}

if (result.segments && result.segments.length > 0) {
console.log("\nSegments:");
result.segments.forEach((segment, index) => {
console.log(
`${index + 1}. [${segment.start}s - ${segment.end}s] ${segment.text}`
);
});
}
} catch (error) {
console.error("Error during transcription:", error);
}
}

main().catch(console.error);

68 changes: 68 additions & 0 deletions aisuite-js/examples/openai-asr.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import { Client } from "../src";
import * as fs from "fs";
import * as path from "path";

async function main() {
// Initialize the client with OpenAI configuration
const client = new Client({
openai: {
apiKey: process.env.OPENAI_API_KEY!,
},
});

console.log("Available ASR providers:", client.listASRProviders());

// Example: Transcribe an audio file
try {
// Path to your audio file
const testAudioPath = path.join("test-audio.wav");

// Check if test file exists
if (!fs.existsSync(testAudioPath)) {
console.log(
"Test audio file not found. Please provide a valid audio file for transcription."
);
console.log("Expected path:", testAudioPath);
return;
}

const audioBuffer = fs.readFileSync(testAudioPath);

// Transcribe using OpenAI Whisper model
const result = await client.audio.transcriptions.create({
model: "openai:whisper-1",
file: audioBuffer,
language: "en",
response_format: "verbose_json",
temperature: 0,
timestamps: true,
});

console.log("Transcription Result:");
console.log("Text:", result.text);
console.log("Language:", result.language);
console.log("Confidence:", result.confidence);

if (result.words && result.words.length > 0) {
console.log("\nWords with timestamps:");
result.words.slice(0, 5).forEach((word, index) => {
console.log(
`${index + 1}. "${word.text}" (${word.start}s - ${word.end}s, confidence: ${word.confidence})`
);
});
}

if (result.segments && result.segments.length > 0) {
console.log("\nSegments:");
result.segments.slice(0, 3).forEach((segment, index) => {
console.log(
`${index + 1}. "${segment.text}" (${segment.start}s - ${segment.end}s)`
);
});
}
} catch (error) {
console.error("Error:", error);
}
}

main().catch(console.error);
3 changes: 3 additions & 0 deletions aisuite-js/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,15 @@
"example:streaming": "tsx examples/streaming.ts",
"example:mistral": "tsx examples/mistral.ts",
"example:groq": "tsx examples/groq.ts",
"example:deepgram": "tsx examples/deepgram.ts",
"example:openai-asr": "tsx examples/openai-asr.ts",
"lint": "eslint src/**/*.ts",
"prepublishOnly": "npm run build",
"dev": "tsc --watch"
},
"dependencies": {
"@anthropic-ai/sdk": "^0.56.0",
"@deepgram/sdk": "^4.11.2",
"@mistralai/mistralai": "^0.1.3",
"groq-sdk": "^0.29.0",
"openai": "^4.0.0"
Expand Down
53 changes: 53 additions & 0 deletions aisuite-js/src/asr-providers/deepgram/adapters.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import { TranscriptionResult, Word, Segment } from "../../types";

export function adaptResponse(response: any): TranscriptionResult {
const words: Word[] = [];
const segments: Segment[] = [];

// Handle Deepgram response structure
if (response.results?.channels?.[0]?.alternatives?.[0]) {
const alternative = response.results.channels[0].alternatives[0];

// Extract words with timestamps and confidence
if (alternative.words) {
alternative.words.forEach((word: any) => {
words.push({
text: word.word,
start: word.start,
end: word.end,
confidence: word.confidence,
speaker: word.speaker?.toString(),
});
});
}

// Extract utterances/segments
if (response.results.utterances) {
response.results.utterances.forEach((utterance: any) => {
segments.push({
text: utterance.transcript,
start: utterance.start,
end: utterance.end,
speaker: utterance.speaker?.toString(),
});
});
}

return {
text: alternative.transcript,
language: response.metadata?.language || "unknown",
confidence: alternative.confidence,
words,
segments,
};
}

// Fallback for unexpected response structure
return {
text: response.transcript || "",
language: "unknown",
confidence: undefined,
words: [],
segments: [],
};
}
2 changes: 2 additions & 0 deletions aisuite-js/src/asr-providers/deepgram/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
export { DeepgramASRProvider } from "./provider";
export type { DeepgramConfig } from "./types";
Loading