Skip to content

Commit 24e9846

Browse files
Merge pull request #243 from andrewyng/rp15/asr-passthrough-api
Python ASR Changes - Allow parameter passthrough to provider SDKs. Add doc on API design choice. Updated tests and packages. Deepgram v5 support.
2 parents ddd5a18 + 38962ff commit 24e9846

File tree

15 files changed

+1887
-385
lines changed

15 files changed

+1887
-385
lines changed

aisuite/client.py

Lines changed: 68 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
11
from .provider import ProviderFactory
22
import os
33
from .utils.tools import Tools
4-
from typing import Union, BinaryIO, Optional, Any
4+
from typing import Union, BinaryIO, Optional, Any, Literal
55
from .framework.message import (
6-
TranscriptionOptions,
76
TranscriptionResponse,
87
)
8+
from .framework.asr_params import ParamValidator
99

1010

1111
class Client:
12-
def __init__(self, provider_configs: dict = {}):
12+
def __init__(
13+
self,
14+
provider_configs: dict = {},
15+
extra_param_mode: Literal["strict", "warn", "permissive"] = "warn",
16+
):
1317
"""
1418
Initialize the client with provider configurations.
1519
Use the ProviderFactory to create provider instances.
@@ -27,9 +31,15 @@ def __init__(self, provider_configs: dict = {}):
2731
"aws_region": "us-west-2"
2832
}
2933
}
34+
extra_param_mode (str): How to handle unknown ASR parameters.
35+
- "strict": Raise ValueError on unknown params (production)
36+
- "warn": Log warning on unknown params (default, development)
37+
- "permissive": Allow all params without validation (testing)
3038
"""
3139
self.providers = {}
3240
self.provider_configs = provider_configs
41+
self.extra_param_mode = extra_param_mode
42+
self.param_validator = ParamValidator(extra_param_mode)
3343
self._chat = None
3444
self._audio = None
3545

@@ -282,51 +292,73 @@ def create(
282292
*,
283293
model: str,
284294
file: Union[str, BinaryIO],
285-
options: Optional[TranscriptionOptions] = None,
286295
**kwargs,
287296
) -> TranscriptionResponse:
288297
"""
289-
Create a transcription using the specified model and file.
298+
Create audio transcription with parameter validation.
299+
300+
This method uses a pass-through approach with validation:
301+
- Common parameters (OpenAI-style) are auto-mapped to provider equivalents
302+
- Provider-specific parameters are passed through directly
303+
- Unknown parameters are handled based on extra_param_mode
290304
291305
Args:
292306
model: Provider and model in format 'provider:model' (e.g., 'openai:whisper-1')
293307
file: Audio file to transcribe (file path or file-like object)
294-
options: TranscriptionOptions instance with unified parameters (includes stream control)
295-
**kwargs: Additional parameters (used if options is None, assumed to be OpenAI format)
308+
**kwargs: Transcription parameters (provider-specific or common)
309+
Common parameters (portable across providers):
310+
- language: Language code (e.g., "en")
311+
- prompt: Context for the transcription
312+
- temperature: Sampling temperature (0-1, OpenAI only)
313+
Provider-specific parameters are passed through directly.
314+
See provider documentation for valid parameters.
296315
297316
Returns:
298-
TranscriptionResponse: Unified response (batch or streaming based on options.stream)
317+
TranscriptionResponse: Unified response (batch or streaming)
318+
319+
Raises:
320+
ValueError: If model format invalid, provider not supported,
321+
or unknown params in strict mode
322+
323+
Examples:
324+
# Portable code (OpenAI-style params)
325+
>>> result = client.audio.transcriptions.create(
326+
... model="openai:whisper-1",
327+
... file="audio.mp3",
328+
... language="en"
329+
... )
330+
331+
# Provider-specific features
332+
>>> result = client.audio.transcriptions.create(
333+
... model="deepgram:nova-2",
334+
... file="audio.mp3",
335+
... language="en", # Common param
336+
... punctuate=True, # Deepgram-specific
337+
... diarize=True # Deepgram-specific
338+
... )
299339
"""
300-
# Validate options and kwargs
301-
if options is not None:
302-
if not options.has_any_parameters():
303-
raise ValueError(
304-
"TranscriptionOptions provided but no parameters are set. "
305-
"Please set at least one parameter or pass None to use kwargs."
306-
)
307-
# TranscriptionOptions takes precedence, ignore kwargs
308-
if kwargs:
309-
import warnings
310-
311-
warnings.warn(
312-
"Both TranscriptionOptions and kwargs provided. Using TranscriptionOptions and ignoring kwargs.",
313-
UserWarning,
314-
)
315-
elif not kwargs:
316-
# Neither options nor kwargs provided
317-
raise ValueError(
318-
"Either TranscriptionOptions or kwargs must be provided for transcription parameters."
319-
)
320-
321-
# Check that correct format is used
340+
# Validate model format
322341
if ":" not in model:
323342
raise ValueError(
324343
f"Invalid model format. Expected 'provider:model', got '{model}'"
325344
)
326345

327-
# Extract the provider key from the model identifier
346+
# Extract provider and model name
328347
provider_key, model_name = model.split(":", 1)
329348

349+
# Validate provider is supported
350+
supported_providers = ProviderFactory.get_supported_providers()
351+
if provider_key not in supported_providers:
352+
raise ValueError(
353+
f"Invalid provider key '{provider_key}'. "
354+
f"Supported providers: {supported_providers}"
355+
)
356+
357+
# Validate and map parameters
358+
validated_params = self.client.param_validator.validate_and_map(
359+
provider_key, kwargs
360+
)
361+
330362
# Initialize provider if not already initialized
331363
if provider_key not in self.client.providers:
332364
config = self.client.provider_configs.get(provider_key, {})
@@ -348,33 +380,29 @@ def create(
348380
)
349381

350382
# Determine if streaming is requested
351-
should_stream = False # Default to batch processing
352-
if options and options.stream is not None:
353-
should_stream = options.stream
354-
elif kwargs.get("stream"):
355-
should_stream = kwargs.get("stream", False)
383+
should_stream = validated_params.get("stream", False)
356384

357-
# Delegate the transcription to the correct provider's implementation
385+
# Delegate to provider implementation
358386
try:
359387
if should_stream:
360388
# Check if provider supports output streaming
361389
if hasattr(provider.audio, "transcriptions") and hasattr(
362390
provider.audio.transcriptions, "create_stream_output"
363391
):
364392
return provider.audio.transcriptions.create_stream_output(
365-
model_name, file, options=options, **kwargs
393+
model_name, file, **validated_params
366394
)
367395
else:
368396
raise ValueError(
369-
f"Provider '{provider_key}' does not support output streaming transcription."
397+
f"Provider '{provider_key}' does not support streaming transcription."
370398
)
371399
else:
372400
# Non-streaming (batch) transcription
373401
if hasattr(provider.audio, "transcriptions") and hasattr(
374402
provider.audio.transcriptions, "create"
375403
):
376404
return provider.audio.transcriptions.create(
377-
model_name, file, options=options, **kwargs
405+
model_name, file, **validated_params
378406
)
379407
else:
380408
raise ValueError(
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
# ASR - API Parameter Design Philosophy
2+
3+
## Design Goal: Portable Code with Provider Flexibility
4+
5+
The ASR parameter system is designed around a core principle: **developers should write portable code that works across providers, while retaining the ability to use provider-specific features when needed**. This document explains the rationale behind our parameter classification and validation approach.
6+
7+
---
8+
9+
## Mandatory Parameters and Common Mappings
10+
11+
### The Foundation: Minimal Requirements
12+
13+
Every transcription needs just two things:
14+
- **`model`**: Which model/provider to use
15+
- **`file`**: What audio to transcribe
16+
17+
By keeping mandatory parameters minimal, we maximize compatibility and reduce the barrier to getting started.
18+
19+
### Common Parameters: Write Once, Run Anywhere
20+
21+
Beyond the basics, there are concepts that exist across providers but use different names or formats. We handle three common parameters that auto-map to each provider's native API:
22+
23+
**Example: Same code, different providers**
24+
25+
```python
26+
# Works with OpenAI
27+
result = client.audio.transcriptions.create(
28+
model="openai:whisper-1",
29+
file="meeting.mp3",
30+
language="en",
31+
prompt="discussion about API design"
32+
)
33+
34+
# Exact same code works with Deepgram
35+
result = client.audio.transcriptions.create(
36+
model="deepgram:nova-2",
37+
file="meeting.mp3",
38+
language="en",
39+
prompt="discussion about API design"
40+
)
41+
```
42+
43+
Behind the scenes:
44+
- **`language`** passes through as `language` for both OpenAI and Deepgram, but expands to `language_code: "en-US"` for Google
45+
- **`prompt`** passes as `prompt` to OpenAI, transforms to `keywords: ["discussion", "about", "API", "design"]` for Deepgram, and becomes `speech_contexts: [{"phrases": ["discussion about API design"]}]` for Google
46+
- **`temperature`** passes through to OpenAI (which supports it) and is silently ignored by Deepgram and Google (which don't)
47+
48+
**Why auto-mapping?** Developers shouldn't need to remember that Google uses `language_code` while others use `language`, or that Deepgram expects a list of keywords. The framework handles these provider quirks transparently, letting you write portable code.
49+
50+
---
51+
52+
## Provider-Specific Features: Pass-Through for Power Users
53+
54+
Each provider has unique features that give them competitive advantages. We don't limit you to the "lowest common denominator" - if you need provider-specific functionality, it's available:
55+
56+
**Deepgram's advanced features:**
57+
```python
58+
result = client.audio.transcriptions.create(
59+
model="deepgram:nova-2",
60+
file="meeting.mp3",
61+
language="en",
62+
punctuate=True, # Deepgram-specific
63+
diarize=True, # Deepgram-specific
64+
sentiment=True, # Deepgram-specific
65+
smart_format=True # Deepgram-specific
66+
)
67+
```
68+
69+
**Google's speech contexts:**
70+
```python
71+
result = client.audio.transcriptions.create(
72+
model="google:latest_long",
73+
file="meeting.mp3",
74+
language_code="en-US",
75+
enable_automatic_punctuation=True, # Google-specific
76+
max_alternatives=3, # Google-specific
77+
speech_contexts=[{"phrases": ["API", "SDK", "REST"]}] # Google-specific
78+
)
79+
```
80+
81+
These provider-specific parameters pass through directly to the provider's SDK. The framework validates them based on your configured mode (see next section), but doesn't block access to unique features.
82+
83+
---
84+
85+
## Progressive Validation: Safety When You Need It
86+
87+
The validation system supports three modes to match different development stages:
88+
89+
### Development Mode: `"warn"` (Default)
90+
```python
91+
client = Client(extra_param_mode="warn")
92+
```
93+
Unknown parameters trigger warnings but continue execution. Perfect for exploration and prototyping. You see *"OpenAI doesn't support 'punctuate'"* but your code keeps running.
94+
95+
### Strict Mode: `"strict"`
96+
```python
97+
client = Client(extra_param_mode="strict")
98+
```
99+
Unknown parameters raise errors immediately. Use in production to catch typos, configuration mistakes, or provider API changes early. Ensures no silent failures.
100+
101+
### Permissive Mode: `"permissive"`
102+
```python
103+
client = Client(extra_param_mode="permissive")
104+
```
105+
All parameters pass through without validation. Use for beta features, experimental parameters, or when providers add new capabilities faster than framework updates.
106+
107+
**Progressive workflow:**
108+
1. **Develop** with `warn` - explore freely, see warnings
109+
2. **Refactor** - fix warnings to make code portable
110+
3. **Deploy** with `strict` - ensure production safety
111+
112+
---
113+
114+
## Developer Experience Benefits
115+
116+
### 1. Write Portable Code Naturally
117+
The same parameter names work across providers. Switch from OpenAI to Deepgram by changing one word: the model identifier.
118+
119+
### 2. Progressive Enhancement
120+
Start with portable common parameters. Add provider-specific features only where you need them. Your core logic remains portable even when using advanced features for specific providers.
121+
122+
### 3. Zero Framework Lock-in
123+
Parameter names come directly from provider APIs, not framework abstractions. If you need to remove the framework, you already know the native API - the names are identical.
124+
125+
### 4. Validation That Adapts to You
126+
Choose your safety level based on context. Strict for production, warn for development, permissive for bleeding-edge features. The framework supports your workflow rather than constraining it.
127+
128+
### 5. No Documentation Friction
129+
Copy parameters from provider docs directly. No need to learn our abstraction layer or figure out mappings - we handle the common cases, you use native names for everything else.
130+
131+
---
132+
133+
## Alternative Design Considered
134+
135+
We considered creating a unified options object (`TranscriptionOptions`) that explicitly defines all parameters with framework-specific names. We chose pass-through instead because:
136+
137+
1. **Provider APIs evolve faster than frameworks** - New parameters appear frequently. Pass-through lets developers use them immediately (in permissive mode) without waiting for framework updates.
138+
139+
2. **Provider features don't map cleanly** - Deepgram's sentiment analysis, Google's complex speech contexts, OpenAI's timestamp granularities - each is unique. A unified object means either losing functionality or creating complex provider-specific abstractions.
140+
141+
3. **Direct API access reduces friction** - Developers already know their provider's API from official docs. They can use parameter names directly rather than learning another abstraction layer.
142+
143+
The pass-through approach with progressive validation provides the best of both worlds: portability for common cases, power for advanced features, and safety when you need it.
144+
145+
---
146+
147+
## Design Principles Summary
148+
149+
- **Mandatory Minimal**: Only `model` and `file` required
150+
- **Common Auto-Mapped**: Frequent cross-provider concepts map transparently
151+
- **Provider-Specific Pass-Through**: Unique features remain accessible
152+
- **Progressive Validation**: Three modes for different development stages
153+
- **Zero Abstraction Tax**: Use provider APIs directly with optional safety nets
154+
155+
This design prioritizes developer experience through portability without sacrificing power, validation without blocking experimentation, and simplicity without limiting functionality.

0 commit comments

Comments
 (0)