Skip to content

Commit ff01bf4

Browse files
committed
feat: add embedding response caching support
- Add FenicEmbeddingsResponse type - Update CachedResponse to support both completions and embeddings - Add ResponseType enum for type safety - Implement embedding cache key computation - Update SQLite schema to store embeddings - Update ModelClient to handle embedding caching - Add comprehensive tests for embedding caching
1 parent 79ea09b commit ff01bf4

File tree

5 files changed

+424
-73
lines changed

5 files changed

+424
-73
lines changed

src/fenic/_inference/cache/protocol.py

Lines changed: 108 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,78 +1,125 @@
11
"""Protocol and types for LLM response caching."""
22

3-
from dataclasses import dataclass
3+
from dataclasses import dataclass, field
44
from datetime import datetime
5+
from enum import Enum
56
from typing import Dict, List, Optional, Protocol, Union
67

78
from fenic._inference.types import (
89
FenicCompletionsRequest,
910
FenicCompletionsResponse,
1011
FenicEmbeddingsRequest,
12+
FenicEmbeddingsResponse,
1113
ResponseUsage,
1214
)
1315

1416

17+
class ResponseType(str, Enum):
18+
"""Type of cached response.
19+
20+
Attributes:
21+
COMPLETION: A completion response from a language model.
22+
EMBEDDING: An embedding response from an embedding model.
23+
"""
24+
25+
COMPLETION = "completion"
26+
EMBEDDING = "embedding"
27+
28+
1529
@dataclass
1630
class CachedResponse:
1731
"""Cached LLM response with metadata.
1832
33+
Supports both completion and embedding responses. Either `completion` or
34+
`embedding` must be set, determined by `response_type`.
35+
1936
Attributes:
20-
completion: The completion text from the LLM.
37+
completion: The completion text from the LLM (for completion responses).
38+
embedding: The embedding vector (for embedding responses).
39+
response_type: Type of response (ResponseType enum).
2140
model: The model that generated this response.
2241
cached_at: Timestamp when this response was cached.
2342
prompt_tokens: Number of prompt tokens (if available).
2443
completion_tokens: Number of completion tokens (if available).
2544
total_tokens: Total number of tokens (if available).
2645
cached_tokens: Number of cached tokens (default: 0).
2746
thinking_tokens: Number of thinking tokens (default: 0).
28-
logprobs: Token log probabilities (if available).
47+
logprobs: Token log probabilities (if available, completion only).
2948
access_count: Number of times this cached response has been accessed.
3049
3150
Example:
32-
Creating a cached response:
51+
Creating a cached completion response:
3352
3453
```python
54+
from fenic._inference.cache.protocol import ResponseType
55+
3556
cached = CachedResponse(
3657
completion="Hello, world!",
58+
response_type=ResponseType.COMPLETION,
3759
model="gpt-4o-mini",
3860
cached_at=datetime.now(),
3961
prompt_tokens=10,
4062
completion_tokens=5,
4163
total_tokens=15,
4264
)
4365
```
66+
67+
Creating a cached embedding response:
68+
69+
```python
70+
from fenic._inference.cache.protocol import ResponseType
71+
72+
cached = CachedResponse(
73+
embedding=[0.1, 0.2, 0.3],
74+
response_type=ResponseType.EMBEDDING,
75+
model="text-embedding-3-small",
76+
cached_at=datetime.now(),
77+
prompt_tokens=10,
78+
total_tokens=10,
79+
)
80+
```
4481
"""
4582

46-
completion: str
47-
model: str
48-
cached_at: datetime
49-
prompt_tokens: Optional[int]
50-
completion_tokens: Optional[int]
51-
total_tokens: Optional[int]
83+
completion: Optional[str] = None
84+
embedding: Optional[List[float]] = None
85+
response_type: ResponseType = ResponseType.COMPLETION
86+
model: str = ""
87+
cached_at: datetime = field(default_factory=datetime.now)
88+
prompt_tokens: Optional[int] = None
89+
completion_tokens: Optional[int] = None
90+
total_tokens: Optional[int] = None
5291
cached_tokens: int = 0
5392
thinking_tokens: int = 0
5493
logprobs: Optional[list] = None
5594
access_count: int = 0
5695

57-
def to_fenic_response(self) -> FenicCompletionsResponse:
96+
def to_fenic_completion_response(self) -> FenicCompletionsResponse:
5897
"""Convert cached response to FenicCompletionsResponse.
5998
6099
Returns:
61100
FenicCompletionsResponse with cached data and usage information.
62101
102+
Raises:
103+
ValueError: If this is not a completion response.
104+
63105
Example:
64106
```python
107+
from fenic._inference.cache.protocol import ResponseType
108+
65109
cached = CachedResponse(
66110
completion="Hello!",
111+
response_type=ResponseType.COMPLETION,
67112
model="gpt-4o-mini",
68113
cached_at=datetime.now(),
69114
prompt_tokens=10,
70115
completion_tokens=5,
71116
total_tokens=15,
72117
)
73-
response = cached.to_fenic_response()
118+
response = cached.to_fenic_completion_response()
74119
```
75120
"""
121+
if self.response_type != ResponseType.COMPLETION or self.completion is None:
122+
raise ValueError("This cached response is not a completion response")
76123
usage = None
77124
if self.prompt_tokens is not None:
78125
usage = ResponseUsage(
@@ -89,6 +136,50 @@ def to_fenic_response(self) -> FenicCompletionsResponse:
89136
usage=usage,
90137
)
91138

139+
def to_fenic_embedding_response(self) -> List[float]:
140+
"""Convert cached response to embedding list.
141+
142+
Returns:
143+
List of floats representing the embedding vector.
144+
145+
Raises:
146+
ValueError: If this is not an embedding response.
147+
148+
Example:
149+
```python
150+
from fenic._inference.cache.protocol import ResponseType
151+
152+
cached = CachedResponse(
153+
embedding=[0.1, 0.2, 0.3],
154+
response_type=ResponseType.EMBEDDING,
155+
model="text-embedding-3-small",
156+
cached_at=datetime.now(),
157+
prompt_tokens=10,
158+
total_tokens=10,
159+
)
160+
embedding = cached.to_fenic_embedding_response()
161+
```
162+
"""
163+
if self.response_type != ResponseType.EMBEDDING or self.embedding is None:
164+
raise ValueError("This cached response is not an embedding response")
165+
return self.embedding
166+
167+
def to_fenic_response(self) -> Union[FenicCompletionsResponse, List[float]]:
168+
"""Convert cached response to appropriate Fenic response type.
169+
170+
Returns:
171+
FenicCompletionsResponse for completion responses, or List[float] for embedding responses.
172+
173+
Example:
174+
```python
175+
cached = CachedResponse(...)
176+
response = cached.to_fenic_response()
177+
```
178+
"""
179+
if self.response_type == ResponseType.EMBEDDING:
180+
return self.to_fenic_embedding_response()
181+
return self.to_fenic_completion_response()
182+
92183

93184
@dataclass
94185
class CacheStats:
@@ -199,14 +290,14 @@ def get_batch(self, cache_keys: List[str]) -> Dict[str, Optional[CachedResponse]
199290
def set(
200291
self,
201292
cache_key: str,
202-
response: FenicCompletionsResponse,
293+
response: Union[FenicCompletionsResponse, FenicEmbeddingsResponse],
203294
model: str,
204295
) -> bool:
205296
"""Store response in cache.
206297
207298
Args:
208299
cache_key: Unique key for the response.
209-
response: The response to cache.
300+
response: The response to cache (completion or embedding).
210301
model: The model that generated this response.
211302
212303
Returns:
@@ -219,12 +310,13 @@ def set(
219310
...
220311

221312
def set_batch(
222-
self, entries: List[tuple[str, FenicCompletionsResponse, str]]
313+
self, entries: List[tuple[str, Union[FenicCompletionsResponse, FenicEmbeddingsResponse], str]]
223314
) -> int:
224315
"""Store multiple responses in cache.
225316
226317
Args:
227-
entries: List of (cache_key, response, model) tuples.
318+
entries: List of (cache_key, response, model) tuples. Responses can be
319+
either FenicCompletionsResponse or FenicEmbeddingsResponse.
228320
229321
Returns:
230322
Count of successfully stored entries.

0 commit comments

Comments
 (0)