11"""Protocol and types for LLM response caching."""
22
3- from dataclasses import dataclass
3+ from dataclasses import dataclass , field
44from datetime import datetime
5+ from enum import Enum
56from typing import Dict , List , Optional , Protocol , Union
67
78from fenic ._inference .types import (
89 FenicCompletionsRequest ,
910 FenicCompletionsResponse ,
1011 FenicEmbeddingsRequest ,
12+ FenicEmbeddingsResponse ,
1113 ResponseUsage ,
1214)
1315
1416
17+ class ResponseType (str , Enum ):
18+ """Type of cached response.
19+
20+ Attributes:
21+ COMPLETION: A completion response from a language model.
22+ EMBEDDING: An embedding response from an embedding model.
23+ """
24+
25+ COMPLETION = "completion"
26+ EMBEDDING = "embedding"
27+
28+
1529@dataclass
1630class CachedResponse :
1731 """Cached LLM response with metadata.
1832
33+ Supports both completion and embedding responses. Either `completion` or
34+ `embedding` must be set, determined by `response_type`.
35+
1936 Attributes:
20- completion: The completion text from the LLM.
37+ completion: The completion text from the LLM (for completion responses).
38+ embedding: The embedding vector (for embedding responses).
39+ response_type: Type of response (ResponseType enum).
2140 model: The model that generated this response.
2241 cached_at: Timestamp when this response was cached.
2342 prompt_tokens: Number of prompt tokens (if available).
2443 completion_tokens: Number of completion tokens (if available).
2544 total_tokens: Total number of tokens (if available).
2645 cached_tokens: Number of cached tokens (default: 0).
2746 thinking_tokens: Number of thinking tokens (default: 0).
28- logprobs: Token log probabilities (if available).
47+ logprobs: Token log probabilities (if available, completion only ).
2948 access_count: Number of times this cached response has been accessed.
3049
3150 Example:
32- Creating a cached response:
51+ Creating a cached completion response:
3352
3453 ```python
54+ from fenic._inference.cache.protocol import ResponseType
55+
3556 cached = CachedResponse(
3657 completion="Hello, world!",
58+ response_type=ResponseType.COMPLETION,
3759 model="gpt-4o-mini",
3860 cached_at=datetime.now(),
3961 prompt_tokens=10,
4062 completion_tokens=5,
4163 total_tokens=15,
4264 )
4365 ```
66+
67+ Creating a cached embedding response:
68+
69+ ```python
70+ from fenic._inference.cache.protocol import ResponseType
71+
72+ cached = CachedResponse(
73+ embedding=[0.1, 0.2, 0.3],
74+ response_type=ResponseType.EMBEDDING,
75+ model="text-embedding-3-small",
76+ cached_at=datetime.now(),
77+ prompt_tokens=10,
78+ total_tokens=10,
79+ )
80+ ```
4481 """
4582
46- completion : str
47- model : str
48- cached_at : datetime
49- prompt_tokens : Optional [int ]
50- completion_tokens : Optional [int ]
51- total_tokens : Optional [int ]
83+ completion : Optional [str ] = None
84+ embedding : Optional [List [float ]] = None
85+ response_type : ResponseType = ResponseType .COMPLETION
86+ model : str = ""
87+ cached_at : datetime = field (default_factory = datetime .now )
88+ prompt_tokens : Optional [int ] = None
89+ completion_tokens : Optional [int ] = None
90+ total_tokens : Optional [int ] = None
5291 cached_tokens : int = 0
5392 thinking_tokens : int = 0
5493 logprobs : Optional [list ] = None
5594 access_count : int = 0
5695
57- def to_fenic_response (self ) -> FenicCompletionsResponse :
96+ def to_fenic_completion_response (self ) -> FenicCompletionsResponse :
5897 """Convert cached response to FenicCompletionsResponse.
5998
6099 Returns:
61100 FenicCompletionsResponse with cached data and usage information.
62101
102+ Raises:
103+ ValueError: If this is not a completion response.
104+
63105 Example:
64106 ```python
107+ from fenic._inference.cache.protocol import ResponseType
108+
65109 cached = CachedResponse(
66110 completion="Hello!",
111+ response_type=ResponseType.COMPLETION,
67112 model="gpt-4o-mini",
68113 cached_at=datetime.now(),
69114 prompt_tokens=10,
70115 completion_tokens=5,
71116 total_tokens=15,
72117 )
73- response = cached.to_fenic_response ()
118+ response = cached.to_fenic_completion_response ()
74119 ```
75120 """
121+ if self .response_type != ResponseType .COMPLETION or self .completion is None :
122+ raise ValueError ("This cached response is not a completion response" )
76123 usage = None
77124 if self .prompt_tokens is not None :
78125 usage = ResponseUsage (
@@ -89,6 +136,50 @@ def to_fenic_response(self) -> FenicCompletionsResponse:
89136 usage = usage ,
90137 )
91138
139+ def to_fenic_embedding_response (self ) -> List [float ]:
140+ """Convert cached response to embedding list.
141+
142+ Returns:
143+ List of floats representing the embedding vector.
144+
145+ Raises:
146+ ValueError: If this is not an embedding response.
147+
148+ Example:
149+ ```python
150+ from fenic._inference.cache.protocol import ResponseType
151+
152+ cached = CachedResponse(
153+ embedding=[0.1, 0.2, 0.3],
154+ response_type=ResponseType.EMBEDDING,
155+ model="text-embedding-3-small",
156+ cached_at=datetime.now(),
157+ prompt_tokens=10,
158+ total_tokens=10,
159+ )
160+ embedding = cached.to_fenic_embedding_response()
161+ ```
162+ """
163+ if self .response_type != ResponseType .EMBEDDING or self .embedding is None :
164+ raise ValueError ("This cached response is not an embedding response" )
165+ return self .embedding
166+
167+ def to_fenic_response (self ) -> Union [FenicCompletionsResponse , List [float ]]:
168+ """Convert cached response to appropriate Fenic response type.
169+
170+ Returns:
171+ FenicCompletionsResponse for completion responses, or List[float] for embedding responses.
172+
173+ Example:
174+ ```python
175+ cached = CachedResponse(...)
176+ response = cached.to_fenic_response()
177+ ```
178+ """
179+ if self .response_type == ResponseType .EMBEDDING :
180+ return self .to_fenic_embedding_response ()
181+ return self .to_fenic_completion_response ()
182+
92183
93184@dataclass
94185class CacheStats :
@@ -199,14 +290,14 @@ def get_batch(self, cache_keys: List[str]) -> Dict[str, Optional[CachedResponse]
199290 def set (
200291 self ,
201292 cache_key : str ,
202- response : FenicCompletionsResponse ,
293+ response : Union [ FenicCompletionsResponse , FenicEmbeddingsResponse ] ,
203294 model : str ,
204295 ) -> bool :
205296 """Store response in cache.
206297
207298 Args:
208299 cache_key: Unique key for the response.
209- response: The response to cache.
300+ response: The response to cache (completion or embedding) .
210301 model: The model that generated this response.
211302
212303 Returns:
@@ -219,12 +310,13 @@ def set(
219310 ...
220311
221312 def set_batch (
222- self , entries : List [tuple [str , FenicCompletionsResponse , str ]]
313+ self , entries : List [tuple [str , Union [ FenicCompletionsResponse , FenicEmbeddingsResponse ] , str ]]
223314 ) -> int :
224315 """Store multiple responses in cache.
225316
226317 Args:
227- entries: List of (cache_key, response, model) tuples.
318+ entries: List of (cache_key, response, model) tuples. Responses can be
319+ either FenicCompletionsResponse or FenicEmbeddingsResponse.
228320
229321 Returns:
230322 Count of successfully stored entries.
0 commit comments