1010
1111from fenic ._inference .common_openai .openai_utils import convert_messages
1212from fenic ._inference .common_openai .utils import handle_openai_compatible_response
13+ from fenic ._inference .google .gemini_token_counter import GeminiLocalTokenCounter
1314from fenic ._inference .model_client import (
1415 FatalException ,
1516 ModelClient ,
2021)
2122from fenic ._inference .openrouter .openrouter_provider import OpenRouterModelProvider
2223from fenic ._inference .rate_limit_strategy import (
24+ MAX_OUTPUT_TOKENS_PER_PARSE_PDF_REQUEST ,
2325 AdaptiveBackoffRateLimitStrategy ,
2426 RateLimitStrategy ,
2527 TokenEstimate ,
@@ -87,6 +89,11 @@ def __init__(
8789 self ._aio_client = OpenRouterModelProvider ().aio_client
8890 self ._metrics = LMMetrics ()
8991
92+ self ._google_token_counter = None
93+ provider_and_model = model .split ("/" )
94+ if provider_and_model [0 ] == "google" :
95+ self ._google_token_counter = GeminiLocalTokenCounter (model_name = provider_and_model [1 ])
96+
9097 async def make_single_request (
9198 self , request : FenicCompletionsRequest
9299 ) -> Union [None , FenicCompletionsResponse , TransientException , FatalException ]:
@@ -238,8 +245,8 @@ def estimate_tokens_for_request(
238245 self , request : FenicCompletionsRequest
239246 ) -> TokenEstimate :
240247 return TokenEstimate (
241- input_tokens = self .token_counter . count_tokens (request . messages ),
242- output_tokens = self .token_counter . count_tokens ( request . messages ) + self . _get_expected_additional_reasoning_tokens (request ),
248+ input_tokens = self ._estimate_input_tokens (request ),
249+ output_tokens = self ._estimate_output_tokens (request ),
243250 )
244251
245252 def reset_metrics (self ):
@@ -248,15 +255,52 @@ def reset_metrics(self):
248255 def get_metrics (self ) -> LMMetrics :
249256 return self ._metrics
250257
258+ def _estimate_output_tokens (self , request : FenicCompletionsRequest ) -> int :
259+ """Estimate the number of output tokens for a request."""
260+ base_tokens = request .max_completion_tokens or 0
261+ if request .max_completion_tokens is None and request .messages .user_file :
262+ # TODO(DY): the semantic operator should dictate how the file affects the token estimate
263+ if self ._google_token_counter :
264+ base_tokens += self ._google_token_counter .count_file_output_tokens (messages = request .messages )
265+ else :
266+ base_tokens += self .token_counter .count_file_output_tokens (messages = request .messages )
267+ return base_tokens + self ._get_expected_additional_reasoning_tokens (request )
268+
251269 def _get_max_output_token_request_limit (self , request : FenicCompletionsRequest ) -> int :
252270 """Get the upper limit of output tokens for a request.
253271
254- If max_completion_tokens is not set, don't apply a limit and return None .
272+ For file parsing requests, use a guardrail limit of 8192 tokens (the lowest output limit of a VLM model we support) .
255273
256274 Include the thinking token budget with a safety margin."""
257- if request .max_completion_tokens is None :
258- return None
259- return request .max_completion_tokens + self ._get_expected_additional_reasoning_tokens (request )
275+ max_output_tokens = request .max_completion_tokens
276+ if request .max_completion_tokens is None and request .messages .user_file :
277+ # Guardrail to ensure the model uses a sane amount of output tokens.
278+ # Note: we can't use our token estimation because the pdf could be empty, or have only images (scans).
279+ # TODO(DY): the semantic operator should dictate how the file affects the token estimate
280+ max_output_tokens = MAX_OUTPUT_TOKENS_PER_PARSE_PDF_REQUEST
281+ return max_output_tokens + self ._get_expected_additional_reasoning_tokens (request )
282+
283+ def _estimate_input_tokens (self , request : FenicCompletionsRequest ) -> int :
284+ """Estimate the number of input tokens for a request."""
285+ if self ._google_token_counter :
286+ input_tokens = self ._google_token_counter .count_tokens (request .messages , ignore_file = True )
287+ else :
288+ input_tokens = self .token_counter .count_tokens (request .messages )
289+ if request .messages .user_file :
290+ input_tokens += self ._estimate_file_input_tokens (request )
291+ return input_tokens
292+
293+ def _estimate_file_input_tokens (self , request : FenicCompletionsRequest ) -> int :
294+ """Estimate the number of input tokens from a file in a request."""
295+ profile_config = self ._profile_manager .get_profile_by_name (request .model_profile )
296+ if profile_config .parsing_engine and profile_config .parsing_engine == "native" :
297+ if self ._google_token_counter :
298+ return self ._google_token_counter .count_file_input_tokens (messages = request .messages )
299+ else :
300+ return self .token_counter .count_file_input_tokens (messages = request .messages )
301+ # OpenRouter's engine tool processes the file first and passes annotated text to the model.
302+ # We can estimate by extracting the text and tokenizing it (which is what count_file_output_tokens does)
303+ return self .token_counter .count_file_output_tokens (messages = request .messages )
260304
261305 # This is a slightly less conservative estimate than the OpenRouter documentation on how reasoning_effort is used to
262306 # generate a reasoning.max_tokens for models that only support reasoning.max_tokens.
0 commit comments