11"""Client for making batch requests to OpenRouter's chat completions API."""
22
3+ import importlib .util
34import logging
45import math
56from json .decoder import JSONDecodeError
@@ -65,16 +66,28 @@ def __init__(
6566 profiles : Optional [dict [str , object ]] = None ,
6667 default_profile_name : Optional [str ] = None ,
6768 ):
69+ # Choose token counter based on the model's provider
70+ token_counter = None
71+ provider_and_model = model .split ("/" )
72+ if provider_and_model [0 ] == "google" and importlib .util .find_spec ("google.genai" ) is not None :
73+ # If fenic is built with google module, use the GeminiLocalTokenCounter.
74+ # Otherwise, fall back to the TiktokenTokenCounter.
75+ from fenic ._inference .google .gemini_token_counter import (
76+ GeminiLocalTokenCounter ,
77+ )
78+ token_counter = GeminiLocalTokenCounter (model_name = provider_and_model [1 ])
79+ else :
80+ token_counter = TiktokenTokenCounter (
81+ model_name = provider_and_model [1 ], fallback_encoding = "o200k_base"
82+ )
6883 super ().__init__ (
6984 model = model ,
7085 model_provider = ModelProvider .OPENROUTER ,
7186 model_provider_class = OpenRouterModelProvider (),
7287 rate_limit_strategy = rate_limit_strategy ,
7388 queue_size = queue_size ,
7489 max_backoffs = max_backoffs ,
75- token_counter = TiktokenTokenCounter (
76- model_name = model , fallback_encoding = "o200k_base"
77- ),
90+ token_counter = token_counter ,
7891 )
7992 self ._model_parameters = model_catalog .get_completion_model_parameters (
8093 ModelProvider .OPENROUTER , model
@@ -87,17 +100,22 @@ def __init__(
87100 self ._aio_client = OpenRouterModelProvider ().aio_client
88101 self ._metrics = LMMetrics ()
89102
103+
104+
90105 async def make_single_request (
91106 self , request : FenicCompletionsRequest
92107 ) -> Union [None , FenicCompletionsResponse , TransientException , FatalException ]:
93108 profile = self ._profile_manager .get_profile_by_name (request .model_profile )
94109 common_params = {
95110 "model" : self .model ,
96111 "messages" : convert_messages (request .messages ),
97- "max_completion_tokens" : self ._get_max_output_token_request_limit (request ),
98112 "n" : 1 ,
99113 }
100114
115+ max_completion_tokens = self ._get_max_output_token_request_limit (request )
116+ if max_completion_tokens is not None :
117+ common_params ["max_completion_tokens" ] = max_completion_tokens
118+
101119 if request .top_logprobs :
102120 common_params .update (
103121 {"logprobs" : True , "top_logprobs" : request .top_logprobs }
@@ -238,8 +256,8 @@ def estimate_tokens_for_request(
238256 self , request : FenicCompletionsRequest
239257 ) -> TokenEstimate :
240258 return TokenEstimate (
241- input_tokens = self .token_counter . count_tokens (request . messages ),
242- output_tokens = self .token_counter . count_tokens ( request . messages ) + self . _get_expected_additional_reasoning_tokens (request ),
259+ input_tokens = self ._estimate_input_tokens (request ),
260+ output_tokens = self ._estimate_output_tokens (request ),
243261 )
244262
245263 def reset_metrics (self ):
@@ -248,16 +266,39 @@ def reset_metrics(self):
248266 def get_metrics (self ) -> LMMetrics :
249267 return self ._metrics
250268
251- def _get_max_output_token_request_limit (self , request : FenicCompletionsRequest ) -> int :
252- """Get the upper limit of output tokens for a request.
269+ def _estimate_output_tokens (self , request : FenicCompletionsRequest ) -> int :
270+ """Estimate the number of output tokens for a request."""
271+ base_tokens = request .max_completion_tokens or 0
272+ if request .max_completion_tokens is None and request .messages .user_file :
273+ # TODO(DY): the semantic operator should dictate how the file affects the token estimate
274+ base_tokens += self .token_counter .count_file_output_tokens (messages = request .messages )
275+ return base_tokens + self ._get_expected_additional_reasoning_tokens (request )
253276
254- If max_completion_tokens is not set, don't apply a limit and return None.
277+ def _get_max_output_token_request_limit (self , request : FenicCompletionsRequest ) -> Optional [int ]:
278+ """Return the maximum output token limit for a request.
255279
256- Include the thinking token budget with a safety margin."""
280+ Returns None if max_completion_tokens is not provided (no limit should be set).
281+ If max_completion_tokens is provided, includes the thinking token budget with a safety margin."""
257282 if request .max_completion_tokens is None :
258283 return None
259284 return request .max_completion_tokens + self ._get_expected_additional_reasoning_tokens (request )
260285
286+ def _estimate_input_tokens (self , request : FenicCompletionsRequest ) -> int :
287+ """Estimate the number of input tokens for a request."""
288+ input_tokens = self .token_counter .count_tokens (request .messages , ignore_file = True )
289+ if request .messages .user_file :
290+ input_tokens += self ._estimate_file_input_tokens (request )
291+ return input_tokens
292+
293+ def _estimate_file_input_tokens (self , request : FenicCompletionsRequest ) -> int :
294+ """Estimate the number of input tokens from a file in a request."""
295+ profile_config = self ._profile_manager .get_profile_by_name (request .model_profile )
296+ if profile_config .parsing_engine and profile_config .parsing_engine == "native" :
297+ return self .token_counter .count_file_input_tokens (messages = request .messages )
298+ # OpenRouter's engine tool processes the file first and passes annotated text to the model.
299+ # We can estimate by extracting the text and tokenizing it (which is what count_file_output_tokens does)
300+ return self .token_counter .count_file_output_tokens (messages = request .messages )
301+
261302 # This is a slightly less conservative estimate than the OpenRouter documentation on how reasoning_effort is used to
262303 # generate a reasoning.max_tokens for models that only support reasoning.max_tokens.
263304 # These percentages are slightly lower, since our use-cases generally require fewer reasoning tokens.
0 commit comments