Skip to content

Commit 6c4fcee

Browse files
committed
feat: Add pdf_parsing to openrouter
1 parent f2170a5 commit 6c4fcee

File tree

11 files changed

+139
-45
lines changed

11 files changed

+139
-45
lines changed

.github/workflows/test.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ jobs:
4848
env:
4949
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
5050
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
51+
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
5152
strategy:
5253
matrix: ${{ fromJson(needs.matrix.outputs.matrix) }}
5354
steps:

src/fenic/_inference/google/gemini_token_counter.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def __init__(self, model_name: str, fallback_encoding: str = "gemini-2.5-flash")
4040
except ValueError:
4141
self.google_tokenizer = LocalTokenizer(model_name=fallback_encoding)
4242

43-
def count_tokens(self, messages: Tokenizable) -> int:
43+
def count_tokens(self, messages: Tokenizable, ignore_file: bool = False) -> int:
4444
"""Count tokens for a string, message list, or `LMRequestMessages`.
4545
4646
Args:
@@ -53,7 +53,7 @@ def count_tokens(self, messages: Tokenizable) -> int:
5353
if isinstance(messages, str):
5454
return self._count_text_tokens(messages)
5555
elif isinstance(messages, LMRequestMessages):
56-
return self._count_request_tokens(messages)
56+
return self._count_request_tokens(messages, ignore_file)
5757

5858
def count_file_input_tokens(self, messages: LMRequestMessages) -> int:
5959
# Gemini 2.0 charges 258 tokens per page for all PDF inputs. For more detail, see https://gemini-api.apidog.io/doc-965859#technical-details
@@ -68,7 +68,7 @@ def count_file_output_tokens(self, messages: LMRequestMessages) -> int:
6868
# In our estimates we add buffer, both for markdown structure and in case we ask the model to describe images.
6969
return self.google_tokenizer.count_tokens(text).total_tokens
7070

71-
def _count_request_tokens(self, messages: LMRequestMessages) -> int:
71+
def _count_request_tokens(self, messages: LMRequestMessages, ignore_file: bool = False) -> int:
7272
"""Count tokens for an `LMRequestMessages` object."""
7373
contents = convert_text_messages(messages)
7474
tokens = 0
@@ -79,7 +79,7 @@ def _count_request_tokens(self, messages: LMRequestMessages) -> int:
7979
).total_tokens
8080
tokens += count_tokens
8181

82-
if messages.user_file:
82+
if messages.user_file and not ignore_file:
8383
tokens += self.count_file_input_tokens(messages)
8484
return tokens
8585

src/fenic/_inference/openrouter/openrouter_batch_chat_completions_client.py

Lines changed: 51 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Client for making batch requests to OpenRouter's chat completions API."""
22

3+
import importlib.util
34
import logging
45
import math
56
from json.decoder import JSONDecodeError
@@ -65,16 +66,28 @@ def __init__(
6566
profiles: Optional[dict[str, object]] = None,
6667
default_profile_name: Optional[str] = None,
6768
):
69+
# Choose token counter based on the model's provider
70+
token_counter = None
71+
provider_and_model = model.split("/")
72+
if provider_and_model[0] == "google" and importlib.util.find_spec("google.genai") is not None:
73+
# If fenic is built with google module, use the GeminiLocalTokenCounter.
74+
# Otherwise, fall back to the TiktokenTokenCounter.
75+
from fenic._inference.google.gemini_token_counter import (
76+
GeminiLocalTokenCounter,
77+
)
78+
token_counter = GeminiLocalTokenCounter(model_name=provider_and_model[1])
79+
else:
80+
token_counter = TiktokenTokenCounter(
81+
model_name=provider_and_model[1], fallback_encoding="o200k_base"
82+
)
6883
super().__init__(
6984
model=model,
7085
model_provider=ModelProvider.OPENROUTER,
7186
model_provider_class=OpenRouterModelProvider(),
7287
rate_limit_strategy=rate_limit_strategy,
7388
queue_size=queue_size,
7489
max_backoffs=max_backoffs,
75-
token_counter=TiktokenTokenCounter(
76-
model_name=model, fallback_encoding="o200k_base"
77-
),
90+
token_counter=token_counter,
7891
)
7992
self._model_parameters = model_catalog.get_completion_model_parameters(
8093
ModelProvider.OPENROUTER, model
@@ -87,17 +100,22 @@ def __init__(
87100
self._aio_client = OpenRouterModelProvider().aio_client
88101
self._metrics = LMMetrics()
89102

103+
104+
90105
async def make_single_request(
91106
self, request: FenicCompletionsRequest
92107
) -> Union[None, FenicCompletionsResponse, TransientException, FatalException]:
93108
profile = self._profile_manager.get_profile_by_name(request.model_profile)
94109
common_params = {
95110
"model": self.model,
96111
"messages": convert_messages(request.messages),
97-
"max_completion_tokens": self._get_max_output_token_request_limit(request),
98112
"n": 1,
99113
}
100114

115+
max_completion_tokens = self._get_max_output_token_request_limit(request)
116+
if max_completion_tokens is not None:
117+
common_params["max_completion_tokens"] = max_completion_tokens
118+
101119
if request.top_logprobs:
102120
common_params.update(
103121
{"logprobs": True, "top_logprobs": request.top_logprobs}
@@ -238,8 +256,8 @@ def estimate_tokens_for_request(
238256
self, request: FenicCompletionsRequest
239257
) -> TokenEstimate:
240258
return TokenEstimate(
241-
input_tokens=self.token_counter.count_tokens(request.messages),
242-
output_tokens=self.token_counter.count_tokens(request.messages) + self._get_expected_additional_reasoning_tokens(request),
259+
input_tokens=self._estimate_input_tokens(request),
260+
output_tokens=self._estimate_output_tokens(request),
243261
)
244262

245263
def reset_metrics(self):
@@ -248,16 +266,39 @@ def reset_metrics(self):
248266
def get_metrics(self) -> LMMetrics:
249267
return self._metrics
250268

251-
def _get_max_output_token_request_limit(self, request: FenicCompletionsRequest) -> int:
252-
"""Get the upper limit of output tokens for a request.
269+
def _estimate_output_tokens(self, request: FenicCompletionsRequest) -> int:
270+
"""Estimate the number of output tokens for a request."""
271+
base_tokens = request.max_completion_tokens or 0
272+
if request.max_completion_tokens is None and request.messages.user_file:
273+
# TODO(DY): the semantic operator should dictate how the file affects the token estimate
274+
base_tokens += self.token_counter.count_file_output_tokens(messages=request.messages)
275+
return base_tokens + self._get_expected_additional_reasoning_tokens(request)
253276

254-
If max_completion_tokens is not set, don't apply a limit and return None.
277+
def _get_max_output_token_request_limit(self, request: FenicCompletionsRequest) -> Optional[int]:
278+
"""Return the maximum output token limit for a request.
255279
256-
Include the thinking token budget with a safety margin."""
280+
Returns None if max_completion_tokens is not provided (no limit should be set).
281+
If max_completion_tokens is provided, includes the thinking token budget with a safety margin."""
257282
if request.max_completion_tokens is None:
258283
return None
259284
return request.max_completion_tokens + self._get_expected_additional_reasoning_tokens(request)
260285

286+
def _estimate_input_tokens(self, request: FenicCompletionsRequest) -> int:
287+
"""Estimate the number of input tokens for a request."""
288+
input_tokens = self.token_counter.count_tokens(request.messages, ignore_file=True)
289+
if request.messages.user_file:
290+
input_tokens += self._estimate_file_input_tokens(request)
291+
return input_tokens
292+
293+
def _estimate_file_input_tokens(self, request: FenicCompletionsRequest) -> int:
294+
"""Estimate the number of input tokens from a file in a request."""
295+
profile_config = self._profile_manager.get_profile_by_name(request.model_profile)
296+
if profile_config.parsing_engine and profile_config.parsing_engine == "native":
297+
return self.token_counter.count_file_input_tokens(messages=request.messages)
298+
# OpenRouter's engine tool processes the file first and passes annotated text to the model.
299+
# We can estimate by extracting the text and tokenizing it (which is what count_file_output_tokens does)
300+
return self.token_counter.count_file_output_tokens(messages=request.messages)
301+
261302
# This is a slightly less conservative estimate than the OpenRouter documentation on how reasoning_effort is used to
262303
# generate a reasoning.max_tokens for models that only support reasoning.max_tokens.
263304
# These percentages are slightly lower, since our use-cases generally require fewer reasoning tokens.

src/fenic/_inference/openrouter/openrouter_profile_manager.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
ResolvedOpenRouterProviderRouting,
2121
)
2222
from fenic.core.types.provider_routing import StructuredOutputStrategy
23+
from fenic.core.types.semantic import ParsingEngine
2324

2425

2526
@dataclass
@@ -32,6 +33,7 @@ class OpenRouterCompletionProfileConfiguration(BaseProfileConfiguration):
3233
models: Optional[list[str]] = None
3334
provider: Optional[ResolvedOpenRouterProviderRouting] = None
3435
structured_output_strategy: Optional[StructuredOutputStrategy] = None
36+
parsing_engine: Optional[ParsingEngine] = None
3537

3638
@property
3739
def extra_body(self) -> dict[str, Any]:
@@ -70,6 +72,14 @@ def extra_body(self) -> dict[str, Any]:
7072
if reasoning_obj:
7173
reasoning_obj["exclude"] = True
7274
params["reasoning"] = reasoning_obj
75+
params["plugins"] = [
76+
{
77+
"id": "file-parser",
78+
"pdf": {
79+
"engine": self.parsing_engine or "native"
80+
}
81+
}
82+
]
7383
return params
7484

7585

@@ -116,13 +126,21 @@ def _process_profile(
116126
and profile.reasoning_max_tokens is None
117127
):
118128
profile.reasoning_effort = "low"
129+
130+
pdf_page_processing_cost = None
131+
if profile.parsing_engine and profile.parsing_engine == "mistral-ocr":
132+
pdf_page_processing_cost = 2/1000
133+
elif profile.parsing_engine and profile.parsing_engine == "pdf-text":
134+
pdf_page_processing_cost = 0
119135

120136
return OpenRouterCompletionProfileConfiguration(
121137
models=profile.models,
122138
provider=profile.provider,
123139
reasoning_effort=profile.reasoning_effort,
124140
reasoning_max_tokens=profile.reasoning_max_tokens,
125141
structured_output_strategy=profile.structured_output_strategy,
142+
parsing_engine=profile.parsing_engine,
143+
pdf_page_processing_cost=pdf_page_processing_cost,
126144
)
127145

128146
def get_default_profile(self) -> OpenRouterCompletionProfileConfiguration:

src/fenic/_inference/openrouter/openrouter_provider.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ def _translate_model(
123123
supports_reasoning=supports_reasoning,
124124
supports_custom_temperature=supports_custom_temperature,
125125
supports_verbosity=supports_verbosity,
126+
supports_pdf_parsing=True, # Even if the model doesn't support pdf file processing, OpenRouter can use its separate processing engines
126127
supported_parameters=supported_params,
127128
)
128129

src/fenic/_inference/token_counter.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
Tokenizable = Union[str | LMRequestMessages]
1111

1212
class TokenCounter(Protocol):
13-
def count_tokens(self, messages: Tokenizable) -> int: ...
13+
def count_tokens(self, messages: Tokenizable, ignore_file: bool = False) -> int: ...
1414
def count_file_input_tokens(self, messages: LMRequestMessages) -> int: ...
1515
def count_file_output_tokens(self, messages: LMRequestMessages) -> int: ...
1616

@@ -22,11 +22,11 @@ def __init__(self, model_name: str, fallback_encoding: str = "o200k_base"):
2222
except KeyError:
2323
self.tokenizer = tiktoken.get_encoding(fallback_encoding)
2424

25-
def count_tokens(self, messages: Tokenizable) -> int:
25+
def count_tokens(self, messages: Tokenizable, ignore_file: bool = False) -> int:
2626
if isinstance(messages, str):
2727
return len(self.tokenizer.encode(messages))
2828
elif isinstance(messages, LMRequestMessages):
29-
return self._count_message_tokens(messages)
29+
return self._count_message_tokens(messages, ignore_file)
3030
else:
3131
raise TypeError(f"Expected str or LMRequestMessages, got {type(messages)}")
3232

@@ -55,7 +55,7 @@ def count_file_output_tokens(self, messages: LMRequestMessages) -> int:
5555
else:
5656
raise InternalError(f"File{messages.user_file.path}'s extension is not supported for llm completions.")
5757

58-
def _count_message_tokens(self, messages: LMRequestMessages) -> int:
58+
def _count_message_tokens(self, messages: LMRequestMessages, ignore_file: bool = False) -> int:
5959
num_tokens = 0
6060
message_count = 2 # system message and user parent message
6161
num_tokens += self.count_tokens(messages.system)
@@ -66,7 +66,7 @@ def _count_message_tokens(self, messages: LMRequestMessages) -> int:
6666
num_tokens += self.count_tokens(example.user)
6767
num_tokens += self.count_tokens(example.assistant)
6868
message_count += 2
69-
if messages.user_file:
69+
if messages.user_file and not ignore_file:
7070
num_tokens += self.count_file_input_tokens(messages)
7171
message_count += 1
7272
num_tokens += message_count * PREFIX_TOKENS_PER_MESSAGE

src/fenic/api/session/config.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -644,6 +644,7 @@ class Profile(BaseModel):
644644
ge=1024,
645645
)
646646

647+
ParsingEngine = Literal["mistral-ocr", "pdf-text", "native"]
647648

648649
class OpenRouterLanguageModel(BaseModel):
649650
"""Configuration for OpenRouter language models.
@@ -788,6 +789,8 @@ class Profile(BaseModel):
788789
If the model does support reasoning, but not `reasoning_max_tokens`, a `reasoning_effort_ will be automatically
789790
calculated based on `reasoning_max_tokens` as a percentage of the model's maximum output size
790791
([OpenRouter Documentation](https://openrouter.ai/docs/use-cases/reasoning-tokens#max-tokens-for-reasoning))
792+
parsing_engine: The parsing engine to use for processing PDF files. By default, the model's native parsing engine will be used. If the model doesn't support PDF processing and the parsing engine is not provided, an error will be raised. Note: 'mistral-ocr' incurs additional costs.
793+
([OpenRouter Documentation](https://openrouter.ai/docs/features/multimodal/pdfs))
791794
"""
792795
model_config = ConfigDict(extra="forbid")
793796

@@ -807,6 +810,10 @@ class Profile(BaseModel):
807810
provider: Optional[OpenRouterLanguageModel.Provider] = Field(
808811
default=None, description="Provider routing configuration"
809812
)
813+
parsing_engine: Optional[ParsingEngine] = Field(
814+
default=None,
815+
description="The parsing engine to use for processing PDF files. By default, the model's native parsing engine will be used."
816+
)
810817

811818

812819
CohereEmbeddingTaskType = Literal[
@@ -1341,6 +1348,7 @@ def resolve_model(model: ModelConfig) -> ResolvedModelConfig:
13411348
reasoning_effort=profile.reasoning_effort,
13421349
reasoning_max_tokens=profile.reasoning_max_tokens,
13431350
models=profile.models,
1351+
parsing_engine=profile.parsing_engine,
13441352
provider=(
13451353
ResolvedOpenRouterProviderRouting(
13461354
**(profile.provider.model_dump())

src/fenic/core/_inference/model_catalog.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ def __init__(
3232
self.cached_input_token_write_cost = cached_input_token_write_cost
3333
self.output_token_cost = output_token_cost
3434

35-
3635
class CompletionModelParameters:
3736
"""Parameters for completion models including costs and context window size.
3837
@@ -42,6 +41,7 @@ class CompletionModelParameters:
4241
cached_input_token_write_cost: Cost per cached input token write in USD
4342
output_token_cost: Cost per output token in USD
4443
context_window_length: Maximum number of tokens in the context window
44+
tiered_token_costs: Tiered token costs for the model.
4545
max_output_tokens: Maximum number of tokens the model can generate in a single request.
4646
max_temperature: Maximum temperature for the model.
4747
supports_profiles: Whether the model supports parameter profiles.
@@ -897,7 +897,6 @@ def _initialize_google_gla_models(self):
897897
context_window_length=1_048_576,
898898
max_output_tokens=8_192,
899899
max_temperature=2.0,
900-
supports_profiles=False,
901900
supports_pdf_parsing=True,
902901
),
903902
snapshots=["gemini-2.0-flash-lite-001"],
@@ -914,7 +913,6 @@ def _initialize_google_gla_models(self):
914913
context_window_length=1_048_576,
915914
max_output_tokens=8_192,
916915
max_temperature=2.0,
917-
supports_profiles=False,
918916
supports_pdf_parsing=True,
919917
),
920918
snapshots=["gemini-2.0-flash-001", "gemini-2.0-flash-exp"],

src/fenic/core/_resolved_session_config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
ProviderSort,
2020
StructuredOutputStrategy,
2121
)
22+
from fenic.core.types.semantic import ParsingEngine
2223

2324
ReasoningEffort = Literal["minimal", "low", "medium", "high"]
2425
Verbosity = Literal["low", "medium", "high"]
@@ -79,6 +80,7 @@ class ResolvedOpenRouterModelProfile:
7980
models: Optional[list[str]] = None
8081
provider: Optional[ResolvedOpenRouterProviderRouting] = None
8182
structured_output_strategy: Optional[StructuredOutputStrategy] = None
83+
parsing_engine: Optional[ParsingEngine] = None
8284

8385

8486
@dataclass
@@ -128,6 +130,7 @@ class ResolvedOpenRouterModelConfig:
128130
profiles: Optional[dict[str, ResolvedOpenRouterModelProfile]] = None
129131
model_provider: ModelProvider = ModelProvider.OPENROUTER
130132
default_profile: Optional[str] = None
133+
parsing_engine: Optional[ParsingEngine] = None
131134

132135

133136
ResolvedModelConfig = Union[

src/fenic/core/types/semantic.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
"""Types used to configure model selection for semantic functions."""
22
from __future__ import annotations
33

4-
from typing import Optional, Union
4+
from typing import Literal, Optional, Union
55

66
from pydantic import BaseModel
77

88
from fenic.core._logical_plan.resolved_types import ResolvedModelAlias
99

10+
ParsingEngine = Literal["mistral-ocr", "pdf-text", "native"]
1011

1112
class ModelAlias(BaseModel):
1213
"""A combination of a model name and a required profile for that model.

0 commit comments

Comments
 (0)