Skip to content

Commit b7be090

Browse files
committed
feat: Add pdf_parsing to openrouter
1 parent e3f58cd commit b7be090

File tree

11 files changed

+143
-40
lines changed

11 files changed

+143
-40
lines changed

.github/workflows/test.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ jobs:
4848
env:
4949
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
5050
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
51+
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
5152
strategy:
5253
matrix: ${{ fromJson(needs.matrix.outputs.matrix) }}
5354
steps:

src/fenic/_inference/google/gemini_token_counter.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def __init__(self, model_name: str, fallback_encoding: str = "gemini-2.5-flash")
4040
except ValueError:
4141
self.google_tokenizer = LocalTokenizer(model_name=fallback_encoding)
4242

43-
def count_tokens(self, messages: Tokenizable) -> int:
43+
def count_tokens(self, messages: Tokenizable, ignore_file:bool = False) -> int:
4444
"""Count tokens for a string, message list, or `LMRequestMessages`.
4545
4646
Args:
@@ -53,7 +53,7 @@ def count_tokens(self, messages: Tokenizable) -> int:
5353
if isinstance(messages, str):
5454
return self._count_text_tokens(messages)
5555
elif isinstance(messages, LMRequestMessages):
56-
return self._count_request_tokens(messages)
56+
return self._count_request_tokens(messages, ignore_file)
5757

5858
def count_file_input_tokens(self, messages: LMRequestMessages) -> int:
5959
# Gemini 2.0 charges 258 tokens per page for all PDF inputs. For more detail, see https://gemini-api.apidog.io/doc-965859#technical-details
@@ -68,7 +68,7 @@ def count_file_output_tokens(self, messages: LMRequestMessages) -> int:
6868
# In our estimates we add buffer, both for markdown structure and in case we ask the model to describe images.
6969
return self.google_tokenizer.count_tokens(text).total_tokens
7070

71-
def _count_request_tokens(self, messages: LMRequestMessages) -> int:
71+
def _count_request_tokens(self, messages: LMRequestMessages, ignore_file:bool = False) -> int:
7272
"""Count tokens for an `LMRequestMessages` object."""
7373
contents = convert_text_messages(messages)
7474
tokens = 0
@@ -79,7 +79,7 @@ def _count_request_tokens(self, messages: LMRequestMessages) -> int:
7979
).total_tokens
8080
tokens += count_tokens
8181

82-
if messages.user_file:
82+
if messages.user_file and not ignore_file:
8383
tokens += self.count_file_input_tokens(messages)
8484
return tokens
8585

src/fenic/_inference/openrouter/openrouter_batch_chat_completions_client.py

Lines changed: 48 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
from fenic._inference.common_openai.openai_utils import convert_messages
1212
from fenic._inference.common_openai.utils import handle_openai_compatible_response
13+
from fenic._inference.google.gemini_token_counter import GeminiLocalTokenCounter
1314
from fenic._inference.model_client import (
1415
FatalException,
1516
ModelClient,
@@ -87,17 +88,25 @@ def __init__(
8788
self._aio_client = OpenRouterModelProvider().aio_client
8889
self._metrics = LMMetrics()
8990

91+
self._google_token_counter = None
92+
provider_and_model = model.split("/")
93+
if provider_and_model[0] == "google":
94+
self._google_token_counter = GeminiLocalTokenCounter(model_name=provider_and_model[1])
95+
9096
async def make_single_request(
9197
self, request: FenicCompletionsRequest
9298
) -> Union[None, FenicCompletionsResponse, TransientException, FatalException]:
9399
profile = self._profile_manager.get_profile_by_name(request.model_profile)
94100
common_params = {
95101
"model": self.model,
96102
"messages": convert_messages(request.messages),
97-
"max_completion_tokens": self._get_max_output_token_request_limit(request),
98103
"n": 1,
99104
}
100105

106+
max_completion_tokens = self._get_max_output_token_request_limit(request)
107+
if max_completion_tokens is not None:
108+
common_params["max_completion_tokens"] = max_completion_tokens
109+
101110
if request.top_logprobs:
102111
common_params.update(
103112
{"logprobs": True, "top_logprobs": request.top_logprobs}
@@ -238,8 +247,8 @@ def estimate_tokens_for_request(
238247
self, request: FenicCompletionsRequest
239248
) -> TokenEstimate:
240249
return TokenEstimate(
241-
input_tokens=self.token_counter.count_tokens(request.messages),
242-
output_tokens=self.token_counter.count_tokens(request.messages) + self._get_expected_additional_reasoning_tokens(request),
250+
input_tokens=self._estimate_input_tokens(request),
251+
output_tokens=self._estimate_output_tokens(request),
243252
)
244253

245254
def reset_metrics(self):
@@ -248,16 +257,48 @@ def reset_metrics(self):
248257
def get_metrics(self) -> LMMetrics:
249258
return self._metrics
250259

251-
def _get_max_output_token_request_limit(self, request: FenicCompletionsRequest) -> int:
252-
"""Get the upper limit of output tokens for a request.
260+
def _estimate_output_tokens(self, request: FenicCompletionsRequest) -> int:
261+
"""Estimate the number of output tokens for a request."""
262+
base_tokens = request.max_completion_tokens or 0
263+
if request.max_completion_tokens is None and request.messages.user_file:
264+
# TODO(DY): the semantic operator should dictate how the file affects the token estimate
265+
if self._google_token_counter:
266+
base_tokens += self._google_token_counter.count_file_output_tokens(messages=request.messages)
267+
else:
268+
base_tokens += self.token_counter.count_file_output_tokens(messages=request.messages)
269+
return base_tokens + self._get_expected_additional_reasoning_tokens(request)
253270

254-
If max_completion_tokens is not set, don't apply a limit and return None.
271+
def _get_max_output_token_request_limit(self, request: FenicCompletionsRequest) -> Optional[int]:
272+
"""Get the upper limit of output tokens for a request.
255273
256-
Include the thinking token budget with a safety margin."""
274+
Returns None if max_completion_tokens is not provided (no limit should be set).
275+
If max_completion_tokens is provided, includes the thinking token budget with a safety margin."""
257276
if request.max_completion_tokens is None:
258277
return None
259278
return request.max_completion_tokens + self._get_expected_additional_reasoning_tokens(request)
260279

280+
def _estimate_input_tokens(self, request: FenicCompletionsRequest) -> int:
281+
"""Estimate the number of input tokens for a request."""
282+
if self._google_token_counter:
283+
input_tokens = self._google_token_counter.count_tokens(request.messages, ignore_file=True)
284+
else:
285+
input_tokens = self.token_counter.count_tokens(request.messages)
286+
if request.messages.user_file:
287+
input_tokens += self._estimate_file_input_tokens(request)
288+
return input_tokens
289+
290+
def _estimate_file_input_tokens(self, request: FenicCompletionsRequest) -> int:
291+
"""Estimate the number of input tokens from a file in a request."""
292+
profile_config = self._profile_manager.get_profile_by_name(request.model_profile)
293+
if profile_config.parsing_engine and profile_config.parsing_engine == "native":
294+
if self._google_token_counter:
295+
return self._google_token_counter.count_file_input_tokens(messages=request.messages)
296+
else:
297+
return self.token_counter.count_file_input_tokens(messages=request.messages)
298+
# OpenRouter's engine tool processes the file first and passes annotated text to the model.
299+
# We can estimate by extracting the text and tokenizing it (which is what count_file_output_tokens does)
300+
return self.token_counter.count_file_output_tokens(messages=request.messages)
301+
261302
# This is a slightly less conservative estimate than the OpenRouter documentation on how reasoning_effort is used to
262303
# generate a reasoning.max_tokens for models that only support reasoning.max_tokens.
263304
# These percentages are slightly lower, since our use-cases generally require fewer reasoning tokens.

src/fenic/_inference/openrouter/openrouter_profile_manager.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
ResolvedOpenRouterProviderRouting,
2121
)
2222
from fenic.core.types.provider_routing import StructuredOutputStrategy
23+
from fenic.core.types.semantic import ParsingEngine
2324

2425

2526
@dataclass
@@ -32,6 +33,7 @@ class OpenRouterCompletionProfileConfiguration(BaseProfileConfiguration):
3233
models: Optional[list[str]] = None
3334
provider: Optional[ResolvedOpenRouterProviderRouting] = None
3435
structured_output_strategy: Optional[StructuredOutputStrategy] = None
36+
parsing_engine: Optional[ParsingEngine] = None
3537

3638
@property
3739
def extra_body(self) -> dict[str, Any]:
@@ -70,6 +72,14 @@ def extra_body(self) -> dict[str, Any]:
7072
if reasoning_obj:
7173
reasoning_obj["exclude"] = True
7274
params["reasoning"] = reasoning_obj
75+
params["plugins"] = [
76+
{
77+
"id": "file-parser",
78+
"pdf": {
79+
"engine": self.parsing_engine or "native"
80+
}
81+
}
82+
]
7383
return params
7484

7585

@@ -116,13 +126,21 @@ def _process_profile(
116126
and profile.reasoning_max_tokens is None
117127
):
118128
profile.reasoning_effort = "low"
129+
130+
pdf_page_processing_cost = None
131+
if profile.parsing_engine and profile.parsing_engine == "mistral-ocr":
132+
pdf_page_processing_cost = 2/1000
133+
elif profile.parsing_engine and profile.parsing_engine == "pdf-text":
134+
pdf_page_processing_cost = 0
119135

120136
return OpenRouterCompletionProfileConfiguration(
121137
models=profile.models,
122138
provider=profile.provider,
123139
reasoning_effort=profile.reasoning_effort,
124140
reasoning_max_tokens=profile.reasoning_max_tokens,
125141
structured_output_strategy=profile.structured_output_strategy,
142+
parsing_engine=profile.parsing_engine,
143+
pdf_page_processing_cost=pdf_page_processing_cost,
126144
)
127145

128146
def get_default_profile(self) -> OpenRouterCompletionProfileConfiguration:

src/fenic/_inference/openrouter/openrouter_provider.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,14 @@ def client(self):
5757
base_url=OPENROUTER_BASE_URL,
5858
)
5959

60+
@cached_property
61+
def chat_url(self) -> str:
62+
return OPENROUTER_BASE_URL + "/chat/completions"
63+
64+
@cached_property
65+
def headers(self) -> Dict[str, str]:
66+
return self._headers
67+
6068
@cached_property
6169
def aio_client(self):
6270
"""Return an Async OpenAI SDK client configured for OpenRouter."""
@@ -123,6 +131,7 @@ def _translate_model(
123131
supports_reasoning=supports_reasoning,
124132
supports_custom_temperature=supports_custom_temperature,
125133
supports_verbosity=supports_verbosity,
134+
supports_pdf_parsing=True, # Even if the model doesn't support pdf file processing, OpenRouter can use its separate processing engines
126135
supported_parameters=supported_params,
127136
)
128137

src/fenic/_inference/token_counter.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
Tokenizable = Union[str | LMRequestMessages]
1111

1212
class TokenCounter(Protocol):
13-
def count_tokens(self, messages: Tokenizable) -> int: ...
13+
def count_tokens(self, messages: Tokenizable, ignore_file:bool = False) -> int: ...
1414
def count_file_input_tokens(self, messages: LMRequestMessages) -> int: ...
1515
def count_file_output_tokens(self, messages: LMRequestMessages) -> int: ...
1616

@@ -22,11 +22,11 @@ def __init__(self, model_name: str, fallback_encoding: str = "o200k_base"):
2222
except KeyError:
2323
self.tokenizer = tiktoken.get_encoding(fallback_encoding)
2424

25-
def count_tokens(self, messages: Tokenizable) -> int:
25+
def count_tokens(self, messages: Tokenizable, ignore_file:bool = False) -> int:
2626
if isinstance(messages, str):
2727
return len(self.tokenizer.encode(messages))
2828
elif isinstance(messages, LMRequestMessages):
29-
return self._count_message_tokens(messages)
29+
return self._count_message_tokens(messages, ignore_file)
3030
else:
3131
raise TypeError(f"Expected str or LMRequestMessages, got {type(messages)}")
3232

@@ -55,7 +55,7 @@ def count_file_output_tokens(self, messages: LMRequestMessages) -> int:
5555
else:
5656
raise InternalError(f"File{messages.user_file.path}'s extension is not supported for llm completions.")
5757

58-
def _count_message_tokens(self, messages: LMRequestMessages) -> int:
58+
def _count_message_tokens(self, messages: LMRequestMessages, ignore_file:bool = False) -> int:
5959
num_tokens = 0
6060
message_count = 2 # system message and user parent message
6161
num_tokens += self.count_tokens(messages.system)
@@ -66,7 +66,7 @@ def _count_message_tokens(self, messages: LMRequestMessages) -> int:
6666
num_tokens += self.count_tokens(example.user)
6767
num_tokens += self.count_tokens(example.assistant)
6868
message_count += 2
69-
if messages.user_file:
69+
if messages.user_file and not ignore_file:
7070
num_tokens += self.count_file_input_tokens(messages)
7171
message_count += 1
7272
num_tokens += message_count * PREFIX_TOKENS_PER_MESSAGE

src/fenic/api/session/config.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -644,6 +644,7 @@ class Profile(BaseModel):
644644
ge=1024,
645645
)
646646

647+
ParsingEngine = Literal["mistral-ocr", "pdf-text", "native"]
647648

648649
class OpenRouterLanguageModel(BaseModel):
649650
"""Configuration for OpenRouter language models.
@@ -788,6 +789,8 @@ class Profile(BaseModel):
788789
If the model does support reasoning, but not `reasoning_max_tokens`, a `reasoning_effort_ will be automatically
789790
calculated based on `reasoning_max_tokens` as a percentage of the model's maximum output size
790791
([OpenRouter Documentation](https://openrouter.ai/docs/use-cases/reasoning-tokens#max-tokens-for-reasoning))
792+
parsing_engine: The parsing engine to use for processing PDF files. By default, the model's native parsing engine will be used. If the model doesn't support PDF processing and the parsing engine is not provided, an error will be raised. Note: 'mistral-ocr' incurs additional costs.
793+
([OpenRouter Documentation](https://openrouter.ai/docs/features/multimodal/pdfs))
791794
"""
792795
model_config = ConfigDict(extra="forbid")
793796

@@ -807,6 +810,10 @@ class Profile(BaseModel):
807810
provider: Optional[OpenRouterLanguageModel.Provider] = Field(
808811
default=None, description="Provider routing configuration"
809812
)
813+
parsing_engine: Optional[ParsingEngine] = Field(
814+
default=None,
815+
description="The parsing engine to use for processing PDF files. By default, the model's native parsing engine will be used."
816+
)
810817

811818

812819
CohereEmbeddingTaskType = Literal[
@@ -1341,6 +1348,7 @@ def resolve_model(model: ModelConfig) -> ResolvedModelConfig:
13411348
reasoning_effort=profile.reasoning_effort,
13421349
reasoning_max_tokens=profile.reasoning_max_tokens,
13431350
models=profile.models,
1351+
parsing_engine=profile.parsing_engine,
13441352
provider=(
13451353
ResolvedOpenRouterProviderRouting(
13461354
**(profile.provider.model_dump())

src/fenic/core/_inference/model_catalog.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ def __init__(
3232
self.cached_input_token_write_cost = cached_input_token_write_cost
3333
self.output_token_cost = output_token_cost
3434

35-
3635
class CompletionModelParameters:
3736
"""Parameters for completion models including costs and context window size.
3837
@@ -42,6 +41,8 @@ class CompletionModelParameters:
4241
cached_input_token_write_cost: Cost per cached input token write in USD
4342
output_token_cost: Cost per output token in USD
4443
context_window_length: Maximum number of tokens in the context window
44+
tiered_token_costs: Tiered token costs for the model.
45+
pdf_processing_engine_cost_per_page: Cost per page for PDF processing - for model configs that use a separate processing engine for PDF parsing.
4546
max_output_tokens: Maximum number of tokens the model can generate in a single request.
4647
max_temperature: Maximum temperature for the model.
4748
supports_profiles: Whether the model supports parameter profiles.
@@ -62,6 +63,7 @@ def __init__(
6263
cached_input_token_write_cost: float = 0.0,
6364
cached_input_token_read_cost: float = 0.0,
6465
tiered_token_costs: Optional[Dict[int, TieredTokenCost]] = None,
66+
pdf_processing_engine_cost_per_page: Optional[float] = None,
6567
supports_profiles=True,
6668
supports_reasoning=False,
6769
supports_minimal_reasoning=False,
@@ -78,6 +80,7 @@ def __init__(
7880
self.context_window_length = context_window_length
7981
self.has_tiered_input_token_costs = tiered_token_costs is not None
8082
self.tiered_input_token_costs = tiered_token_costs
83+
self.pdf_processing_engine_cost_per_page = pdf_processing_engine_cost_per_page
8184
self.max_output_tokens = max_output_tokens
8285
self.max_temperature = max_temperature
8386
self.supports_profiles = supports_profiles
@@ -897,7 +900,6 @@ def _initialize_google_gla_models(self):
897900
context_window_length=1_048_576,
898901
max_output_tokens=8_192,
899902
max_temperature=2.0,
900-
supports_profiles=False,
901903
supports_pdf_parsing=True,
902904
),
903905
snapshots=["gemini-2.0-flash-lite-001"],
@@ -914,7 +916,6 @@ def _initialize_google_gla_models(self):
914916
context_window_length=1_048_576,
915917
max_output_tokens=8_192,
916918
max_temperature=2.0,
917-
supports_profiles=False,
918919
supports_pdf_parsing=True,
919920
),
920921
snapshots=["gemini-2.0-flash-001", "gemini-2.0-flash-exp"],

src/fenic/core/_resolved_session_config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
ProviderSort,
2020
StructuredOutputStrategy,
2121
)
22+
from fenic.core.types.semantic import ParsingEngine
2223

2324
ReasoningEffort = Literal["minimal", "low", "medium", "high"]
2425
Verbosity = Literal["low", "medium", "high"]
@@ -79,6 +80,7 @@ class ResolvedOpenRouterModelProfile:
7980
models: Optional[list[str]] = None
8081
provider: Optional[ResolvedOpenRouterProviderRouting] = None
8182
structured_output_strategy: Optional[StructuredOutputStrategy] = None
83+
parsing_engine: Optional[ParsingEngine] = None
8284

8385

8486
@dataclass
@@ -128,6 +130,7 @@ class ResolvedOpenRouterModelConfig:
128130
profiles: Optional[dict[str, ResolvedOpenRouterModelProfile]] = None
129131
model_provider: ModelProvider = ModelProvider.OPENROUTER
130132
default_profile: Optional[str] = None
133+
parsing_engine: Optional[ParsingEngine] = None
131134

132135

133136
ResolvedModelConfig = Union[

src/fenic/core/types/semantic.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
"""Types used to configure model selection for semantic functions."""
22
from __future__ import annotations
33

4-
from typing import Optional, Union
4+
from typing import Literal, Optional, Union
55

66
from pydantic import BaseModel
77

88
from fenic.core._logical_plan.resolved_types import ResolvedModelAlias
99

10+
ParsingEngine = Literal["mistral-ocr", "pdf-text", "native"]
1011

1112
class ModelAlias(BaseModel):
1213
"""A combination of a model name and a required profile for that model.

0 commit comments

Comments
 (0)