diff --git a/src/fenic/_backends/local/semantic_operators/parse_pdf.py b/src/fenic/_backends/local/semantic_operators/parse_pdf.py
index 92ace8991..0442b95a0 100644
--- a/src/fenic/_backends/local/semantic_operators/parse_pdf.py
+++ b/src/fenic/_backends/local/semantic_operators/parse_pdf.py
@@ -48,11 +48,13 @@ def __init__(
         page_separator: Optional[str] = None,
         describe_images: bool = False,
         model_alias: Optional[ResolvedModelAlias] = None,
+        max_output_tokens: Optional[int] = None,
     ):
         self.page_separator = page_separator
         self.describe_images = describe_images
         self.model = model
         self.model_alias = model_alias
+        self.max_output_tokens = max_output_tokens
 
         DocFolderLoader.check_file_extensions(input.to_list(), "pdf")
 
@@ -62,7 +64,7 @@ def __init__(
                 model=model,
                 operator_name="semantic.parse_pdf",
                 inference_config=InferenceConfiguration(
-                    max_output_tokens=None,
+                    max_output_tokens=max_output_tokens,
                     temperature=1.0,  # Use a higher temperature so gemini flash models can handle complex table formatting.  For more info see the conversation here: https://discuss.ai.google.dev/t/gemini-2-0-flash-has-a-weird-bug/65119/26
                     model_profile=model_alias.profile if model_alias else None,
                 ),
diff --git a/src/fenic/_backends/local/transpiler/expr_converter.py b/src/fenic/_backends/local/transpiler/expr_converter.py
index 05f8f4294..30af8bb78 100644
--- a/src/fenic/_backends/local/transpiler/expr_converter.py
+++ b/src/fenic/_backends/local/transpiler/expr_converter.py
@@ -712,8 +712,9 @@ def parse_pdf_fn(batch: pl.Series) -> pl.Series:
                 page_separator=logical.page_separator,
                 describe_images=logical.describe_images,
                 model_alias=logical.model_alias,
+                max_output_tokens=logical.max_output_tokens,
             ).execute()
-        
+
         return self._convert_expr(logical.expr).map_batches(
             parse_pdf_fn, return_dtype=pl.Utf8
         )
diff --git a/src/fenic/_inference/anthropic/anthropic_batch_chat_completions_client.py b/src/fenic/_inference/anthropic/anthropic_batch_chat_completions_client.py
index 9137f0eb0..594b2eef6 100644
--- a/src/fenic/_inference/anthropic/anthropic_batch_chat_completions_client.py
+++ b/src/fenic/_inference/anthropic/anthropic_batch_chat_completions_client.py
@@ -275,7 +275,7 @@ def _estimate_structured_output_overhead(self, response_format) -> int:
         """
         return self.estimate_response_format_tokens(response_format)
 
-    def _get_max_output_tokens(self, request: FenicCompletionsRequest) -> int:
+    def _get_max_output_token_request_limit(self, request: FenicCompletionsRequest) -> int:
         """Get maximum output tokens including thinking budget.
 
         Args:
@@ -329,7 +329,7 @@ def estimate_tokens_for_request(self, request: FenicCompletionsRequest):
         input_tokens += self._count_auxiliary_input_tokens(request)
         
         # Estimate output tokens
-        output_tokens = self._get_max_output_tokens(request)
+        output_tokens = self._get_max_output_token_request_limit(request)
         
         return TokenEstimate(
             input_tokens=input_tokens,
diff --git a/src/fenic/_inference/cohere/cohere_batch_embeddings_client.py b/src/fenic/_inference/cohere/cohere_batch_embeddings_client.py
index 723fd73d0..977fdb28d 100644
--- a/src/fenic/_inference/cohere/cohere_batch_embeddings_client.py
+++ b/src/fenic/_inference/cohere/cohere_batch_embeddings_client.py
@@ -171,7 +171,7 @@ def estimate_tokens_for_request(self, request: FenicEmbeddingsRequest) -> TokenE
             output_tokens=0
         )
 
-    def _get_max_output_tokens(self, request: FenicEmbeddingsRequest) -> int:
+    def _get_max_output_token_request_limit(self, request: FenicEmbeddingsRequest) -> int:
         """Get maximum output tokens (always 0 for embeddings).
         
         Returns:
diff --git a/src/fenic/_inference/common_openai/openai_chat_completions_core.py b/src/fenic/_inference/common_openai/openai_chat_completions_core.py
index e907f9e0c..393fbd671 100644
--- a/src/fenic/_inference/common_openai/openai_chat_completions_core.py
+++ b/src/fenic/_inference/common_openai/openai_chat_completions_core.py
@@ -90,9 +90,13 @@ async def make_single_request(
             common_params: dict[str, Any] = {
                 "model": self._model,
                 "messages": convert_messages(request.messages),
-                "max_completion_tokens": request.max_completion_tokens + profile_configuration.expected_additional_reasoning_tokens,
                 "n": 1,
             }
+
+            max_completion_tokens = self.get_max_output_token_request_limit(request, profile_configuration)
+            if max_completion_tokens is not None:
+                common_params["max_completion_tokens"] = max_completion_tokens
+
             if request.temperature:
                 common_params.update({"temperature": request.temperature})
 
@@ -213,3 +217,13 @@ def get_request_key(self, request: FenicCompletionsRequest) -> str:
             A unique key for the request
         """
         return generate_completion_request_key(request)
+
+    def get_max_output_token_request_limit(self, request: FenicCompletionsRequest, profile_config:OpenAICompletionProfileConfiguration) -> Optional[int]:
+        """Return the maximum output token limit for a request.
+
+        Returns None if max_completion_tokens is not provided (no limit should be set).
+        If max_completion_tokens is provided, includes the thinking token budget with a safety margin.
+        """
+        if request.max_completion_tokens is None:
+            return None
+        return request.max_completion_tokens + profile_config.expected_additional_reasoning_tokens
diff --git a/src/fenic/_inference/google/gemini_batch_embeddings_client.py b/src/fenic/_inference/google/gemini_batch_embeddings_client.py
index 6ab27a063..0be07bd18 100644
--- a/src/fenic/_inference/google/gemini_batch_embeddings_client.py
+++ b/src/fenic/_inference/google/gemini_batch_embeddings_client.py
@@ -121,7 +121,7 @@ def estimate_tokens_for_request(self, request: FenicEmbeddingsRequest) -> TokenE
             input_tokens=self.token_counter.count_tokens(request.doc), output_tokens=0
         )
 
-    def _get_max_output_tokens(self, request: FenicEmbeddingsRequest) -> int:
+    def _get_max_output_token_request_limit(self, request: FenicEmbeddingsRequest) -> int:
         return 0
 
     def reset_metrics(self):
diff --git a/src/fenic/_inference/google/gemini_native_chat_completions_client.py b/src/fenic/_inference/google/gemini_native_chat_completions_client.py
index af8eb39fc..5748ecb49 100644
--- a/src/fenic/_inference/google/gemini_native_chat_completions_client.py
+++ b/src/fenic/_inference/google/gemini_native_chat_completions_client.py
@@ -132,56 +132,6 @@ def count_tokens(self, messages: Tokenizable) -> int:  # type: ignore[override]
         # Re-expose for mypy – same implementation as parent.
         return super().count_tokens(messages)
 
-    def _estimate_structured_output_overhead(self, response_format: ResolvedResponseFormat) -> int:
-        """Use Google-specific response schema token estimation.
-
-        Args:
-            response_format: Pydantic model class defining the response format
-
-        Returns:
-            Estimated token overhead for structured output
-        """
-        return self._estimate_response_schema_tokens(response_format)
-
-    def _get_max_output_tokens(self, request: FenicCompletionsRequest) -> Optional[int]:
-        """Get maximum output tokens including thinking budget.
-
-        If max_completion_tokens is not set, return None.
-
-        Conservative estimate that includes both completion tokens and
-        thinking token budget with a safety margin.
-
-        Args:
-            request: The completion request
-
-        Returns:
-            Maximum output tokens (completion + thinking budget with safety margin)
-        """
-        if request.max_completion_tokens is None:
-            return None
-        profile_config = self._profile_manager.get_profile_by_name(
-            request.model_profile
-        )
-        return request.max_completion_tokens + int(
-            1.5 * profile_config.thinking_token_budget
-        )
-
-    @cache  # noqa: B019 – builtin cache OK here.
-    def _estimate_response_schema_tokens(self, response_format: ResolvedResponseFormat) -> int:
-        """Estimate token count for a response format schema.
-
-        Uses Google's tokenizer to count tokens in a JSON schema representation
-        of the response format. Results are cached for performance.
-
-        Args:
-            response_format: Pydantic model class defining the response format
-
-        Returns:
-            Estimated token count for the response format
-        """
-        schema_str = response_format.schema_fingerprint
-        return self._token_counter.count_tokens(schema_str)
-
     def get_request_key(self, request: FenicCompletionsRequest) -> str:
         """Generate a unique key for the request.
 
@@ -196,19 +146,17 @@ def get_request_key(self, request: FenicCompletionsRequest) -> str:
     def estimate_tokens_for_request(self, request: FenicCompletionsRequest):
         """Estimate the number of tokens for a request.
 
+        If the request provides a max_completion_tokens value, use that.  Otherwise, estimate the output tokens based on the file size.
+
         Args:
             request: The request to estimate tokens for
 
         Returns:
             TokenEstimate: The estimated token usage
         """
-
-        # Count input tokens
         input_tokens = self.count_tokens(request.messages)
         input_tokens += self._count_auxiliary_input_tokens(request)
-
-        output_tokens = self._get_max_output_tokens(request) or self._model_parameters.max_output_tokens
-
+        output_tokens = self._estimate_output_tokens(request)
         return TokenEstimate(input_tokens=input_tokens, output_tokens=output_tokens)
 
     async def make_single_request(
@@ -228,16 +176,17 @@ async def make_single_request(
         """
 
         profile_config = self._profile_manager.get_profile_by_name(request.model_profile)
-        max_output_tokens = self._get_max_output_tokens(request)
-
         generation_config: GenerateContentConfigDict = {
             "temperature": request.temperature,
             "response_logprobs": request.top_logprobs is not None,
             "logprobs": request.top_logprobs,
             "system_instruction": request.messages.system,
         }
+
+        max_output_tokens = self._get_max_output_token_request_limit(request)
         if max_output_tokens is not None:
             generation_config["max_output_tokens"] = max_output_tokens
+
         generation_config.update(profile_config.additional_generation_config)
         if request.structured_output is not None:
             generation_config.update(
@@ -355,3 +304,54 @@ async def make_single_request(
         finally:
             if file_obj:
                 await delete_file(self._client, file_obj.name)
+
+    @cache  # noqa: B019 – builtin cache OK here.
+    def _estimate_response_schema_tokens(self, response_format: ResolvedResponseFormat) -> int:
+        """Estimate token count for a response format schema.
+
+        Uses Google's tokenizer to count tokens in a JSON schema representation
+        of the response format. Results are cached for performance.
+
+        Args:
+            response_format: Pydantic model class defining the response format
+
+        Returns:
+            Estimated token count for the response format
+        """
+        schema_str = response_format.schema_fingerprint
+        return self._token_counter.count_tokens(schema_str)
+
+    def _estimate_structured_output_overhead(self, response_format: ResolvedResponseFormat) -> int:
+        """Use Google-specific response schema token estimation.
+
+        Args:
+            response_format: Pydantic model class defining the response format
+
+        Returns:
+            Estimated token overhead for structured output
+        """
+        return self._estimate_response_schema_tokens(response_format)
+
+    def _estimate_output_tokens(self, request: FenicCompletionsRequest) -> int:
+        """Estimate the number of output tokens for a request."""
+        estimated_output_tokens = request.max_completion_tokens or 0
+        if request.max_completion_tokens is None and request.messages.user_file:
+            # TODO(DY): the semantic operator should dictate how the file affects the token estimate
+            estimated_output_tokens = self.token_counter.count_file_output_tokens(request.messages)
+        return estimated_output_tokens + self._get_expected_additional_reasoning_tokens(request)
+
+    def _get_max_output_token_request_limit(self, request: FenicCompletionsRequest) -> Optional[int]:
+        """Get the upper limit of output tokens for a request.
+
+        Returns None if max_completion_tokens is not provided (no limit should be set).
+        If max_completion_tokens is provided, includes the thinking token budget with a safety margin."""
+        if request.max_completion_tokens is None:
+            return None
+        return request.max_completion_tokens + self._get_expected_additional_reasoning_tokens(request)
+
+    def _get_expected_additional_reasoning_tokens(self, request: FenicCompletionsRequest) -> int:
+        """Get the expected additional reasoning tokens for a request.  Include a safety margin."""
+        profile_config = self._profile_manager.get_profile_by_name(request.model_profile)
+        return int(
+            1.5 * profile_config.thinking_token_budget
+        )
diff --git a/src/fenic/_inference/language_model.py b/src/fenic/_inference/language_model.py
index fe90614ed..7d49b3af4 100644
--- a/src/fenic/_inference/language_model.py
+++ b/src/fenic/_inference/language_model.py
@@ -22,7 +22,7 @@
 
 @dataclass
 class InferenceConfiguration:
-    # If max_output_tokens is not provided, do not include it in the request.
+    # If max_output_tokens is not provided, model_client will add a guardrail based on the estimated output tokens.
     max_output_tokens: Optional[int]
     temperature: float
     top_logprobs: Optional[int] = None
diff --git a/src/fenic/_inference/model_client.py b/src/fenic/_inference/model_client.py
index 74fbd9c54..f65835ef1 100644
--- a/src/fenic/_inference/model_client.py
+++ b/src/fenic/_inference/model_client.py
@@ -245,8 +245,8 @@ def _estimate_structured_output_overhead(self, response_format: ResolvedResponse
 
 
     @abstractmethod
-    def _get_max_output_tokens(self, request: RequestT) -> int:
-        """Get conservative output token estimate. Override in subclasses for provider-specific logic."""
+    def _get_max_output_token_request_limit(self, request: RequestT) -> int:
+        """Get the upper limit of output tokens to set on a request."""
         pass
 
     #
diff --git a/src/fenic/_inference/openai/openai_batch_chat_completions_client.py b/src/fenic/_inference/openai/openai_batch_chat_completions_client.py
index b4807c1a3..9a6684d25 100644
--- a/src/fenic/_inference/openai/openai_batch_chat_completions_client.py
+++ b/src/fenic/_inference/openai/openai_batch_chat_completions_client.py
@@ -65,6 +65,7 @@ def __init__(
             profile_configurations=profiles,
             default_profile_name=default_profile_name,
         )
+
         self._core = OpenAIChatCompletionsCore(
             model=model,
             model_provider=ModelProvider.OPENAI,
@@ -108,7 +109,7 @@ def estimate_tokens_for_request(self, request: FenicCompletionsRequest) -> Token
         """
         return TokenEstimate(
             input_tokens=self.token_counter.count_tokens(request.messages),
-            output_tokens=self._get_max_output_tokens(request)
+            output_tokens=self._estimate_output_tokens(request)
         )
 
     def reset_metrics(self):
@@ -123,10 +124,21 @@ def get_metrics(self) -> LMMetrics:
         """
         return self._core.get_metrics()
 
-    def _get_max_output_tokens(self, request: FenicCompletionsRequest) -> int:
-        """Conservative estimate: max_completion_tokens + reasoning effort-based thinking tokens."""
-        base_tokens = request.max_completion_tokens
-
-        # Get profile-specific reasoning effort
+    def _estimate_output_tokens(self, request: FenicCompletionsRequest) -> int:
+        """Estimate the number of output tokens for a request."""
+        base_tokens = request.max_completion_tokens or 0
+        if request.max_completion_tokens is None and request.messages.user_file:
+            # TODO(DY): the semantic operator should dictate how the file affects the token estimate
+            base_tokens += self.token_counter.count_file_output_tokens(messages=request.messages)
         profile_config = self._profile_manager.get_profile_by_name(request.model_profile)
         return base_tokens + profile_config.expected_additional_reasoning_tokens
+
+    def _get_max_output_token_request_limit(self, request: FenicCompletionsRequest) -> int:
+        """Return the maximum output token limit for a request.
+
+        For file parsing requests, use a guardrail limit of 8192 tokens (the lowest output limit of a VLM model we support).
+
+        Include the thinking token budget with a safety margin.
+        """
+        profile_config = self._profile_manager.get_profile_by_name(request.model_profile)
+        return self._core.get_max_output_token_request_limit(request, profile_config)
diff --git a/src/fenic/_inference/openai/openai_batch_embeddings_client.py b/src/fenic/_inference/openai/openai_batch_embeddings_client.py
index d0134eefd..397a95c33 100644
--- a/src/fenic/_inference/openai/openai_batch_embeddings_client.py
+++ b/src/fenic/_inference/openai/openai_batch_embeddings_client.py
@@ -107,7 +107,7 @@ def get_metrics(self) -> RMMetrics:
         """
         return self._core.get_metrics()
 
-    def _get_max_output_tokens(self, request: RequestT) -> int:
+    def _get_max_output_token_request_limit(self, request: RequestT) -> int:
         return 0
 
     async def validate_api_key(self):
diff --git a/src/fenic/_inference/openrouter/openrouter_batch_chat_completions_client.py b/src/fenic/_inference/openrouter/openrouter_batch_chat_completions_client.py
index 69330b1b1..e7c4bde95 100644
--- a/src/fenic/_inference/openrouter/openrouter_batch_chat_completions_client.py
+++ b/src/fenic/_inference/openrouter/openrouter_batch_chat_completions_client.py
@@ -94,7 +94,7 @@ async def make_single_request(
         common_params = {
                 "model": self.model,
                 "messages": convert_messages(request.messages),
-                "max_completion_tokens": self._get_max_output_tokens(request),
+                "max_completion_tokens": self._get_max_output_token_request_limit(request),
                 "n": 1,
             }
 
@@ -239,7 +239,7 @@ def estimate_tokens_for_request(
     ) -> TokenEstimate:
         return TokenEstimate(
             input_tokens=self.token_counter.count_tokens(request.messages),
-            output_tokens=self._get_max_output_tokens(request),
+            output_tokens=self.token_counter.count_tokens(request.messages) + self._get_expected_additional_reasoning_tokens(request),
         )
 
     def reset_metrics(self):
@@ -248,7 +248,14 @@ def reset_metrics(self):
     def get_metrics(self) -> LMMetrics:
         return self._metrics
 
-    def _get_max_output_tokens(self, request: FenicCompletionsRequest) -> int:
+    def _get_max_output_token_request_limit(self, request: FenicCompletionsRequest) -> int:
+        """Get the upper limit of output tokens for a request.
+
+        If max_completion_tokens is not set, don't apply a limit and return None.
+
+        Include the thinking token budget with a safety margin."""
+        if request.max_completion_tokens is None:
+            return None
         return request.max_completion_tokens + self._get_expected_additional_reasoning_tokens(request)
 
     # This is a slightly less conservative estimate than the OpenRouter documentation on how reasoning_effort is used to
diff --git a/src/fenic/_inference/rate_limit_strategy.py b/src/fenic/_inference/rate_limit_strategy.py
index 4968e405d..b313092eb 100644
--- a/src/fenic/_inference/rate_limit_strategy.py
+++ b/src/fenic/_inference/rate_limit_strategy.py
@@ -10,7 +10,6 @@
 
 logger = logging.getLogger(__name__)
 
-
 @dataclass
 class TokenEstimate:
     input_tokens: int = 0
diff --git a/src/fenic/_inference/token_counter.py b/src/fenic/_inference/token_counter.py
index 0c26c6abd..41bad488b 100644
--- a/src/fenic/_inference/token_counter.py
+++ b/src/fenic/_inference/token_counter.py
@@ -2,14 +2,17 @@
 
 import tiktoken
 
-from fenic._constants import PREFIX_TOKENS_PER_MESSAGE, TOKENS_PER_NAME
-from fenic._inference.common_openai.openai_utils import convert_messages
+from fenic._constants import PREFIX_TOKENS_PER_MESSAGE
+from fenic._inference.request_utils import get_pdf_page_count, get_pdf_text
 from fenic._inference.types import LMRequestMessages
+from fenic.core.error import InternalError
 
 Tokenizable = Union[str | LMRequestMessages]
 
 class TokenCounter(Protocol):
     def count_tokens(self, messages: Tokenizable) -> int: ...
+    def count_file_input_tokens(self, messages: LMRequestMessages) -> int: ...
+    def count_file_output_tokens(self, messages: LMRequestMessages) -> int: ...
 
 class TiktokenTokenCounter(TokenCounter):
 
@@ -23,25 +26,50 @@ def count_tokens(self, messages: Tokenizable) -> int:
         if isinstance(messages, str):
             return len(self.tokenizer.encode(messages))
         elif isinstance(messages, LMRequestMessages):
-            return self._count_message_tokens(convert_messages(messages))
+            return self._count_message_tokens(messages)
         else:
             raise TypeError(f"Expected str or LMRequestMessages, got {type(messages)}")
 
-    def _count_message_tokens(self, messages: list[dict[str, str]]) -> int:
-        num_tokens = 0
-        for message in messages:
-            if "content" in message and isinstance(message["content"], list):
-                num_tokens += self._count_message_tokens(messages=message["content"])
-                continue
-            if "type" in message and message["type"] == "file":
-                # providers count file tokens differently, so we leave that up to the client
-                continue
-            num_tokens += PREFIX_TOKENS_PER_MESSAGE  # Every message starts with <im_start>{role/name}\n{content}<im_end>\n
-            for key, value in message.items():
-                num_tokens += len(self.tokenizer.encode(value))
-                if key == "name":
-                    num_tokens -= TOKENS_PER_NAME  # Subtract one token if the 'name' field is present
+    def count_file_input_tokens(self, messages: LMRequestMessages) -> int:
+        # get file type from file extension
+        file_type = messages.user_file.path.split(".")[-1]
+        if file_type == "pdf":
+            text = get_pdf_text(messages.user_file)
+            page_count = get_pdf_page_count(messages.user_file)
+            text_tokens = self.count_tokens(text)
+            # OpenAI documentation states that they convert PDF pages into images and ingest both text and image into their VLM. 
+            # Based on experimentation, OpenAI seems to count no more than 1024 tokens per page.
+            image_tokens = page_count * 1024 
+            return text_tokens + image_tokens
+        else:
+            raise InternalError(f"File{messages.user_file.path}'s extension is not supported for llm completions.")
 
-        num_tokens += 2  # Every assistant reply is primed with <im_start>assistant
+    def count_file_output_tokens(self, messages: LMRequestMessages) -> int:
+        file_type = messages.user_file.path.split(".")[-1]
+        if file_type == "pdf":
+            # TODO: we do this twice, once for estimating input and once for estimating output.  We can cache the text in the LMFile object.
+            text = get_pdf_text(messages.user_file)
+            # Note: we currently aren't counting any text tokens for describing images, since that defaults to False.
+            # In our estimates we add buffer, both for markdown structure and in case we ask the model to describe images.
+            return self.count_tokens(text)
+        else:
+            raise InternalError(f"File{messages.user_file.path}'s extension is not supported for llm completions.")
 
+    def _count_message_tokens(self, messages: LMRequestMessages) -> int:
+        num_tokens = 0
+        message_count = 2 # system message and user parent message
+        num_tokens += self.count_tokens(messages.system)
+        if messages.user:
+            num_tokens += self.count_tokens(messages.user)
+            message_count += 1
+        for example in messages.examples:
+            num_tokens += self.count_tokens(example.user)
+            num_tokens += self.count_tokens(example.assistant)
+            message_count += 2
+        if messages.user_file:
+            num_tokens += self.count_file_input_tokens(messages)
+            message_count += 1
+        num_tokens += message_count * PREFIX_TOKENS_PER_MESSAGE
+        num_tokens += 2  # Every assistant reply is primed with <im_start>assistant
+        
         return num_tokens
diff --git a/src/fenic/api/functions/semantic.py b/src/fenic/api/functions/semantic.py
index 7622ebc6a..4c8dcbc6b 100644
--- a/src/fenic/api/functions/semantic.py
+++ b/src/fenic/api/functions/semantic.py
@@ -596,6 +596,7 @@ def parse_pdf(
     model_alias: Optional[Union[str, ModelAlias]] = None,
 	page_separator: Optional[str] = None,
 	describe_images: bool = False,  # for images that aren't tables
+	max_output_tokens: Optional[int] = None,
 ) -> Column:
     r"""Parses a column of PDF paths into markdown.
 
@@ -607,6 +608,7 @@ def parse_pdf(
         model_alias: Optional alias for the language model to use for the parsing. If None, will use the language model configured as the default.
         page_separator: Optional page separator to use for the parsing.  If the separator includes the {page} placeholder, the model will replace it with the current page number.
         describe_images:  Flag to describe images in the PDF. If True, the prompt will ask the model to include a description of the image in the markdown output.  If False, the prompt asks the model to ignore images that aren't tables or charts.
+        max_output_tokens: Optional maximum number of output tokens per ~3 pages of PDF (does not include reasoning tokens). If None, don't constrain the model's output.
 
     Note:
         For Gemini models, this function uses the google file API, uploading PDF files to Google's file store and deleting them after each request.
@@ -640,5 +642,6 @@ def parse_pdf(
             model_alias=resolved_model_alias,
             page_separator=page_separator,
             describe_images=describe_images,
+            max_output_tokens=max_output_tokens,
         )
     )
\ No newline at end of file
diff --git a/src/fenic/core/_inference/model_catalog.py b/src/fenic/core/_inference/model_catalog.py
index c5813ab55..67fa8b73a 100644
--- a/src/fenic/core/_inference/model_catalog.py
+++ b/src/fenic/core/_inference/model_catalog.py
@@ -504,6 +504,7 @@ def _initialize_openai_models(self):
                 max_output_tokens=16_384,
                 max_temperature=2,
                 supports_profiles=False,
+                supports_pdf_parsing=True,
             ),
             snapshots=["gpt-4o-mini-2024-07-18"],
         )
@@ -519,6 +520,7 @@ def _initialize_openai_models(self):
                 max_output_tokens=16_384,
                 max_temperature=2,
                 supports_profiles=False,
+                supports_pdf_parsing=True,
             ),
             snapshots=["gpt-4o-2024-05-13", "gpt-4o-2024-08-06", "gpt-4o-2024-11-20"],
         )
@@ -595,6 +597,7 @@ def _initialize_openai_models(self):
                 max_temperature=2.0,
                 supports_reasoning=True,
                 supports_custom_temperature=False,
+                supports_pdf_parsing=True,
             ),
         )
 
@@ -624,6 +627,7 @@ def _initialize_openai_models(self):
                 max_output_tokens=100_000,
                 max_temperature=2.0,
                 supports_reasoning=True,
+                supports_pdf_parsing=True,
             ),
         )
 
@@ -641,6 +645,7 @@ def _initialize_openai_models(self):
                 supports_minimal_reasoning=True,
                 supports_custom_temperature=False,
                 supports_verbosity=True,
+                supports_pdf_parsing=True,
             ),
             snapshots=["gpt-5-2025-08-07"],
         )
@@ -658,6 +663,7 @@ def _initialize_openai_models(self):
                 supports_minimal_reasoning=True,
                 supports_custom_temperature=False,
                 supports_verbosity=True,
+                supports_pdf_parsing=True,
             ),
             snapshots=["gpt-5-mini-2025-08-07"],
         )
@@ -675,6 +681,7 @@ def _initialize_openai_models(self):
                 supports_minimal_reasoning=True,
                 supports_verbosity=True,
                 supports_custom_temperature=False,
+                supports_pdf_parsing=True,
             ),
             snapshots=["gpt-5-nano-2025-08-07"],
         )
diff --git a/src/fenic/core/_logical_plan/expressions/semantic.py b/src/fenic/core/_logical_plan/expressions/semantic.py
index 635e9a952..260e6b2d0 100644
--- a/src/fenic/core/_logical_plan/expressions/semantic.py
+++ b/src/fenic/core/_logical_plan/expressions/semantic.py
@@ -631,11 +631,13 @@ def __init__(
         model_alias: Optional[ResolvedModelAlias] = None,
         page_separator: Optional[str] = None,
         describe_images: bool = False,
+        max_output_tokens: Optional[int] = None,
     ):
         self.expr = expr
         self.model_alias = model_alias
         self.page_separator = page_separator
         self.describe_images = describe_images
+        self.max_output_tokens = max_output_tokens
 
         # Initialize validator for composition-based type validation
         self._validator = SignatureValidator(self.function_name)
@@ -665,4 +667,5 @@ def __str__(self) -> str:
     def _eq_specific(self, other: SemanticParsePDFExpr) -> bool:
         return (self.model_alias == other.model_alias
                 and self.page_separator == other.page_separator
-                and self.describe_images == other.describe_images)
\ No newline at end of file
+                and self.describe_images == other.describe_images
+                and self.max_output_tokens == other.max_output_tokens)
\ No newline at end of file
diff --git a/tests/_backends/local/functions/test_semantic_parse_pdf.py b/tests/_backends/local/functions/test_semantic_parse_pdf.py
index af0c97144..316ccc390 100644
--- a/tests/_backends/local/functions/test_semantic_parse_pdf.py
+++ b/tests/_backends/local/functions/test_semantic_parse_pdf.py
@@ -19,12 +19,13 @@
     "Content about war and peace, examining the complex relationship between conflict and harmony throughout human history. This analysis covers major historical conflicts, their causes and consequences, as well as the various peace movements and diplomatic efforts that have shaped our world. The discussion includes philosophical perspectives on violence, justice, and the pursuit of lasting peace.",
 ]
 
+# keeping the more expensive models off by default
 vlms_to_test = [
-    # TODO: add openai tests when adding openai parse_pdf support
-    #(OpenAILanguageModel, "gpt-5-nano"),
-    #(OpenAILanguageModel, "gpt-4o-mini"),
-    #(OpenAILanguageModel, "o3-mini"),
-    (GoogleDeveloperLanguageModel, "gemini-2.5-pro"),
+    (OpenAILanguageModel, "gpt-5-nano"),
+    (OpenAILanguageModel, "gpt-4o-mini"),
+    #(OpenAILanguageModel, "o3"),
+    (OpenAILanguageModel, "o4-mini"),
+    #(GoogleDeveloperLanguageModel, "gemini-2.5-pro"),
     (GoogleDeveloperLanguageModel, "gemini-2.0-flash-lite"),
     (GoogleDeveloperLanguageModel, "gemini-2.5-flash-lite"),
 ]
@@ -184,7 +185,7 @@ def _setup_session_with_vlm(test_model_class: BaseModel, model_name: str):
         else:
             profile = test_model_class.Profile(reasoning_effort="low")
     config = SessionConfig(
-        app_name="test_app_google",
+        app_name="test_app_parse_pdf",
         semantic=SemanticConfig(
             language_models={"vlm": test_model_class(
                 model_name=model_name,
diff --git a/tests/_inference/test_openai_token_counter.py b/tests/_inference/test_openai_token_counter.py
new file mode 100644
index 000000000..1d9e9488b
--- /dev/null
+++ b/tests/_inference/test_openai_token_counter.py
@@ -0,0 +1,75 @@
+import os
+
+import fitz
+
+from fenic._inference.token_counter import TiktokenTokenCounter
+from fenic._inference.types import FewShotExample, LMRequestFile, LMRequestMessages
+from tests.conftest import _save_pdf_file
+
+
+def test_local_token_counter_counts_tokens():
+    model = "gpt-4o-mini"
+    counter = TiktokenTokenCounter(model_name=model)
+    assert counter.count_tokens("This is a longer string of text with characters: 那只敏捷的棕色狐狸跳过了懒惰的狗") == 28
+
+    model = "gpt-4o"
+    pro_counter = TiktokenTokenCounter(model_name=model)
+    assert pro_counter.count_tokens("This is a longer string of text with characters: 那只敏捷的棕色狐狸跳过了懒惰的狗") == 28
+
+def test_local_token_counter_falls_back_to_o200k_base():
+    model = "gpt-242342"  # non-existent model
+    counter = TiktokenTokenCounter(model_name=model)
+    assert counter.count_tokens("This is a longer string of text with characters: 那只敏捷的棕色狐狸跳过了懒惰的狗") == 28
+
+def test_openai_tokenizer_counts_tokens_for_message_list():
+    model = "gpt-4o-mini"
+
+    counter = TiktokenTokenCounter(model_name=model)
+    messages = LMRequestMessages(
+        system="You are a helpful assistant.",
+        examples=[FewShotExample(user="ping", assistant="pong")],
+        user="Summarize: The quick brown fox jumps over the lazy dog.",
+    )
+    # Note: The exact token count may differ from Gemini due to different tokenization
+    # This test verifies the method works and returns a reasonable value
+    token_count = counter.count_tokens(messages)
+    assert token_count == 44  # Actual token count for OpenAI tokenizer
+
+def test_openai_tokenizer_counts_tokens_for_pdfs(temp_dir_just_one_file):
+    model = "gpt-4o-mini"
+    pdf_path1 = os.path.join(temp_dir_just_one_file, "test_pdf_one_page.pdf")
+    pdf_path2 = os.path.join(temp_dir_just_one_file, "test_pdf_three_pages.pdf")
+    _save_pdf_file(pdf_path1, page_count=1, text_content="The quick brown fox jumps over the lazy dog.")
+    _save_pdf_file(pdf_path2, page_count=3, text_content="The quick brown fox jumps over the lazy dog.")
+    counter = TiktokenTokenCounter(model_name=model)
+    messages = LMRequestMessages(
+        system="You are a helpful assistant.",
+        examples=[],
+        user_file=LMRequestFile(path=pdf_path1, page_range=(0, 1)),
+    )
+    # OpenAI uses different tokenization for PDFs, including image tokens
+    # Just verify it returns a positive number and processes the PDF
+    token_count = counter.count_tokens(messages)
+    assert token_count > 0
+
+    messages = LMRequestMessages(
+        system="You are a helpful assistant.",
+        examples=[],
+        user_file=LMRequestFile(path=pdf_path2, page_range=(0, 3)),
+    )
+    # Verify that 3-page PDF has more tokens than 1-page PDF
+    token_count_3pages = counter.count_tokens(messages)
+    assert token_count_3pages > token_count
+
+    # verify token estimation for chunked PDF works
+    pdf_2 = fitz.open(pdf_path2)
+    pdf_2_chunk = fitz.open()
+    pdf_2_chunk.insert_pdf(pdf_2, from_page=1, to_page=1)
+    messages = LMRequestMessages(
+        system="You are a helpful assistant.",
+        examples=[],
+        user_file=LMRequestFile(path=pdf_path2, pdf_chunk_bytes=pdf_2_chunk.tobytes(), page_range=(1, 2)),
+    )
+    # Verify that a 1-page chunk has the same number of tokens as a 1-page PDF
+    token_count_1_page_chunk = counter.count_tokens(messages)
+    assert token_count_1_page_chunk == token_count
\ No newline at end of file
diff --git a/tests/conftest.py b/tests/conftest.py
index 5d9840cc6..5728646e3 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -297,7 +297,7 @@ def configure_language_model(model_provider: ModelProvider, model_name: str) ->
     )
     # these limits are purposely low so we don't consume our entire project limit while running multiple tests in multiple branches
     if model_provider == ModelProvider.OPENAI:
-        if model_parameters.supports_reasoning and model_parameters.supports_verbosity:
+        if model_parameters.supports_minimal_reasoning and model_parameters.supports_verbosity:
             language_model = OpenAILanguageModel(
                 model_name=model_name,
                 rpm=500,