feat: tweak pdf parser for corner cases and add 120s demo

YoungVor · YoungVor · commit eef48d2ed75e · 2025-10-13T09:20:37.000-07:00
diff --git a/examples/fenic_in_120_seconds/18_pdf_processing.ipynb b/examples/fenic_in_120_seconds/18_pdf_processing.ipynb
diff --git a/examples/pdf_processing/pdf_processing.ipynb b/examples/pdf_processing/pdf_processing.ipynb
diff --git a/examples/pdf_processing/pdf_processing.py b/examples/pdf_processing/pdf_processing.py
@@ -0,0 +1,129 @@
+"""Document metadata extraction example using fenic semantic operations.
+
+This example demonstrates how to extract structured metadata from unstructured document text
+using Fenic’s Pydantic model integration for schema definitions.
+"""
+
+import os
+import shutil
+from typing import List, Optional
+
+import huggingface_hub as hf
+from pydantic import BaseModel, Field
+
+import fenic as fc
+
+data_dir = "examples_data/"
+data_dir = "/Users/davidyoungworth/pdf_eval_data/docs/whitepapers"
+output_dir = "examples_data/"
+
+def main(config: Optional[fc.SessionConfig] = None):
+    """Extract metadata from document excerpts using semantic operations."""
+    # Configure session with semantic capabilities
+    # TODO: add other examples 
+    config = config or fc.SessionConfig(
+        app_name="pdf_processing",
+        semantic=fc.SemanticConfig(
+            language_models={
+                "parse_model": fc.GoogleDeveloperLanguageModel(
+                    model_name="gemini-2.5-flash-lite",
+                    rpm=500,
+                    tpm=1_000_000,
+                ),
+                "cheap_model": fc.OpenAILanguageModel(
+                    model_name="gpt-5-nano",
+                    rpm=500,
+                    tpm=200_000,
+                ),
+            },
+            default_language_model="cheap_model"
+        ),
+    )
+
+    # Create session
+    session = fc.Session.get_or_create(config)
+
+
+    # Setup: grab some pdfs
+    # Note: This will soon be unnessary when fenic's pdf doc loader can load directly from huggingface
+    repo_id = "typedef-ai/pdf_data" 
+    os.makedirs(data_dir, exist_ok=True)
+
+    files = hf.list_repo_files(repo_id=repo_id, repo_type="dataset")
+    for file in files:
+        if file.startswith("whitepapers/"):
+            hf.hf_hub_download(repo_id=repo_id, repo_type="dataset", filename=file, local_dir=data_dir)
+
+    # Create a table to store the markdown
+    pdf_filtered_df = session.read.pdf_metadata(f"{data_dir}/*.pdf").filter(
+        (fc.col("page_count") > 3) & (not fc.col("is_encrypted")))
+    
+    # Using gemini parsing model to parse the pdf content into raw markdown
+    pdf_to_md_content = pdf_filtered_df.with_column(
+	    "markdown_content", 
+	    fc.semantic.parse_pdf(
+        fc.col("file_path"), model_alias="parse_model")).cache()
+
+    # Fenic's powerfull markdown processing can provide any structure to the content
+    # We will separate by sections and generate a table of contents
+    pdf_sections_df = pdf_to_md_content.select(
+        fc.when(
+            fc.col("title").is_not_null(), 
+            fc.col("title")
+        ).otherwise(
+            fc.text.split_part(fc.col("file_path"), "/", -1)
+        ).alias("name"),
+        "markdown_content",
+        # Extract sections up to level 3 headers, returning array of section objects
+        fc.markdown.extract_header_chunks(fc.col("markdown_content"), header_level=3).alias("sections"),
+        fc.markdown.generate_toc(fc.col("markdown_content")).alias("toc")
+    )
+
+    pdf_sections_df.show()
+
+    # Use the processed markdown content to extract content information using minimum tokens
+    class PDFContentCategorization(BaseModel):
+        """Pydantic model for PDF content categorization."""
+        summary: str = Field(description="Brief one sentence summary of the PDF given its table of contents")
+        sections_about_model_training: List[str] = Field(description="List of headings that are specifically about model training")
+        products_mentioned: List[str] = Field(description="All product names mentioned in the PDF table of contents")
+
+    pdf_filtered_details = pdf_sections_df.with_column(
+        "content_categorization", 
+        fc.semantic.extract("toc", PDFContentCategorization, model_alias="cheap_model")
+    ).cache()
+
+
+    # Let's save on tokens by using only the table of contents to summarize each whitepaper's content
+    print("="*70)
+    print("SUMMARY OF WHITEPAPERS")
+    print("="*70)
+    for row in pdf_filtered_details.to_pylist():
+        print(f"Whitepaper: {row['name']}")
+        print(f"Summary: {row['content_categorization.summary']}")
+        print(f"Products mentioned: {row['content_categorization.products_mentioned']}")
+
+    breakpoint()
+    # Let's say you want to take a closer look at sections that detail model training
+    model_training_sections_df = pdf_filtered_details.explode("sections").filter(
+        fc.col("content_categorization.sections_about_model_training").is_not_null() & 
+        fc.array_contains(fc.col("content_categorization.sections_about_model_training"), fc.col("sections.heading"))
+    )
+
+    print("="*70)
+    print("FILTER DATA ABOUT MODEL TRAINING")
+    print("="*70)
+    print(f"Found {model_training_sections_df.count()} sections about model training:")
+    model_training_sections_df.show()
+
+
+    # Cleanup the pdfs we downloaded
+    shutil.rmtree(data_dir)
+
+    # Clean up the session
+    session.stop()
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/src/fenic/_backends/local/semantic_operators/parse_pdf.py b/src/fenic/_backends/local/semantic_operators/parse_pdf.py
@@ -24,8 +24,9 @@ class ParsePDF(BaseSingleColumnFilePathOperator[str, str]):
     """Operator for parsing PDF files using language models with PDF parsing capabilities."""
     SYSTEM_PROMPT = jinja2.Template(dedent("""\
         Transcribe the main content of this PDF document to clean, well-formatted markdown.
-         - Output should be raw markdown, don't surround in code fences or backticks.
-         - Preserve the structure, formatting, headings, lists, and any tables to the best of your ability
+         - Output should be raw markdown, don't surround the whole output in code fences or backticks.
+         - For each topic, create a markdown heading. For key terms, use bold text.
+         - Preserve the structure, formatting, headings, lists, table of contents, and any tables using markdown syntax.
          - Format tables as github markdown tables, however:
              - for table headings, immediately add ' |' after the table heading
         {% if multiple_pages %}
diff --git a/src/fenic/_inference/common_openai/openai_chat_completions_core.py b/src/fenic/_inference/common_openai/openai_chat_completions_core.py
@@ -22,6 +22,7 @@
     FatalException,
     TransientException,
 )
+from fenic._inference.rate_limit_strategy import MAX_OUTPUT_TOKENS_PER_PARSE_PDF_REQUEST
 from fenic._inference.request_utils import generate_completion_request_key
 from fenic._inference.token_counter import TokenCounter
 from fenic._inference.types import (
@@ -90,10 +91,9 @@ async def make_single_request(
             common_params: dict[str, Any] = {
                 "model": self._model,
                 "messages": convert_messages(request.messages),
+                "max_completion_tokens": self.get_max_output_token_request_limit(request, profile_configuration),
                 "n": 1,
             }
-            if request.max_completion_tokens:
-                common_params.update({"max_completion_tokens": request.max_completion_tokens + profile_configuration.expected_additional_reasoning_tokens})
             if request.temperature:
                 common_params.update({"temperature": request.temperature})
 
@@ -214,3 +214,18 @@ def get_request_key(self, request: FenicCompletionsRequest) -> str:
             A unique key for the request
         """
         return generate_completion_request_key(request)
+
+    def get_max_output_token_request_limit(self, request: FenicCompletionsRequest, profile_config:OpenAICompletionProfileConfiguration) -> int:
+        """Return the maximum output token limit for a request.
+
+        For file parsing requests, use a guardrail limit of 8192 tokens (the lowest output limit of a VLM model we support).
+
+        Include the thinking token budget with a safety margin.
+        """
+        max_output_tokens = request.max_completion_tokens or 0
+        if request.max_completion_tokens is None and request.messages.user_file:
+            # Guardrail to ensure the model uses a sane amount of output tokens.
+            # Note: we can't use our token estimation because the pdf could be empty, or have only images (scans).
+            # TODO(DY): the semantic operator should dictate how the file affects the token estimate
+            max_output_tokens = MAX_OUTPUT_TOKENS_PER_PARSE_PDF_REQUEST
+        return max_output_tokens + profile_config.expected_additional_reasoning_tokens
diff --git a/src/fenic/_inference/google/gemini_native_chat_completions_client.py b/src/fenic/_inference/google/gemini_native_chat_completions_client.py
@@ -27,6 +27,7 @@
     TransientException,
 )
 from fenic._inference.rate_limit_strategy import (
+    MAX_OUTPUT_TOKENS_PER_PARSE_PDF_REQUEST,
     TokenEstimate,
     UnifiedTokenRateLimitStrategy,
 )
@@ -176,16 +177,14 @@ async def make_single_request(
         """
 
         profile_config = self._profile_manager.get_profile_by_name(request.model_profile)
-        max_output_tokens = self._get_max_output_token_request_limit(request)
-
         generation_config: GenerateContentConfigDict = {
             "temperature": request.temperature,
             "response_logprobs": request.top_logprobs is not None,
             "logprobs": request.top_logprobs,
+            "max_output_tokens": self._get_max_output_token_request_limit(request),
             "system_instruction": request.messages.system,
         }
-        if max_output_tokens is not None:
-            generation_config["max_output_tokens"] = max_output_tokens
+
         generation_config.update(profile_config.additional_generation_config)
         if request.structured_output is not None:
             generation_config.update(
@@ -342,14 +341,15 @@ def _estimate_output_tokens(self, request: FenicCompletionsRequest) -> int:
     def _get_max_output_token_request_limit(self, request: FenicCompletionsRequest) -> Optional[int]:
         """Get the upper limit of output tokens for a request.
 
-        If max_completion_tokens is not set, don't apply a limit and return None.
+        For file parsing requests, use a guardrail limit of 8192 tokens (the lowest output limit of a VLM model we support).
 
         Include the thinking token budget with a safety margin."""
         max_output_tokens = request.max_completion_tokens or 0
         if request.max_completion_tokens is None and request.messages.user_file:
             # Guardrail to ensure the model uses a sane amount of output tokens.
+            # Note: we can't use our token estimation because the pdf could be empty, or have only images (scans).
             # TODO(DY): the semantic operator should dictate how the file affects the token estimate
-            max_output_tokens = self.token_counter.count_file_output_tokens(request.messages) * 2
+            max_output_tokens = MAX_OUTPUT_TOKENS_PER_PARSE_PDF_REQUEST
         return max_output_tokens + self._get_expected_additional_reasoning_tokens(request)
 
     def _get_expected_additional_reasoning_tokens(self, request: FenicCompletionsRequest) -> int:
diff --git a/src/fenic/_inference/openai/openai_batch_chat_completions_client.py b/src/fenic/_inference/openai/openai_batch_chat_completions_client.py
@@ -130,18 +130,15 @@ def _estimate_output_tokens(self, request: FenicCompletionsRequest) -> int:
         if request.max_completion_tokens is None and request.messages.user_file:
             # TODO(DY): the semantic operator should dictate how the file affects the token estimate
             base_tokens += self.token_counter.count_file_output_tokens(messages=request.messages)
-        return base_tokens + self._get_expected_additional_reasoning_tokens(request)
+        profile_config = self._profile_manager.get_profile_by_name(request.model_profile)
+        return base_tokens + profile_config.expected_additional_reasoning_tokens
 
     def _get_max_output_token_request_limit(self, request: FenicCompletionsRequest) -> int:
-        """Return the maximum output token limit for a request."""
-        max_output_tokens = request.max_completion_tokens or 0
-        if request.max_completion_tokens is None and request.messages.user_file:
-            # Guardrail to ensure the model uses a sane amount of output tokens.
-            # TODO(DY): the semantic operator should dictate how the file affects the token estimate
-            max_output_tokens = self.token_counter.count_file_output_tokens(request.messages) * 2
-        return max_output_tokens + self._get_expected_additional_reasoning_tokens(request)
+        """Return the maximum output token limit for a request.
 
-    def _get_expected_additional_reasoning_tokens(self, request: FenicCompletionsRequest) -> int:
-        """Get the expected additional reasoning tokens for a request."""
+        For file parsing requests, use a guardrail limit of 8192 tokens (the lowest output limit of a VLM model we support).
+
+        Include the thinking token budget with a safety margin.
+        """
         profile_config = self._profile_manager.get_profile_by_name(request.model_profile)
-        return profile_config.expected_additional_reasoning_tokens
+        return self._core.get_max_output_token_request_limit(request, profile_config)
diff --git a/src/fenic/_inference/openrouter/openrouter_batch_chat_completions_client.py b/src/fenic/_inference/openrouter/openrouter_batch_chat_completions_client.py
@@ -21,6 +21,7 @@
 )
 from fenic._inference.openrouter.openrouter_provider import OpenRouterModelProvider
 from fenic._inference.rate_limit_strategy import (
+    MAX_OUTPUT_TOKENS_PER_PARSE_PDF_REQUEST,
     AdaptiveBackoffRateLimitStrategy,
     RateLimitStrategy,
     TokenEstimate,
@@ -268,17 +269,15 @@ def _estimate_output_tokens(self, request: FenicCompletionsRequest) -> int:
     def _get_max_output_token_request_limit(self, request: FenicCompletionsRequest) -> int:
         """Get the upper limit of output tokens for a request.
 
-        If max_completion_tokens is not set, don't apply a limit and return None.
+        For file parsing requests, use a guardrail limit of 8192 tokens (the lowest output limit of a VLM model we support).
 
         Include the thinking token budget with a safety margin."""
-        if request.max_completion_tokens:
-            max_output_tokens = request.max_completion_tokens
-        elif request.messages.user_file:
+        max_output_tokens = request.max_completion_tokens
+        if request.max_completion_tokens is None and request.messages.user_file:
             # Guardrail to ensure the model uses a sane amount of output tokens.
-            if self._google_token_counter:
-                max_output_tokens = self._google_token_counter.count_file_output_tokens(messages=request.messages) * 2
-            else:
-                max_output_tokens = self.token_counter.count_file_output_tokens(messages=request.messages) * 2
+            # Note: we can't use our token estimation because the pdf could be empty, or have only images (scans).
+            # TODO(DY): the semantic operator should dictate how the file affects the token estimate
+            max_output_tokens = MAX_OUTPUT_TOKENS_PER_PARSE_PDF_REQUEST
         return max_output_tokens + self._get_expected_additional_reasoning_tokens(request)
 
     def _estimate_input_tokens(self, request: FenicCompletionsRequest) -> int:
diff --git a/src/fenic/_inference/rate_limit_strategy.py b/src/fenic/_inference/rate_limit_strategy.py
@@ -10,7 +10,7 @@
 
 logger = logging.getLogger(__name__)
 
-
+MAX_OUTPUT_TOKENS_PER_PARSE_PDF_REQUEST = 8192
 @dataclass
 class TokenEstimate:
     input_tokens: int = 0