Skip to content

Commit eef48d2

Browse files
committed
feat: tweak pdf parser for corner cases and add 120s demo
1 parent 3ed6592 commit eef48d2

File tree

9 files changed

+910
-30
lines changed

9 files changed

+910
-30
lines changed

examples/fenic_in_120_seconds/18_pdf_processing.ipynb

Lines changed: 739 additions & 0 deletions
Large diffs are not rendered by default.

examples/pdf_processing/pdf_processing.ipynb

Whitespace-only changes.
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
"""Document metadata extraction example using fenic semantic operations.
2+
3+
This example demonstrates how to extract structured metadata from unstructured document text
4+
using Fenic’s Pydantic model integration for schema definitions.
5+
"""
6+
7+
import os
8+
import shutil
9+
from typing import List, Optional
10+
11+
import huggingface_hub as hf
12+
from pydantic import BaseModel, Field
13+
14+
import fenic as fc
15+
16+
data_dir = "examples_data/"
17+
data_dir = "/Users/davidyoungworth/pdf_eval_data/docs/whitepapers"
18+
output_dir = "examples_data/"
19+
20+
def main(config: Optional[fc.SessionConfig] = None):
21+
"""Extract metadata from document excerpts using semantic operations."""
22+
# Configure session with semantic capabilities
23+
# TODO: add other examples
24+
config = config or fc.SessionConfig(
25+
app_name="pdf_processing",
26+
semantic=fc.SemanticConfig(
27+
language_models={
28+
"parse_model": fc.GoogleDeveloperLanguageModel(
29+
model_name="gemini-2.5-flash-lite",
30+
rpm=500,
31+
tpm=1_000_000,
32+
),
33+
"cheap_model": fc.OpenAILanguageModel(
34+
model_name="gpt-5-nano",
35+
rpm=500,
36+
tpm=200_000,
37+
),
38+
},
39+
default_language_model="cheap_model"
40+
),
41+
)
42+
43+
# Create session
44+
session = fc.Session.get_or_create(config)
45+
46+
47+
# Setup: grab some pdfs
48+
# Note: This will soon be unnessary when fenic's pdf doc loader can load directly from huggingface
49+
repo_id = "typedef-ai/pdf_data"
50+
os.makedirs(data_dir, exist_ok=True)
51+
52+
files = hf.list_repo_files(repo_id=repo_id, repo_type="dataset")
53+
for file in files:
54+
if file.startswith("whitepapers/"):
55+
hf.hf_hub_download(repo_id=repo_id, repo_type="dataset", filename=file, local_dir=data_dir)
56+
57+
# Create a table to store the markdown
58+
pdf_filtered_df = session.read.pdf_metadata(f"{data_dir}/*.pdf").filter(
59+
(fc.col("page_count") > 3) & (not fc.col("is_encrypted")))
60+
61+
# Using gemini parsing model to parse the pdf content into raw markdown
62+
pdf_to_md_content = pdf_filtered_df.with_column(
63+
"markdown_content",
64+
fc.semantic.parse_pdf(
65+
fc.col("file_path"), model_alias="parse_model")).cache()
66+
67+
# Fenic's powerfull markdown processing can provide any structure to the content
68+
# We will separate by sections and generate a table of contents
69+
pdf_sections_df = pdf_to_md_content.select(
70+
fc.when(
71+
fc.col("title").is_not_null(),
72+
fc.col("title")
73+
).otherwise(
74+
fc.text.split_part(fc.col("file_path"), "/", -1)
75+
).alias("name"),
76+
"markdown_content",
77+
# Extract sections up to level 3 headers, returning array of section objects
78+
fc.markdown.extract_header_chunks(fc.col("markdown_content"), header_level=3).alias("sections"),
79+
fc.markdown.generate_toc(fc.col("markdown_content")).alias("toc")
80+
)
81+
82+
pdf_sections_df.show()
83+
84+
# Use the processed markdown content to extract content information using minimum tokens
85+
class PDFContentCategorization(BaseModel):
86+
"""Pydantic model for PDF content categorization."""
87+
summary: str = Field(description="Brief one sentence summary of the PDF given its table of contents")
88+
sections_about_model_training: List[str] = Field(description="List of headings that are specifically about model training")
89+
products_mentioned: List[str] = Field(description="All product names mentioned in the PDF table of contents")
90+
91+
pdf_filtered_details = pdf_sections_df.with_column(
92+
"content_categorization",
93+
fc.semantic.extract("toc", PDFContentCategorization, model_alias="cheap_model")
94+
).cache()
95+
96+
97+
# Let's save on tokens by using only the table of contents to summarize each whitepaper's content
98+
print("="*70)
99+
print("SUMMARY OF WHITEPAPERS")
100+
print("="*70)
101+
for row in pdf_filtered_details.to_pylist():
102+
print(f"Whitepaper: {row['name']}")
103+
print(f"Summary: {row['content_categorization.summary']}")
104+
print(f"Products mentioned: {row['content_categorization.products_mentioned']}")
105+
106+
breakpoint()
107+
# Let's say you want to take a closer look at sections that detail model training
108+
model_training_sections_df = pdf_filtered_details.explode("sections").filter(
109+
fc.col("content_categorization.sections_about_model_training").is_not_null() &
110+
fc.array_contains(fc.col("content_categorization.sections_about_model_training"), fc.col("sections.heading"))
111+
)
112+
113+
print("="*70)
114+
print("FILTER DATA ABOUT MODEL TRAINING")
115+
print("="*70)
116+
print(f"Found {model_training_sections_df.count()} sections about model training:")
117+
model_training_sections_df.show()
118+
119+
120+
# Cleanup the pdfs we downloaded
121+
shutil.rmtree(data_dir)
122+
123+
# Clean up the session
124+
session.stop()
125+
126+
127+
if __name__ == "__main__":
128+
main()
129+

src/fenic/_backends/local/semantic_operators/parse_pdf.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,9 @@ class ParsePDF(BaseSingleColumnFilePathOperator[str, str]):
2424
"""Operator for parsing PDF files using language models with PDF parsing capabilities."""
2525
SYSTEM_PROMPT = jinja2.Template(dedent("""\
2626
Transcribe the main content of this PDF document to clean, well-formatted markdown.
27-
- Output should be raw markdown, don't surround in code fences or backticks.
28-
- Preserve the structure, formatting, headings, lists, and any tables to the best of your ability
27+
- Output should be raw markdown, don't surround the whole output in code fences or backticks.
28+
- For each topic, create a markdown heading. For key terms, use bold text.
29+
- Preserve the structure, formatting, headings, lists, table of contents, and any tables using markdown syntax.
2930
- Format tables as github markdown tables, however:
3031
- for table headings, immediately add ' |' after the table heading
3132
{% if multiple_pages %}

src/fenic/_inference/common_openai/openai_chat_completions_core.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
FatalException,
2323
TransientException,
2424
)
25+
from fenic._inference.rate_limit_strategy import MAX_OUTPUT_TOKENS_PER_PARSE_PDF_REQUEST
2526
from fenic._inference.request_utils import generate_completion_request_key
2627
from fenic._inference.token_counter import TokenCounter
2728
from fenic._inference.types import (
@@ -90,10 +91,9 @@ async def make_single_request(
9091
common_params: dict[str, Any] = {
9192
"model": self._model,
9293
"messages": convert_messages(request.messages),
94+
"max_completion_tokens": self.get_max_output_token_request_limit(request, profile_configuration),
9395
"n": 1,
9496
}
95-
if request.max_completion_tokens:
96-
common_params.update({"max_completion_tokens": request.max_completion_tokens + profile_configuration.expected_additional_reasoning_tokens})
9797
if request.temperature:
9898
common_params.update({"temperature": request.temperature})
9999

@@ -214,3 +214,18 @@ def get_request_key(self, request: FenicCompletionsRequest) -> str:
214214
A unique key for the request
215215
"""
216216
return generate_completion_request_key(request)
217+
218+
def get_max_output_token_request_limit(self, request: FenicCompletionsRequest, profile_config:OpenAICompletionProfileConfiguration) -> int:
219+
"""Return the maximum output token limit for a request.
220+
221+
For file parsing requests, use a guardrail limit of 8192 tokens (the lowest output limit of a VLM model we support).
222+
223+
Include the thinking token budget with a safety margin.
224+
"""
225+
max_output_tokens = request.max_completion_tokens or 0
226+
if request.max_completion_tokens is None and request.messages.user_file:
227+
# Guardrail to ensure the model uses a sane amount of output tokens.
228+
# Note: we can't use our token estimation because the pdf could be empty, or have only images (scans).
229+
# TODO(DY): the semantic operator should dictate how the file affects the token estimate
230+
max_output_tokens = MAX_OUTPUT_TOKENS_PER_PARSE_PDF_REQUEST
231+
return max_output_tokens + profile_config.expected_additional_reasoning_tokens

src/fenic/_inference/google/gemini_native_chat_completions_client.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
TransientException,
2828
)
2929
from fenic._inference.rate_limit_strategy import (
30+
MAX_OUTPUT_TOKENS_PER_PARSE_PDF_REQUEST,
3031
TokenEstimate,
3132
UnifiedTokenRateLimitStrategy,
3233
)
@@ -176,16 +177,14 @@ async def make_single_request(
176177
"""
177178

178179
profile_config = self._profile_manager.get_profile_by_name(request.model_profile)
179-
max_output_tokens = self._get_max_output_token_request_limit(request)
180-
181180
generation_config: GenerateContentConfigDict = {
182181
"temperature": request.temperature,
183182
"response_logprobs": request.top_logprobs is not None,
184183
"logprobs": request.top_logprobs,
184+
"max_output_tokens": self._get_max_output_token_request_limit(request),
185185
"system_instruction": request.messages.system,
186186
}
187-
if max_output_tokens is not None:
188-
generation_config["max_output_tokens"] = max_output_tokens
187+
189188
generation_config.update(profile_config.additional_generation_config)
190189
if request.structured_output is not None:
191190
generation_config.update(
@@ -342,14 +341,15 @@ def _estimate_output_tokens(self, request: FenicCompletionsRequest) -> int:
342341
def _get_max_output_token_request_limit(self, request: FenicCompletionsRequest) -> Optional[int]:
343342
"""Get the upper limit of output tokens for a request.
344343
345-
If max_completion_tokens is not set, don't apply a limit and return None.
344+
For file parsing requests, use a guardrail limit of 8192 tokens (the lowest output limit of a VLM model we support).
346345
347346
Include the thinking token budget with a safety margin."""
348347
max_output_tokens = request.max_completion_tokens or 0
349348
if request.max_completion_tokens is None and request.messages.user_file:
350349
# Guardrail to ensure the model uses a sane amount of output tokens.
350+
# Note: we can't use our token estimation because the pdf could be empty, or have only images (scans).
351351
# TODO(DY): the semantic operator should dictate how the file affects the token estimate
352-
max_output_tokens = self.token_counter.count_file_output_tokens(request.messages) * 2
352+
max_output_tokens = MAX_OUTPUT_TOKENS_PER_PARSE_PDF_REQUEST
353353
return max_output_tokens + self._get_expected_additional_reasoning_tokens(request)
354354

355355
def _get_expected_additional_reasoning_tokens(self, request: FenicCompletionsRequest) -> int:

src/fenic/_inference/openai/openai_batch_chat_completions_client.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -130,18 +130,15 @@ def _estimate_output_tokens(self, request: FenicCompletionsRequest) -> int:
130130
if request.max_completion_tokens is None and request.messages.user_file:
131131
# TODO(DY): the semantic operator should dictate how the file affects the token estimate
132132
base_tokens += self.token_counter.count_file_output_tokens(messages=request.messages)
133-
return base_tokens + self._get_expected_additional_reasoning_tokens(request)
133+
profile_config = self._profile_manager.get_profile_by_name(request.model_profile)
134+
return base_tokens + profile_config.expected_additional_reasoning_tokens
134135

135136
def _get_max_output_token_request_limit(self, request: FenicCompletionsRequest) -> int:
136-
"""Return the maximum output token limit for a request."""
137-
max_output_tokens = request.max_completion_tokens or 0
138-
if request.max_completion_tokens is None and request.messages.user_file:
139-
# Guardrail to ensure the model uses a sane amount of output tokens.
140-
# TODO(DY): the semantic operator should dictate how the file affects the token estimate
141-
max_output_tokens = self.token_counter.count_file_output_tokens(request.messages) * 2
142-
return max_output_tokens + self._get_expected_additional_reasoning_tokens(request)
137+
"""Return the maximum output token limit for a request.
143138
144-
def _get_expected_additional_reasoning_tokens(self, request: FenicCompletionsRequest) -> int:
145-
"""Get the expected additional reasoning tokens for a request."""
139+
For file parsing requests, use a guardrail limit of 8192 tokens (the lowest output limit of a VLM model we support).
140+
141+
Include the thinking token budget with a safety margin.
142+
"""
146143
profile_config = self._profile_manager.get_profile_by_name(request.model_profile)
147-
return profile_config.expected_additional_reasoning_tokens
144+
return self._core.get_max_output_token_request_limit(request, profile_config)

src/fenic/_inference/openrouter/openrouter_batch_chat_completions_client.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
)
2222
from fenic._inference.openrouter.openrouter_provider import OpenRouterModelProvider
2323
from fenic._inference.rate_limit_strategy import (
24+
MAX_OUTPUT_TOKENS_PER_PARSE_PDF_REQUEST,
2425
AdaptiveBackoffRateLimitStrategy,
2526
RateLimitStrategy,
2627
TokenEstimate,
@@ -268,17 +269,15 @@ def _estimate_output_tokens(self, request: FenicCompletionsRequest) -> int:
268269
def _get_max_output_token_request_limit(self, request: FenicCompletionsRequest) -> int:
269270
"""Get the upper limit of output tokens for a request.
270271
271-
If max_completion_tokens is not set, don't apply a limit and return None.
272+
For file parsing requests, use a guardrail limit of 8192 tokens (the lowest output limit of a VLM model we support).
272273
273274
Include the thinking token budget with a safety margin."""
274-
if request.max_completion_tokens:
275-
max_output_tokens = request.max_completion_tokens
276-
elif request.messages.user_file:
275+
max_output_tokens = request.max_completion_tokens
276+
if request.max_completion_tokens is None and request.messages.user_file:
277277
# Guardrail to ensure the model uses a sane amount of output tokens.
278-
if self._google_token_counter:
279-
max_output_tokens = self._google_token_counter.count_file_output_tokens(messages=request.messages) * 2
280-
else:
281-
max_output_tokens = self.token_counter.count_file_output_tokens(messages=request.messages) * 2
278+
# Note: we can't use our token estimation because the pdf could be empty, or have only images (scans).
279+
# TODO(DY): the semantic operator should dictate how the file affects the token estimate
280+
max_output_tokens = MAX_OUTPUT_TOKENS_PER_PARSE_PDF_REQUEST
282281
return max_output_tokens + self._get_expected_additional_reasoning_tokens(request)
283282

284283
def _estimate_input_tokens(self, request: FenicCompletionsRequest) -> int:

src/fenic/_inference/rate_limit_strategy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
logger = logging.getLogger(__name__)
1212

13-
13+
MAX_OUTPUT_TOKENS_PER_PARSE_PDF_REQUEST = 8192
1414
@dataclass
1515
class TokenEstimate:
1616
input_tokens: int = 0

0 commit comments

Comments
 (0)