11import logging
22from textwrap import dedent
3- from typing import List , Optional , Tuple
3+ from typing import List , Optional , Tuple , Union
44
55import fitz
66import jinja2
1010 BaseSingleColumnFilePathOperator ,
1111 CompletionOnlyRequestSender ,
1212)
13- from fenic ._backends .local .utils .doc_loader import DocFolderLoader
13+ from fenic ._backends .local .utils .doc_loader import (
14+ DocFolderLoader ,
15+ resolve_and_coalesce_pages ,
16+ validate_pages_argument ,
17+ )
1418from fenic ._inference .language_model import InferenceConfiguration , LanguageModel
1519from fenic ._inference .types import LMRequestFile , LMRequestMessages
1620from fenic .core ._logical_plan .resolved_types import ResolvedModelAlias
@@ -48,11 +52,13 @@ def __init__(
4852 page_separator : Optional [str ] = None ,
4953 describe_images : bool = False ,
5054 model_alias : Optional [ResolvedModelAlias ] = None ,
55+ pages : Optional [Union [pl .Series , int , List [Union [int , List [int ]]]]] = None ,
5156 ):
5257 self .page_separator = page_separator
5358 self .describe_images = describe_images
5459 self .model = model
5560 self .model_alias = model_alias
61+ self .pages = pages
5662
5763 DocFolderLoader .check_file_extensions (input .to_list (), "pdf" )
5864
@@ -105,12 +111,19 @@ def build_request_messages_batch(self) -> Tuple[List[Optional[LMRequestMessages]
105111 List of the each chunk size (page count) per PDF (page_counts_per_chunk_per_row)"""
106112 messages_batch = []
107113 page_counts_per_chunk_per_row = []
108- for path in self .input :
114+ for idx , path in enumerate ( self .input ) :
109115 if not path :
110116 messages_batch .append (None )
111117 page_counts_per_chunk_per_row .append ([1 ])
112118 else :
113- file_chunks = self ._get_file_chunks (path )
119+ # pages can be a literal int, list of ranges, or a logical expression that resolves to an int or list of ranges
120+ row_pages = self .pages .to_list ()[idx ] if isinstance (self .pages , pl .Series ) else self .pages
121+
122+ # Validate pages if it's not None (validation happens here for column values)
123+ if row_pages is not None :
124+ validate_pages_argument (row_pages )
125+
126+ file_chunks = self ._get_file_chunks (path , row_pages )
114127 page_counts_per_chunk = []
115128 for file in file_chunks :
116129 messages_batch .append (
@@ -120,57 +133,73 @@ def build_request_messages_batch(self) -> Tuple[List[Optional[LMRequestMessages]
120133 page_counts_per_chunk_per_row .append (page_counts_per_chunk )
121134 return messages_batch , page_counts_per_chunk_per_row
122135
123-
124- def _get_file_chunks (self , file_path : str ) -> List [LMRequestFile ]:
136+ def _get_file_chunks (self , file_path : str , pages : Optional [Union [int , List [Union [int , List [int ]]]]] = None ) -> List [LMRequestFile ]:
125137 """Get the page chunks for the PDF file.
126138
127139 Limit the pages based on the model's output token limit and internal max pages per chunk.
128140
129141 Args:
130142 file_path: Path to the PDF file
143+ pages: Optional pages specification (1-indexed). If None, process all pages.
131144
132145 Returns:
133146 List of LMRequestFile objects
134- List of (start_page, end_page) tuples (inclusive, 0-indexed)
135147 """
136148 chunks = []
137- range_start_page = 0
138- range_tokens = 0
139- range_page_count = 0
140149
141150 with fitz .open (file_path ) as doc :
142151 total_pages = doc .page_count
143- for page_num in range (total_pages ):
144- text = doc [page_num ].get_text ("text" )
145- page_tokens = self .model .count_tokens (text )
146- # Check if we need to start a new range, either by reaching the token limit or the requested page range size
147- would_exceed_tokens = range_tokens > 0 and (range_tokens + page_tokens ) * PDF_MARKDOWN_OUTPUT_TOKEN_MULTIPLIER > self .model .model_parameters .max_output_tokens
148- would_exceed_page_limit = range_page_count >= PDF_MAX_PAGES_CHUNK
149-
150- if would_exceed_tokens or would_exceed_page_limit :
151- # Save current batch
152- last_page = page_num - 1
153- page_range = (range_start_page , last_page )
154- with fitz .open () as doc_chunk :
155- doc_chunk .insert_pdf (doc , from_page = range_start_page , to_page = last_page )
156- chunks .append (LMRequestFile (path = file_path , pdf_chunk_bytes = doc_chunk .tobytes (), page_range = page_range ))
157- range_start_page = page_num
158- range_tokens = page_tokens
159- range_page_count = 1
160- else :
161- range_tokens += page_tokens
162- range_page_count += 1
163-
164- # Add the last batch if there are remaining pages
165- if range_start_page < total_pages :
166- if range_start_page == 0 :
167- # whole pdf fits in one chunk, no need to keep data in memory
168- chunks .append (LMRequestFile (path = file_path , pdf_chunk_bytes = None , page_range = (0 , total_pages - 1 )))
169- else :
170- # multi-page chunk
171- with fitz .open () as doc_chunk :
172- doc_chunk .insert_pdf (doc , from_page = range_start_page , to_page = total_pages - 1 )
173- chunks .append (LMRequestFile (path = file_path , pdf_chunk_bytes = doc_chunk .tobytes (), page_range = (range_start_page , total_pages - 1 )))
152+
153+ # Resolve page ranges
154+ if pages is not None :
155+ resolved_ranges = resolve_and_coalesce_pages (pages , total_pages )
156+ # Filter out ranges that exceed the document's page count
157+ resolved_ranges = [(start , min (end , total_pages - 1 )) for start , end in resolved_ranges if start < total_pages ]
158+ else :
159+ # Process all pages
160+ resolved_ranges = [(0 , total_pages - 1 )]
161+
162+ # Process each range
163+ for range_start , range_end in resolved_ranges :
164+ # Track current chunk within this range
165+ chunk_start_page = range_start
166+ chunk_tokens = 0
167+ chunk_page_count = 0
168+
169+ for page_num in range (range_start , range_end + 1 ):
170+ text = doc [page_num ].get_text ("text" )
171+ page_tokens = self .model .count_tokens (text )
172+
173+ # Check if we need to start a new chunk
174+ would_exceed_tokens = chunk_tokens > 0 and (chunk_tokens + page_tokens ) * PDF_MARKDOWN_OUTPUT_TOKEN_MULTIPLIER > self .model .model_parameters .max_output_tokens
175+ would_exceed_page_limit = chunk_page_count >= PDF_MAX_PAGES_CHUNK
176+
177+ if would_exceed_tokens or would_exceed_page_limit :
178+ # Save current chunk
179+ last_page = page_num - 1
180+ page_range = (chunk_start_page , last_page )
181+ with fitz .open () as doc_chunk :
182+ doc_chunk .insert_pdf (doc , from_page = chunk_start_page , to_page = last_page )
183+ chunks .append (LMRequestFile (path = file_path , pdf_chunk_bytes = doc_chunk .tobytes (), page_range = page_range ))
184+
185+ # Start new chunk
186+ chunk_start_page = page_num
187+ chunk_tokens = page_tokens
188+ chunk_page_count = 1
189+ else :
190+ chunk_tokens += page_tokens
191+ chunk_page_count += 1
192+
193+ # Add the last chunk for this range if there are remaining pages
194+ if chunk_start_page <= range_end :
195+ if chunk_start_page == 0 and range_end == total_pages - 1 and len (resolved_ranges ) == 1 :
196+ # Whole PDF fits in one chunk, no need to keep data in memory
197+ chunks .append (LMRequestFile (path = file_path , pdf_chunk_bytes = None , page_range = (0 , total_pages - 1 )))
198+ else :
199+ # Multi-page chunk or partial PDF
200+ with fitz .open () as doc_chunk :
201+ doc_chunk .insert_pdf (doc , from_page = chunk_start_page , to_page = range_end )
202+ chunks .append (LMRequestFile (path = file_path , pdf_chunk_bytes = doc_chunk .tobytes (), page_range = (chunk_start_page , range_end )))
174203
175204 return chunks
176205
0 commit comments