diff --git a/.gitignore b/.gitignore index 47d38baef..c98d040d3 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,5 @@ SDK/* log/* logs/ parts/* +test.ipynb json_results/* diff --git a/pageindex/config.yaml b/pageindex/config.yaml index fd73e3a2c..216959dbf 100644 --- a/pageindex/config.yaml +++ b/pageindex/config.yaml @@ -1,8 +1,16 @@ model: "gpt-4o-2024-11-20" +base_url: "" toc_check_page_num: 20 max_page_num_each_node: 10 max_token_num_each_node: 20000 if_add_node_id: "yes" if_add_node_summary: "yes" if_add_doc_description: "no" -if_add_node_text: "no" \ No newline at end of file +if_add_node_text: "no" +pdf_parser: "docling" # Options: PyPDF2, PyMuPDF, docling +use_docling: True +use_gpu: False +do_ocr: True +do_table_structure: True +do_cell_matching: True +tokenizer_model: "o200k_base" \ No newline at end of file diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 882fb5dea..16ac9510d 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -5,6 +5,20 @@ import random import re from .utils import * +from schemas.schemas import ( + TitleAppearanceResponse, + TitleAppearanceInStartResponse, + TocDetectedResponse, + CompletedResponse, + PageIndexGivenResponse, + TocItem, + TocWithPage, + TableOfContents, + TocItemWithStart, + PhysicalIndexResponse, + TocItemList, + TocItemWithStartList +) import os from concurrent.futures import ThreadPoolExecutor, as_completed @@ -27,21 +41,10 @@ async def check_title_appearance(item, page_list, start_index=1, model=None): The given section title is {title}. The given page_text is {page_text}. - - Reply format: - {{ - - "thinking": - "answer": "yes or no" (yes if the section appears or starts in the page_text, no otherwise) - }} - Directly return the final JSON structure. Do not output anything else.""" - - response = await ChatGPT_API_async(model=model, prompt=prompt) - response = extract_json(response) - if 'answer' in response: - answer = response['answer'] - else: - answer = 'no' + """ + + response = await ChatGPT_API_async(model=model, prompt=prompt, response_format=TitleAppearanceResponse) + answer = response.answer return {'list_index': item['list_index'], 'answer': answer, 'title': title, 'page_number': page_number} @@ -56,19 +59,12 @@ async def check_title_appearance_in_start(title, page_text, model=None, logger=N The given section title is {title}. The given page_text is {page_text}. - - reply format: - {{ - "thinking": - "start_begin": "yes or no" (yes if the section starts in the beginning of the page_text, no otherwise) - }} - Directly return the final JSON structure. Do not output anything else.""" - - response = await ChatGPT_API_async(model=model, prompt=prompt) - response = extract_json(response) + """ + + response = await ChatGPT_API_async(model=model, prompt=prompt, response_format=TitleAppearanceInStartResponse) if logger: - logger.info(f"Response: {response}") - return response.get("start_begin", "no") + logger.info(f"Response: {response.model_dump()}") + return response.start_begin async def check_title_appearance_in_start_concurrent(structure, page_list, model=None, logger=None): @@ -107,55 +103,43 @@ def toc_detector_single_page(content, model=None): Given text: {content} - return the following JSON format: - {{ - "thinking": - "toc_detected": "", - }} - - Directly return the final JSON structure. Do not output anything else. - Please note: abstract,summary, notation list, figure list, table list, etc. are not table of contents.""" + Please note: abstract, summary, notation list, figure list, table list, etc. are not table of contents. + """ - response = ChatGPT_API(model=model, prompt=prompt) - # print('response', response) - json_content = extract_json(response) - return json_content['toc_detected'] + response = ChatGPT_API(model=model, prompt=prompt, response_format=TocDetectedResponse) + return response.toc_detected def check_if_toc_extraction_is_complete(content, toc, model=None): prompt = f""" - You are given a partial document and a table of contents. - Your job is to check if the table of contents is complete, which it contains all the main sections in the partial document. + You are given a partial document and a table of contents. + Your job is to check if the table of contents is complete, which means it contains all the main sections in the partial document. - Reply format: - {{ - "thinking": - "completed": "yes" or "no" - }} - Directly return the final JSON structure. Do not output anything else.""" + Document: + {content} - prompt = prompt + '\n Document:\n' + content + '\n Table of contents:\n' + toc - response = ChatGPT_API(model=model, prompt=prompt) - json_content = extract_json(response) - return json_content['completed'] + Table of contents: + {toc} + """ + + response = ChatGPT_API(model=model, prompt=prompt, response_format=CompletedResponse) + return response.completed def check_if_toc_transformation_is_complete(content, toc, model=None): prompt = f""" - You are given a raw table of contents and a table of contents. - Your job is to check if the table of contents is complete. + You are given a raw table of contents and a cleaned table of contents. + Your job is to check if the cleaned table of contents is complete. - Reply format: - {{ - "thinking": - "completed": "yes" or "no" - }} - Directly return the final JSON structure. Do not output anything else.""" + Raw Table of contents: + {content} - prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc - response = ChatGPT_API(model=model, prompt=prompt) - json_content = extract_json(response) - return json_content['completed'] + Cleaned Table of contents: + {toc} + """ + + response = ChatGPT_API(model=model, prompt=prompt, response_format=CompletedResponse) + return response.completed def extract_toc_content(content, model=None): prompt = f""" @@ -204,17 +188,10 @@ def detect_page_index(toc_content, model=None): Your job is to detect if there are page numbers/indices given within the table of contents. Given text: {toc_content} + """ - Reply format: - {{ - "thinking": - "page_index_given_in_toc": "" - }} - Directly return the final JSON structure. Do not output anything else.""" - - response = ChatGPT_API(model=model, prompt=prompt) - json_content = extract_json(response) - return json_content['page_index_given_in_toc'] + response = ChatGPT_API(model=model, prompt=prompt, response_format=PageIndexGivenResponse) + return response.page_index_given_in_toc def toc_extractor(page_list, toc_page_list, model): def transform_dots_to_colon(text): @@ -246,54 +223,31 @@ def toc_index_extractor(toc, content, model=None): The structure variable is the numeric system which represents the index of the hierarchy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc. - The response should be in the following JSON format: - [ - { - "structure": (string), - "title": , - "physical_index": "<physical_index_X>" (keep the format) - }, - ... - ] - Only add the physical_index to the sections that are in the provided pages. - If the section is not in the provided pages, do not add the physical_index to it. - Directly return the final JSON structure. Do not output anything else.""" + If the section is not in the provided pages, set physical_index to null. + """ prompt = tob_extractor_prompt + '\nTable of contents:\n' + str(toc) + '\nDocument pages:\n' + content - response = ChatGPT_API(model=model, prompt=prompt) - json_content = extract_json(response) - return json_content + response = ChatGPT_API(model=model, prompt=prompt, response_format=TocItemList) + return [item.model_dump() for item in response.items] def toc_transformer(toc_content, model=None): print('start toc_transformer') init_prompt = """ - You are given a table of contents, You job is to transform the whole table of content into a JSON format included table_of_contents. + You are given a table of contents. Your job is to transform the whole table of content into a JSON format. structure is the numeric system which represents the index of the hierarchy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc. - The response should be in the following JSON format: - { - table_of_contents: [ - { - "structure": <structure index, "x.x.x" or None> (string), - "title": <title of the section>, - "page": <page number or None>, - }, - ... - ], - } You should transform the full table of contents in one go. - Directly return the final JSON structure, do not output anything else. """ + """ prompt = init_prompt + '\n Given table of contents\n:' + toc_content - last_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt) - if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model) + last_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, response_format=TableOfContents) + if_complete = check_if_toc_transformation_is_complete(toc_content, str(last_complete.model_dump()), model) if if_complete == "yes" and finish_reason == "finished": - last_complete = extract_json(last_complete) - cleaned_response=convert_page_to_int(last_complete['table_of_contents']) + cleaned_response = convert_page_to_int([item.model_dump() for item in last_complete.table_of_contents]) return cleaned_response last_complete = get_json_content(last_complete) @@ -452,30 +406,19 @@ def page_list_to_group_text(page_contents, token_lengths, max_tokens=20000, over def add_page_number_to_toc(part, structure, model=None): fill_prompt_seq = """ - You are given an JSON structure of a document and a partial part of the document. Your task is to check if the title that is described in the structure is started in the partial given document. + You are given a JSON structure of a document and a partial part of the document. Your task is to check if the title that is described in the structure is started in the partial given document. The provided text contains tags like <physical_index_X> and <physical_index_X> to indicate the physical location of the page X. - If the full target section starts in the partial given document, insert the given JSON structure with the "start": "yes", and "start_index": "<physical_index_X>". - - If the full target section does not start in the partial given document, insert "start": "no", "start_index": None. + If the full target section starts in the partial given document, set "start": "yes" and "physical_index": "<physical_index_X>". + If the full target section does not start in the partial given document, set "start": "no" and "physical_index": null. - The response should be in the following format. - [ - { - "structure": <structure index, "x.x.x" or None> (string), - "title": <title of the section>, - "start": "<yes or no>", - "physical_index": "<physical_index_X> (keep the format)" or None - }, - ... - ] The given structure contains the result of the previous part, you need to fill the result of the current part, do not change the previous result. - Directly return the final JSON structure. Do not output anything else.""" + """ prompt = fill_prompt_seq + f"\n\nCurrent Partial Document:\n{part}\n\nGiven Structure\n{json.dumps(structure, indent=2)}\n" - current_json_raw = ChatGPT_API(model=model, prompt=prompt) - json_result = extract_json(current_json_raw) + current_json_raw = ChatGPT_API(model=model, prompt=prompt, response_format=TocItemWithStartList) + json_result = [item.model_dump() for item in current_json_raw.items] for item in json_result: if 'start' in item: @@ -507,26 +450,17 @@ def generate_toc_continue(toc_content, part, model="gpt-4o-2024-11-20"): For the title, you need to extract the original title from the text, only fix the space inconsistency. - The provided text contains tags like <physical_index_X> and <physical_index_X> to indicate the start and end of page X. \ + The provided text contains tags like <physical_index_X> and <physical_index_X> to indicate the start and end of page X. For the physical_index, you need to extract the physical index of the start of the section from the text. Keep the <physical_index_X> format. - The response should be in the following format. - [ - { - "structure": <structure index, "x.x.x"> (string), - "title": <title of the section, keep the original title>, - "physical_index": "<physical_index_X> (keep the format)" - }, - ... - ] - - Directly return the additional part of the final JSON structure. Do not output anything else.""" + Directly return the additional part of the final JSON structure. Do not output anything else. + """ prompt = prompt + '\nGiven text\n:' + part + '\nPrevious tree structure\n:' + json.dumps(toc_content, indent=2) - response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt) + response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, response_format=TocItemList) if finish_reason == 'finished': - return extract_json(response) + return [item.model_dump() for item in response.items] else: raise Exception(f'finish reason: {finish_reason}') @@ -540,38 +474,28 @@ def generate_toc_init(part, model=None): For the title, you need to extract the original title from the text, only fix the space inconsistency. - The provided text contains tags like <physical_index_X> and <physical_index_X> to indicate the start and end of page X. + The provided text contains tags like <physical_index_X> and <physical_index_X> to indicate the start and end of page X. For the physical_index, you need to extract the physical index of the start of the section from the text. Keep the <physical_index_X> format. - The response should be in the following format. - [ - {{ - "structure": <structure index, "x.x.x"> (string), - "title": <title of the section, keep the original title>, - "physical_index": "<physical_index_X> (keep the format)" - }}, - - ], - - - Directly return the final JSON structure. Do not output anything else.""" + Directly return the final JSON structure. Do not output anything else. + """ prompt = prompt + '\nGiven text\n:' + part - response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt) + response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, response_format=TocItemList) if finish_reason == 'finished': - return extract_json(response) + return [item.model_dump() for item in response.items] else: raise Exception(f'finish reason: {finish_reason}') -def process_no_toc(page_list, start_index=1, model=None, logger=None): +def process_no_toc(page_list, start_index=1, model=None, tokenizer_model=None, logger=None): page_contents=[] token_lengths=[] for page_index in range(start_index, start_index+len(page_list)): page_text = f"<physical_index_{page_index}>\n{page_list[page_index-start_index][0]}\n<physical_index_{page_index}>\n\n" page_contents.append(page_text) - token_lengths.append(count_tokens(page_text, model)) + token_lengths.append(count_tokens(page_text, tokenizer_model)) group_texts = page_list_to_group_text(page_contents, token_lengths) logger.info(f'len(group_texts): {len(group_texts)}') @@ -586,7 +510,7 @@ def process_no_toc(page_list, start_index=1, model=None, logger=None): return toc_with_page_number -def process_toc_no_page_numbers(toc_content, toc_page_list, page_list, start_index=1, model=None, logger=None): +def process_toc_no_page_numbers(toc_content, toc_page_list, page_list, start_index=1, model=None,tokenizer_model=None, logger=None): page_contents=[] token_lengths=[] toc_content = toc_transformer(toc_content, model) @@ -594,7 +518,7 @@ def process_toc_no_page_numbers(toc_content, toc_page_list, page_list, start_in for page_index in range(start_index, start_index+len(page_list)): page_text = f"<physical_index_{page_index}>\n{page_list[page_index-start_index][0]}\n<physical_index_{page_index}>\n\n" page_contents.append(page_text) - token_lengths.append(count_tokens(page_text, model)) + token_lengths.append(count_tokens(page_text, tokenizer_model)) group_texts = page_list_to_group_text(page_contents, token_lengths) logger.info(f'len(group_texts): {len(group_texts)}') @@ -611,7 +535,7 @@ def process_toc_no_page_numbers(toc_content, toc_page_list, page_list, start_in -def process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_check_page_num=None, model=None, logger=None): +def process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_check_page_num=None, model=None, logger=None): toc_with_page_number = toc_transformer(toc_content, model) logger.info(f'toc_with_page_number: {toc_with_page_number}') @@ -723,29 +647,17 @@ def check_toc(page_list, opt=None): print('index not found') return {'toc_content': toc_json['toc_content'], 'toc_page_list': toc_page_list, 'page_index_given_in_toc': 'no'} - - - - - ################### fix incorrect toc ######################################################### def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20"): tob_extractor_prompt = """ You are given a section title and several pages of a document, your job is to find the physical index of the start page of the section in the partial document. The provided pages contains tags like <physical_index_X> and <physical_index_X> to indicate the physical location of the page X. - - Reply in a JSON format: - { - "thinking": <explain which page, started and closed by <physical_index_X>, contains the start of this section>, - "physical_index": "<physical_index_X>" (keep the format) - } - Directly return the final JSON structure. Do not output anything else.""" + """ prompt = tob_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content - response = ChatGPT_API(model=model, prompt=prompt) - json_content = extract_json(response) - return convert_physical_index_to_int(json_content['physical_index']) + response = ChatGPT_API(model=model, prompt=prompt, response_format=PhysicalIndexResponse) + return convert_physical_index_to_int(response.physical_index) @@ -955,9 +867,9 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N if mode == 'process_toc_with_page_numbers': toc_with_page_number = process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_check_page_num=opt.toc_check_page_num, model=opt.model, logger=logger) elif mode == 'process_toc_no_page_numbers': - toc_with_page_number = process_toc_no_page_numbers(toc_content, toc_page_list, page_list, model=opt.model, logger=logger) + toc_with_page_number = process_toc_no_page_numbers(toc_content, toc_page_list, page_list, model=opt.model,tokenizer_model=opt.tokenizer_model, logger=logger) else: - toc_with_page_number = process_no_toc(page_list, start_index=start_index, model=opt.model, logger=logger) + toc_with_page_number = process_no_toc(page_list, start_index=start_index, model=opt.model, tokenizer_model=opt.tokenizer_model, logger=logger) toc_with_page_number = [item for item in toc_with_page_number if item.get('physical_index') is not None] @@ -1066,7 +978,9 @@ def page_index_main(doc, opt=None): raise ValueError("Unsupported input type. Expected a PDF file path or BytesIO object.") print('Parsing PDF...') - page_list = get_page_tokens(doc) + page_list = get_page_tokens(doc, tokenizer_model=opt.tokenizer_model ,use_gpu=opt.use_gpu, + do_ocr=opt.do_ocr, do_table_structure=opt.do_table_structure, + do_cell_matching=opt.do_cell_matching) logger.info({'total_page_number': len(page_list)}) logger.info({'total_token': sum([page[1] for page in page_list])}) diff --git a/pageindex/schemas/schemas.py b/pageindex/schemas/schemas.py new file mode 100644 index 000000000..464fe11d0 --- /dev/null +++ b/pageindex/schemas/schemas.py @@ -0,0 +1,90 @@ +from pydantic import BaseModel, Field +from typing import Optional, List, Literal + + +# Response schemas for simple yes/no checks with thinking +class TitleAppearanceResponse(BaseModel): + """Response for checking if a title appears in a page.""" + thinking: str = Field(description="Explanation of why the section appears or starts in the page_text") + answer: Literal["yes", "no"] = Field(description="Whether the section appears or starts in the page_text") + + +class TitleAppearanceInStartResponse(BaseModel): + """Response for checking if a title appears at the start of a page.""" + thinking: str = Field(description="Explanation of why the section starts at the beginning") + start_begin: Literal["yes", "no"] = Field(description="Whether the section starts at the beginning of the page") + + +class TocDetectedResponse(BaseModel): + """Response for detecting table of contents in text.""" + thinking: str = Field(description="Explanation of why there is or isn't a table of contents") + toc_detected: Literal["yes", "no"] = Field(description="Whether a table of contents was detected") + + +class CompletedResponse(BaseModel): + """Response for checking if extraction/transformation is complete.""" + thinking: str = Field(description="Explanation of why the operation is complete or not") + completed: Literal["yes", "no"] = Field(description="Whether the operation is complete") + + +class PageIndexGivenResponse(BaseModel): + """Response for detecting if page numbers are given in TOC.""" + thinking: str = Field(description="Explanation of whether page numbers are present") + page_index_given_in_toc: Literal["yes", "no"] = Field(description="Whether page indices are given in the table of contents") + + +# TOC item schemas +class TocItem(BaseModel): + """Table of contents item with physical index.""" + structure: Optional[str] = Field(None, description="Structure index like '1', '1.1', '1.2.1', etc.") + title: str = Field(description="Title of the section") + physical_index: Optional[str] = Field(None, description="Physical index in format '<physical_index_X>' or null") + + +class TocWithPage(BaseModel): + """Table of contents item with page number.""" + structure: Optional[str] = Field(None, description="Structure index like '1', '1.1', '1.2.1', etc.") + title: str = Field(description="Title of the section") + page: Optional[int] = Field(None, description="Page number as integer or null") + + +class TableOfContents(BaseModel): + """Root object for table of contents with page numbers.""" + table_of_contents: List[TocWithPage] = Field(description="List of table of contents items") + + +class TocItemWithStart(BaseModel): + """Table of contents item with start indicator.""" + structure: Optional[str] = Field(None, description="Structure index like '1', '1.1', '1.2.1', etc.") + title: str = Field(description="Title of the section") + start: Literal["yes", "no"] = Field(description="Whether the section starts in the given document part") + physical_index: Optional[str] = Field(None, description="Physical index in format '<physical_index_X>' or null") + + +class PhysicalIndexResponse(BaseModel): + """Response with physical index location.""" + thinking: str = Field(description="Explanation of which page contains the start of the section") + physical_index: str = Field(description="Physical index in format '<physical_index_X>'") + + +# List response types +class TocItemList(BaseModel): + """List of table of contents items.""" + items: List[TocItem] = Field(description="List of TOC items") + + def __iter__(self): + return iter(self.items) + + def __getitem__(self, index): + return self.items[index] + + +class TocItemWithStartList(BaseModel): + """List of table of contents items with start indicators.""" + items: List[TocItemWithStart] = Field(description="List of TOC items with start indicators") + + def __iter__(self): + return iter(self.items) + + def __getitem__(self, index): + return self.items[index] diff --git a/pageindex/utils.py b/pageindex/utils.py index dc7acd888..aec977d4e 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -1,4 +1,5 @@ import tiktoken +import re import openai import logging import os @@ -16,19 +17,187 @@ import yaml from pathlib import Path from types import SimpleNamespace as config +from pathlib import Path + +from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import ( + PdfPipelineOptions, + TableStructureOptions, +) +from docling.datamodel.settings import settings +from docling.document_converter import DocumentConverter, PdfFormatOption + CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY") +BASE_URL = os.getenv("BASE_URL","https://api.openai.com/v1/chat/completions") + +class PDFReader: + def __init__(self, pdf_path, parser="docling", use_gpu=False, num_threads=8, do_ocr=True, do_table_structure=True, do_cell_matching=True): + self.pdf_path = pdf_path + self.parser = parser + self.options = { + "use_gpu": use_gpu, + "num_threads": num_threads, + "do_ocr": do_ocr, + "do_table_structure": do_table_structure, + "do_cell_matching": do_cell_matching + } + self._doc_content = None # Generic content holder + self._pages_list = None + self._full_text = None + + def _load_docling(self): + try: + if self.options["use_gpu"]: + print("Using GPU") + device = AcceleratorDevice.GPU + else: + print("Using CPU") + device = AcceleratorDevice.CPU + accelerator_options = AcceleratorOptions( + num_threads=self.options["num_threads"], + device=device + ) + + pipeline_options = PdfPipelineOptions() + pipeline_options.accelerator_options = accelerator_options + pipeline_options.do_ocr = self.options["do_ocr"] + pipeline_options.do_table_structure = self.options["do_table_structure"] + pipeline_options.table_structure_options = TableStructureOptions( + do_cell_matching=self.options["do_cell_matching"] + ) + + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, + ) + } + ) + + settings.debug.profile_pipeline_timings = True + + # Convert the document + conversion_result = converter.convert(self.pdf_path) + doc = conversion_result.document + + # Helper to get pages + self._full_text = doc.export_to_markdown() + self._pages_list = doc.export_to_markdown(page_break_placeholder="<!-- PAGE_BREAK -->").split("<!-- PAGE_BREAK -->") + + doc_conversion_secs = conversion_result.timings["pipeline_total"].times + print(f"Conversion secs: {doc_conversion_secs}") + + except Exception as e: + print(f"Error extracting text from PDF with Docling: {e}") + raise e -def count_tokens(text, model=None): + def _load_pypdf2(self): + try: + pdf_reader = PyPDF2.PdfReader(self.pdf_path) + num_pages = len(pdf_reader.pages) + page_texts = [] + for page_num in range(num_pages): + page = pdf_reader.pages[page_num] + page_text = page.extract_text() + if page_text: + page_texts.append(page_text) + else: + page_texts.append("") + self._pages_list = page_texts + self._full_text = "\n".join(page_texts) + except Exception as e: + print(f"Error extracting text from PDF with PyPDF2: {e}") + raise e + + def _load_pymupdf(self): + try: + if isinstance(self.pdf_path, BytesIO): + doc = pymupdf.open(stream=self.pdf_path, filetype="pdf") + else: + doc = pymupdf.open(self.pdf_path) + + num_pages = len(doc) + page_texts = [] + for page_num in range(num_pages): + page = doc.load_page(page_num) + page_text = page.get_text() + if page_text: + page_texts.append(page_text) + else: + page_texts.append("") + doc.close() + self._pages_list = page_texts + self._full_text = "\n".join(page_texts) + except Exception as e: + print(f"Error extracting text from PDF with PyMuPDF: {e}") + raise e + + def load(self): + if self._full_text is not None and self._pages_list is not None: + return self + + if self.parser == "docling": + print("Using Docling") + self._load_docling() + elif self.parser == "PyMuPDF": + print("Using PyMuPDF") + self._load_pymupdf() + elif self.parser == "PyPDF2": + print("Using PyPDF2") + self._load_pypdf2() + else: + raise ValueError(f"Unsupported PDF parser: {self.parser}. Use 'docling', 'PyMuPDF', or 'PyPDF2'") + return self + + def export_to_markdown(self): + self.load() + return self._full_text + + def get_pages(self): + self.load() + return self._pages_list + + +def extract_text_from_pdf(input_doc_path, pdf_parser="docling", use_gpu=False, num_threads=8, do_ocr=True, do_table_structure=True, do_cell_matching=True): + reader = PDFReader( + input_doc_path, + parser=pdf_parser, + use_gpu=use_gpu, + num_threads=num_threads, + do_ocr=do_ocr, + do_table_structure=do_table_structure, + do_cell_matching=do_cell_matching + ) + return reader.export_to_markdown() + +def read_pdf(input_doc_path, pdf_parser="docling", output_format="full", use_gpu=False, num_threads=8, do_ocr=True, do_table_structure=True, do_cell_matching=True): + reader = PDFReader( + input_doc_path, + parser=pdf_parser, + use_gpu=use_gpu, + num_threads=num_threads, + do_ocr=do_ocr, + do_table_structure=do_table_structure, + do_cell_matching=do_cell_matching + ) + + if output_format == "pages": + return reader.get_pages() + else: + return reader.export_to_markdown() + +def count_tokens(text, tokenizer_model="o200k_base"): if not text: return 0 - enc = tiktoken.encoding_for_model(model) + enc = tiktoken.get_encoding(tokenizer_model) tokens = enc.encode(text) return len(tokens) -def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None): +def ChatGPT_API_with_finish_reason(model, prompt, response_format=None, api_key=CHATGPT_API_KEY, base_url=BASE_URL, chat_history=None): max_retries = 10 - client = openai.OpenAI(api_key=api_key) + client = openai.OpenAI(api_key=api_key, base_url=base_url) for i in range(max_retries): try: if chat_history: @@ -37,15 +206,31 @@ def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_ else: messages = [{"role": "user", "content": prompt}] - response = client.chat.completions.create( - model=model, - messages=messages, - temperature=0, - ) - if response.choices[0].finish_reason == "length": - return response.choices[0].message.content, "max_output_reached" + if response_format: + # Use parse() for Pydantic models + response = client.beta.chat.completions.parse( + model=model, + messages=messages, + temperature=0, + response_format=response_format, + ) + if response.choices[0].finish_reason == "length": + finish_reason = "max_output_reached" + else: + finish_reason = "finished" + return response.choices[0].message.parsed, finish_reason else: - return response.choices[0].message.content, "finished" + # Use create() for regular text responses + response = client.chat.completions.create( + model=model, + messages=messages, + temperature=0, + ) + if response.choices[0].finish_reason == "length": + finish_reason = "max_output_reached" + else: + finish_reason = "finished" + return response.choices[0].message.content, finish_reason except Exception as e: print('************* Retrying *************') @@ -54,13 +239,13 @@ def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_ time.sleep(1) # Wait for 1秒 before retrying else: logging.error('Max retries reached for prompt: ' + prompt) - return "Error" + return "Error", "error" -def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None): +def ChatGPT_API(model, prompt, response_format=None, api_key=CHATGPT_API_KEY, base_url=BASE_URL, chat_history=None): max_retries = 10 - client = openai.OpenAI(api_key=api_key) + client = openai.OpenAI(api_key=api_key, base_url=base_url) for i in range(max_retries): try: if chat_history: @@ -69,13 +254,23 @@ def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None): else: messages = [{"role": "user", "content": prompt}] - response = client.chat.completions.create( - model=model, - messages=messages, - temperature=0, - ) - - return response.choices[0].message.content + if response_format: + # Use parse() for Pydantic models + response = client.beta.chat.completions.parse( + model=model, + messages=messages, + temperature=0, + response_format=response_format, + ) + return response.choices[0].message.parsed + else: + # Use create() for regular text responses + response = client.chat.completions.create( + model=model, + messages=messages, + temperature=0, + ) + return response.choices[0].message.content except Exception as e: print('************* Retrying *************') logging.error(f"Error: {e}") @@ -86,18 +281,29 @@ def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None): return "Error" -async def ChatGPT_API_async(model, prompt, api_key=CHATGPT_API_KEY): +async def ChatGPT_API_async(model, prompt, response_format=None, api_key=CHATGPT_API_KEY, base_url=BASE_URL): max_retries = 10 messages = [{"role": "user", "content": prompt}] for i in range(max_retries): try: - async with openai.AsyncOpenAI(api_key=api_key) as client: - response = await client.chat.completions.create( - model=model, - messages=messages, - temperature=0, - ) - return response.choices[0].message.content + async with openai.AsyncOpenAI(api_key=api_key, base_url=base_url) as client: + if response_format: + # Use parse() for Pydantic models + response = await client.beta.chat.completions.parse( + model=model, + messages=messages, + temperature=0, + response_format=response_format, + ) + return response.choices[0].message.parsed + else: + # Use create() for regular text responses + response = await client.chat.completions.create( + model=model, + messages=messages, + temperature=0, + ) + return response.choices[0].message.content except Exception as e: print('************* Retrying *************') logging.error(f"Error: {e}") @@ -244,31 +450,39 @@ def get_last_node(structure): return structure[-1] -def extract_text_from_pdf(pdf_path): - pdf_reader = PyPDF2.PdfReader(pdf_path) - ###return text not list - text="" - for page_num in range(len(pdf_reader.pages)): - page = pdf_reader.pages[page_num] - text+=page.extract_text() - return text - def get_pdf_title(pdf_path): pdf_reader = PyPDF2.PdfReader(pdf_path) meta = pdf_reader.metadata title = meta.title if meta and meta.title else 'Untitled' return title -def get_text_of_pages(pdf_path, start_page, end_page, tag=True): - pdf_reader = PyPDF2.PdfReader(pdf_path) +def get_text_of_pages(pdf_path, start_page, end_page, tag=True, pdf_parser="PyPDF2", use_gpu=False, num_threads=8, do_ocr=True, do_table_structure=True): + reader = PDFReader( + pdf_path, + parser=pdf_parser, + use_gpu=use_gpu, + num_threads=num_threads, + do_ocr=do_ocr, + do_table_structure=do_table_structure + ) + pages = reader.get_pages() + text = "" - for page_num in range(start_page-1, end_page): - page = pdf_reader.pages[page_num] - page_text = page.extract_text() + # Adjust for 0-based indexing and handle page limits + num_pages = len(pages) + + # Ensure start_page is at least 1 + start_idx = max(0, start_page - 1) + # Ensure end_page does not exceed number of pages + end_idx = min(num_pages, end_page) + + for i in range(start_idx, end_idx): + page_text = pages[i] if tag: - text += f"<start_index_{page_num+1}>\n{page_text}\n<end_index_{page_num+1}>\n" + text += f"<start_index_{i+1}>\n{page_text}\n<end_index_{i+1}>\n" else: text += page_text + return text def get_first_start_page_from_text(text): @@ -410,31 +624,24 @@ def add_preface_if_needed(data): -def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"): - enc = tiktoken.encoding_for_model(model) - if pdf_parser == "PyPDF2": - pdf_reader = PyPDF2.PdfReader(pdf_path) - page_list = [] - for page_num in range(len(pdf_reader.pages)): - page = pdf_reader.pages[page_num] - page_text = page.extract_text() - token_length = len(enc.encode(page_text)) - page_list.append((page_text, token_length)) - return page_list - elif pdf_parser == "PyMuPDF": - if isinstance(pdf_path, BytesIO): - pdf_stream = pdf_path - doc = pymupdf.open(stream=pdf_stream, filetype="pdf") - elif isinstance(pdf_path, str) and os.path.isfile(pdf_path) and pdf_path.lower().endswith(".pdf"): - doc = pymupdf.open(pdf_path) - page_list = [] - for page in doc: - page_text = page.get_text() - token_length = len(enc.encode(page_text)) - page_list.append((page_text, token_length)) - return page_list - else: - raise ValueError(f"Unsupported PDF parser: {pdf_parser}") +def get_page_tokens(pdf_path, tokenizer_model="o200k_base", pdf_parser="docling", use_gpu=False, do_ocr=True, do_table_structure=True, do_cell_matching=True): + enc = tiktoken.get_encoding(tokenizer_model) + + reader = PDFReader( + pdf_path, + parser=pdf_parser, + use_gpu=use_gpu, + do_ocr=do_ocr, + do_table_structure=do_table_structure, + do_cell_matching=do_cell_matching + ) + pages = reader.get_pages() + + page_list = [] + for page_text in pages: + token_length = len(enc.encode(page_text)) + page_list.append((page_text, token_length)) + return page_list @@ -548,15 +755,23 @@ def convert_physical_index_to_int(data): # Check if item is a dictionary and has 'physical_index' key if isinstance(data[i], dict) and 'physical_index' in data[i]: if isinstance(data[i]['physical_index'], str): - if data[i]['physical_index'].startswith('<physical_index_'): - data[i]['physical_index'] = int(data[i]['physical_index'].split('_')[-1].rstrip('>').strip()) - elif data[i]['physical_index'].startswith('physical_index_'): - data[i]['physical_index'] = int(data[i]['physical_index'].split('_')[-1].strip()) + try: + if data[i]['physical_index'].startswith('<physical_index_'): + data[i]['physical_index'] = int(data[i]['physical_index'].split('_')[-1].rstrip('>').strip()) + elif data[i]['physical_index'].startswith('physical_index_'): + data[i]['physical_index'] = int(data[i]['physical_index'].split('_')[-1].strip()) + except ValueError: + # Keep original value if conversion fails (e.g., Roman numerals like 'ii') + pass elif isinstance(data, str): - if data.startswith('<physical_index_'): - data = int(data.split('_')[-1].rstrip('>').strip()) - elif data.startswith('physical_index_'): - data = int(data.split('_')[-1].strip()) + try: + if data.startswith('<physical_index_'): + data = int(data.split('_')[-1].rstrip('>').strip()) + elif data.startswith('physical_index_'): + data = int(data.split('_')[-1].strip()) + except ValueError: + # Return None if conversion fails (e.g., Roman numerals like 'ii') + return None # Check data is int if isinstance(data, int): return data diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..b8f0345b4 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,50 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "PageIndex" +version = "0.1.0" +description = """ +A vectorless, reasoning-based RAG system that builds a hierarchical tree index from long documents +and uses LLMs to reason over that index for agentic, context-aware retrieval. +It simulates how human experts navigate and extract knowledge from complex documents through tree search, +enabling LLMs to think and reason their way to the most relevant document sections. +""" +readme = "README.md" +requires-python = ">=3.10" +license = { file = "LICENSE" } + +dependencies = [ + "openai==1.101.0", + "pymupdf==1.26.4", + "PyPDF2==3.0.1", + "python-dotenv==1.1.0", + "tiktoken==0.11.0", + "pyyaml==6.0.2", + "docling==2.72.0", + "pydantic==2.12.5", + "ruff==0.15.2", +] + +[project.urls] +Homepage = "https://github.com/VectifyAI/PageIndex" + +# ----------------------------- +# setuptools package discovery +# ----------------------------- +[tool.setuptools.packages.find] +where = ["pageindex"] +include = ["*", "*.*"] + +# ----------------------------- +# Ruff configuration +# ----------------------------- +[tool.ruff] +line-length = 88 +exclude = ["__pycache__", "build", "dist", ".venv", "venv"] +select = ["E", "F", "W", "I"] +ignore = ["E501"] + +[tool.ruff.per-file-ignores] +"tests/*" = ["D", "S101"] diff --git a/requirements.txt b/requirements.txt index 463db58f1..c393897fa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,5 @@ PyPDF2==3.0.1 python-dotenv==1.1.0 tiktoken==0.11.0 pyyaml==6.0.2 +docling==2.72.0 +pydantic==2.12.5 \ No newline at end of file diff --git a/run_pageindex.py b/run_pageindex.py index 107024505..c5a2847ff 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -51,17 +51,24 @@ raise ValueError(f"PDF file not found: {args.pdf_path}") # Process PDF file - # Configure options - opt = config( - model=args.model, - toc_check_page_num=args.toc_check_pages, - max_page_num_each_node=args.max_pages_per_node, - max_token_num_each_node=args.max_tokens_per_node, - if_add_node_id=args.if_add_node_id, - if_add_node_summary=args.if_add_node_summary, - if_add_doc_description=args.if_add_doc_description, - if_add_node_text=args.if_add_node_text - ) + # Use ConfigLoader to get consistent defaults (matching markdown behavior) + from pageindex.utils import ConfigLoader + config_loader = ConfigLoader() + + # Create options dict with user args + user_opt = { + 'model': args.model, + 'toc_check_page_num': args.toc_check_pages, + 'max_page_num_each_node': args.max_pages_per_node, + 'max_token_num_each_node': args.max_tokens_per_node, + 'if_add_node_id': args.if_add_node_id, + 'if_add_node_summary': args.if_add_node_summary, + 'if_add_doc_description': args.if_add_doc_description, + 'if_add_node_text': args.if_add_node_text + } + + # Load config with defaults from config.yaml + opt = config_loader.load(user_opt) # Process the PDF toc_with_page_number = page_index_main(args.pdf_path, opt) diff --git a/tests/pdfs/paper_5-1.pdf b/tests/pdfs/paper_5-1.pdf new file mode 100644 index 000000000..886da6ed1 Binary files /dev/null and b/tests/pdfs/paper_5-1.pdf differ