diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 882fb5dea..39018c4df 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -239,7 +239,7 @@ def transform_dots_to_colon(text): def toc_index_extractor(toc, content, model=None): print('start toc_index_extractor') - tob_extractor_prompt = """ + toc_extractor_prompt = """ You are given a table of contents in a json format and several pages of a document, your job is to add the physical_index to the table of contents in the json format. The provided pages contains tags like and to indicate the physical location of the page X. @@ -260,7 +260,7 @@ def toc_index_extractor(toc, content, model=None): If the section is not in the provided pages, do not add the physical_index to it. Directly return the final JSON structure. Do not output anything else.""" - prompt = tob_extractor_prompt + '\nTable of contents:\n' + str(toc) + '\nDocument pages:\n' + content + prompt = toc_extractor_prompt + '\nTable of contents:\n' + str(toc) + '\nDocument pages:\n' + content response = ChatGPT_API(model=model, prompt=prompt) json_content = extract_json(response) return json_content @@ -730,7 +730,7 @@ def check_toc(page_list, opt=None): ################### fix incorrect toc ######################################################### def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20"): - tob_extractor_prompt = """ + toc_extractor_prompt = """ You are given a section title and several pages of a document, your job is to find the physical index of the start page of the section in the partial document. The provided pages contains tags like and to indicate the physical location of the page X. @@ -742,7 +742,7 @@ def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20 } Directly return the final JSON structure. Do not output anything else.""" - prompt = tob_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content + prompt = toc_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content response = ChatGPT_API(model=model, prompt=prompt) json_content = extract_json(response) return convert_physical_index_to_int(json_content['physical_index'])