Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions pageindex/page_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def transform_dots_to_colon(text):

def toc_index_extractor(toc, content, model=None):
print('start toc_index_extractor')
tob_extractor_prompt = """
toc_extractor_prompt = """
You are given a table of contents in a json format and several pages of a document, your job is to add the physical_index to the table of contents in the json format.

The provided pages contains tags like <physical_index_X> and <physical_index_X> to indicate the physical location of the page X.
Expand All @@ -260,7 +260,7 @@ def toc_index_extractor(toc, content, model=None):
If the section is not in the provided pages, do not add the physical_index to it.
Directly return the final JSON structure. Do not output anything else."""

prompt = tob_extractor_prompt + '\nTable of contents:\n' + str(toc) + '\nDocument pages:\n' + content
prompt = toc_extractor_prompt + '\nTable of contents:\n' + str(toc) + '\nDocument pages:\n' + content
response = ChatGPT_API(model=model, prompt=prompt)
json_content = extract_json(response)
return json_content
Expand Down Expand Up @@ -730,7 +730,7 @@ def check_toc(page_list, opt=None):

################### fix incorrect toc #########################################################
def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20"):
tob_extractor_prompt = """
toc_extractor_prompt = """
You are given a section title and several pages of a document, your job is to find the physical index of the start page of the section in the partial document.

The provided pages contains tags like <physical_index_X> and <physical_index_X> to indicate the physical location of the page X.
Expand All @@ -742,7 +742,7 @@ def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20
}
Directly return the final JSON structure. Do not output anything else."""

prompt = tob_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content
prompt = toc_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content
response = ChatGPT_API(model=model, prompt=prompt)
json_content = extract_json(response)
return convert_physical_index_to_int(json_content['physical_index'])
Expand Down