From f2dea03aba7dbd67383dc2e3870d5678ea709bb7 Mon Sep 17 00:00:00 2001 From: Magicbook1108 Date: Wed, 24 Sep 2025 14:40:44 +0800 Subject: [PATCH 01/14] save work 0.2 --- rag/app/local_edit.py | 111 +++++++ rag/app/naive.py | 16 +- run.sh | 6 + toc_detection_result.txt | 113 +++++++ toc_index_result.txt | 644 +++++++++++++++++++++++++++++++++++++ toc_transformer_result.txt | 375 +++++++++++++++++++++ 6 files changed, 1259 insertions(+), 6 deletions(-) create mode 100644 rag/app/local_edit.py create mode 100755 run.sh create mode 100644 toc_detection_result.txt create mode 100644 toc_index_result.txt create mode 100644 toc_transformer_result.txt diff --git a/rag/app/local_edit.py b/rag/app/local_edit.py new file mode 100644 index 00000000000..296ebb8f6ca --- /dev/null +++ b/rag/app/local_edit.py @@ -0,0 +1,111 @@ +import json +from typing import List, Tuple +from rag.app.naive import chunk +from rag.prompts.prompts import toc_transformer, table_of_contents_index +from rag.nlp import num_tokens_from_string +from rag.prompts.prompts import detect_table_of_contents +from api.db import LLMType +from api.db.services.llm_service import LLMBundle +from api.db.services.tenant_llm_service import TenantLLMService +from api.db.services.user_service import TenantService + +if __name__ == "__main__": + import sys + + from api import settings + if settings.FACTORY_LLM_INFOS is None: + print("Fixing FACTORY_LLM_INFOS initialization...") + settings.init_settings() # 重新初始化设置 + + def dummy(prog=None, msg=""): + pass + tenant_id = "10b8ea16937911f09ae613abffb949cc" # 从数据库查询到的用户ID + + results, tables, figures = chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy, tenant_id=tenant_id) + sections, section_images, page_1024, tc_arr = [], [], [""], [0] + + # 修复:results是元组列表,不是字典列表 + for text, image in results or []: + tc = num_tokens_from_string(text) + page_1024[-1] += "\n" + text + tc_arr[-1] += tc + if tc_arr[-1] > 1024: + page_1024.append("") + tc_arr.append(0) + import sys + from api import settings + def dummy(prog=None, msg=""): + pass + + def process_toc_full(pdf_path, tenant_id): + if settings.FACTORY_LLM_INFOS is None: + print("Fixing FACTORY_LLM_INFOS initialization...") + settings.init_settings() + results, tables, figures = chunk(pdf_path, from_page=0, to_page=10, callback=dummy, tenant_id=tenant_id) + sections, section_images, page_1024, tc_arr = [], [], [""], [0] + for text, image in results or []: + tc = num_tokens_from_string(text) + page_1024[-1] += "\n" + text + tc_arr[-1] += tc + if tc_arr[-1] > 1024: + page_1024.append("") + tc_arr.append(0) + sections.append((text, "")) + section_images.append(image) + chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_name="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", lang="Chinese") + toc_secs = detect_table_of_contents(page_1024, chat_mdl) + with open("toc_detection_result.txt", "w", encoding="utf-8") as f: + f.write("=== TOC Detection Results ===\n") + f.write(f"Found {len(toc_secs)} TOC sections\n\n") + for i, sec in enumerate(toc_secs): + f.write(f"--- Section {i+1} ---\n") + f.write(sec) + f.write("\n\n") + print(f"✅ TOC detection results saved to toc_detection_result.txt ({len(toc_secs)} sections)") + if toc_secs: + toc_arr = toc_transformer(toc_secs, chat_mdl) + with open("toc_transformer_result.txt", "w", encoding="utf-8") as f: + f.write("=== TOC Transformer Results ===\n") + f.write(json.dumps(toc_arr, ensure_ascii=False, indent=2)) + print(f"✅ TOC transformer results saved to toc_transformer_result.txt ({len(toc_arr)} items)") + toc_arr = [it for it in toc_arr if it.get("structure")] + print(f"📋 Filtered to {len(toc_arr)} items with structure") + toc_arr = table_of_contents_index(toc_arr, [t for t,_ in sections], chat_mdl) + with open("toc_index_result.txt", "w", encoding="utf-8") as f: + f.write("=== TOC Index Results ===\n") + f.write(json.dumps(toc_arr, ensure_ascii=False, indent=2)) + print(f"✅ TOC index results saved to toc_index_result.txt ({len(toc_arr)} items)") + print("\n" + "="*50) + print("FINAL TOC STRUCTURE:") + print("="*50) + print(json.dumps(toc_arr, ensure_ascii=False, indent=2), flush=True) + else: + print("❌ No TOC sections detected") + + def process_toc_from_file(tenant_id, sections_path, toc_transformer_path): + if settings.FACTORY_LLM_INFOS is None: + print("Fixing FACTORY_LLM_INFOS initialization...") + settings.init_settings() + # 读取sections + with open(sections_path, "r", encoding="utf-8") as f: + sections = [line.strip() for line in f if line.strip() and not line.startswith("===") and not line.startswith("---")] + # 读取toc_transformer结果 + with open(toc_transformer_path, "r", encoding="utf-8") as f: + toc_arr = json.loads(f.read().split("=== TOC Transformer Results ===\n")[-1]) + chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_name="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", lang="Chinese") + toc_arr = [it for it in toc_arr if it.get("structure")] + print(f"📋 Filtered to {len(toc_arr)} items with structure") + toc_arr = table_of_contents_index(toc_arr, sections, chat_mdl) + with open("toc_index_result.txt", "w", encoding="utf-8") as f: + f.write("=== TOC Index Results ===\n") + f.write(json.dumps(toc_arr, ensure_ascii=False, indent=2)) + print(f"✅ TOC index results saved to toc_index_result.txt ({len(toc_arr)} items)") + print("\n" + "="*50) + print("FINAL TOC STRUCTURE:") + print("="*50) + print(json.dumps(toc_arr, ensure_ascii=False, indent=2), flush=True) + + if __name__ == "__main__": + # 示例:只执行第三步,前两步结果从本地文件读取 + tenant_id = "10b8ea16937911f09ae613abffb949cc" + process_toc_from_file(tenant_id, "toc_detection_result.txt", "toc_transformer_result.txt") \ No newline at end of file diff --git a/rag/app/naive.py b/rag/app/naive.py index 9265ae7768a..6a8fdd1779b 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -470,9 +470,11 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, tables.extend(figures) else: sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) - - res = tokenize_table(tables, doc, is_english) - callback(0.8, "Finish parsing.") + figures = None + + return sections, tables, figures + # res = tokenize_table(tables, doc, is_english) + # callback(0.8, "Finish parsing.") else: if layout_recognizer == "Plain Text": @@ -483,8 +485,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) - res = tokenize_table(tables, doc, is_english) - callback(0.8, "Finish parsing.") + return sections, tables, figures if figures else None + # res = tokenize_table(tables, doc, is_english) + # callback(0.8, "Finish parsing.") elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") @@ -600,4 +603,5 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, def dummy(prog=None, msg=""): pass - chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) + sections, tables, figures = chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) + print(sections) \ No newline at end of file diff --git a/run.sh b/run.sh new file mode 100755 index 00000000000..185471227bb --- /dev/null +++ b/run.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +set -e +cd "$(dirname "$0")" +# 若有虚拟环境就用,没有也不报错 +[ -f .venv/bin/activate ] && source .venv/bin/activate +python -m rag.app.local_edit "${1:-$(xdg-user-dir DESKTOP)/default.pdf}" diff --git a/toc_detection_result.txt b/toc_detection_result.txt new file mode 100644 index 00000000000..0f3403bdc7f --- /dev/null +++ b/toc_detection_result.txt @@ -0,0 +1,113 @@ +=== TOC Detection Results === +Found 2 TOC sections + +--- Section 1 --- + +大脑构造 +自控力 +还在督促自己每天进步一点吗? +还在坚持每天阅读的习惯吗? +还在为找不到自己喜欢的书籍烦恼吗? +那~ +你愿意与我成为书友吗? +国内外当下流行书籍 +各图书销量排行榜书籍 +大量工具书籍 +使我们受益终生的书籍 +海量电子版、纸质版书籍及音频课程 +还有贴心的“学习管家”服务哦! +微信:shuyou055 +认知觉醒:开启自我改变的原动力 +周岭 著 +人民邮电出版社 +北京 +图书在版编目(CIP)数据 +认知觉醒:开启自我改变的原动力/周岭著. --北京:人民邮电出版 +社,2020.9 +ISBN 978-7-115-54342-4 +Ⅰ.①认… Ⅱ.①周…Ⅲ.①成功心理-通俗读物 Ⅳ.①B848.4-49 +中国版本图书馆CIP数据核字(2020)第114940号 +◆著  周 岭 +责任编辑 陈素然 +责任印制 周昇亮 +◆人民邮电出版社出版发行 北京市丰台区成寿寺路11号 +电子邮件315@ptpress.com.cn +邮编100164  +网址https://www.ptpress.com.cn +◆开本:720×960 1/16 +印张:17.25  2020年9月第1版 +字数:222千字  2020年9月北京第1次印刷 +定价:59.80元 +读者服务热线:(010)81055522 印装质量热线:(010)81055316 +反盗版热线:(010)81055315 +广告经营许可证:京东市监广登字20170147号 +目录 +封面 +扉页 +版权信息 +自序 开启自我改变的原动力 +上篇 内观自己,摆脱焦虑 +第一章 大脑——一切问题的起源 +第一节 大脑:重新认识你自己 +第二节 焦虑:焦虑的根源 +第三节 耐心:得耐心者得天下 +第二章 潜意识——生命留给我们的彩蛋 +第一节 模糊:人生是一场消除模糊的比赛 +第二节 感性:顶级的成长竟然是“凭感觉” +第三章 元认知——人类的终极能能力 +第一节 元认知:成长慢,是因为你不会“飞” +第二节 自控力:我们生而为人就是为了成为思维舵手 +下篇 外观世界,借力前行 +第四章 专注力——情绪和智慧的交叉地带 +第一节 情绪专注:一招提振你的注意力 +第二节 学习专注:深度沉浸是进化双刃剑的安全剑柄 +第五章 学习力——学习不是一味地努力 +第一节 匹配:舒适区边缘,适用于万物的方法论 +第二节 深度:深度学习,人生为数不多的好出路 +第三节 关联:高手的“暗箱” +第四节 体系:建立个人认知体系其实很简单 +第五节 打卡:莫迷恋打卡,打卡打不出未来 +第六节 反馈:是时候告诉你什么是真正的学习了 + +--- Section 2 --- + +第七节 休息:你没成功,可能是因为太刻苦了 +第六章 行动力——没有行动世界只是个概念 +第一节 清晰:一个观念,重构你的行动力 +第二节 “傻瓜”:这个世界会奖励那些不计得失的“傻瓜” +第三节 行动:“道理都懂,就是不做”怎么破解 +第七章 情绪力——情绪是多角度看问题的智慧 +第一节 智带宽:唯有富足,方能解忧 +第二节 单一视角:你的坏情绪,源于视角单一 +第三节 游戏心态:幸福的人,总是在做另外一件事 +第八章 早冥读写跑,人生五件套——成本最低的成长之道 +第一节 早起:无闹钟、不参团、不打卡,我是如何坚持早起的 +第二节 冥想:终有一天,你要解锁这条隐藏赛道 +第三节 阅读:如何让自己真正爱上阅读 +第四节 写作:谢谢你,费曼先生 +第五节 运动:灵魂想要走得远,身体必须在路上 +结语 一流的生活不是富有,而是觉知 +后记 共同改变,一起前行 +参考文献 +送给我的女儿 +周子琪 +自序 开启自我改变的原动力 +我们是幸运的一代人,赶上了人类社会迄今为止最大的跨越发展 +期,科技进步,物资丰富,万物互联。我们的寿命变得更长,智商变得 +更高,财富变得更多,而且这些可以通过基因或基金传递给下一代。但 +无论科技多么发达,有一样东西却始终无法直接传递,那就是心智。 +所谓心智,通俗地说,就是我们看待人和事的态度,以及由此做出 +的判断与选择。每一个人来到这个世界时,其人生观、世界观、价值观 +全部都是从零开始的,所有习性、习惯、模式也要从人性的初始状态开 +始发展,你、我、我们的父辈和孩子都是如此,没有人能够直接跨越这 +一阶段。而不少人的初始状态是混沌的,他们天然追求简单、轻松、舒 +适、确定,这种天性支配着他们,成为他们喜怒哀乐的生理起点,然而 +身陷其中的大多数人对此知之甚少。 +我们对自己的无知使自己看起来就像一个“醒着的睡着的人”。我知 +道一个人不可能同时“醒着”和“睡着”,这二者显然是矛盾的,但在指出 +这个逻辑错误之前,你不妨先随我一起看看我们的人生轨迹,或许你会 +同意我这个说法。 +如果不出意外,大多数人都会沿着“求学—工作—婚恋”的路线成 +长,随着生活的惯性一直往前走。年轻的时候,几乎没有人会觉得自己 +的将来能有多差,认定美好的生活会自然到来。不谙世事的我们认为: + diff --git a/toc_index_result.txt b/toc_index_result.txt new file mode 100644 index 00000000000..08b17e41daf --- /dev/null +++ b/toc_index_result.txt @@ -0,0 +1,644 @@ +=== TOC Index Results === +[ + { + "structure": "1", + "title": "大脑构造", + "indices": [ + 1 + ] + }, + { + "structure": "2", + "title": "自控力", + "indices": [ + 2 + ] + }, + { + "structure": "3", + "title": "还在督促自己每天进步一点吗?", + "indices": [ + 3 + ] + }, + { + "structure": "4", + "title": "还在坚持每天阅读的习惯吗?", + "indices": [ + 4 + ] + }, + { + "structure": "5", + "title": "还在为找不到自己喜欢的书籍烦恼吗?", + "indices": [ + 5 + ] + }, + { + "structure": "6", + "title": "那~", + "indices": [ + 6 + ] + }, + { + "structure": "7", + "title": "你愿意与我成为书友吗?", + "indices": [ + 7 + ] + }, + { + "structure": "8", + "title": "国内外当下流行书籍", + "indices": [ + 8 + ] + }, + { + "structure": "9", + "title": "各图书销量排行榜书籍", + "indices": [ + 9 + ] + }, + { + "structure": "10", + "title": "大量工具书籍", + "indices": [ + 10 + ] + }, + { + "structure": "11", + "title": "使我们受益终生的书籍", + "indices": [ + 11 + ] + }, + { + "structure": "12", + "title": "海量电子版、纸质版书籍及音频课程", + "indices": [ + 12 + ] + }, + { + "structure": "13", + "title": "还有贴心的“学习管家”服务哦!", + "indices": [ + 13 + ] + }, + { + "structure": "14", + "title": "微信:shuyou055", + "indices": [ + 14 + ] + }, + { + "structure": "15", + "title": "认知觉醒:开启自我改变的原动力", + "indices": [ + 15 + ] + }, + { + "structure": "16", + "title": "周岭 著", + "indices": [ + 16 + ] + }, + { + "structure": "17", + "title": "人民邮电出版社", + "indices": [ + 17 + ] + }, + { + "structure": "18", + "title": "北京", + "indices": [ + 18 + ] + }, + { + "structure": "19", + "title": "图书在版编目(CIP)数据", + "indices": [ + 19 + ] + }, + { + "structure": "20", + "title": "认知觉醒:开启自我改变的原动力/周岭著. --北京:人民邮电出版社, 2020.9", + "indices": [] + }, + { + "structure": "21", + "title": "ISBN 978-7-115-54342-4", + "indices": [ + 22 + ] + }, + { + "structure": "22", + "title": "Ⅰ.①认…", + "indices": [ + 23 + ] + }, + { + "structure": "23", + "title": "Ⅱ.①周…", + "indices": [] + }, + { + "structure": "24", + "title": "Ⅲ.①成功心理-通俗读物", + "indices": [ + 23 + ] + }, + { + "structure": "25", + "title": "Ⅳ.①B848.4-49", + "indices": [ + 23 + ] + }, + { + "structure": "26", + "title": "中国版本图书馆CIP数据核字(2020)第114940号", + "indices": [ + 24 + ] + }, + { + "structure": "27", + "title": "◆著  周 岭", + "indices": [ + 25 + ] + }, + { + "structure": "28", + "title": "责任编辑 陈素然", + "indices": [ + 26 + ] + }, + { + "structure": "29", + "title": "责任印制 周昇亮", + "indices": [ + 27 + ] + }, + { + "structure": "30", + "title": "◆人民邮电出版社出版发行 北京市丰台区成寿寺路11号", + "indices": [ + 28 + ] + }, + { + "structure": "31", + "title": "电子邮件315@ptpress.com.cn", + "indices": [ + 29 + ] + }, + { + "structure": "32", + "title": "邮编100164", + "indices": [ + 30 + ] + }, + { + "structure": "33", + "title": "开本:720×960 1/16", + "indices": [] + }, + { + "structure": "34", + "title": "印张:17.25", + "indices": [ + 33 + ] + }, + { + "structure": "35", + "title": "字数:222千字", + "indices": [ + 35 + ] + }, + { + "structure": "36", + "title": "2020年9月第1版", + "indices": [] + }, + { + "structure": "37", + "title": "定价:59.80元", + "indices": [ + 35 + ] + }, + { + "structure": "38", + "title": "读者服务热线:(010)81055522", + "indices": [] + }, + { + "structure": "39", + "title": "印装质量热线:(010)81055316", + "indices": [ + 36 + ] + }, + { + "structure": "40", + "title": "反盗版热线:(010)81055315", + "indices": [ + 37 + ] + }, + { + "structure": "41", + "title": "广告经营许可证:京东市监广登字20170147号", + "indices": [ + 38 + ] + }, + { + "structure": "42", + "title": "目录", + "indices": [ + 39 + ] + }, + { + "structure": "43", + "title": "封面", + "indices": [ + 40 + ] + }, + { + "structure": "44", + "title": "扉页", + "indices": [ + 41 + ] + }, + { + "structure": "45", + "title": "版权信息", + "indices": [ + 42 + ] + }, + { + "structure": "46", + "title": "自序 开启自我改变的原动力", + "indices": [ + 43 + ] + }, + { + "structure": "47", + "title": "上篇 内观自己,摆脱焦虑", + "indices": [ + 44 + ] + }, + { + "structure": "48", + "title": "第一章 大脑——一切问题的起源", + "indices": [ + 45 + ] + }, + { + "structure": "49", + "title": "第一节 大脑:重新认识你自己", + "indices": [ + 46 + ] + }, + { + "structure": "50", + "title": "第二节 焦虑:焦虑的根源", + "indices": [ + 47 + ] + }, + { + "structure": "51", + "title": "第三节 耐心:得耐心者得天下", + "indices": [ + 48 + ] + }, + { + "structure": "52", + "title": "第二章 潜意识——生命留给我们的彩蛋", + "indices": [ + 49 + ] + }, + { + "structure": "53", + "title": "第一节 模糊:人生是一场消除模糊的比赛", + "indices": [ + 50 + ] + }, + { + "structure": "54", + "title": "第二节 感性:顶级的成长竟然是“凭感觉”", + "indices": [ + 51 + ] + }, + { + "structure": "55", + "title": "第三章 元认知——人类的终极能能力", + "indices": [ + 52 + ] + }, + { + "structure": "56", + "title": "第一节 元认知:成长慢,是因为你不会“飞”", + "indices": [ + 53 + ] + }, + { + "structure": "57", + "title": "第二节 自控力:我们生而为人就是为了成为思维舵手", + "indices": [ + 54 + ] + }, + { + "structure": "58", + "title": "下篇 外观世界,借力前行", + "indices": [ + 55 + ] + }, + { + "structure": "59", + "title": "第四章 专注力——情绪和智慧的交叉地带", + "indices": [ + 56 + ] + }, + { + "structure": "60", + "title": "第一节 情绪专注:一招提振你的注意力", + "indices": [ + 57 + ] + }, + { + "structure": "61", + "title": "第二节 学习专注:深度沉浸是进化双刃剑的安全剑柄", + "indices": [ + 58 + ] + }, + { + "structure": "62", + "title": "第五章 学习力——学习不是一味地努力", + "indices": [ + 59 + ] + }, + { + "structure": "63", + "title": "第一节 匹配:舒适区边缘,适用于万物的方法论", + "indices": [ + 60 + ] + }, + { + "structure": "64", + "title": "第二节 深度:深度学习,人生为数不多的好出路", + "indices": [ + 61 + ] + }, + { + "structure": "65", + "title": "第三节 关联:高手的“暗箱”", + "indices": [ + 62 + ] + }, + { + "structure": "66", + "title": "第四节 体系:建立个人认知体系其实很简单", + "indices": [ + 63 + ] + }, + { + "structure": "67", + "title": "第五节 打卡:莫迷恋打卡,打卡打不出未来", + "indices": [ + 64 + ] + }, + { + "structure": "68", + "title": "第六节 反馈:是时候告诉你什么是真正的学习了", + "indices": [ + 65 + ] + }, + { + "structure": "69", + "title": "第七节 休息:你没成功,可能是因为太刻苦了", + "indices": [ + 66 + ] + }, + { + "structure": "70", + "title": "第六章 行动力——没有行动世界只是个概念", + "indices": [ + 67 + ] + }, + { + "structure": "71", + "title": "第一节 清晰:一个观念,重构你的行动力", + "indices": [ + 68 + ] + }, + { + "structure": "72", + "title": "第二节 “傻瓜”:这个世界会奖励那些不计得失的“傻瓜”", + "indices": [ + 69 + ] + }, + { + "structure": "73", + "title": "第三节 行动:“道理都懂,就是不做”怎么破解", + "indices": [ + 70 + ] + }, + { + "structure": "74", + "title": "第七章 情绪力——情绪是多角度看问题的智慧", + "indices": [ + 71 + ] + }, + { + "structure": "75", + "title": "第一节 智带宽:唯有富足,方能解忧", + "indices": [ + 72 + ] + }, + { + "structure": "76", + "title": "第二节 单一视角:你的坏情绪,源于视角单一", + "indices": [ + 73 + ] + }, + { + "structure": "77", + "title": "第三节 游戏心态:幸福的人,总是在做另外一件事", + "indices": [ + 74 + ] + }, + { + "structure": "78", + "title": "第八章 早冥读写跑,人生五件套——成本最低的成长之道", + "indices": [ + 75 + ] + }, + { + "structure": "79", + "title": "第一节 早起:无闹钟、不参团、不打卡,我是如何坚持早起的", + "indices": [ + 76 + ] + }, + { + "structure": "80", + "title": "第二节 冥想:终有一天,你要解锁这条隐藏赛道", + "indices": [ + 77 + ] + }, + { + "structure": "81", + "title": "第三节 阅读:如何让自己真正爱上阅读", + "indices": [ + 78 + ] + }, + { + "structure": "82", + "title": "第四节 写作:谢谢你,费曼先生", + "indices": [ + 79 + ] + }, + { + "structure": "83", + "title": "第五节 运动:灵魂想要走得远,身体必须在路上", + "indices": [ + 80 + ] + }, + { + "structure": "84", + "title": "结语 一流的生活不是富有,而是觉知", + "indices": [ + 81 + ] + }, + { + "structure": "85", + "title": "后记 共同改变,一起前行", + "indices": [ + 82 + ] + }, + { + "structure": "86", + "title": "参考文献", + "indices": [ + 83 + ] + }, + { + "structure": "87", + "title": "送给我的女儿", + "indices": [ + 84 + ] + }, + { + "structure": "88", + "title": "自序 开启自我改变的原动力", + "indices": [ + 86 + ] + }, + { + "structure": "89", + "title": "我们是幸运的一代人,赶上了人类社会迄今为止最大的跨越发展", + "indices": [ + 87 + ] + }, + { + "structure": "90", + "title": "期,科技进步,物资丰富,万物互联。我们的寿命变得更长,智商变得更高,财富变得更多,而且这些可以通过基因或基金传递给下一代。但无论科技多么发达,有一样东西却始终无法直接传递,那就是心智。", + "indices": [ + 89 + ] + }, + { + "structure": "91", + "title": "所谓心智,通俗地说,就是我们看待人和事的态度,以及由此做出的判断与选择。每一个人来到这个世界时,其人生观、世界观、价值观全部都是从零开始的,所有习惯、习惯、模式也要从人性的初始状态 开始发展,你、我、我们的父辈和孩子都是如此,没有人能够直接跨越这 一阶段。而不少人的初始状态是混沌的,他们天然追求简单、轻松、舒适、确定,这种天性支配着他们,成为他们喜怒哀乐的生理起点,然而身陷其中的大多数人对此知之甚少。", + "indices": [ + 91 + ] + }, + { + "structure": "92", + "title": "我们对自己的无知使自己看起来就像一个“醒着的睡着的人”。我明白一个人不可能同时“醒着”和“睡着”,这二者显然是矛盾的,但在指出这个逻辑错误之前,你不妨先随我一起看看我们的人生轨迹,或许你会同意我这个说法。", + "indices": [ + 100 + ] + }, + { + "structure": "93", + "title": "如果不出意外,大多数人都会沿着“求学—工作—婚恋”的路线成长,随着生活的惯性一直往前走。年轻的时候,几乎没有人会觉得自己的将来能有多差,认定美好的生活会自然到来。不谙世事的我们认为:", + "indices": [ + 103 + ] + } +] \ No newline at end of file diff --git a/toc_transformer_result.txt b/toc_transformer_result.txt new file mode 100644 index 00000000000..e36f58e5504 --- /dev/null +++ b/toc_transformer_result.txt @@ -0,0 +1,375 @@ +=== TOC Transformer Results === +[ + { + "structure": "1", + "title": "大脑构造" + }, + { + "structure": "2", + "title": "自控力" + }, + { + "structure": "3", + "title": "还在督促自己每天进步一点吗?" + }, + { + "structure": "4", + "title": "还在坚持每天阅读的习惯吗?" + }, + { + "structure": "5", + "title": "还在为找不到自己喜欢的书籍烦恼吗?" + }, + { + "structure": "6", + "title": "那~" + }, + { + "structure": "7", + "title": "你愿意与我成为书友吗?" + }, + { + "structure": "8", + "title": "国内外当下流行书籍" + }, + { + "structure": "9", + "title": "各图书销量排行榜书籍" + }, + { + "structure": "10", + "title": "大量工具书籍" + }, + { + "structure": "11", + "title": "使我们受益终生的书籍" + }, + { + "structure": "12", + "title": "海量电子版、纸质版书籍及音频课程" + }, + { + "structure": "13", + "title": "还有贴心的“学习管家”服务哦!" + }, + { + "structure": "14", + "title": "微信:shuyou055" + }, + { + "structure": "15", + "title": "认知觉醒:开启自我改变的原动力" + }, + { + "structure": "16", + "title": "周岭 著" + }, + { + "structure": "17", + "title": "人民邮电出版社" + }, + { + "structure": "18", + "title": "北京" + }, + { + "structure": "19", + "title": "图书在版编目(CIP)数据" + }, + { + "structure": "20", + "title": "认知觉醒:开启自我改变的原动力/周岭著. --北京:人民邮电出版社, 2020.9" + }, + { + "structure": "21", + "title": "ISBN 978-7-115-54342-4" + }, + { + "structure": "22", + "title": "Ⅰ.①认…" + }, + { + "structure": "23", + "title": "Ⅱ.①周…" + }, + { + "structure": "24", + "title": "Ⅲ.①成功心理-通俗读物" + }, + { + "structure": "25", + "title": "Ⅳ.①B848.4-49" + }, + { + "structure": "26", + "title": "中国版本图书馆CIP数据核字(2020)第114940号" + }, + { + "structure": "27", + "title": "◆著  周 岭" + }, + { + "structure": "28", + "title": "责任编辑 陈素然" + }, + { + "structure": "29", + "title": "责任印制 周昇亮" + }, + { + "structure": "30", + "title": "◆人民邮电出版社出版发行 北京市丰台区成寿寺路11号" + }, + { + "structure": "31", + "title": "电子邮件315@ptpress.com.cn" + }, + { + "structure": "32", + "title": "邮编100164" + }, + { + "structure": "33", + "title": "开本:720×960 1/16" + }, + { + "structure": "34", + "title": "印张:17.25" + }, + { + "structure": "35", + "title": "字数:222千字" + }, + { + "structure": "36", + "title": "2020年9月第1版" + }, + { + "structure": "37", + "title": "定价:59.80元" + }, + { + "structure": "38", + "title": "读者服务热线:(010)81055522" + }, + { + "structure": "39", + "title": "印装质量热线:(010)81055316" + }, + { + "structure": "40", + "title": "反盗版热线:(010)81055315" + }, + { + "structure": "41", + "title": "广告经营许可证:京东市监广登字20170147号" + }, + { + "structure": "42", + "title": "目录" + }, + { + "structure": "43", + "title": "封面" + }, + { + "structure": "44", + "title": "扉页" + }, + { + "structure": "45", + "title": "版权信息" + }, + { + "structure": "46", + "title": "自序 开启自我改变的原动力" + }, + { + "structure": "47", + "title": "上篇 内观自己,摆脱焦虑" + }, + { + "structure": "48", + "title": "第一章 大脑——一切问题的起源" + }, + { + "structure": "49", + "title": "第一节 大脑:重新认识你自己" + }, + { + "structure": "50", + "title": "第二节 焦虑:焦虑的根源" + }, + { + "structure": "51", + "title": "第三节 耐心:得耐心者得天下" + }, + { + "structure": "52", + "title": "第二章 潜意识——生命留给我们的彩蛋" + }, + { + "structure": "53", + "title": "第一节 模糊:人生是一场消除模糊的比赛" + }, + { + "structure": "54", + "title": "第二节 感性:顶级的成长竟然是“凭感觉”" + }, + { + "structure": "55", + "title": "第三章 元认知——人类的终极能能力" + }, + { + "structure": "56", + "title": "第一节 元认知:成长慢,是因为你不会“飞”" + }, + { + "structure": "57", + "title": "第二节 自控力:我们生而为人就是为了成为思维舵手" + }, + { + "structure": "58", + "title": "下篇 外观世界,借力前行" + }, + { + "structure": "59", + "title": "第四章 专注力——情绪和智慧的交叉地带" + }, + { + "structure": "60", + "title": "第一节 情绪专注:一招提振你的注意力" + }, + { + "structure": "61", + "title": "第二节 学习专注:深度沉浸是进化双刃剑的安全剑柄" + }, + { + "structure": "62", + "title": "第五章 学习力——学习不是一味地努力" + }, + { + "structure": "63", + "title": "第一节 匹配:舒适区边缘,适用于万物的方法论" + }, + { + "structure": "64", + "title": "第二节 深度:深度学习,人生为数不多的好出路" + }, + { + "structure": "65", + "title": "第三节 关联:高手的“暗箱”" + }, + { + "structure": "66", + "title": "第四节 体系:建立个人认知体系其实很简单" + }, + { + "structure": "67", + "title": "第五节 打卡:莫迷恋打卡,打卡打不出未来" + }, + { + "structure": "68", + "title": "第六节 反馈:是时候告诉你什么是真正的学习了" + }, + { + "structure": "69", + "title": "第七节 休息:你没成功,可能是因为太刻苦了" + }, + { + "structure": "70", + "title": "第六章 行动力——没有行动世界只是个概念" + }, + { + "structure": "71", + "title": "第一节 清晰:一个观念,重构你的行动力" + }, + { + "structure": "72", + "title": "第二节 “傻瓜”:这个世界会奖励那些不计得失的“傻瓜”" + }, + { + "structure": "73", + "title": "第三节 行动:“道理都懂,就是不做”怎么破解" + }, + { + "structure": "74", + "title": "第七章 情绪力——情绪是多角度看问题的智慧" + }, + { + "structure": "75", + "title": "第一节 智带宽:唯有富足,方能解忧" + }, + { + "structure": "76", + "title": "第二节 单一视角:你的坏情绪,源于视角单一" + }, + { + "structure": "77", + "title": "第三节 游戏心态:幸福的人,总是在做另外一件事" + }, + { + "structure": "78", + "title": "第八章 早冥读写跑,人生五件套——成本最低的成长之道" + }, + { + "structure": "79", + "title": "第一节 早起:无闹钟、不参团、不打卡,我是如何坚持早起的" + }, + { + "structure": "80", + "title": "第二节 冥想:终有一天,你要解锁这条隐藏赛道" + }, + { + "structure": "81", + "title": "第三节 阅读:如何让自己真正爱上阅读" + }, + { + "structure": "82", + "title": "第四节 写作:谢谢你,费曼先生" + }, + { + "structure": "83", + "title": "第五节 运动:灵魂想要走得远,身体必须在路上" + }, + { + "structure": "84", + "title": "结语 一流的生活不是富有,而是觉知" + }, + { + "structure": "85", + "title": "后记 共同改变,一起前行" + }, + { + "structure": "86", + "title": "参考文献" + }, + { + "structure": "87", + "title": "送给我的女儿" + }, + { + "structure": "88", + "title": "自序 开启自我改变的原动力" + }, + { + "structure": "89", + "title": "我们是幸运的一代人,赶上了人类社会迄今为止最大的跨越发展" + }, + { + "structure": "90", + "title": "期,科技进步,物资丰富,万物互联。我们的寿命变得更长,智商变得更高,财富变得更多,而且这些可以通过基因或基金传递给下一代。但无论科技多么发达,有一样东西却始终无法直接传递,那就是心智。" + }, + { + "structure": "91", + "title": "所谓心智,通俗地说,就是我们看待人和事的态度,以及由此做出的判断与选择。每一个人来到这个世界时,其人生观、世界观、价值观全部都是从零开始的,所有习惯、习惯、模式也要从人性的初始状态 开始发展,你、我、我们的父辈和孩子都是如此,没有人能够直接跨越这 一阶段。而不少人的初始状态是混沌的,他们天然追求简单、轻松、舒适、确定,这种天性支配着他们,成为他们喜怒哀乐的生理起点,然而身陷其中的大多数人对此知之甚少。" + }, + { + "structure": "92", + "title": "我们对自己的无知使自己看起来就像一个“醒着的睡着的人”。我明白一个人不可能同时“醒着”和“睡着”,这二者显然是矛盾的,但在指出这个逻辑错误之前,你不妨先随我一起看看我们的人生轨迹,或许你会同意我这个说法。" + }, + { + "structure": "93", + "title": "如果不出意外,大多数人都会沿着“求学—工作—婚恋”的路线成长,随着生活的惯性一直往前走。年轻的时候,几乎没有人会觉得自己的将来能有多差,认定美好的生活会自然到来。不谙世事的我们认为:" + } +] \ No newline at end of file From c8114d9d2bb8498c895586ed16cb895b42829194 Mon Sep 17 00:00:00 2001 From: Magicbook1108 Date: Mon, 29 Sep 2025 09:50:24 +0800 Subject: [PATCH 02/14] update promp NOT FINISHED !! --- rag/app/local_edit.py | 111 ----- rag/prompts/assign_toc_levels.md | 54 +++ rag/prompts/toc_detection.md | 126 ++++-- rag/prompts/toc_from_img_system.md | 18 + rag/prompts/toc_from_img_user.md | 10 + toc_detection_result.txt | 113 ----- toc_index_result.txt | 644 ----------------------------- toc_transformer_result.txt | 375 ----------------- 8 files changed, 180 insertions(+), 1271 deletions(-) delete mode 100644 rag/app/local_edit.py create mode 100644 rag/prompts/assign_toc_levels.md create mode 100644 rag/prompts/toc_from_img_system.md create mode 100644 rag/prompts/toc_from_img_user.md delete mode 100644 toc_detection_result.txt delete mode 100644 toc_index_result.txt delete mode 100644 toc_transformer_result.txt diff --git a/rag/app/local_edit.py b/rag/app/local_edit.py deleted file mode 100644 index 296ebb8f6ca..00000000000 --- a/rag/app/local_edit.py +++ /dev/null @@ -1,111 +0,0 @@ -import json -from typing import List, Tuple -from rag.app.naive import chunk -from rag.prompts.prompts import toc_transformer, table_of_contents_index -from rag.nlp import num_tokens_from_string -from rag.prompts.prompts import detect_table_of_contents -from api.db import LLMType -from api.db.services.llm_service import LLMBundle -from api.db.services.tenant_llm_service import TenantLLMService -from api.db.services.user_service import TenantService - -if __name__ == "__main__": - import sys - - from api import settings - if settings.FACTORY_LLM_INFOS is None: - print("Fixing FACTORY_LLM_INFOS initialization...") - settings.init_settings() # 重新初始化设置 - - def dummy(prog=None, msg=""): - pass - tenant_id = "10b8ea16937911f09ae613abffb949cc" # 从数据库查询到的用户ID - - results, tables, figures = chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy, tenant_id=tenant_id) - sections, section_images, page_1024, tc_arr = [], [], [""], [0] - - # 修复:results是元组列表,不是字典列表 - for text, image in results or []: - tc = num_tokens_from_string(text) - page_1024[-1] += "\n" + text - tc_arr[-1] += tc - if tc_arr[-1] > 1024: - page_1024.append("") - tc_arr.append(0) - import sys - from api import settings - def dummy(prog=None, msg=""): - pass - - def process_toc_full(pdf_path, tenant_id): - if settings.FACTORY_LLM_INFOS is None: - print("Fixing FACTORY_LLM_INFOS initialization...") - settings.init_settings() - results, tables, figures = chunk(pdf_path, from_page=0, to_page=10, callback=dummy, tenant_id=tenant_id) - sections, section_images, page_1024, tc_arr = [], [], [""], [0] - for text, image in results or []: - tc = num_tokens_from_string(text) - page_1024[-1] += "\n" + text - tc_arr[-1] += tc - if tc_arr[-1] > 1024: - page_1024.append("") - tc_arr.append(0) - sections.append((text, "")) - section_images.append(image) - chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_name="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", lang="Chinese") - toc_secs = detect_table_of_contents(page_1024, chat_mdl) - with open("toc_detection_result.txt", "w", encoding="utf-8") as f: - f.write("=== TOC Detection Results ===\n") - f.write(f"Found {len(toc_secs)} TOC sections\n\n") - for i, sec in enumerate(toc_secs): - f.write(f"--- Section {i+1} ---\n") - f.write(sec) - f.write("\n\n") - print(f"✅ TOC detection results saved to toc_detection_result.txt ({len(toc_secs)} sections)") - if toc_secs: - toc_arr = toc_transformer(toc_secs, chat_mdl) - with open("toc_transformer_result.txt", "w", encoding="utf-8") as f: - f.write("=== TOC Transformer Results ===\n") - f.write(json.dumps(toc_arr, ensure_ascii=False, indent=2)) - print(f"✅ TOC transformer results saved to toc_transformer_result.txt ({len(toc_arr)} items)") - toc_arr = [it for it in toc_arr if it.get("structure")] - print(f"📋 Filtered to {len(toc_arr)} items with structure") - toc_arr = table_of_contents_index(toc_arr, [t for t,_ in sections], chat_mdl) - with open("toc_index_result.txt", "w", encoding="utf-8") as f: - f.write("=== TOC Index Results ===\n") - f.write(json.dumps(toc_arr, ensure_ascii=False, indent=2)) - print(f"✅ TOC index results saved to toc_index_result.txt ({len(toc_arr)} items)") - print("\n" + "="*50) - print("FINAL TOC STRUCTURE:") - print("="*50) - print(json.dumps(toc_arr, ensure_ascii=False, indent=2), flush=True) - else: - print("❌ No TOC sections detected") - - def process_toc_from_file(tenant_id, sections_path, toc_transformer_path): - if settings.FACTORY_LLM_INFOS is None: - print("Fixing FACTORY_LLM_INFOS initialization...") - settings.init_settings() - # 读取sections - with open(sections_path, "r", encoding="utf-8") as f: - sections = [line.strip() for line in f if line.strip() and not line.startswith("===") and not line.startswith("---")] - # 读取toc_transformer结果 - with open(toc_transformer_path, "r", encoding="utf-8") as f: - toc_arr = json.loads(f.read().split("=== TOC Transformer Results ===\n")[-1]) - chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_name="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", lang="Chinese") - toc_arr = [it for it in toc_arr if it.get("structure")] - print(f"📋 Filtered to {len(toc_arr)} items with structure") - toc_arr = table_of_contents_index(toc_arr, sections, chat_mdl) - with open("toc_index_result.txt", "w", encoding="utf-8") as f: - f.write("=== TOC Index Results ===\n") - f.write(json.dumps(toc_arr, ensure_ascii=False, indent=2)) - print(f"✅ TOC index results saved to toc_index_result.txt ({len(toc_arr)} items)") - print("\n" + "="*50) - print("FINAL TOC STRUCTURE:") - print("="*50) - print(json.dumps(toc_arr, ensure_ascii=False, indent=2), flush=True) - - if __name__ == "__main__": - # 示例:只执行第三步,前两步结果从本地文件读取 - tenant_id = "10b8ea16937911f09ae613abffb949cc" - process_toc_from_file(tenant_id, "toc_detection_result.txt", "toc_transformer_result.txt") \ No newline at end of file diff --git a/rag/prompts/assign_toc_levels.md b/rag/prompts/assign_toc_levels.md new file mode 100644 index 00000000000..4df875b28b4 --- /dev/null +++ b/rag/prompts/assign_toc_levels.md @@ -0,0 +1,54 @@ + +You are given a JSON array of TOC items. Each item has at least {"title": string} and may include an existing structure. + +Task +- For each item, assign a depth label using Arabic numerals only: top-level = 1, second-level = 2, third-level = 3, etc. +- Multiple items may share the same depth (e.g., many 1s, many 2s). +- Do not use dotted numbering (no 1.1/1.2). Use a single digit string per item indicating its depth only. +- Preserve the original item order exactly. Do not insert, delete, or reorder. +- Decide levels yourself to keep a coherent hierarchy. Keep peers at the same depth. + +Output +- Return a valid JSON array only (no extra text). +- Each element must be {"structure": "1|2|3", "title": }. +- title must be the original title string. + +Examples + +Example A (chapters with sections) +Input: +["Chapter 1 Methods", "Section 1 Definition", "Section 2 Process", "Chapter 2 Experiment"] + +Output: +[ + {"structure":"1","title":"Chapter 1 Methods"}, + {"structure":"2","title":"Section 1 Definition"}, + {"structure":"2","title":"Section 2 Process"}, + {"structure":"1","title":"Chapter 2 Experiment"} +] + +Example B (parts with chapters) +Input: +["Part I Theory", "Chapter 1 Basics", "Chapter 2 Methods", "Part II Applications", "Chapter 3 Case Studies"] + +Output: +[ + {"structure":"1","title":"Part I Theory"}, + {"structure":"2","title":"Chapter 1 Basics"}, + {"structure":"2","title":"Chapter 2 Methods"}, + {"structure":"1","title":"Part II Applications"}, + {"structure":"2","title":"Chapter 3 Case Studies"} +] + +Example C (plain headings) +Input: +["Introduction", "Background and Motivation", "Related Work", "Methodology", "Evaluation"] + +Output: +[ + {"structure":"1","title":"Introduction"}, + {"structure":"2","title":"Background and Motivation"}, + {"structure":"2","title":"Related Work"}, + {"structure":"1","title":"Methodology"}, + {"structure":"1","title":"Evaluation"} +] diff --git a/rag/prompts/toc_detection.md b/rag/prompts/toc_detection.md index 29e068a7a4e..1f528d835cf 100644 --- a/rag/prompts/toc_detection.md +++ b/rag/prompts/toc_detection.md @@ -1,29 +1,99 @@ -You are an AI assistant designed to analyze text content and detect whether a table of contents (TOC) list exists on the given page. Follow these steps: - -1. **Analyze the Input**: Carefully review the provided text content. -2. **Identify Key Features**: Look for common indicators of a TOC, such as: - - Section titles or headings paired with page numbers. - - Patterns like repeated formatting (e.g., bold/italicized text, dots/dashes between titles and numbers). - - Phrases like "Table of Contents," "Contents," or similar headings. - - Logical grouping of topics/subtopics with sequential page references. -3. **Discern Negative Features**: - - The text contains no numbers, or the numbers present are clearly not page references (e.g., dates, statistical figures, phone numbers, version numbers). - - The text consists of full, descriptive sentences and paragraphs that form a narrative, present arguments, or explain concepts, rather than succinctly listing topics. - - Contains citations with authors, publication years, journal titles, and page ranges (e.g., "Smith, J. (2020). Journal Title, 10(2), 45-67."). - - Lists keywords or terms followed by multiple page numbers, often in alphabetical order. - - Comprises terms followed by their definitions or explanations. - - Labeled with headers like "Appendix A," "Appendix B," etc. - - Contains expressive language thanking individuals or organizations for their support or contributions. -4. **Evaluate Evidence**: Weigh the presence/absence of these features to determine if the content resembles a TOC. -5. **Output Format**: Provide your response in the following JSON structure: - ```json - { - "reasoning": "Step-by-step explanation of your analysis based on the features identified." , - "exists": true/false - } - ``` -6. **DO NOT** output anything else except JSON structure. - -**Input text Content ( Text-Only Extraction ):** -{{ page_txt }} +You are an AI assistant designed to analyze whether a given text resembles a Table of Contents (TOC). +Follow these steps explicitly and reason step by step before giving the final answer: +### Step-by-Step Reasoning (CoT) + +1. **Check for TOC Indicators** + - Look for explicit TOC headings such as "Table of Contents", "Contents", "目录". + - If no heading, also consider implicit TOC structures: + - Presence of "Chapter", "第一章", "第X章", "Section", "第一节" etc. + - Consistent hierarchical numbering (1., 1.1, 1.2, … or Chinese numbering). + - Repeated short section titles in a list-like format. + - Page numbers or dotted leaders ("......") strengthen the signal, but are not strictly required. + +2. **Check for Negative Indicators** + - Narrative sentences or long paragraphs rather than short headings. + - Citations (years, authors) dominating the text. + - Acknowledgments, references, definitions, or index-like alphabetical lists. + +3. **Decision** + - If the text is primarily a structured outline of chapters/sections (with or without page numbers), then → `exists=True`. + - Otherwise → `exists=False`. + +--- + +### Example (TOC cases, exists=True) + +**Example 1** + +**Input Text:** +Table of Contents +Chapter 1 Introduction .................. 1 +1.1 Background ........................... 2 +1.2 Research Questions ................... 4 +Chapter 2 Literature Review ............. 10 + +**Expected Output:** +{ + "reasoning": "The text contains a TOC heading, hierarchical numbering, dotted leaders, and page numbers. These are clear TOC indicators.", + "exists": True +} + +--- + +**Example 2** + +**Input Text:** +Contents +Part I: Foundations + Chapter 1 Cognitive Science + Chapter 2 Neuroscience Basics +Part II: Applications + Chapter 3 Learning and Memory + Chapter 4 Decision Making + +**Expected Output:** +{ + "reasoning": "The text contains a 'Contents' heading and structured outline with chapters and parts. Even without page numbers, this is clearly a TOC.", + "exists": True +} + +--- + +### Example (Not TOC cases, exists=False) + +**Example 3** + +**Input Text:** +Smith (2020) argues that machine learning has transformed industry practices. +The first AI conference was held in 1956 at Dartmouth. + +**Expected Output:** +{ + "reasoning": "The text is narrative with sentences and citations, not a structured list of chapters or sections. It does not resemble a TOC.", + "exists": False +} + +--- + +**Example 4** + +**Input Text:** +Acknowledgments +I want to thank my colleagues, my family, and my friends for their support. +This book would not have been possible without their help. + +**Expected Output:** +{ + "reasoning": "The text is acknowledgments in narrative form, not a structured list of chapters or sections. It does not resemble a TOC.", + "exists": False +} + +--- + +### Output +Provide the answer in strict JSON: +{ + "reasoning": "", + "exists": True/False +} diff --git a/rag/prompts/toc_from_img_system.md b/rag/prompts/toc_from_img_system.md new file mode 100644 index 00000000000..d7e7f7bb6e4 --- /dev/null +++ b/rag/prompts/toc_from_img_system.md @@ -0,0 +1,18 @@ +You are a Table-of-Contents (TOC) extractor. +- STRICT OUTPUT: Return ONLY a valid JSON array. +- Each element must be {"structure": "0", "title": ""}. +- If page is NOT a TOC, return [{"structure": "0", "title": "-1"}]. + +Examples: + +Example 1 (valid TOC page): +[ + {"structure": "0", "title": "Introduction"}, + {"structure": "0", "title": "Chapter 1: Basics"}, + {"structure": "0", "title": "Chapter 2: Advanced Topics"} +] + +Example 2 (NOT a TOC page): +[ + {"structure": "0", "title": "-1"} +] \ No newline at end of file diff --git a/rag/prompts/toc_from_img_user.md b/rag/prompts/toc_from_img_user.md new file mode 100644 index 00000000000..51c2e565a47 --- /dev/null +++ b/rag/prompts/toc_from_img_user.md @@ -0,0 +1,10 @@ +[ + { + "type": "image_url", + "image_url": {"url": {{url}}, "detail": "high"} + }, + { + "type": "text", + "text": "Input: one page image at a time. Extract TOC items from this page." + } +] \ No newline at end of file diff --git a/toc_detection_result.txt b/toc_detection_result.txt deleted file mode 100644 index 0f3403bdc7f..00000000000 --- a/toc_detection_result.txt +++ /dev/null @@ -1,113 +0,0 @@ -=== TOC Detection Results === -Found 2 TOC sections - ---- Section 1 --- - -大脑构造 -自控力 -还在督促自己每天进步一点吗? -还在坚持每天阅读的习惯吗? -还在为找不到自己喜欢的书籍烦恼吗? -那~ -你愿意与我成为书友吗? -国内外当下流行书籍 -各图书销量排行榜书籍 -大量工具书籍 -使我们受益终生的书籍 -海量电子版、纸质版书籍及音频课程 -还有贴心的“学习管家”服务哦! -微信:shuyou055 -认知觉醒:开启自我改变的原动力 -周岭 著 -人民邮电出版社 -北京 -图书在版编目(CIP)数据 -认知觉醒:开启自我改变的原动力/周岭著. --北京:人民邮电出版 -社,2020.9 -ISBN 978-7-115-54342-4 -Ⅰ.①认… Ⅱ.①周…Ⅲ.①成功心理-通俗读物 Ⅳ.①B848.4-49 -中国版本图书馆CIP数据核字(2020)第114940号 -◆著  周 岭 -责任编辑 陈素然 -责任印制 周昇亮 -◆人民邮电出版社出版发行 北京市丰台区成寿寺路11号 -电子邮件315@ptpress.com.cn -邮编100164  -网址https://www.ptpress.com.cn -◆开本:720×960 1/16 -印张:17.25  2020年9月第1版 -字数:222千字  2020年9月北京第1次印刷 -定价:59.80元 -读者服务热线:(010)81055522 印装质量热线:(010)81055316 -反盗版热线:(010)81055315 -广告经营许可证:京东市监广登字20170147号 -目录 -封面 -扉页 -版权信息 -自序 开启自我改变的原动力 -上篇 内观自己,摆脱焦虑 -第一章 大脑——一切问题的起源 -第一节 大脑:重新认识你自己 -第二节 焦虑:焦虑的根源 -第三节 耐心:得耐心者得天下 -第二章 潜意识——生命留给我们的彩蛋 -第一节 模糊:人生是一场消除模糊的比赛 -第二节 感性:顶级的成长竟然是“凭感觉” -第三章 元认知——人类的终极能能力 -第一节 元认知:成长慢,是因为你不会“飞” -第二节 自控力:我们生而为人就是为了成为思维舵手 -下篇 外观世界,借力前行 -第四章 专注力——情绪和智慧的交叉地带 -第一节 情绪专注:一招提振你的注意力 -第二节 学习专注:深度沉浸是进化双刃剑的安全剑柄 -第五章 学习力——学习不是一味地努力 -第一节 匹配:舒适区边缘,适用于万物的方法论 -第二节 深度:深度学习,人生为数不多的好出路 -第三节 关联:高手的“暗箱” -第四节 体系:建立个人认知体系其实很简单 -第五节 打卡:莫迷恋打卡,打卡打不出未来 -第六节 反馈:是时候告诉你什么是真正的学习了 - ---- Section 2 --- - -第七节 休息:你没成功,可能是因为太刻苦了 -第六章 行动力——没有行动世界只是个概念 -第一节 清晰:一个观念,重构你的行动力 -第二节 “傻瓜”:这个世界会奖励那些不计得失的“傻瓜” -第三节 行动:“道理都懂,就是不做”怎么破解 -第七章 情绪力——情绪是多角度看问题的智慧 -第一节 智带宽:唯有富足,方能解忧 -第二节 单一视角:你的坏情绪,源于视角单一 -第三节 游戏心态:幸福的人,总是在做另外一件事 -第八章 早冥读写跑,人生五件套——成本最低的成长之道 -第一节 早起:无闹钟、不参团、不打卡,我是如何坚持早起的 -第二节 冥想:终有一天,你要解锁这条隐藏赛道 -第三节 阅读:如何让自己真正爱上阅读 -第四节 写作:谢谢你,费曼先生 -第五节 运动:灵魂想要走得远,身体必须在路上 -结语 一流的生活不是富有,而是觉知 -后记 共同改变,一起前行 -参考文献 -送给我的女儿 -周子琪 -自序 开启自我改变的原动力 -我们是幸运的一代人,赶上了人类社会迄今为止最大的跨越发展 -期,科技进步,物资丰富,万物互联。我们的寿命变得更长,智商变得 -更高,财富变得更多,而且这些可以通过基因或基金传递给下一代。但 -无论科技多么发达,有一样东西却始终无法直接传递,那就是心智。 -所谓心智,通俗地说,就是我们看待人和事的态度,以及由此做出 -的判断与选择。每一个人来到这个世界时,其人生观、世界观、价值观 -全部都是从零开始的,所有习性、习惯、模式也要从人性的初始状态开 -始发展,你、我、我们的父辈和孩子都是如此,没有人能够直接跨越这 -一阶段。而不少人的初始状态是混沌的,他们天然追求简单、轻松、舒 -适、确定,这种天性支配着他们,成为他们喜怒哀乐的生理起点,然而 -身陷其中的大多数人对此知之甚少。 -我们对自己的无知使自己看起来就像一个“醒着的睡着的人”。我知 -道一个人不可能同时“醒着”和“睡着”,这二者显然是矛盾的,但在指出 -这个逻辑错误之前,你不妨先随我一起看看我们的人生轨迹,或许你会 -同意我这个说法。 -如果不出意外,大多数人都会沿着“求学—工作—婚恋”的路线成 -长,随着生活的惯性一直往前走。年轻的时候,几乎没有人会觉得自己 -的将来能有多差,认定美好的生活会自然到来。不谙世事的我们认为: - diff --git a/toc_index_result.txt b/toc_index_result.txt deleted file mode 100644 index 08b17e41daf..00000000000 --- a/toc_index_result.txt +++ /dev/null @@ -1,644 +0,0 @@ -=== TOC Index Results === -[ - { - "structure": "1", - "title": "大脑构造", - "indices": [ - 1 - ] - }, - { - "structure": "2", - "title": "自控力", - "indices": [ - 2 - ] - }, - { - "structure": "3", - "title": "还在督促自己每天进步一点吗?", - "indices": [ - 3 - ] - }, - { - "structure": "4", - "title": "还在坚持每天阅读的习惯吗?", - "indices": [ - 4 - ] - }, - { - "structure": "5", - "title": "还在为找不到自己喜欢的书籍烦恼吗?", - "indices": [ - 5 - ] - }, - { - "structure": "6", - "title": "那~", - "indices": [ - 6 - ] - }, - { - "structure": "7", - "title": "你愿意与我成为书友吗?", - "indices": [ - 7 - ] - }, - { - "structure": "8", - "title": "国内外当下流行书籍", - "indices": [ - 8 - ] - }, - { - "structure": "9", - "title": "各图书销量排行榜书籍", - "indices": [ - 9 - ] - }, - { - "structure": "10", - "title": "大量工具书籍", - "indices": [ - 10 - ] - }, - { - "structure": "11", - "title": "使我们受益终生的书籍", - "indices": [ - 11 - ] - }, - { - "structure": "12", - "title": "海量电子版、纸质版书籍及音频课程", - "indices": [ - 12 - ] - }, - { - "structure": "13", - "title": "还有贴心的“学习管家”服务哦!", - "indices": [ - 13 - ] - }, - { - "structure": "14", - "title": "微信:shuyou055", - "indices": [ - 14 - ] - }, - { - "structure": "15", - "title": "认知觉醒:开启自我改变的原动力", - "indices": [ - 15 - ] - }, - { - "structure": "16", - "title": "周岭 著", - "indices": [ - 16 - ] - }, - { - "structure": "17", - "title": "人民邮电出版社", - "indices": [ - 17 - ] - }, - { - "structure": "18", - "title": "北京", - "indices": [ - 18 - ] - }, - { - "structure": "19", - "title": "图书在版编目(CIP)数据", - "indices": [ - 19 - ] - }, - { - "structure": "20", - "title": "认知觉醒:开启自我改变的原动力/周岭著. --北京:人民邮电出版社, 2020.9", - "indices": [] - }, - { - "structure": "21", - "title": "ISBN 978-7-115-54342-4", - "indices": [ - 22 - ] - }, - { - "structure": "22", - "title": "Ⅰ.①认…", - "indices": [ - 23 - ] - }, - { - "structure": "23", - "title": "Ⅱ.①周…", - "indices": [] - }, - { - "structure": "24", - "title": "Ⅲ.①成功心理-通俗读物", - "indices": [ - 23 - ] - }, - { - "structure": "25", - "title": "Ⅳ.①B848.4-49", - "indices": [ - 23 - ] - }, - { - "structure": "26", - "title": "中国版本图书馆CIP数据核字(2020)第114940号", - "indices": [ - 24 - ] - }, - { - "structure": "27", - "title": "◆著  周 岭", - "indices": [ - 25 - ] - }, - { - "structure": "28", - "title": "责任编辑 陈素然", - "indices": [ - 26 - ] - }, - { - "structure": "29", - "title": "责任印制 周昇亮", - "indices": [ - 27 - ] - }, - { - "structure": "30", - "title": "◆人民邮电出版社出版发行 北京市丰台区成寿寺路11号", - "indices": [ - 28 - ] - }, - { - "structure": "31", - "title": "电子邮件315@ptpress.com.cn", - "indices": [ - 29 - ] - }, - { - "structure": "32", - "title": "邮编100164", - "indices": [ - 30 - ] - }, - { - "structure": "33", - "title": "开本:720×960 1/16", - "indices": [] - }, - { - "structure": "34", - "title": "印张:17.25", - "indices": [ - 33 - ] - }, - { - "structure": "35", - "title": "字数:222千字", - "indices": [ - 35 - ] - }, - { - "structure": "36", - "title": "2020年9月第1版", - "indices": [] - }, - { - "structure": "37", - "title": "定价:59.80元", - "indices": [ - 35 - ] - }, - { - "structure": "38", - "title": "读者服务热线:(010)81055522", - "indices": [] - }, - { - "structure": "39", - "title": "印装质量热线:(010)81055316", - "indices": [ - 36 - ] - }, - { - "structure": "40", - "title": "反盗版热线:(010)81055315", - "indices": [ - 37 - ] - }, - { - "structure": "41", - "title": "广告经营许可证:京东市监广登字20170147号", - "indices": [ - 38 - ] - }, - { - "structure": "42", - "title": "目录", - "indices": [ - 39 - ] - }, - { - "structure": "43", - "title": "封面", - "indices": [ - 40 - ] - }, - { - "structure": "44", - "title": "扉页", - "indices": [ - 41 - ] - }, - { - "structure": "45", - "title": "版权信息", - "indices": [ - 42 - ] - }, - { - "structure": "46", - "title": "自序 开启自我改变的原动力", - "indices": [ - 43 - ] - }, - { - "structure": "47", - "title": "上篇 内观自己,摆脱焦虑", - "indices": [ - 44 - ] - }, - { - "structure": "48", - "title": "第一章 大脑——一切问题的起源", - "indices": [ - 45 - ] - }, - { - "structure": "49", - "title": "第一节 大脑:重新认识你自己", - "indices": [ - 46 - ] - }, - { - "structure": "50", - "title": "第二节 焦虑:焦虑的根源", - "indices": [ - 47 - ] - }, - { - "structure": "51", - "title": "第三节 耐心:得耐心者得天下", - "indices": [ - 48 - ] - }, - { - "structure": "52", - "title": "第二章 潜意识——生命留给我们的彩蛋", - "indices": [ - 49 - ] - }, - { - "structure": "53", - "title": "第一节 模糊:人生是一场消除模糊的比赛", - "indices": [ - 50 - ] - }, - { - "structure": "54", - "title": "第二节 感性:顶级的成长竟然是“凭感觉”", - "indices": [ - 51 - ] - }, - { - "structure": "55", - "title": "第三章 元认知——人类的终极能能力", - "indices": [ - 52 - ] - }, - { - "structure": "56", - "title": "第一节 元认知:成长慢,是因为你不会“飞”", - "indices": [ - 53 - ] - }, - { - "structure": "57", - "title": "第二节 自控力:我们生而为人就是为了成为思维舵手", - "indices": [ - 54 - ] - }, - { - "structure": "58", - "title": "下篇 外观世界,借力前行", - "indices": [ - 55 - ] - }, - { - "structure": "59", - "title": "第四章 专注力——情绪和智慧的交叉地带", - "indices": [ - 56 - ] - }, - { - "structure": "60", - "title": "第一节 情绪专注:一招提振你的注意力", - "indices": [ - 57 - ] - }, - { - "structure": "61", - "title": "第二节 学习专注:深度沉浸是进化双刃剑的安全剑柄", - "indices": [ - 58 - ] - }, - { - "structure": "62", - "title": "第五章 学习力——学习不是一味地努力", - "indices": [ - 59 - ] - }, - { - "structure": "63", - "title": "第一节 匹配:舒适区边缘,适用于万物的方法论", - "indices": [ - 60 - ] - }, - { - "structure": "64", - "title": "第二节 深度:深度学习,人生为数不多的好出路", - "indices": [ - 61 - ] - }, - { - "structure": "65", - "title": "第三节 关联:高手的“暗箱”", - "indices": [ - 62 - ] - }, - { - "structure": "66", - "title": "第四节 体系:建立个人认知体系其实很简单", - "indices": [ - 63 - ] - }, - { - "structure": "67", - "title": "第五节 打卡:莫迷恋打卡,打卡打不出未来", - "indices": [ - 64 - ] - }, - { - "structure": "68", - "title": "第六节 反馈:是时候告诉你什么是真正的学习了", - "indices": [ - 65 - ] - }, - { - "structure": "69", - "title": "第七节 休息:你没成功,可能是因为太刻苦了", - "indices": [ - 66 - ] - }, - { - "structure": "70", - "title": "第六章 行动力——没有行动世界只是个概念", - "indices": [ - 67 - ] - }, - { - "structure": "71", - "title": "第一节 清晰:一个观念,重构你的行动力", - "indices": [ - 68 - ] - }, - { - "structure": "72", - "title": "第二节 “傻瓜”:这个世界会奖励那些不计得失的“傻瓜”", - "indices": [ - 69 - ] - }, - { - "structure": "73", - "title": "第三节 行动:“道理都懂,就是不做”怎么破解", - "indices": [ - 70 - ] - }, - { - "structure": "74", - "title": "第七章 情绪力——情绪是多角度看问题的智慧", - "indices": [ - 71 - ] - }, - { - "structure": "75", - "title": "第一节 智带宽:唯有富足,方能解忧", - "indices": [ - 72 - ] - }, - { - "structure": "76", - "title": "第二节 单一视角:你的坏情绪,源于视角单一", - "indices": [ - 73 - ] - }, - { - "structure": "77", - "title": "第三节 游戏心态:幸福的人,总是在做另外一件事", - "indices": [ - 74 - ] - }, - { - "structure": "78", - "title": "第八章 早冥读写跑,人生五件套——成本最低的成长之道", - "indices": [ - 75 - ] - }, - { - "structure": "79", - "title": "第一节 早起:无闹钟、不参团、不打卡,我是如何坚持早起的", - "indices": [ - 76 - ] - }, - { - "structure": "80", - "title": "第二节 冥想:终有一天,你要解锁这条隐藏赛道", - "indices": [ - 77 - ] - }, - { - "structure": "81", - "title": "第三节 阅读:如何让自己真正爱上阅读", - "indices": [ - 78 - ] - }, - { - "structure": "82", - "title": "第四节 写作:谢谢你,费曼先生", - "indices": [ - 79 - ] - }, - { - "structure": "83", - "title": "第五节 运动:灵魂想要走得远,身体必须在路上", - "indices": [ - 80 - ] - }, - { - "structure": "84", - "title": "结语 一流的生活不是富有,而是觉知", - "indices": [ - 81 - ] - }, - { - "structure": "85", - "title": "后记 共同改变,一起前行", - "indices": [ - 82 - ] - }, - { - "structure": "86", - "title": "参考文献", - "indices": [ - 83 - ] - }, - { - "structure": "87", - "title": "送给我的女儿", - "indices": [ - 84 - ] - }, - { - "structure": "88", - "title": "自序 开启自我改变的原动力", - "indices": [ - 86 - ] - }, - { - "structure": "89", - "title": "我们是幸运的一代人,赶上了人类社会迄今为止最大的跨越发展", - "indices": [ - 87 - ] - }, - { - "structure": "90", - "title": "期,科技进步,物资丰富,万物互联。我们的寿命变得更长,智商变得更高,财富变得更多,而且这些可以通过基因或基金传递给下一代。但无论科技多么发达,有一样东西却始终无法直接传递,那就是心智。", - "indices": [ - 89 - ] - }, - { - "structure": "91", - "title": "所谓心智,通俗地说,就是我们看待人和事的态度,以及由此做出的判断与选择。每一个人来到这个世界时,其人生观、世界观、价值观全部都是从零开始的,所有习惯、习惯、模式也要从人性的初始状态 开始发展,你、我、我们的父辈和孩子都是如此,没有人能够直接跨越这 一阶段。而不少人的初始状态是混沌的,他们天然追求简单、轻松、舒适、确定,这种天性支配着他们,成为他们喜怒哀乐的生理起点,然而身陷其中的大多数人对此知之甚少。", - "indices": [ - 91 - ] - }, - { - "structure": "92", - "title": "我们对自己的无知使自己看起来就像一个“醒着的睡着的人”。我明白一个人不可能同时“醒着”和“睡着”,这二者显然是矛盾的,但在指出这个逻辑错误之前,你不妨先随我一起看看我们的人生轨迹,或许你会同意我这个说法。", - "indices": [ - 100 - ] - }, - { - "structure": "93", - "title": "如果不出意外,大多数人都会沿着“求学—工作—婚恋”的路线成长,随着生活的惯性一直往前走。年轻的时候,几乎没有人会觉得自己的将来能有多差,认定美好的生活会自然到来。不谙世事的我们认为:", - "indices": [ - 103 - ] - } -] \ No newline at end of file diff --git a/toc_transformer_result.txt b/toc_transformer_result.txt deleted file mode 100644 index e36f58e5504..00000000000 --- a/toc_transformer_result.txt +++ /dev/null @@ -1,375 +0,0 @@ -=== TOC Transformer Results === -[ - { - "structure": "1", - "title": "大脑构造" - }, - { - "structure": "2", - "title": "自控力" - }, - { - "structure": "3", - "title": "还在督促自己每天进步一点吗?" - }, - { - "structure": "4", - "title": "还在坚持每天阅读的习惯吗?" - }, - { - "structure": "5", - "title": "还在为找不到自己喜欢的书籍烦恼吗?" - }, - { - "structure": "6", - "title": "那~" - }, - { - "structure": "7", - "title": "你愿意与我成为书友吗?" - }, - { - "structure": "8", - "title": "国内外当下流行书籍" - }, - { - "structure": "9", - "title": "各图书销量排行榜书籍" - }, - { - "structure": "10", - "title": "大量工具书籍" - }, - { - "structure": "11", - "title": "使我们受益终生的书籍" - }, - { - "structure": "12", - "title": "海量电子版、纸质版书籍及音频课程" - }, - { - "structure": "13", - "title": "还有贴心的“学习管家”服务哦!" - }, - { - "structure": "14", - "title": "微信:shuyou055" - }, - { - "structure": "15", - "title": "认知觉醒:开启自我改变的原动力" - }, - { - "structure": "16", - "title": "周岭 著" - }, - { - "structure": "17", - "title": "人民邮电出版社" - }, - { - "structure": "18", - "title": "北京" - }, - { - "structure": "19", - "title": "图书在版编目(CIP)数据" - }, - { - "structure": "20", - "title": "认知觉醒:开启自我改变的原动力/周岭著. --北京:人民邮电出版社, 2020.9" - }, - { - "structure": "21", - "title": "ISBN 978-7-115-54342-4" - }, - { - "structure": "22", - "title": "Ⅰ.①认…" - }, - { - "structure": "23", - "title": "Ⅱ.①周…" - }, - { - "structure": "24", - "title": "Ⅲ.①成功心理-通俗读物" - }, - { - "structure": "25", - "title": "Ⅳ.①B848.4-49" - }, - { - "structure": "26", - "title": "中国版本图书馆CIP数据核字(2020)第114940号" - }, - { - "structure": "27", - "title": "◆著  周 岭" - }, - { - "structure": "28", - "title": "责任编辑 陈素然" - }, - { - "structure": "29", - "title": "责任印制 周昇亮" - }, - { - "structure": "30", - "title": "◆人民邮电出版社出版发行 北京市丰台区成寿寺路11号" - }, - { - "structure": "31", - "title": "电子邮件315@ptpress.com.cn" - }, - { - "structure": "32", - "title": "邮编100164" - }, - { - "structure": "33", - "title": "开本:720×960 1/16" - }, - { - "structure": "34", - "title": "印张:17.25" - }, - { - "structure": "35", - "title": "字数:222千字" - }, - { - "structure": "36", - "title": "2020年9月第1版" - }, - { - "structure": "37", - "title": "定价:59.80元" - }, - { - "structure": "38", - "title": "读者服务热线:(010)81055522" - }, - { - "structure": "39", - "title": "印装质量热线:(010)81055316" - }, - { - "structure": "40", - "title": "反盗版热线:(010)81055315" - }, - { - "structure": "41", - "title": "广告经营许可证:京东市监广登字20170147号" - }, - { - "structure": "42", - "title": "目录" - }, - { - "structure": "43", - "title": "封面" - }, - { - "structure": "44", - "title": "扉页" - }, - { - "structure": "45", - "title": "版权信息" - }, - { - "structure": "46", - "title": "自序 开启自我改变的原动力" - }, - { - "structure": "47", - "title": "上篇 内观自己,摆脱焦虑" - }, - { - "structure": "48", - "title": "第一章 大脑——一切问题的起源" - }, - { - "structure": "49", - "title": "第一节 大脑:重新认识你自己" - }, - { - "structure": "50", - "title": "第二节 焦虑:焦虑的根源" - }, - { - "structure": "51", - "title": "第三节 耐心:得耐心者得天下" - }, - { - "structure": "52", - "title": "第二章 潜意识——生命留给我们的彩蛋" - }, - { - "structure": "53", - "title": "第一节 模糊:人生是一场消除模糊的比赛" - }, - { - "structure": "54", - "title": "第二节 感性:顶级的成长竟然是“凭感觉”" - }, - { - "structure": "55", - "title": "第三章 元认知——人类的终极能能力" - }, - { - "structure": "56", - "title": "第一节 元认知:成长慢,是因为你不会“飞”" - }, - { - "structure": "57", - "title": "第二节 自控力:我们生而为人就是为了成为思维舵手" - }, - { - "structure": "58", - "title": "下篇 外观世界,借力前行" - }, - { - "structure": "59", - "title": "第四章 专注力——情绪和智慧的交叉地带" - }, - { - "structure": "60", - "title": "第一节 情绪专注:一招提振你的注意力" - }, - { - "structure": "61", - "title": "第二节 学习专注:深度沉浸是进化双刃剑的安全剑柄" - }, - { - "structure": "62", - "title": "第五章 学习力——学习不是一味地努力" - }, - { - "structure": "63", - "title": "第一节 匹配:舒适区边缘,适用于万物的方法论" - }, - { - "structure": "64", - "title": "第二节 深度:深度学习,人生为数不多的好出路" - }, - { - "structure": "65", - "title": "第三节 关联:高手的“暗箱”" - }, - { - "structure": "66", - "title": "第四节 体系:建立个人认知体系其实很简单" - }, - { - "structure": "67", - "title": "第五节 打卡:莫迷恋打卡,打卡打不出未来" - }, - { - "structure": "68", - "title": "第六节 反馈:是时候告诉你什么是真正的学习了" - }, - { - "structure": "69", - "title": "第七节 休息:你没成功,可能是因为太刻苦了" - }, - { - "structure": "70", - "title": "第六章 行动力——没有行动世界只是个概念" - }, - { - "structure": "71", - "title": "第一节 清晰:一个观念,重构你的行动力" - }, - { - "structure": "72", - "title": "第二节 “傻瓜”:这个世界会奖励那些不计得失的“傻瓜”" - }, - { - "structure": "73", - "title": "第三节 行动:“道理都懂,就是不做”怎么破解" - }, - { - "structure": "74", - "title": "第七章 情绪力——情绪是多角度看问题的智慧" - }, - { - "structure": "75", - "title": "第一节 智带宽:唯有富足,方能解忧" - }, - { - "structure": "76", - "title": "第二节 单一视角:你的坏情绪,源于视角单一" - }, - { - "structure": "77", - "title": "第三节 游戏心态:幸福的人,总是在做另外一件事" - }, - { - "structure": "78", - "title": "第八章 早冥读写跑,人生五件套——成本最低的成长之道" - }, - { - "structure": "79", - "title": "第一节 早起:无闹钟、不参团、不打卡,我是如何坚持早起的" - }, - { - "structure": "80", - "title": "第二节 冥想:终有一天,你要解锁这条隐藏赛道" - }, - { - "structure": "81", - "title": "第三节 阅读:如何让自己真正爱上阅读" - }, - { - "structure": "82", - "title": "第四节 写作:谢谢你,费曼先生" - }, - { - "structure": "83", - "title": "第五节 运动:灵魂想要走得远,身体必须在路上" - }, - { - "structure": "84", - "title": "结语 一流的生活不是富有,而是觉知" - }, - { - "structure": "85", - "title": "后记 共同改变,一起前行" - }, - { - "structure": "86", - "title": "参考文献" - }, - { - "structure": "87", - "title": "送给我的女儿" - }, - { - "structure": "88", - "title": "自序 开启自我改变的原动力" - }, - { - "structure": "89", - "title": "我们是幸运的一代人,赶上了人类社会迄今为止最大的跨越发展" - }, - { - "structure": "90", - "title": "期,科技进步,物资丰富,万物互联。我们的寿命变得更长,智商变得更高,财富变得更多,而且这些可以通过基因或基金传递给下一代。但无论科技多么发达,有一样东西却始终无法直接传递,那就是心智。" - }, - { - "structure": "91", - "title": "所谓心智,通俗地说,就是我们看待人和事的态度,以及由此做出的判断与选择。每一个人来到这个世界时,其人生观、世界观、价值观全部都是从零开始的,所有习惯、习惯、模式也要从人性的初始状态 开始发展,你、我、我们的父辈和孩子都是如此,没有人能够直接跨越这 一阶段。而不少人的初始状态是混沌的,他们天然追求简单、轻松、舒适、确定,这种天性支配着他们,成为他们喜怒哀乐的生理起点,然而身陷其中的大多数人对此知之甚少。" - }, - { - "structure": "92", - "title": "我们对自己的无知使自己看起来就像一个“醒着的睡着的人”。我明白一个人不可能同时“醒着”和“睡着”,这二者显然是矛盾的,但在指出这个逻辑错误之前,你不妨先随我一起看看我们的人生轨迹,或许你会同意我这个说法。" - }, - { - "structure": "93", - "title": "如果不出意外,大多数人都会沿着“求学—工作—婚恋”的路线成长,随着生活的惯性一直往前走。年轻的时候,几乎没有人会觉得自己的将来能有多差,认定美好的生活会自然到来。不谙世事的我们认为:" - } -] \ No newline at end of file From 61ea29ee6acf36d380f1788c3c5a9cc8504b8eb2 Mon Sep 17 00:00:00 2001 From: Billy Bao Date: Mon, 29 Sep 2025 10:37:17 +0800 Subject: [PATCH 03/14] fix naive.py --- rag/app/naive.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/rag/app/naive.py b/rag/app/naive.py index 6a8fdd1779b..283fd7dcea3 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -472,9 +472,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) figures = None - return sections, tables, figures - # res = tokenize_table(tables, doc, is_english) - # callback(0.8, "Finish parsing.") + res = tokenize_table(tables, doc, is_english) + callback(0.8, "Finish parsing.") else: if layout_recognizer == "Plain Text": @@ -604,4 +603,4 @@ def dummy(prog=None, msg=""): pass sections, tables, figures = chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) - print(sections) \ No newline at end of file + print(sections) From 3fb8cae61c305caf40f8382897f753b3a03edc0b Mon Sep 17 00:00:00 2001 From: Billy Bao Date: Mon, 29 Sep 2025 10:38:26 +0800 Subject: [PATCH 04/14] Refactor figure handling in naive.py --- rag/app/naive.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/rag/app/naive.py b/rag/app/naive.py index 283fd7dcea3..9265ae7768a 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -470,8 +470,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, tables.extend(figures) else: sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) - figures = None - + res = tokenize_table(tables, doc, is_english) callback(0.8, "Finish parsing.") @@ -484,9 +483,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) - return sections, tables, figures if figures else None - # res = tokenize_table(tables, doc, is_english) - # callback(0.8, "Finish parsing.") + res = tokenize_table(tables, doc, is_english) + callback(0.8, "Finish parsing.") elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") @@ -602,5 +600,4 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, def dummy(prog=None, msg=""): pass - sections, tables, figures = chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) - print(sections) + chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) From f25ca388edd0ea7fb984341ab83ec0a575fd07b2 Mon Sep 17 00:00:00 2001 From: Billy Bao Date: Mon, 29 Sep 2025 10:38:40 +0800 Subject: [PATCH 05/14] Delete run.sh --- run.sh | 6 ------ 1 file changed, 6 deletions(-) delete mode 100755 run.sh diff --git a/run.sh b/run.sh deleted file mode 100755 index 185471227bb..00000000000 --- a/run.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env bash -set -e -cd "$(dirname "$0")" -# 若有虚拟环境就用,没有也不报错 -[ -f .venv/bin/activate ] && source .venv/bin/activate -python -m rag.app.local_edit "${1:-$(xdg-user-dir DESKTOP)/default.pdf}" From 5ee7b7a7081540c8f5edb6a583e7ed5b951bd658 Mon Sep 17 00:00:00 2001 From: Magicbook1108 Date: Mon, 29 Sep 2025 12:06:54 +0800 Subject: [PATCH 06/14] Update prompts.py --- rag/prompts/prompts.py | 481 +++++++++++++++++++++++++---------------- 1 file changed, 299 insertions(+), 182 deletions(-) diff --git a/rag/prompts/prompts.py b/rag/prompts/prompts.py index cc23da6ba6c..bb1499fc153 100644 --- a/rag/prompts/prompts.py +++ b/rag/prompts/prompts.py @@ -1,4 +1,3 @@ -# # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,14 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import datetime -import json -import logging import re -from copy import deepcopy -from typing import Tuple +import json +import math import jinja2 +import base64 +import logging +import datetime +import unicodedata +import pdfplumber import json_repair +from os import write +from io import BytesIO +from typing import Tuple +from copy import deepcopy +from api.db import LLMType from api.utils import hash_str2int from rag.prompts.prompt_template import load_prompt from rag.settings import TAG_FLD @@ -440,7 +446,7 @@ def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list: def gen_json(system_prompt:str, user_prompt:str, chat_mdl): - _, msg = message_fit_in(form_message(system_prompt, user_prompt), chat_mdl.max_length) + _, msg = message_fit_in(form_message(system_prompt, user_prompt), 1000000) ans = chat_mdl.chat(msg[0]["content"], msg[1:]) ans = re.sub(r"(^.*|```json\n|```\n*$)", "", ans, flags=re.DOTALL) try: @@ -450,203 +456,314 @@ def gen_json(system_prompt:str, user_prompt:str, chat_mdl): TOC_DETECTION = load_prompt("toc_detection") -def detect_table_of_contents(page_1024:list[str], chat_mdl): - toc_secs = [] - for i, sec in enumerate(page_1024[:22]): - ans = gen_json(PROMPT_JINJA_ENV.from_string(TOC_DETECTION).render(page_txt=sec), "Only JSON please.", chat_mdl) - if toc_secs and not ans["exists"]: - break - toc_secs.append(sec) - return toc_secs - - -TOC_EXTRACTION = load_prompt("toc_extraction") -TOC_EXTRACTION_CONTINUE = load_prompt("toc_extraction_continue") -def extract_table_of_contents(toc_pages, chat_mdl): - if not toc_pages: - return [] +def detect_table_of_contents(pages:list[str], chat_mdl): + for i, sec in enumerate(pages): + if sec == "": + continue + ans = gen_json(PROMPT_JINJA_ENV.from_string(TOC_DETECTION).render(), f"Input:{sec}", chat_mdl) + print(f"TOC detection for page {i}: {ans}") + if ans.get("exists", False): + return i + return -1 + + +TOC_FROM_IMG_SYSTEM = load_prompt("toc_from_img_system") +TOC_FROM_IMG_USER = load_prompt("toc_from_img_user") +def gen_toc_from_img(img_url, vision_mdl): + ans = gen_json(PROMPT_JINJA_ENV.from_string(TOC_FROM_IMG_SYSTEM).render(), + PROMPT_JINJA_ENV.from_string(TOC_FROM_IMG_USER).render(url=img_url), + vision_mdl) + return ans + +TOC_LEVELS = load_prompt("assign_toc_levels") +def assign_toc_levels(toc_secs, chat_mdl): + ans = gen_json(PROMPT_JINJA_ENV.from_string(TOC_LEVELS).render(), + str(toc_secs), + chat_mdl + ) + + return ans - return gen_json(PROMPT_JINJA_ENV.from_string(TOC_EXTRACTION).render(toc_page="\n".join(toc_pages)), "Only JSON please.", chat_mdl) +def gen_image_from_page(page): + pil_img = page.to_image(resolution=300, antialias=True).original + img_buf = BytesIO() + if pil_img.mode in ("RGBA", "LA"): + pil_img = pil_img.convert("RGB") -def toc_index_extractor(toc:list[dict], content:str, chat_mdl): - tob_extractor_prompt = """ - You are given a table of contents in a json format and several pages of a document, your job is to add the physical_index to the table of contents in the json format. + pil_img.save(img_buf, format="JPEG") + b64 = base64.b64encode(img_buf.getvalue()).decode("utf-8") + img_buf.close() - The provided pages contains tags like and to indicate the physical location of the page X. + img_url = f"data:image/jpeg;base64,{b64}" + return img_url - The structure variable is the numeric system which represents the index of the hierarchy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc. - The response should be in the following JSON format: - [ +def build_img_toc_messages(url): + return [ { - "structure": (string), - "title": , - "physical_index": "<physical_index_X>" (keep the format) + "role": "system", + "content": """You are a strict Table-of-Contents (TOC) extractor. + +INPUT: +- You will receive one page of a PDF as an image. + +YOUR TASK: +1. Determine if this page is a TOC (Table of Contents). + - A TOC page usually has short, list-like headings (e.g. "Chapter 1", "第一章", "Section 2.3"), + often aligned or followed by dots/leaders and page numbers. + - A TOC page contains at least 3 such distinct headings. + - If the page is mostly narrative text, title page, author info, or ads, it is NOT a TOC. + +2. If it IS a TOC page: + - Return ONLY a **valid JSON array**. + - Each element must be an object with two fields: + {"structure": "0", "title": "<the heading text>"} + - "structure" must always be the string "0". + - "title" must be the exact heading text extracted from the TOC (do not invent or summarize). + - Keep the order as it appears on the page. + +3. If it is NOT a TOC page: + - Return ONLY the following JSON: + [ + {"structure": "0", "title": "-1"} + ] + +STRICT RULES: +- Do NOT include explanations, reasoning, or any text outside the JSON. +- Do NOT wrap the output in ```json fences. +- Output must start with `[` and end with `]`. +- Ensure the JSON is syntactically valid (no trailing commas). + +EXAMPLES: + +Example A (TOC page): +[ + {"structure": "0", "title": "Introduction"}, + {"structure": "0", "title": "Chapter 1: Basics"}, + {"structure": "0", "title": "Chapter 2: Advanced Topics"} +] + +Example B (NOT a TOC page): +[ + {"structure": "0", "title": "-1"} +]""" + }, + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": url, "detail": "high"}} + ], }, - ... ] - Only add the physical_index to the sections that are in the provided pages. - If the title of the section are not in the provided pages, do not add the physical_index to it. - Directly return the final JSON structure. Do not output anything else.""" - - prompt = tob_extractor_prompt + '\nTable of contents:\n' + json.dumps(toc, ensure_ascii=False, indent=2) + '\nDocument pages:\n' + content - return gen_json(prompt, "Only JSON please.", chat_mdl) - - -TOC_INDEX = load_prompt("toc_index") -def table_of_contents_index(toc_arr: list[dict], sections: list[str], chat_mdl): - if not toc_arr or not sections: - return [] - - toc_map = {} - for i, it in enumerate(toc_arr): - k1 = (it["structure"]+it["title"]).replace(" ", "") - k2 = it["title"].strip() - if k1 not in toc_map: - toc_map[k1] = [] - if k2 not in toc_map: - toc_map[k2] = [] - toc_map[k1].append(i) - toc_map[k2].append(i) - - for it in toc_arr: - it["indices"] = [] - for i, sec in enumerate(sections): - sec = sec.strip() - if sec.replace(" ", "") in toc_map: - for j in toc_map[sec.replace(" ", "")]: - toc_arr[j]["indices"].append(i) - - all_pathes = [] - def dfs(start, path): - nonlocal all_pathes - if start >= len(toc_arr): - if path: - all_pathes.append(path) - return - if not toc_arr[start]["indices"]: - dfs(start+1, path) - return - added = False - for j in toc_arr[start]["indices"]: - if path and j < path[-1][0]: + +def gen_toc_from_pdf(filename, empty_pages, start_page, chat_mdl): + toc = [] + with pdfplumber.open(BytesIO(filename)) as pdf: + for i, page in enumerate(prune_pages(pdf.pages)): + if i in empty_pages or i < start_page: continue - _path = deepcopy(path) - _path.append((j, start)) - added = True - dfs(start+1, _path) - if not added and path: - all_pathes.append(path) - - dfs(0, []) - path = max(all_pathes, key=lambda x:len(x)) - for it in toc_arr: - it["indices"] = [] - for j, i in path: - toc_arr[i]["indices"] = [j] - print(json.dumps(toc_arr, ensure_ascii=False, indent=2)) + + img_url = gen_image_from_page(page) + import requests + # === 直接请求 /v1/chat/completions(写死,唯一变量是 message)=== + url = "https://api.siliconflow.cn/v1/chat/completions" + headers = { + "Authorization": "Bearer sk-ipkxipfjzorweuhfwudgzhcfevdjqjnneltugjffkgtogypn", # 替换为你的实际 API Key + "Content-Type": "application/json", + } + payload = { + "model": "THUDM/GLM-4.1V-9B-Thinking", + "messages": build_img_toc_messages(img_url), # 唯一变量:message + "stream": True, + "temperature": 0.2, + } - i = 0 - while i < len(toc_arr): - it = toc_arr[i] - if it["indices"]: - i += 1 - continue + response = requests.post(url, json=payload, headers=headers, stream=True) + print("\n\nResponse Status Code:\n", response.status_code) + full_content = "" + full_reasoning_content = "" + for chunk in response.iter_lines(): + if not chunk: + continue + chunk_str = chunk.decode("utf-8") + if chunk_str.startswith("data:"): + chunk_str = chunk_str[len("data:"):].strip() + if chunk_str != "[DONE]": + chunk_data = json_repair.loads(chunk_str) + delta = chunk_data["choices"][0].get("delta", {}) + content = delta.get("content", "") + reasoning_content = delta.get("reasoning_content", "") + if content: + print(content, end="", flush=True) + full_content += content + if reasoning_content: + print(reasoning_content, end="", flush=True) + full_reasoning_content += reasoning_content + + raw = full_content or full_reasoning_content or "" + raw = re.sub(r"(^.*</think>|```json\n|```\n*$)", "", raw, flags=re.DOTALL) + ans = json_repair.loads(raw) + + if ans[0].get("title") == "-1": + return toc, i + else: + toc.extend(ans) + return toc, -1 + +def prune_pages(pages): + total = len(pages) + + if total <= 100: + N = math.ceil(total * 0.25) + else: + N = 25 + + N = max(1, N) + return pages[:N] - if i>0 and toc_arr[i-1]["indices"]: - st_i = toc_arr[i-1]["indices"][-1] - else: - st_i = 0 - e = i + 1 - while e <len(toc_arr) and not toc_arr[e]["indices"]: - e += 1 - if e >= len(toc_arr): - e = len(sections) - else: - e = toc_arr[e]["indices"][0] - - for j in range(st_i, min(e+1, len(sections))): - ans = gen_json(PROMPT_JINJA_ENV.from_string(TOC_INDEX).render( - structure=it["structure"], - title=it["title"], - text=sections[j]), "Only JSON please.", chat_mdl) - if ans["exist"] == "yes": - it["indices"].append(j) +def get_page_num(section): + if not section: + return 0 + + poss = section[1].split('\t')[0] + head = poss.lstrip('@') # '7-8' + m = re.match(r'^(\d+)(?:-(\d+))?$', head) + if not m: + return 0 + + return int(m.group(1)) + +def match_toc_sections( + sections, + toc, + start_section_idx, + min_coverage +): + def normalize(s: str) -> str: + if not s: + return "" + s = unicodedata.normalize("NFKC", s) + s = s.replace("\u3000", " ") + s = re.sub(r"\s+", " ", s).strip() + return s + + norm_sections = [] + for idx, (text, poss) in enumerate(sections[start_section_idx:], start=start_section_idx): + norm_sec = normalize(text) + if norm_sec: + norm_sections.append((idx, norm_sec)) + + res = [] + for item in toc: + title = item['title'] + norm_title = normalize(title) + best = None + + # 1) exact match + for idx, norm_sec in norm_sections: + if norm_sec == norm_title: + best = (len(norm_sec), idx) break + + # 2) substring match + if best is None and norm_title: + L = len(norm_title) + for idx, norm_sec in norm_sections: + cov = len(norm_sec) / max(L, 1) + if cov >= min_coverage: + cand = (len(norm_sec), idx) + if (best is None) or (cand[0] > best[0]) or (cand[0] == best[0] and cand[1] < best[1]): + best = cand + + res.append((title, best[1] if best else -1)) + return res - i += 1 - - return toc_arr - +def covered_ranges(pairs): + res= [] + n = len(pairs) + i = 0 + while i < n: + if pairs[i][1] == -1: + titles = [] + start_i = i + while i < n and pairs[i][1] == -1: + titles.append(pairs[i][0]) + i += 1 + end_i = i - 1 + + left_val = None + j = start_i - 1 + while j >= 0: + if pairs[j][1] != -1: + left_val = pairs[j][1] + break + j -= 1 + + right_val = None + k = end_i + 1 + while k < n: + if pairs[k][1] != -1: + right_val = pairs[k][1] + break + k += 1 + + start = (left_val + 1) if left_val is not None else 0 + end = right_val if right_val is not None else (len(pairs) - 1) + + if start <= end: + res.append((titles, (start, end))) + else: + i += 1 + return res -def check_if_toc_transformation_is_complete(content, toc, chat_mdl): - prompt = """ - You are given a raw table of contents and a table of contents. - Your job is to check if the table of contents is complete. - Reply format: - {{ - "thinking": <why do you think the cleaned table of contents is complete or not> - "completed": "yes" or "no" - }} - Directly return the final JSON structure. Do not output anything else.""" +def run_toc(filename, + sections, + chat_mdl, + vision_mdl, + min_coverage = 0.5 + ): + + # 1) Get pages + max_page = get_page_num(sections[-1]) + pages = ["" for _ in range(max_page)] + page_begin_idx = [-1 for _ in range(max_page)] + for idx, sec in enumerate(sections): + print(idx, "\t",sec) + page_num = get_page_num(sec) + pages[page_num-1] += sec[0] + "\n" + if page_begin_idx[page_num-1] == -1: + page_begin_idx[page_num-1] = idx + print("Max page number:", max_page) + + # 2) Prune pages to remove unlikely TOC candidates + # pruned_pages = prune_pages(pages) + # empty_pages = [i for i, p in enumerate(pruned_pages) if p == ""] + + # 3) Detect TOC + # toc_start_page = detect_table_of_contents(pruned_pages, chat_mdl) + # toc_start_page = 6 # for test + # print("\n\nDetected TOC start page:\n", toc_start_page) - prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc - response = gen_json(prompt, "Only JSON please.", chat_mdl) - return response['completed'] + # 4) Generate TOC from images + # toc_secs, start_page_idx = gen_toc_from_pdf(filename, empty_pages, toc_start_page, vision_mdl) + # print("\n\nDetected TOC sections:\n", toc_secs) -def toc_transformer(toc_pages, chat_mdl): - init_prompt = """ - You are given a table of contents, You job is to transform the whole table of content into a JSON format included table_of_contents. + # 5) Assign hierarchy levels to TOC + # toc_with_levels = assign_toc_levels(toc_secs, chat_mdl) + # print("\n\nDetected TOC with levels:\n", toc_with_levels) - The `structure` is the numeric system which represents the index of the hierarchy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc. - The `title` is a short phrase or a several-words term. - - The response should be in the following JSON format: - [ - { - "structure": <structure index, "x.x.x" or None> (string), - "title": <title of the section> - }, - ... - ], - You should transform the full table of contents in one go. - Directly return the final JSON structure, do not output anything else. """ - - toc_content = "\n".join(toc_pages) - prompt = init_prompt + '\n Given table of contents\n:' + toc_content - def clean_toc(arr): - for a in arr: - a["title"] = re.sub(r"[.·….]{2,}", "", a["title"]) - last_complete = gen_json(prompt, "Only JSON please.", chat_mdl) - if_complete = check_if_toc_transformation_is_complete(toc_content, json.dumps(last_complete, ensure_ascii=False, indent=2), chat_mdl) - clean_toc(last_complete) - if if_complete == "yes": - return last_complete - - while not (if_complete == "yes"): - prompt = f""" - Your task is to continue the table of contents json structure, directly output the remaining part of the json structure. - The response should be in the following JSON format: - - The raw table of contents json structure is: - {toc_content} - - The incomplete transformed table of contents json structure is: - {json.dumps(last_complete[-24:], ensure_ascii=False, indent=2)} - - Please continue the json structure, directly output the remaining part of the json structure.""" - new_complete = gen_json(prompt, "Only JSON please.", chat_mdl) - if not new_complete or str(last_complete).find(str(new_complete)) >= 0: - break - clean_toc(new_complete) - last_complete.extend(new_complete) - if_complete = check_if_toc_transformation_is_complete(toc_content, json.dumps(last_complete, ensure_ascii=False, indent=2), chat_mdl) + # 6) match TOC with sections - return last_complete + # start_section_idx = page_begin_idx[start_page_idx] if start_page_idx >=0 and start_page_idx < len(page_begin_idx) else 0 + start_section_idx = 83 + # print("\n\nStart section index for matching:", start_section_idx) + toc_with_levels = [{'structure': '1', 'title': '封面'}, {'structure': '1', 'title': '扉页'}, {'structure': '1', 'title': '版权信息'}, {'structure': '1', 'title': '自序 开启自我改变的原动力'}, {'structure': '1', 'title': '上篇 内观自己,摆脱焦虑'}, {'structure': '2', 'title': '第一章 大脑——一切问题的起源'}, {'structure': '3', 'title': '第一节 大脑:重新认识你自己'}, {'structure': '3', 'title': '第二节 焦虑:焦虑的根源'}, {'structure': '3', 'title': '第三节 耐心:得耐心者得天下'}, {'structure': '2', 'title': '第二章 潜意识——生命留给我们的彩蛋'}, {'structure': '3', 'title': '第一节 模糊:人生是一场消除模糊的比赛'}, {'structure': '3', 'title': '第二节 感性:顶级的成长竟然是"凭感觉>"'}, {'structure': '2', 'title': '第三章 元认知——人类的终极能力'}, {'structure': '3', 'title': '第一节 元认知:成长慢,是因为你不会"飞"'}, {'structure': '3', 'title': '第二节 自控力:我们生而为人就是为了成为思维舵手'}, {'structure': '1', 'title': '下篇 外观世界,借力前行'}, {'structure': '2', 'title': '第四章 专注力——情绪和智慧的交叉地带'}, {'structure': '3', 'title': '第一节 情绪专注:一招提振你的注意力'}, {'structure': '3', 'title': '第二节 学习专注:深度沉浸是进化双刃剑的安全剑柄'}, {'structure': '2', 'title': '第五章 学习力——学习不是一味地努力'}, {'structure': '3', 'title': '第一节 匹配:舒适区边缘,适用于万物的方法论'}, {'structure': '3', 'title': '第二节 深度:深度学习,人生为数不多的好出路'}, {'structure': '3', 'title': '第三节 关联:高手的"暗箱>"'}, {'structure': '3', 'title': '第四节 体系:建立个人认知体系其实很简单'}, {'structure': '3', 'title': '第五节 打卡:莫迷恋打卡,打卡打不出未来'}, {'structure': '3', 'title': '第六节 反馈:是时候告诉你什么是真正的学习了'}, {'structure': '3', 'title': '第七节 休息:你没成功,可能是因为太刻苦了'}, {'structure': '2', 'title': '第六章 行动力——没有行动世界只是个概念'}, {'structure': '3', 'title': '第一节 清晰:一个观念,重构你的行动力'}, {'structure': '3', 'title': '第二节 "傻瓜":这个世界会奖励那些不计得失的"傻瓜>"'}, {'structure': '3', 'title': '第三节 行动:"道理都懂,就是不做"怎么破解'}, {'structure': '2', 'title': '第七章 情绪力——情绪是多角度看问题的智慧'}, {'structure': '3', 'title': '第一节 心智带宽:唯有富足,方能解忧'}, {'structure': '3', 'title': '第二节 单一视角:你的坏情绪,源于视角单一'}, {'structure': '3', 'title': '第三节 游戏心态:幸福的人,总是在做另外一件事'}, {'structure': '2', 'title': '第八章 早冥读写跑,人生五件套——成本最低的成长之道'}, {'structure': '3', 'title': '第一节 早起:无闹钟、不参团、不打卡,我是如何坚持早起的'}, {'structure': '3', 'title': '第二节 冥想:终有一天,你要解锁这条隐藏赛道'}, {'structure': '3', 'title': '第三节 阅读:如何让自己真正爱上阅读'}, {'structure': '3', 'title': '第四节 写作:谢谢你,费曼先生'}, {'structure': '3', 'title': '第五节 运动:灵魂想要走得远,身体必须在路上'}, {'structure': '1', 'title': '结语 一流的生活不是富有,而是觉知'}, {'structure': '1', 'title': '后记 共同改变,一起前行'}, {'structure': '1', 'title': '参考文献'}] + pairs = match_toc_sections(sections, toc_with_levels, start_section_idx, min_coverage) + return pairs \ No newline at end of file From 7539d543b6d7db89e8717f383df171e7481b52e6 Mon Sep 17 00:00:00 2001 From: Magicbook1108 <newyorkupperbay@gmail.com> Date: Mon, 29 Sep 2025 12:26:22 +0800 Subject: [PATCH 07/14] Update prompts.py --- rag/prompts/prompts.py | 139 ++++++++++++++++++++++------------------- 1 file changed, 76 insertions(+), 63 deletions(-) diff --git a/rag/prompts/prompts.py b/rag/prompts/prompts.py index bb1499fc153..4a892ad23c4 100644 --- a/rag/prompts/prompts.py +++ b/rag/prompts/prompts.py @@ -641,81 +641,94 @@ def match_toc_sections( start_section_idx, min_coverage ): + """ + 从 start_section_idx 开始,依次为 TOC 中的每一条标题匹配 sections 中的最佳位置。 + 命中后游标前移;未命中返回 -1。 + 仅做:标准化(NFKC、空白压缩、lower)+ 去掉“尾随页码/点线”(…… 12 / .... 7 / 23) + 不做:前导编号(1./1.2/第一章/Chapter 1)去除,不做标点清理,保留多语言特征。 + """ + import re + import unicodedata + from difflib import SequenceMatcher + + # 仅去除“尾随页码/点线” + P_TRAILING_PAGE = re.compile( + r"""( + [\.\·…]{2,}\s*\d+\s*$ # ...... 12 / …… 23 / ···· 7 + | \s+\d+\s*$ # 12 + )""", + re.X + ) + def normalize(s: str) -> str: if not s: return "" - s = unicodedata.normalize("NFKC", s) - s = s.replace("\u3000", " ") + s = unicodedata.normalize("NFKC", s).lower() + s = s.replace("\u3000", " ") # 全角空格 -> 半角空格 s = re.sub(r"\s+", " ", s).strip() return s - + + def strip_trailing_page(s: str) -> str: + """仅移除尾随页码/点线,保留其余语言与标点特征""" + s = normalize(s) + s = P_TRAILING_PAGE.sub("", s).strip() + return s + + # 仅从 start_section_idx 开始做候选 norm_sections = [] - for idx, (text, poss) in enumerate(sections[start_section_idx:], start=start_section_idx): - norm_sec = normalize(text) - if norm_sec: - norm_sections.append((idx, norm_sec)) - + for idx, sec in enumerate(sections): + if idx < max(0, start_section_idx): + continue + text = sec[0] if isinstance(sec, (list, tuple)) else str(sec) + n_full = normalize(text) + n_core = strip_trailing_page(text) # 只去尾随页码/点线 + if n_core: + norm_sections.append((idx, n_full, n_core)) + + def similarity(a: str, b: str) -> float: + if not a or not b: + return 0.0 + return SequenceMatcher(None, a, b).ratio() + res = [] + scan_from = max(0, start_section_idx) + for item in toc: - title = item['title'] - norm_title = normalize(title) - best = None - - # 1) exact match - for idx, norm_sec in norm_sections: - if norm_sec == norm_title: - best = (len(norm_sec), idx) - break - - # 2) substring match - if best is None and norm_title: - L = len(norm_title) - for idx, norm_sec in norm_sections: - cov = len(norm_sec) / max(L, 1) - if cov >= min_coverage: - cand = (len(norm_sec), idx) - if (best is None) or (cand[0] > best[0]) or (cand[0] == best[0] and cand[1] < best[1]): - best = cand - - res.append((title, best[1] if best else -1)) - return res + title = item.get("title") if isinstance(item, dict) else str(item) + t_full = normalize(title) + t_core = strip_trailing_page(title) -def covered_ranges(pairs): - res= [] - n = len(pairs) - i = 0 - while i < n: - if pairs[i][1] == -1: - titles = [] - start_i = i - while i < n and pairs[i][1] == -1: - titles.append(pairs[i][0]) - i += 1 - end_i = i - 1 - - left_val = None - j = start_i - 1 - while j >= 0: - if pairs[j][1] != -1: - left_val = pairs[j][1] - break - j -= 1 + best_idx = -1 + best_score = -1.0 - right_val = None - k = end_i + 1 - while k < n: - if pairs[k][1] != -1: - right_val = pairs[k][1] + if t_core: + # 线性向后扫描 + for idx, s_full, s_core in norm_sections: + if idx < scan_from: + continue + + # 1) 强匹配:完全相等(full 或 仅去尾随页码后的 core) + if s_full == t_full or s_core == t_core: + best_idx, best_score = idx, 1.0 break - k += 1 - start = (left_val + 1) if left_val is not None else 0 - end = right_val if right_val is not None else (len(pairs) - 1) + # 2) 双向包含 + 对称覆盖率 + if t_core in s_core or s_core in t_core: + overlap = min(len(t_core), len(s_core)) + cov = overlap / max(len(t_core), len(s_core), 1) + if cov >= float(min_coverage) and cov > best_score: + best_idx, best_score = idx, cov + # 不立即 break,继续看看是否有更高得分 + + # 3) 相似度兜底 + sim = similarity(t_core, s_core) + if sim >= float(min_coverage) and sim > best_score: + best_idx, best_score = idx, sim + + res.append((title, best_idx)) + if best_idx != -1: + scan_from = best_idx + 1 # 命中后推进 - if start <= end: - res.append((titles, (start, end))) - else: - i += 1 return res @@ -759,7 +772,7 @@ def run_toc(filename, # 6) match TOC with sections # start_section_idx = page_begin_idx[start_page_idx] if start_page_idx >=0 and start_page_idx < len(page_begin_idx) else 0 - start_section_idx = 83 + start_section_idx = 83 # for test # print("\n\nStart section index for matching:", start_section_idx) toc_with_levels = [{'structure': '1', 'title': '封面'}, {'structure': '1', 'title': '扉页'}, {'structure': '1', 'title': '版权信息'}, {'structure': '1', 'title': '自序 开启自我改变的原动力'}, {'structure': '1', 'title': '上篇 内观自己,摆脱焦虑'}, {'structure': '2', 'title': '第一章 大脑——一切问题的起源'}, {'structure': '3', 'title': '第一节 大脑:重新认识你自己'}, {'structure': '3', 'title': '第二节 焦虑:焦虑的根源'}, {'structure': '3', 'title': '第三节 耐心:得耐心者得天下'}, {'structure': '2', 'title': '第二章 潜意识——生命留给我们的彩蛋'}, {'structure': '3', 'title': '第一节 模糊:人生是一场消除模糊的比赛'}, {'structure': '3', 'title': '第二节 感性:顶级的成长竟然是"凭感觉>"'}, {'structure': '2', 'title': '第三章 元认知——人类的终极能力'}, {'structure': '3', 'title': '第一节 元认知:成长慢,是因为你不会"飞"'}, {'structure': '3', 'title': '第二节 自控力:我们生而为人就是为了成为思维舵手'}, {'structure': '1', 'title': '下篇 外观世界,借力前行'}, {'structure': '2', 'title': '第四章 专注力——情绪和智慧的交叉地带'}, {'structure': '3', 'title': '第一节 情绪专注:一招提振你的注意力'}, {'structure': '3', 'title': '第二节 学习专注:深度沉浸是进化双刃剑的安全剑柄'}, {'structure': '2', 'title': '第五章 学习力——学习不是一味地努力'}, {'structure': '3', 'title': '第一节 匹配:舒适区边缘,适用于万物的方法论'}, {'structure': '3', 'title': '第二节 深度:深度学习,人生为数不多的好出路'}, {'structure': '3', 'title': '第三节 关联:高手的"暗箱>"'}, {'structure': '3', 'title': '第四节 体系:建立个人认知体系其实很简单'}, {'structure': '3', 'title': '第五节 打卡:莫迷恋打卡,打卡打不出未来'}, {'structure': '3', 'title': '第六节 反馈:是时候告诉你什么是真正的学习了'}, {'structure': '3', 'title': '第七节 休息:你没成功,可能是因为太刻苦了'}, {'structure': '2', 'title': '第六章 行动力——没有行动世界只是个概念'}, {'structure': '3', 'title': '第一节 清晰:一个观念,重构你的行动力'}, {'structure': '3', 'title': '第二节 "傻瓜":这个世界会奖励那些不计得失的"傻瓜>"'}, {'structure': '3', 'title': '第三节 行动:"道理都懂,就是不做"怎么破解'}, {'structure': '2', 'title': '第七章 情绪力——情绪是多角度看问题的智慧'}, {'structure': '3', 'title': '第一节 心智带宽:唯有富足,方能解忧'}, {'structure': '3', 'title': '第二节 单一视角:你的坏情绪,源于视角单一'}, {'structure': '3', 'title': '第三节 游戏心态:幸福的人,总是在做另外一件事'}, {'structure': '2', 'title': '第八章 早冥读写跑,人生五件套——成本最低的成长之道'}, {'structure': '3', 'title': '第一节 早起:无闹钟、不参团、不打卡,我是如何坚持早起的'}, {'structure': '3', 'title': '第二节 冥想:终有一天,你要解锁这条隐藏赛道'}, {'structure': '3', 'title': '第三节 阅读:如何让自己真正爱上阅读'}, {'structure': '3', 'title': '第四节 写作:谢谢你,费曼先生'}, {'structure': '3', 'title': '第五节 运动:灵魂想要走得远,身体必须在路上'}, {'structure': '1', 'title': '结语 一流的生活不是富有,而是觉知'}, {'structure': '1', 'title': '后记 共同改变,一起前行'}, {'structure': '1', 'title': '参考文献'}] From 6b664fac476f92bd458b48b00eb305115269d306 Mon Sep 17 00:00:00 2001 From: Magicbook1108 <newyorkupperbay@gmail.com> Date: Mon, 29 Sep 2025 13:42:44 +0800 Subject: [PATCH 08/14] Update prompts.py --- rag/prompts/prompts.py | 161 ++++++++++++++++++++++------------------- 1 file changed, 86 insertions(+), 75 deletions(-) diff --git a/rag/prompts/prompts.py b/rag/prompts/prompts.py index 4a892ad23c4..31909837555 100644 --- a/rag/prompts/prompts.py +++ b/rag/prompts/prompts.py @@ -15,6 +15,7 @@ import re import json import math +import time import jinja2 import base64 import logging @@ -35,7 +36,7 @@ STOP_TOKEN="<|STOP|>" COMPLETE_TASK="complete_task" - +MAX_RETRIES = 3 def get_value(d, k1, k2): return d.get(k1, d.get(k2)) @@ -559,60 +560,65 @@ def build_img_toc_messages(url): def gen_toc_from_pdf(filename, empty_pages, start_page, chat_mdl): + # Collect TOC items gathered across pages toc = [] + + # Open in-memory PDF and iterate over a pruned subset of pages to reduce cost with pdfplumber.open(BytesIO(filename)) as pdf: for i, page in enumerate(prune_pages(pdf.pages)): + # Skip empty candidates and pages before the detected TOC start page if i in empty_pages or i < start_page: continue - - img_url = gen_image_from_page(page) - import requests - # === 直接请求 /v1/chat/completions(写死,唯一变量是 message)=== - url = "https://api.siliconflow.cn/v1/chat/completions" - headers = { - "Authorization": "Bearer sk-ipkxipfjzorweuhfwudgzhcfevdjqjnneltugjffkgtogypn", # 替换为你的实际 API Key - "Content-Type": "application/json", - } - payload = { - "model": "THUDM/GLM-4.1V-9B-Thinking", - "messages": build_img_toc_messages(img_url), # 唯一变量:message - "stream": True, - "temperature": 0.2, - } - - response = requests.post(url, json=payload, headers=headers, stream=True) - print("\n\nResponse Status Code:\n", response.status_code) - full_content = "" - full_reasoning_content = "" - for chunk in response.iter_lines(): - if not chunk: - continue - chunk_str = chunk.decode("utf-8") - if chunk_str.startswith("data:"): - chunk_str = chunk_str[len("data:"):].strip() - if chunk_str != "[DONE]": - chunk_data = json_repair.loads(chunk_str) - delta = chunk_data["choices"][0].get("delta", {}) - content = delta.get("content", "") - reasoning_content = delta.get("reasoning_content", "") - if content: - print(content, end="", flush=True) - full_content += content - if reasoning_content: - print(reasoning_content, end="", flush=True) - full_reasoning_content += reasoning_content - - raw = full_content or full_reasoning_content or "" - raw = re.sub(r"(^.*</think>|```json\n|```\n*$)", "", raw, flags=re.DOTALL) - ans = json_repair.loads(raw) + # Render the page to a high-res JPEG data URL for vision model consumption + img_url = gen_image_from_page(page) + # Build vision-LLM messages instructing strict JSON-only TOC extraction + msg = build_img_toc_messages(img_url) + + # Exponential backoff on transient parsing/LLM errors + delay = 1.0 + for attempt in range(MAX_RETRIES): + try: + raw = chat_mdl.chat( + msg[0]["content"], + msg[1:], + {"temperature": 0.2}, + ) + # Strip think tags and code fences before JSON repair/parse + raw = re.sub( + r"(^.*</think>|```json\n|```\n*$)", + "", + raw, + flags=re.DOTALL, + ) + ans = json_repair.loads(raw) + break + except Exception as e: + logging.warning( + f"TOC page {i} attempt {attempt}/{MAX_RETRIES} failed: {e}" + ) + if attempt < MAX_RETRIES: + time.sleep(delay) + delay *= 2 + else: + logging.exception( + f"TOC page {i} failed after retries." + ) + ans = None + + # If the page is not a TOC, stop and return collected TOC with the page index if ans[0].get("title") == "-1": return toc, i else: + # Otherwise accumulate extracted TOC items and continue toc.extend(ans) + + # If no explicit stop page was detected, return TOC and -1 return toc, -1 + def prune_pages(pages): + # Heuristic: only scan the first 25% (up to 25) pages as likely TOC candidates total = len(pages) if total <= 100: @@ -620,10 +626,13 @@ def prune_pages(pages): else: N = 25 + # Always keep at least one page N = max(1, N) return pages[:N] + def get_page_num(section): + # Extract the starting page number from a section meta string like '@7-8' if not section: return 0 @@ -633,8 +642,10 @@ def get_page_num(section): if not m: return 0 + # Return the first page number as the section's page hint return int(m.group(1)) + def match_toc_sections( sections, toc, @@ -642,16 +653,16 @@ def match_toc_sections( min_coverage ): """ - 从 start_section_idx 开始,依次为 TOC 中的每一条标题匹配 sections 中的最佳位置。 - 命中后游标前移;未命中返回 -1。 - 仅做:标准化(NFKC、空白压缩、lower)+ 去掉“尾随页码/点线”(…… 12 / .... 7 / 23) - 不做:前导编号(1./1.2/第一章/Chapter 1)去除,不做标点清理,保留多语言特征。 + Match each TOC title to the best section index starting from start_section_idx. + Cursor moves forward after a match; unmatched items get index -1. + Only strip trailing page dots/numbers (e.g., "...... 12") and normalize (NFKC, spaces, lower). + Leading numbering (e.g., "1.", "第一章", "Chapter 1") is intentionally preserved for matching. """ import re import unicodedata from difflib import SequenceMatcher - # 仅去除“尾随页码/点线” + # Regex to remove only trailing dot leaders and page numbers (e.g., "...... 12", " 23") P_TRAILING_PAGE = re.compile( r"""( [\.\·…]{2,}\s*\d+\s*$ # ...... 12 / …… 23 / ···· 7 @@ -661,39 +672,43 @@ def match_toc_sections( ) def normalize(s: str) -> str: + # Unicode normalize, lower, collapse whitespace if not s: return "" s = unicodedata.normalize("NFKC", s).lower() - s = s.replace("\u3000", " ") # 全角空格 -> 半角空格 + s = s.replace("\u3000", " ") # full-width space -> half-width space s = re.sub(r"\s+", " ", s).strip() return s def strip_trailing_page(s: str) -> str: - """仅移除尾随页码/点线,保留其余语言与标点特征""" + # Remove only trailing page cues; keep numbering/punctuation for multi-language robustness s = normalize(s) s = P_TRAILING_PAGE.sub("", s).strip() return s - # 仅从 start_section_idx 开始做候选 + # Build normalized section candidates starting at the provided cursor norm_sections = [] for idx, sec in enumerate(sections): if idx < max(0, start_section_idx): continue text = sec[0] if isinstance(sec, (list, tuple)) else str(sec) n_full = normalize(text) - n_core = strip_trailing_page(text) # 只去尾随页码/点线 + n_core = strip_trailing_page(text) # only remove trailing page markers if n_core: norm_sections.append((idx, n_full, n_core)) def similarity(a: str, b: str) -> float: + # Fuzzy ratio as a fallback if not a or not b: return 0.0 return SequenceMatcher(None, a, b).ratio() res = [] + # Matching cursor moves forward to enforce TOC order scan_from = max(0, start_section_idx) for item in toc: + # Support dict {'title': ...} or plain string title = item.get("title") if isinstance(item, dict) else str(item) t_full = normalize(title) t_core = strip_trailing_page(title) @@ -702,32 +717,34 @@ def similarity(a: str, b: str) -> float: best_score = -1.0 if t_core: - # 线性向后扫描 + # Linear forward scan to keep document order for idx, s_full, s_core in norm_sections: if idx < scan_from: continue - # 1) 强匹配:完全相等(full 或 仅去尾随页码后的 core) + # Strong match: exact equality on full or core if s_full == t_full or s_core == t_core: best_idx, best_score = idx, 1.0 break - # 2) 双向包含 + 对称覆盖率 + # Bidirectional containment with coverage threshold (symmetric-ish) if t_core in s_core or s_core in t_core: overlap = min(len(t_core), len(s_core)) cov = overlap / max(len(t_core), len(s_core), 1) if cov >= float(min_coverage) and cov > best_score: best_idx, best_score = idx, cov - # 不立即 break,继续看看是否有更高得分 + # Keep scanning; might find a higher score - # 3) 相似度兜底 + # Fuzzy similarity fallback sim = similarity(t_core, s_core) if sim >= float(min_coverage) and sim > best_score: best_idx, best_score = idx, sim + # Record match (or -1 if not found) res.append((title, best_idx)) if best_idx != -1: - scan_from = best_idx + 1 # 命中后推进 + # Advance cursor to maintain TOC order mapping + scan_from = best_idx + 1 return res @@ -752,31 +769,25 @@ def run_toc(filename, print("Max page number:", max_page) # 2) Prune pages to remove unlikely TOC candidates - # pruned_pages = prune_pages(pages) - # empty_pages = [i for i, p in enumerate(pruned_pages) if p == ""] + pruned_pages = prune_pages(pages) + empty_pages = [i for i, p in enumerate(pruned_pages) if p == ""] # 3) Detect TOC - # toc_start_page = detect_table_of_contents(pruned_pages, chat_mdl) - # toc_start_page = 6 # for test - # print("\n\nDetected TOC start page:\n", toc_start_page) - + toc_start_page = detect_table_of_contents(pruned_pages, chat_mdl) + print("\n\nDetected TOC start page:\n", toc_start_page) # 4) Generate TOC from images - # toc_secs, start_page_idx = gen_toc_from_pdf(filename, empty_pages, toc_start_page, vision_mdl) - # print("\n\nDetected TOC sections:\n", toc_secs) + toc_secs, start_page_idx = gen_toc_from_pdf(filename, empty_pages, toc_start_page, vision_mdl) + print("\n\nDetected TOC sections:\n", toc_secs) # 5) Assign hierarchy levels to TOC - # toc_with_levels = assign_toc_levels(toc_secs, chat_mdl) - # print("\n\nDetected TOC with levels:\n", toc_with_levels) + toc_with_levels = assign_toc_levels(toc_secs, chat_mdl) + print("\n\nDetected TOC with levels:\n", toc_with_levels) # 6) match TOC with sections - - # start_section_idx = page_begin_idx[start_page_idx] if start_page_idx >=0 and start_page_idx < len(page_begin_idx) else 0 - start_section_idx = 83 # for test - # print("\n\nStart section index for matching:", start_section_idx) - - toc_with_levels = [{'structure': '1', 'title': '封面'}, {'structure': '1', 'title': '扉页'}, {'structure': '1', 'title': '版权信息'}, {'structure': '1', 'title': '自序 开启自我改变的原动力'}, {'structure': '1', 'title': '上篇 内观自己,摆脱焦虑'}, {'structure': '2', 'title': '第一章 大脑——一切问题的起源'}, {'structure': '3', 'title': '第一节 大脑:重新认识你自己'}, {'structure': '3', 'title': '第二节 焦虑:焦虑的根源'}, {'structure': '3', 'title': '第三节 耐心:得耐心者得天下'}, {'structure': '2', 'title': '第二章 潜意识——生命留给我们的彩蛋'}, {'structure': '3', 'title': '第一节 模糊:人生是一场消除模糊的比赛'}, {'structure': '3', 'title': '第二节 感性:顶级的成长竟然是"凭感觉>"'}, {'structure': '2', 'title': '第三章 元认知——人类的终极能力'}, {'structure': '3', 'title': '第一节 元认知:成长慢,是因为你不会"飞"'}, {'structure': '3', 'title': '第二节 自控力:我们生而为人就是为了成为思维舵手'}, {'structure': '1', 'title': '下篇 外观世界,借力前行'}, {'structure': '2', 'title': '第四章 专注力——情绪和智慧的交叉地带'}, {'structure': '3', 'title': '第一节 情绪专注:一招提振你的注意力'}, {'structure': '3', 'title': '第二节 学习专注:深度沉浸是进化双刃剑的安全剑柄'}, {'structure': '2', 'title': '第五章 学习力——学习不是一味地努力'}, {'structure': '3', 'title': '第一节 匹配:舒适区边缘,适用于万物的方法论'}, {'structure': '3', 'title': '第二节 深度:深度学习,人生为数不多的好出路'}, {'structure': '3', 'title': '第三节 关联:高手的"暗箱>"'}, {'structure': '3', 'title': '第四节 体系:建立个人认知体系其实很简单'}, {'structure': '3', 'title': '第五节 打卡:莫迷恋打卡,打卡打不出未来'}, {'structure': '3', 'title': '第六节 反馈:是时候告诉你什么是真正的学习了'}, {'structure': '3', 'title': '第七节 休息:你没成功,可能是因为太刻苦了'}, {'structure': '2', 'title': '第六章 行动力——没有行动世界只是个概念'}, {'structure': '3', 'title': '第一节 清晰:一个观念,重构你的行动力'}, {'structure': '3', 'title': '第二节 "傻瓜":这个世界会奖励那些不计得失的"傻瓜>"'}, {'structure': '3', 'title': '第三节 行动:"道理都懂,就是不做"怎么破解'}, {'structure': '2', 'title': '第七章 情绪力——情绪是多角度看问题的智慧'}, {'structure': '3', 'title': '第一节 心智带宽:唯有富足,方能解忧'}, {'structure': '3', 'title': '第二节 单一视角:你的坏情绪,源于视角单一'}, {'structure': '3', 'title': '第三节 游戏心态:幸福的人,总是在做另外一件事'}, {'structure': '2', 'title': '第八章 早冥读写跑,人生五件套——成本最低的成长之道'}, {'structure': '3', 'title': '第一节 早起:无闹钟、不参团、不打卡,我是如何坚持早起的'}, {'structure': '3', 'title': '第二节 冥想:终有一天,你要解锁这条隐藏赛道'}, {'structure': '3', 'title': '第三节 阅读:如何让自己真正爱上阅读'}, {'structure': '3', 'title': '第四节 写作:谢谢你,费曼先生'}, {'structure': '3', 'title': '第五节 运动:灵魂想要走得远,身体必须在路上'}, {'structure': '1', 'title': '结语 一流的生活不是富有,而是觉知'}, {'structure': '1', 'title': '后记 共同改变,一起前行'}, {'structure': '1', 'title': '参考文献'}] - + start_section_idx = page_begin_idx[start_page_idx] if start_page_idx >=0 and start_page_idx < len(page_begin_idx) else 0 + print("\n\nStart section index for matching:", start_section_idx) pairs = match_toc_sections(sections, toc_with_levels, start_section_idx, min_coverage) - return pairs \ No newline at end of file + print("\n\nMatched TOC sections with indices:\n", pairs) + return pairs \ No newline at end of file From 63ae797e5757c0351592e9db1cbcaaeb354f8c6b Mon Sep 17 00:00:00 2001 From: Magicbook1108 <newyorkupperbay@gmail.com> Date: Mon, 29 Sep 2025 13:43:51 +0800 Subject: [PATCH 09/14] Update prompts.py --- rag/prompts/prompts.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/rag/prompts/prompts.py b/rag/prompts/prompts.py index 31909837555..47417c885a8 100644 --- a/rag/prompts/prompts.py +++ b/rag/prompts/prompts.py @@ -560,19 +560,15 @@ def build_img_toc_messages(url): def gen_toc_from_pdf(filename, empty_pages, start_page, chat_mdl): - # Collect TOC items gathered across pages toc = [] - # Open in-memory PDF and iterate over a pruned subset of pages to reduce cost with pdfplumber.open(BytesIO(filename)) as pdf: for i, page in enumerate(prune_pages(pdf.pages)): - # Skip empty candidates and pages before the detected TOC start page if i in empty_pages or i < start_page: continue # Render the page to a high-res JPEG data URL for vision model consumption img_url = gen_image_from_page(page) - # Build vision-LLM messages instructing strict JSON-only TOC extraction msg = build_img_toc_messages(img_url) # Exponential backoff on transient parsing/LLM errors @@ -584,7 +580,6 @@ def gen_toc_from_pdf(filename, empty_pages, start_page, chat_mdl): msg[1:], {"temperature": 0.2}, ) - # Strip think tags and code fences before JSON repair/parse raw = re.sub( r"(^.*</think>|```json\n|```\n*$)", "", @@ -610,10 +605,8 @@ def gen_toc_from_pdf(filename, empty_pages, start_page, chat_mdl): if ans[0].get("title") == "-1": return toc, i else: - # Otherwise accumulate extracted TOC items and continue toc.extend(ans) - # If no explicit stop page was detected, return TOC and -1 return toc, -1 @@ -626,7 +619,6 @@ def prune_pages(pages): else: N = 25 - # Always keep at least one page N = max(1, N) return pages[:N] @@ -698,17 +690,14 @@ def strip_trailing_page(s: str) -> str: norm_sections.append((idx, n_full, n_core)) def similarity(a: str, b: str) -> float: - # Fuzzy ratio as a fallback if not a or not b: return 0.0 return SequenceMatcher(None, a, b).ratio() res = [] - # Matching cursor moves forward to enforce TOC order scan_from = max(0, start_section_idx) for item in toc: - # Support dict {'title': ...} or plain string title = item.get("title") if isinstance(item, dict) else str(item) t_full = normalize(title) t_core = strip_trailing_page(title) @@ -740,11 +729,9 @@ def similarity(a: str, b: str) -> float: if sim >= float(min_coverage) and sim > best_score: best_idx, best_score = idx, sim - # Record match (or -1 if not found) res.append((title, best_idx)) if best_idx != -1: - # Advance cursor to maintain TOC order mapping - scan_from = best_idx + 1 + scan_from = best_idx + 1 # 命中后推进 return res @@ -790,4 +777,5 @@ def run_toc(filename, pairs = match_toc_sections(sections, toc_with_levels, start_section_idx, min_coverage) print("\n\nMatched TOC sections with indices:\n", pairs) - return pairs \ No newline at end of file + + return pairs # [(title, section_idx), ...] \ No newline at end of file From 050c4752a89195303d511c92577988500b9553b2 Mon Sep 17 00:00:00 2001 From: Billy Bao <newyorkupperbay@gmail.com> Date: Mon, 29 Sep 2025 13:47:45 +0800 Subject: [PATCH 10/14] remove chinese comment --- rag/prompts/prompts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rag/prompts/prompts.py b/rag/prompts/prompts.py index 47417c885a8..a6eaaf62402 100644 --- a/rag/prompts/prompts.py +++ b/rag/prompts/prompts.py @@ -731,7 +731,7 @@ def similarity(a: str, b: str) -> float: res.append((title, best_idx)) if best_idx != -1: - scan_from = best_idx + 1 # 命中后推进 + scan_from = best_idx + 1 return res @@ -778,4 +778,4 @@ def run_toc(filename, pairs = match_toc_sections(sections, toc_with_levels, start_section_idx, min_coverage) print("\n\nMatched TOC sections with indices:\n", pairs) - return pairs # [(title, section_idx), ...] \ No newline at end of file + return pairs # [(title, section_idx), ...] From bffce75cdbe16ce04995f7d7de9491c68ef779b3 Mon Sep 17 00:00:00 2001 From: Billy Bao <newyorkupperbay@gmail.com> Date: Mon, 29 Sep 2025 13:48:02 +0800 Subject: [PATCH 11/14] Delete rag/prompts/toc_from_img_system.md --- rag/prompts/toc_from_img_system.md | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 rag/prompts/toc_from_img_system.md diff --git a/rag/prompts/toc_from_img_system.md b/rag/prompts/toc_from_img_system.md deleted file mode 100644 index d7e7f7bb6e4..00000000000 --- a/rag/prompts/toc_from_img_system.md +++ /dev/null @@ -1,18 +0,0 @@ -You are a Table-of-Contents (TOC) extractor. -- STRICT OUTPUT: Return ONLY a valid JSON array. -- Each element must be {"structure": "0", "title": "<heading text>"}. -- If page is NOT a TOC, return [{"structure": "0", "title": "-1"}]. - -Examples: - -Example 1 (valid TOC page): -[ - {"structure": "0", "title": "Introduction"}, - {"structure": "0", "title": "Chapter 1: Basics"}, - {"structure": "0", "title": "Chapter 2: Advanced Topics"} -] - -Example 2 (NOT a TOC page): -[ - {"structure": "0", "title": "-1"} -] \ No newline at end of file From 677370ce2950f7f301c445ee8bc260201607d5d7 Mon Sep 17 00:00:00 2001 From: Billy Bao <newyorkupperbay@gmail.com> Date: Mon, 29 Sep 2025 13:48:18 +0800 Subject: [PATCH 12/14] Delete rag/prompts/toc_from_img_user.md --- rag/prompts/toc_from_img_user.md | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 rag/prompts/toc_from_img_user.md diff --git a/rag/prompts/toc_from_img_user.md b/rag/prompts/toc_from_img_user.md deleted file mode 100644 index 51c2e565a47..00000000000 --- a/rag/prompts/toc_from_img_user.md +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "type": "image_url", - "image_url": {"url": {{url}}, "detail": "high"} - }, - { - "type": "text", - "text": "Input: one page image at a time. Extract TOC items from this page." - } -] \ No newline at end of file From ddedd78897745502e1002a6256ab609709a7bff5 Mon Sep 17 00:00:00 2001 From: Magicbook1108 <newyorkupperbay@gmail.com> Date: Mon, 29 Sep 2025 13:50:02 +0800 Subject: [PATCH 13/14] remove unused function --- rag/prompts/prompts.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/rag/prompts/prompts.py b/rag/prompts/prompts.py index a6eaaf62402..72a9f4b9531 100644 --- a/rag/prompts/prompts.py +++ b/rag/prompts/prompts.py @@ -467,15 +467,6 @@ def detect_table_of_contents(pages:list[str], chat_mdl): return i return -1 - -TOC_FROM_IMG_SYSTEM = load_prompt("toc_from_img_system") -TOC_FROM_IMG_USER = load_prompt("toc_from_img_user") -def gen_toc_from_img(img_url, vision_mdl): - ans = gen_json(PROMPT_JINJA_ENV.from_string(TOC_FROM_IMG_SYSTEM).render(), - PROMPT_JINJA_ENV.from_string(TOC_FROM_IMG_USER).render(url=img_url), - vision_mdl) - return ans - TOC_LEVELS = load_prompt("assign_toc_levels") def assign_toc_levels(toc_secs, chat_mdl): ans = gen_json(PROMPT_JINJA_ENV.from_string(TOC_LEVELS).render(), From bec20cbc158b5890ee433e8b797e2fa7e84e570b Mon Sep 17 00:00:00 2001 From: Magicbook1108 <newyorkupperbay@gmail.com> Date: Thu, 9 Oct 2025 12:48:09 +0800 Subject: [PATCH 14/14] coding style fix --- rag/prompts/prompts.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rag/prompts/prompts.py b/rag/prompts/prompts.py index 72a9f4b9531..1c26c1f8985 100644 --- a/rag/prompts/prompts.py +++ b/rag/prompts/prompts.py @@ -467,6 +467,7 @@ def detect_table_of_contents(pages:list[str], chat_mdl): return i return -1 + TOC_LEVELS = load_prompt("assign_toc_levels") def assign_toc_levels(toc_secs, chat_mdl): ans = gen_json(PROMPT_JINJA_ENV.from_string(TOC_LEVELS).render(), @@ -476,6 +477,7 @@ def assign_toc_levels(toc_secs, chat_mdl): return ans + def gen_image_from_page(page): pil_img = page.to_image(resolution=300, antialias=True).original img_buf = BytesIO()