Skip to content

Commit e38a86b

Browse files
author
virgilwong
committed
feat: Add automatic trigger generation logic for graphrag and raptor (infiniflow#11402)
1 parent 13e212c commit e38a86b

File tree

20 files changed

+382
-11
lines changed

20 files changed

+382
-11
lines changed

api/db/services/document_service.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -626,9 +626,9 @@ def get_doc_count(cls, tenant_id):
626626
def begin2parse(cls, doc_id, keep_progress=False):
627627
info = {
628628
"progress_msg": "Task is queued...",
629-
"process_begin_at": get_format_time(),
630629
}
631630
if not keep_progress:
631+
info["process_begin_at"] = get_format_time()
632632
info["progress"] = random.random() * 1 / 100.
633633
info["run"] = TaskStatus.RUNNING.value
634634
# keep the doc in DONE state when keep_progress=True for GraphRAG, RAPTOR and Mindmap tasks
@@ -719,10 +719,12 @@ def _sync_progress(cls, docs:list[dict]):
719719
freeze_progress = special_task_running and doc_progress >= 1 and not finished
720720
msg = "\n".join(sorted(msg))
721721
info = {
722-
"process_duration": datetime.timestamp(
723-
datetime.now()) -
724-
d["process_begin_at"].timestamp(),
725-
"run": status}
722+
"run": status
723+
}
724+
if not freeze_progress and 0 < doc_progress < 1:
725+
info["process_duration"] = (
726+
datetime.timestamp(datetime.now()) - d["process_begin_at"].timestamp()
727+
)
726728
if prg != 0 and not freeze_progress:
727729
info["progress"] = prg
728730
if msg:

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ dependencies = [
146146
"captcha>=0.7.1",
147147
"pip>=25.2",
148148
"pypandoc>=1.16",
149+
"croniter>=2.0.1,<3.0.0",
149150
]
150151

151152
[dependency-groups]

rag/svr/task_executor.py

Lines changed: 111 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
import numpy as np
5252
from peewee import DoesNotExist
5353
from common.constants import LLMType, ParserType, PipelineTaskType
54-
from api.db.services.document_service import DocumentService
54+
from api.db.services.document_service import DocumentService, queue_raptor_o_graphrag_tasks
5555
from api.db.services.llm_service import LLMBundle
5656
from api.db.services.task_service import TaskService, has_canceled, CANVAS_DEBUG_DOC_ID, GRAPH_RAPTOR_FAKE_DOC_ID
5757
from api.db.services.file2document_service import File2DocumentService
@@ -68,6 +68,7 @@
6868
from common.exceptions import TaskCanceledException
6969
from common import settings
7070
from common.constants import PAGERANK_FLD, TAG_FLD, SVR_CONSUMER_GROUP_NAME
71+
from croniter import croniter
7172

7273
BATCH_SIZE = 64
7374

@@ -641,6 +642,8 @@ def dict_update(meta):
641642
logging.info("[Done], chunks({}), token({}), elapsed:{:.2f}".format(len(chunks), embedding_token_consumption, task_time_cost))
642643
PipelineOperationLogService.create(document_id=doc_id, pipeline_id=dataflow_id, task_type=PipelineTaskType.PARSE, dsl=str(pipeline))
643644

645+
trigger_update_after(task_dataset_id, doc_id)
646+
644647

645648
@timeout(3600)
646649
async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_size, callback=None, doc_ids=[]):
@@ -747,6 +750,27 @@ async def insert_es(task_id, task_tenant_id, task_dataset_id, chunks, progress_c
747750
return True
748751

749752

753+
def trigger_update_after(kb_id: str, doc_id: str):
754+
try:
755+
ok, kb = KnowledgebaseService.get_by_id(kb_id)
756+
if not ok:
757+
return
758+
conf = kb.parser_config or {}
759+
gconf = conf.get("graphrag") or {}
760+
rconf = conf.get("raptor") or {}
761+
if gconf.get("use_graphrag") and gconf.get("strategy") == "update_after":
762+
docs, _ = DocumentService.get_by_kb_id(kb_id=kb.id, page_number=0, items_per_page=0, orderby="create_time", desc=False, keywords="", run_status=[], types=[], suffix=[])
763+
sample_document = docs[0] if docs else {"id": doc_id}
764+
tid = queue_raptor_o_graphrag_tasks(sample_doc_id=sample_document, ty="graphrag", priority=0, fake_doc_id=GRAPH_RAPTOR_FAKE_DOC_ID, doc_ids=[doc_id])
765+
KnowledgebaseService.update_by_id(kb.id, {"graphrag_task_id": tid})
766+
if rconf.get("use_raptor") and rconf.get("strategy") == "update_after":
767+
docs, _ = DocumentService.get_by_kb_id(kb_id=kb.id, page_number=0, items_per_page=0, orderby="create_time", desc=False, keywords="", run_status=[], types=[], suffix=[])
768+
sample_document = docs[0] if docs else {"id": doc_id}
769+
tid = queue_raptor_o_graphrag_tasks(sample_doc_id=sample_document, ty="raptor", priority=0, fake_doc_id=GRAPH_RAPTOR_FAKE_DOC_ID, doc_ids=[doc_id])
770+
KnowledgebaseService.update_by_id(kb.id, {"raptor_task_id": tid})
771+
except Exception:
772+
pass
773+
750774
@timeout(60*60*3, 1)
751775
async def do_handle_task(task):
752776
task_type = task.get("task_type", "")
@@ -948,6 +972,7 @@ async def do_handle_task(task):
948972
"Chunk doc({}), page({}-{}), chunks({}), token({}), elapsed:{:.2f}".format(task_document_name, task_from_page,
949973
task_to_page, len(chunks),
950974
token_count, task_time_cost))
975+
trigger_update_after(task_dataset_id, task_doc_id)
951976

952977

953978
async def handle_task():
@@ -1062,6 +1087,90 @@ async def task_manager():
10621087
task_limiter.release()
10631088

10641089

1090+
async def _due(cron: str, last_finish: datetime):
1091+
try:
1092+
if not cron:
1093+
return False
1094+
if not croniter.is_valid(cron):
1095+
return False
1096+
slot = datetime.now().replace(second=0, microsecond=0)
1097+
prev_time = croniter(cron, slot).get_prev(datetime)
1098+
if last_finish and last_finish >= prev_time:
1099+
return False
1100+
return True
1101+
except Exception:
1102+
return False
1103+
1104+
1105+
async def scheduler():
1106+
while not stop_event.is_set():
1107+
try:
1108+
def _doc_finish_ts_ms(doc):
1109+
pb = doc.get("process_begin_at")
1110+
dur = doc.get("process_duration") or 0
1111+
if not pb:
1112+
return None
1113+
try:
1114+
pb_ts_ms = int(pb.timestamp() * 1000)
1115+
except Exception:
1116+
return None
1117+
return pb_ts_ms + int(dur * 1000)
1118+
1119+
def _schedule_if_needed(kb, changed_docs, ty):
1120+
if not changed_docs:
1121+
return
1122+
if ty == "graphrag":
1123+
task_id = kb.graphrag_task_id
1124+
else:
1125+
task_id = kb.raptor_task_id
1126+
skip = False
1127+
if task_id:
1128+
ok, t = TaskService.get_by_id(task_id)
1129+
skip = bool(ok and t and t.progress not in [-1, 1])
1130+
if skip:
1131+
return
1132+
sample_document = changed_docs[0]
1133+
document_ids = [d["id"] for d in changed_docs]
1134+
tid = queue_raptor_o_graphrag_tasks(sample_doc_id=sample_document, ty=ty, priority=0, fake_doc_id=GRAPH_RAPTOR_FAKE_DOC_ID, doc_ids=document_ids)
1135+
if ty == "graphrag":
1136+
KnowledgebaseService.update_by_id(kb.id, {"graphrag_task_id": tid})
1137+
else:
1138+
KnowledgebaseService.update_by_id(kb.id, {"raptor_task_id": tid})
1139+
1140+
ids = KnowledgebaseService.get_all_ids()
1141+
for kb_id in ids:
1142+
ok, kb = KnowledgebaseService.get_by_id(kb_id)
1143+
if not ok:
1144+
continue
1145+
conf = kb.parser_config or {}
1146+
gconf = (conf.get("graphrag") or {})
1147+
rconf = (conf.get("raptor") or {})
1148+
if gconf.get("use_graphrag") and gconf.get("strategy") == "timed" and gconf.get("cron"):
1149+
if await _due(gconf.get("cron"), kb.graphrag_task_finish_at):
1150+
documents, _ = DocumentService.get_by_kb_id(kb_id=kb.id, page_number=0, items_per_page=0, orderby="create_time", desc=False, keywords="", run_status=[], types=[], suffix=[])
1151+
if documents:
1152+
finish_dt = kb.graphrag_task_finish_at
1153+
changed_docs = documents
1154+
if finish_dt:
1155+
finish_ts_ms = int(finish_dt.timestamp() * 1000)
1156+
changed_docs = [d for d in documents if (lambda t: t is not None and t > finish_ts_ms)(_doc_finish_ts_ms(d))]
1157+
_schedule_if_needed(kb, changed_docs, "graphrag")
1158+
if rconf.get("use_raptor") and rconf.get("strategy") == "timed" and rconf.get("cron"):
1159+
if await _due(rconf.get("cron"), kb.raptor_task_finish_at):
1160+
documents, _ = DocumentService.get_by_kb_id(kb_id=kb.id, page_number=0, items_per_page=0, orderby="create_time", desc=False, keywords="", run_status=[], types=[], suffix=[])
1161+
if documents:
1162+
finish_dt = kb.raptor_task_finish_at
1163+
changed_docs = documents
1164+
if finish_dt:
1165+
finish_ts_ms = int(finish_dt.timestamp() * 1000)
1166+
changed_docs = [d for d in documents if (lambda t: t is not None and t > finish_ts_ms)(_doc_finish_ts_ms(d))]
1167+
_schedule_if_needed(kb, changed_docs, "raptor")
1168+
except Exception as e:
1169+
logging.exception(e)
1170+
pass
1171+
await trio.sleep(60) # Special tasks take a long time to run, so the start time of scheduled tasks does not need to be very precise
1172+
1173+
10651174
async def main():
10661175
logging.info(r"""
10671176
____ __ _
@@ -1089,6 +1198,7 @@ async def main():
10891198

10901199
async with trio.open_nursery() as nursery:
10911200
nursery.start_soon(report_status)
1201+
nursery.start_soon(scheduler)
10921202
while not stop_event.is_set():
10931203
await task_limiter.acquire()
10941204
nursery.start_soon(task_manager)

web/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@
116116
"uuid": "^9.0.1",
117117
"xlsx": "^0.18.5",
118118
"zod": "^3.23.8",
119+
"cron-validate": "^1.4.5",
119120
"zustand": "^4.5.2"
120121
},
121122
"devDependencies": {

web/src/components/parse-configuration/graph-rag-form-fields.tsx

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ import {
1818
FormLabel,
1919
FormMessage,
2020
} from '../ui/form';
21+
import { ExpandedInput } from '../ui/input';
22+
import { Radio } from '../ui/radio';
2123
import { RAGFlowSelect } from '../ui/select';
2224
import { Switch } from '../ui/switch';
2325

@@ -119,6 +121,10 @@ const GraphRagItems = ({
119121
control: form.control,
120122
name: 'parser_config.graphrag.use_graphrag',
121123
});
124+
const strategy = useWatch({
125+
control: form.control,
126+
name: 'parser_config.graphrag.strategy',
127+
});
122128

123129
const methodOptions = useMemo(() => {
124130
return [MethodValue.Light, MethodValue.General].map((x) => ({
@@ -136,6 +142,60 @@ const GraphRagItems = ({
136142

137143
return (
138144
<FormContainer className={cn({ 'mb-4': marginBottom }, className)}>
145+
<FormField
146+
control={form.control}
147+
name={'parser_config.graphrag.strategy'}
148+
render={({ field }) => {
149+
return (
150+
<FormItem className=" items-center space-y-0 ">
151+
<div className="flex items-start">
152+
<FormLabel className="text-sm whitespace-nowrap w-1/4">
153+
{t('graphRagStrategy')}
154+
</FormLabel>
155+
<div className="w-3/4">
156+
<FormControl>
157+
<Radio.Group {...field}>
158+
<div className={'flex gap-4 w-full text-text-secondary '}>
159+
<Radio value="manual">{t('strategyManual')}</Radio>
160+
<Radio value="update_after">{t('strategyUpdateAfter')}</Radio>
161+
<Radio value="timed">{t('strategyTimed')}</Radio>
162+
</div>
163+
</Radio.Group>
164+
</FormControl>
165+
</div>
166+
</div>
167+
<div className="flex pt-1">
168+
<div className="w-1/4"></div>
169+
<FormMessage />
170+
</div>
171+
</FormItem>
172+
);
173+
}}
174+
/>
175+
{strategy === 'timed' && (
176+
<FormField
177+
control={form.control}
178+
name={'parser_config.graphrag.cron'}
179+
render={({ field }) => (
180+
<FormItem className=" items-center space-y-0 ">
181+
<div className="flex items-center">
182+
<FormLabel className="text-sm whitespace-nowrap w-1/4">
183+
{t('cronExpression')}
184+
</FormLabel>
185+
<div className="w-3/4">
186+
<FormControl>
187+
<ExpandedInput {...field} className="w-full" placeholder={t('cronPlaceholder')} />
188+
</FormControl>
189+
</div>
190+
</div>
191+
<div className="flex pt-1">
192+
<div className="w-1/4"></div>
193+
<FormMessage />
194+
</div>
195+
</FormItem>
196+
)}
197+
/>
198+
)}
139199
<UseGraphRagFormField
140200
data={data}
141201
onDelete={onDelete}

web/src/components/parse-configuration/raptor-form-fields.tsx

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,13 +67,71 @@ const RaptorFormFields = ({
6767
const form = useFormContext();
6868
const { t } = useTranslate('knowledgeConfiguration');
6969
const useRaptor = useWatch({ name: UseRaptorField });
70+
const strategy = useWatch({
71+
control: form.control,
72+
name: 'parser_config.raptor.strategy',
73+
});
7074

7175
const handleGenerate = useCallback(() => {
7276
form.setValue(RandomSeedField, random(10000));
7377
}, [form]);
7478

7579
return (
7680
<>
81+
<FormField
82+
control={form.control}
83+
name={'parser_config.raptor.strategy'}
84+
render={({ field }) => {
85+
return (
86+
<FormItem className=" items-center space-y-0 ">
87+
<div className="flex items-start">
88+
<FormLabel className="text-sm whitespace-nowrap w-1/4">
89+
{t('raptorStrategy')}
90+
</FormLabel>
91+
<div className="w-3/4">
92+
<FormControl>
93+
<Radio.Group {...field}>
94+
<div className={'flex gap-4 w-full text-text-secondary '}>
95+
<Radio value="manual">{t('strategyManual')}</Radio>
96+
<Radio value="update_after">{t('strategyUpdateAfter')}</Radio>
97+
<Radio value="timed">{t('strategyTimed')}</Radio>
98+
</div>
99+
</Radio.Group>
100+
</FormControl>
101+
</div>
102+
</div>
103+
<div className="flex pt-1">
104+
<div className="w-1/4"></div>
105+
<FormMessage />
106+
</div>
107+
</FormItem>
108+
);
109+
}}
110+
/>
111+
{strategy === 'timed' && (
112+
<FormField
113+
control={form.control}
114+
name={'parser_config.raptor.cron'}
115+
render={({ field }) => (
116+
<FormItem className=" items-center space-y-0 ">
117+
<div className="flex items-center">
118+
<FormLabel className="text-sm whitespace-nowrap w-1/4">
119+
{t('cronExpression')}
120+
</FormLabel>
121+
<div className="w-3/4">
122+
<FormControl>
123+
<ExpandedInput {...field} className="w-full" placeholder={t('cronPlaceholder')} />
124+
</FormControl>
125+
</div>
126+
</div>
127+
<div className="flex pt-1">
128+
<div className="w-1/4"></div>
129+
<FormMessage />
130+
</div>
131+
</FormItem>
132+
)}
133+
/>
134+
)}
77135
<FormField
78136
control={form.control}
79137
name={UseRaptorField}

web/src/interfaces/database/knowledge.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ export interface IKnowledgeResult {
5252

5353
export interface Raptor {
5454
use_raptor: boolean;
55+
strategy?: string;
56+
cron?: string;
5557
}
5658

5759
export interface ParserConfig {
@@ -66,7 +68,7 @@ export interface ParserConfig {
6668
raptor?: Raptor;
6769
tag_kb_ids?: string[];
6870
topn_tags?: number;
69-
graphrag?: { use_graphrag?: boolean };
71+
graphrag?: { use_graphrag?: boolean; entity_types?: string[]; method?: string; resolution?: boolean; community?: boolean; strategy?: string; cron?: string };
7072
}
7173

7274
export interface IKnowledgeFileParserConfig {

web/src/locales/de.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,15 @@ export default {
208208
plainText: 'Einfach',
209209
},
210210
knowledgeConfiguration: {
211+
useGraphRag: 'Wissensgraph‑Generierung',
212+
useRaptor: 'RAPTOR zur Verbesserung des Abrufs verwenden',
213+
raptorStrategy: 'RAPTOR‑Generierungsstrategie',
214+
graphRagStrategy: 'GraphRAG‑Generierungsstrategie',
215+
strategyManual: 'Manuell',
216+
strategyUpdateAfter: 'Nach Aktualisierung',
217+
strategyTimed: 'Zeitgesteuert',
218+
cronExpression: 'Cron‑Ausdruck',
219+
cronPlaceholder: 'Bitte Cron‑Ausdruck eingeben',
211220
titleDescription:
212221
'Aktualisieren Sie hier Ihre Wissensdatenbank-Konfiguration, insbesondere die Chunk-Methode.',
213222
name: 'Name der Wissensdatenbank',
@@ -334,6 +343,7 @@ export default {
334343
<i>Textzeilen, die nicht den obigen Regeln entsprechen, werden ignoriert.</i>
335344
`,
336345
useRaptor: 'RAPTOR zur Verbesserung des Abrufs verwenden',
346+
337347
useRaptorTip:
338348
'RAPTOR für Multi-Hop-Frage-Antwort-Aufgaben aktivieren. Details unter https://ragflow.io/docs/dev/enable_raptor.',
339349
prompt: 'Prompt',

0 commit comments

Comments
 (0)