feat(cron): Add RAGflow Graph Build Cron for periodic status updates and new builds
This commit is contained in:
@@ -426,6 +426,66 @@ class RAGFlowService:
|
|||||||
self._log(f"📄 Document found: {result.get('name')} (run={result.get('run')})")
|
self._log(f"📄 Document found: {result.get('name')} (run={result.get('run')})")
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
async def trace_graphrag(self, dataset_id: str) -> Optional[Dict]:
|
||||||
|
"""
|
||||||
|
Gibt den aktuellen Status des Knowledge-Graph-Builds zurueck.
|
||||||
|
GET /api/v1/datasets/{dataset_id}/trace_graphrag
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mit 'progress' (0.0-1.0), 'task_id', 'progress_msg' etc.
|
||||||
|
None wenn noch kein Graph-Build gestartet wurde.
|
||||||
|
"""
|
||||||
|
import aiohttp
|
||||||
|
url = f"{self.base_url.rstrip('/')}/api/v1/datasets/{dataset_id}/trace_graphrag"
|
||||||
|
headers = {'Authorization': f'Bearer {self.api_key}'}
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.get(url, headers=headers) as resp:
|
||||||
|
if resp.status not in (200, 201):
|
||||||
|
text = await resp.text()
|
||||||
|
raise RuntimeError(
|
||||||
|
f"trace_graphrag HTTP {resp.status} fuer dataset {dataset_id}: {text}"
|
||||||
|
)
|
||||||
|
data = await resp.json()
|
||||||
|
task = data.get('data')
|
||||||
|
if not task:
|
||||||
|
return None
|
||||||
|
return {
|
||||||
|
'task_id': task.get('id', ''),
|
||||||
|
'progress': float(task.get('progress', 0.0)),
|
||||||
|
'progress_msg': task.get('progress_msg', ''),
|
||||||
|
'begin_at': task.get('begin_at'),
|
||||||
|
'update_date': task.get('update_date'),
|
||||||
|
}
|
||||||
|
|
||||||
|
async def run_graphrag(self, dataset_id: str) -> str:
|
||||||
|
"""
|
||||||
|
Startet bzw. aktualisiert den Knowledge Graph eines Datasets
|
||||||
|
via POST /api/v1/datasets/{id}/run_graphrag.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
graphrag_task_id (str) – leer wenn der Server keinen zurueckgibt.
|
||||||
|
"""
|
||||||
|
import aiohttp
|
||||||
|
url = f"{self.base_url.rstrip('/')}/api/v1/datasets/{dataset_id}/run_graphrag"
|
||||||
|
headers = {
|
||||||
|
'Authorization': f'Bearer {self.api_key}',
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
}
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.post(url, headers=headers, json={}) as resp:
|
||||||
|
if resp.status not in (200, 201):
|
||||||
|
text = await resp.text()
|
||||||
|
raise RuntimeError(
|
||||||
|
f"run_graphrag HTTP {resp.status} fuer dataset {dataset_id}: {text}"
|
||||||
|
)
|
||||||
|
data = await resp.json()
|
||||||
|
task_id = (data.get('data') or {}).get('graphrag_task_id', '')
|
||||||
|
self._log(
|
||||||
|
f"🔗 run_graphrag angestossen fuer {dataset_id[:16]}…"
|
||||||
|
+ (f" task_id={task_id}" if task_id else "")
|
||||||
|
)
|
||||||
|
return task_id
|
||||||
|
|
||||||
async def wait_for_parsing(
|
async def wait_for_parsing(
|
||||||
self,
|
self,
|
||||||
dataset_id: str,
|
dataset_id: str,
|
||||||
|
|||||||
176
src/steps/crm/akte/ragflow_graph_build_cron_step.py
Normal file
176
src/steps/crm/akte/ragflow_graph_build_cron_step.py
Normal file
@@ -0,0 +1,176 @@
|
|||||||
|
"""
|
||||||
|
RAGflow Graph Build Cron
|
||||||
|
|
||||||
|
Laeuft alle 5 Minuten und erledigt zwei Aufgaben:
|
||||||
|
|
||||||
|
Phase A – Status-Update laufender Graphs:
|
||||||
|
Holt alle CAkten mit graphParsingStatus='parsing', fragt per trace_graphrag
|
||||||
|
den aktuellen Fortschritt ab und setzt den Status in EspoCRM auf 'complete'
|
||||||
|
sobald progress == 1.0.
|
||||||
|
|
||||||
|
Phase B – Neue Graph-Builds anstossen:
|
||||||
|
Holt alle CAkten mit:
|
||||||
|
- aiParsingStatus in ['complete', 'complete_with_failures']
|
||||||
|
- graphParsingStatus in ['unclean', 'no_graph']
|
||||||
|
- aiCollectionId isNotNull
|
||||||
|
Stellt sicher, dass kein Graph-Build laeuft (trace_graphrag), und
|
||||||
|
stoesst per run_graphrag einen neuen Build an.
|
||||||
|
Setzt graphParsingStatus → 'parsing'.
|
||||||
|
|
||||||
|
graphParsingStatus-Werte (EspoCRM):
|
||||||
|
no_graph → noch kein Graph gebaut
|
||||||
|
parsing → Graph-Build laeuft
|
||||||
|
complete → Graph fertig (progress == 1.0)
|
||||||
|
unclean → Graph veraltet (neue Dokumente hochgeladen)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from motia import FlowContext, cron
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"name": "RAGflow Graph Build Cron",
|
||||||
|
"description": "Polls and triggers Knowledge Graph builds in RAGflow for CAkten",
|
||||||
|
"flows": ["akte-sync"],
|
||||||
|
"triggers": [cron("0 */5 * * * *")], # alle 5 Minuten
|
||||||
|
}
|
||||||
|
|
||||||
|
BATCH_SIZE = 50
|
||||||
|
|
||||||
|
|
||||||
|
async def handler(input_data: None, ctx: FlowContext) -> None:
|
||||||
|
from services.espocrm import EspoCRMAPI
|
||||||
|
from services.ragflow_service import RAGFlowService
|
||||||
|
|
||||||
|
ctx.logger.info("=" * 60)
|
||||||
|
ctx.logger.info("⏰ RAGFLOW GRAPH BUILD CRON")
|
||||||
|
|
||||||
|
espocrm = EspoCRMAPI(ctx)
|
||||||
|
ragflow = RAGFlowService(ctx)
|
||||||
|
|
||||||
|
# ══════════════════════════════════════════════════════════════
|
||||||
|
# Phase A: Laufende Builds aktualisieren
|
||||||
|
# ══════════════════════════════════════════════════════════════
|
||||||
|
ctx.logger.info("── Phase A: Laufende Builds pruefen ──")
|
||||||
|
try:
|
||||||
|
parsing_result = await espocrm.list_entities(
|
||||||
|
'CAkten',
|
||||||
|
where=[
|
||||||
|
{'type': 'isNotNull', 'attribute': 'aiCollectionId'},
|
||||||
|
{'type': 'equals', 'attribute': 'graphParsingStatus', 'value': 'parsing'},
|
||||||
|
],
|
||||||
|
select='id,aiCollectionId,graphParsingStatus',
|
||||||
|
max_size=BATCH_SIZE,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
ctx.logger.error(f"❌ EspoCRM Phase-A-Abfrage fehlgeschlagen: {e}")
|
||||||
|
parsing_result = {'list': []}
|
||||||
|
|
||||||
|
polling_done = 0
|
||||||
|
polling_error = 0
|
||||||
|
for akte in parsing_result.get('list', []):
|
||||||
|
akte_id = akte['id']
|
||||||
|
dataset_id = akte['aiCollectionId']
|
||||||
|
try:
|
||||||
|
task = await ragflow.trace_graphrag(dataset_id)
|
||||||
|
if task is None:
|
||||||
|
# kein Task mehr vorhanden – als unclean markieren
|
||||||
|
ctx.logger.warn(
|
||||||
|
f" ⚠️ Akte {akte_id}: kein Graph-Task gefunden → unclean"
|
||||||
|
)
|
||||||
|
await espocrm.update_entity('CAkten', akte_id, {'graphParsingStatus': 'unclean'})
|
||||||
|
polling_done += 1
|
||||||
|
elif task['progress'] >= 1.0:
|
||||||
|
ctx.logger.info(
|
||||||
|
f" ✅ Akte {akte_id}: Graph fertig (progress=100%) → complete"
|
||||||
|
)
|
||||||
|
await espocrm.update_entity('CAkten', akte_id, {'graphParsingStatus': 'complete'})
|
||||||
|
polling_done += 1
|
||||||
|
else:
|
||||||
|
ctx.logger.info(
|
||||||
|
f" ⏳ Akte {akte_id}: Graph laeuft noch "
|
||||||
|
f"(progress={task['progress']:.0%})"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
ctx.logger.error(f" ❌ Fehler bei Akte {akte_id}: {e}")
|
||||||
|
polling_error += 1
|
||||||
|
|
||||||
|
ctx.logger.info(
|
||||||
|
f" Phase A: {len(parsing_result.get('list', []))} laufend"
|
||||||
|
f" → {polling_done} aktualisiert {polling_error} Fehler"
|
||||||
|
)
|
||||||
|
|
||||||
|
# ══════════════════════════════════════════════════════════════
|
||||||
|
# Phase B: Neue Graph-Builds anstossen
|
||||||
|
# ══════════════════════════════════════════════════════════════
|
||||||
|
ctx.logger.info("── Phase B: Neue Builds anstossen ──")
|
||||||
|
try:
|
||||||
|
pending_result = await espocrm.list_entities(
|
||||||
|
'CAkten',
|
||||||
|
where=[
|
||||||
|
{'type': 'isNotNull', 'attribute': 'aiCollectionId'},
|
||||||
|
{'type': 'in', 'attribute': 'aiParsingStatus',
|
||||||
|
'value': ['complete', 'complete_with_failures']},
|
||||||
|
{'type': 'in', 'attribute': 'graphParsingStatus',
|
||||||
|
'value': ['unclean', 'no_graph']},
|
||||||
|
],
|
||||||
|
select='id,aiCollectionId,aiParsingStatus,graphParsingStatus',
|
||||||
|
max_size=BATCH_SIZE,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
ctx.logger.error(f"❌ EspoCRM Phase-B-Abfrage fehlgeschlagen: {e}")
|
||||||
|
pending_result = {'list': []}
|
||||||
|
|
||||||
|
triggered = 0
|
||||||
|
skipped = 0
|
||||||
|
trig_error = 0
|
||||||
|
|
||||||
|
for akte in pending_result.get('list', []):
|
||||||
|
akte_id = akte['id']
|
||||||
|
dataset_id = akte['aiCollectionId']
|
||||||
|
ai_status = akte.get('aiParsingStatus', '—')
|
||||||
|
graph_status = akte.get('graphParsingStatus', '—')
|
||||||
|
|
||||||
|
# Sicherstellen dass kein Build bereits laeuft
|
||||||
|
try:
|
||||||
|
task = await ragflow.trace_graphrag(dataset_id)
|
||||||
|
except Exception as e:
|
||||||
|
ctx.logger.error(
|
||||||
|
f" ❌ trace_graphrag Akte {akte_id} fehlgeschlagen: {e}"
|
||||||
|
)
|
||||||
|
trig_error += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if task is not None and task['progress'] < 1.0:
|
||||||
|
ctx.logger.info(
|
||||||
|
f" ⏭️ Akte {akte_id}: Build laeuft noch "
|
||||||
|
f"(progress={task['progress']:.0%}) → setze parsing"
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
await espocrm.update_entity('CAkten', akte_id, {'graphParsingStatus': 'parsing'})
|
||||||
|
except Exception as e:
|
||||||
|
ctx.logger.error(f" ❌ Status-Update fehlgeschlagen: {e}")
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Build anstossen
|
||||||
|
ctx.logger.info(
|
||||||
|
f" 🔧 Akte {akte_id} "
|
||||||
|
f"ai={ai_status} graph={graph_status} "
|
||||||
|
f"dataset={dataset_id[:16]}…"
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
task_id = await ragflow.run_graphrag(dataset_id)
|
||||||
|
ctx.logger.info(
|
||||||
|
f" ✅ Graph-Build angestossen"
|
||||||
|
+ (f" task_id={task_id}" if task_id else "")
|
||||||
|
)
|
||||||
|
await espocrm.update_entity('CAkten', akte_id, {'graphParsingStatus': 'parsing'})
|
||||||
|
triggered += 1
|
||||||
|
except Exception as e:
|
||||||
|
ctx.logger.error(f" ❌ Fehler: {e}")
|
||||||
|
trig_error += 1
|
||||||
|
|
||||||
|
ctx.logger.info(
|
||||||
|
f" Phase B: {len(pending_result.get('list', []))} ausstehend"
|
||||||
|
f" → {triggered} angestossen {skipped} uebersprungen {trig_error} Fehler"
|
||||||
|
)
|
||||||
|
ctx.logger.info("=" * 60)
|
||||||
Reference in New Issue
Block a user