feat(sync): Implement RAGflow Parsing Status Poller for syncing document statuses with EspoCRM
This commit is contained in:
125
src/steps/crm/akte/ragflow_parsing_status_cron_step.py
Normal file
125
src/steps/crm/akte/ragflow_parsing_status_cron_step.py
Normal file
@@ -0,0 +1,125 @@
|
|||||||
|
"""
|
||||||
|
RAGflow Parsing Status Poller
|
||||||
|
|
||||||
|
Fragt alle 60 Sekunden EspoCRM nach CDokumente-Eintraegen ab,
|
||||||
|
deren RAGflow-Parsing noch nicht abgeschlossen ist (aiParsingStatus not in {complete, failed}).
|
||||||
|
Fuer jedes gefundene Dokument wird der aktuelle Parsing-Status von RAGflow
|
||||||
|
abgefragt und – bei Aenderung – zurueck nach EspoCRM geschrieben.
|
||||||
|
|
||||||
|
aiParsingStatus-Werte (EspoCRM):
|
||||||
|
unknown → RAGflow run=UNSTART (noch nicht gestartet)
|
||||||
|
parsing → RAGflow run=RUNNING
|
||||||
|
complete → RAGflow run=DONE
|
||||||
|
failed → RAGflow run=FAIL oder CANCEL
|
||||||
|
"""
|
||||||
|
|
||||||
|
from motia import FlowContext, cron
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"name": "RAGflow Parsing Status Poller",
|
||||||
|
"description": "Polls RAGflow parsing status for uploaded documents and syncs back to EspoCRM",
|
||||||
|
"flows": ["akte-sync"],
|
||||||
|
"triggers": [cron("0 */1 * * * *")], # jede Minute
|
||||||
|
}
|
||||||
|
|
||||||
|
# RAGflow run → EspoCRM aiParsingStatus
|
||||||
|
RUN_STATUS_MAP = {
|
||||||
|
'UNSTART': 'unknown',
|
||||||
|
'RUNNING': 'parsing',
|
||||||
|
'DONE': 'complete',
|
||||||
|
'FAIL': 'failed',
|
||||||
|
'CANCEL': 'failed',
|
||||||
|
}
|
||||||
|
|
||||||
|
BATCH_SIZE = 200 # max CDokumente pro Poll-Tick
|
||||||
|
|
||||||
|
|
||||||
|
async def handler(input_data: None, ctx: FlowContext) -> None:
|
||||||
|
from services.espocrm import EspoCRMAPI
|
||||||
|
from services.ragflow_service import RAGFlowService
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
ctx.logger.info("=" * 60)
|
||||||
|
ctx.logger.info("⏰ RAGFLOW PARSING STATUS POLLER")
|
||||||
|
|
||||||
|
espocrm = EspoCRMAPI(ctx)
|
||||||
|
ragflow = RAGFlowService(ctx)
|
||||||
|
|
||||||
|
# ── 1. CDokumente laden die noch nicht erfolgreicher geparst wurden ───────
|
||||||
|
try:
|
||||||
|
result = await espocrm.list_entities(
|
||||||
|
'CDokumente',
|
||||||
|
where=[
|
||||||
|
{'type': 'isNotNull', 'attribute': 'aiFileId'},
|
||||||
|
{'type': 'isNotNull', 'attribute': 'aiCollectionId'},
|
||||||
|
{'type': 'notEquals', 'attribute': 'aiParsingStatus', 'value': 'complete'},
|
||||||
|
{'type': 'notEquals', 'attribute': 'aiParsingStatus', 'value': 'failed'},
|
||||||
|
],
|
||||||
|
select='id,aiFileId,aiCollectionId,aiParsingStatus',
|
||||||
|
max_size=BATCH_SIZE,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
ctx.logger.error(f"❌ EspoCRM Abfrage fehlgeschlagen: {e}")
|
||||||
|
ctx.logger.info("=" * 60)
|
||||||
|
return
|
||||||
|
|
||||||
|
docs = result.get('list', [])
|
||||||
|
ctx.logger.info(f" Pending-Dokumente: {len(docs)}")
|
||||||
|
|
||||||
|
if not docs:
|
||||||
|
ctx.logger.info("✓ Keine ausstehenden Dokumente")
|
||||||
|
ctx.logger.info("=" * 60)
|
||||||
|
return
|
||||||
|
|
||||||
|
# ── 2. Nach Dataset-ID gruppieren (1 RAGflow-Aufruf pro Dataset) ─────────
|
||||||
|
by_dataset: dict[str, list] = defaultdict(list)
|
||||||
|
for doc in docs:
|
||||||
|
if doc.get('aiCollectionId'):
|
||||||
|
by_dataset[doc['aiCollectionId']].append(doc)
|
||||||
|
|
||||||
|
updated = 0
|
||||||
|
failed = 0
|
||||||
|
|
||||||
|
for dataset_id, dataset_docs in by_dataset.items():
|
||||||
|
# RAGflow-Dokumente des Datasets laden
|
||||||
|
try:
|
||||||
|
ragflow_docs = await ragflow.list_documents(dataset_id)
|
||||||
|
ragflow_by_id = {rd['id']: rd for rd in ragflow_docs}
|
||||||
|
except Exception as e:
|
||||||
|
ctx.logger.error(f" ❌ RAGflow list_documents({dataset_id[:12]}…) fehlgeschlagen: {e}")
|
||||||
|
failed += len(dataset_docs)
|
||||||
|
continue
|
||||||
|
|
||||||
|
for doc in dataset_docs:
|
||||||
|
doc_id = doc['id']
|
||||||
|
ai_file_id = doc.get('aiFileId', '')
|
||||||
|
current_status = doc.get('aiParsingStatus') or 'unknown'
|
||||||
|
|
||||||
|
ragflow_doc = ragflow_by_id.get(ai_file_id)
|
||||||
|
if not ragflow_doc:
|
||||||
|
ctx.logger.warn(
|
||||||
|
f" ⚠️ CDokumente {doc_id}: aiFileId {ai_file_id[:12]}… nicht in RAGflow gefunden"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
run = (ragflow_doc.get('run') or 'UNSTART').upper()
|
||||||
|
new_status = RUN_STATUS_MAP.get(run, 'unknown')
|
||||||
|
|
||||||
|
if new_status == current_status:
|
||||||
|
continue # keine Änderung
|
||||||
|
|
||||||
|
ctx.logger.info(
|
||||||
|
f" 📄 {doc_id}: {current_status} → {new_status} "
|
||||||
|
f"(run={run}, progress={ragflow_doc.get('progress', 0):.0%})"
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
await espocrm.update_entity('CDokumente', doc_id, {
|
||||||
|
'aiParsingStatus': new_status,
|
||||||
|
})
|
||||||
|
updated += 1
|
||||||
|
except Exception as e:
|
||||||
|
ctx.logger.error(f" ❌ Update CDokumente {doc_id} fehlgeschlagen: {e}")
|
||||||
|
failed += 1
|
||||||
|
|
||||||
|
ctx.logger.info(f" ✅ Aktualisiert: {updated} ❌ Fehler: {failed}")
|
||||||
|
ctx.logger.info("=" * 60)
|
||||||
Reference in New Issue
Block a user