feat(sync): Implement RAGflow Parsing Status Poller for syncing document statuses with EspoCRM

2026-03-27 10:12:52 +00:00
parent c20baeb21a
commit a2181a25fc
1 changed files with 125 additions and 0 deletions
--- a/src/steps/crm/akte/ragflow_parsing_status_cron_step.py
+++ b/src/steps/crm/akte/ragflow_parsing_status_cron_step.py
@@ -0,0 +1,125 @@
+"""
+RAGflow Parsing Status Poller
+
+Fragt alle 60 Sekunden EspoCRM nach CDokumente-Eintraegen ab,
+deren RAGflow-Parsing noch nicht abgeschlossen ist (aiParsingStatus not in {complete, failed}).
+Fuer jedes gefundene Dokument wird der aktuelle Parsing-Status von RAGflow
+abgefragt und – bei Aenderung – zurueck nach EspoCRM geschrieben.
+
+aiParsingStatus-Werte (EspoCRM):
+  unknown  → RAGflow run=UNSTART  (noch nicht gestartet)
+  parsing  → RAGflow run=RUNNING
+  complete → RAGflow run=DONE
+  failed   → RAGflow run=FAIL oder CANCEL
+"""
+
+from motia import FlowContext, cron
+
+config = {
+    "name": "RAGflow Parsing Status Poller",
+    "description": "Polls RAGflow parsing status for uploaded documents and syncs back to EspoCRM",
+    "flows": ["akte-sync"],
+    "triggers": [cron("0 */1 * * * *")],  # jede Minute
+}
+
+# RAGflow run → EspoCRM aiParsingStatus
+RUN_STATUS_MAP = {
+    'UNSTART': 'unknown',
+    'RUNNING': 'parsing',
+    'DONE':    'complete',
+    'FAIL':    'failed',
+    'CANCEL':  'failed',
+}
+
+BATCH_SIZE = 200  # max CDokumente pro Poll-Tick
+
+
+async def handler(input_data: None, ctx: FlowContext) -> None:
+    from services.espocrm import EspoCRMAPI
+    from services.ragflow_service import RAGFlowService
+    from collections import defaultdict
+
+    ctx.logger.info("=" * 60)
+    ctx.logger.info("⏰ RAGFLOW PARSING STATUS POLLER")
+
+    espocrm = EspoCRMAPI(ctx)
+    ragflow = RAGFlowService(ctx)
+
+    # ── 1. CDokumente laden die noch nicht erfolgreicher geparst wurden ───────
+    try:
+        result = await espocrm.list_entities(
+            'CDokumente',
+            where=[
+                {'type': 'isNotNull', 'attribute': 'aiFileId'},
+                {'type': 'isNotNull', 'attribute': 'aiCollectionId'},
+                {'type': 'notEquals', 'attribute': 'aiParsingStatus', 'value': 'complete'},
+                {'type': 'notEquals', 'attribute': 'aiParsingStatus', 'value': 'failed'},
+            ],
+            select='id,aiFileId,aiCollectionId,aiParsingStatus',
+            max_size=BATCH_SIZE,
+        )
+    except Exception as e:
+        ctx.logger.error(f"❌ EspoCRM Abfrage fehlgeschlagen: {e}")
+        ctx.logger.info("=" * 60)
+        return
+
+    docs = result.get('list', [])
+    ctx.logger.info(f"   Pending-Dokumente: {len(docs)}")
+
+    if not docs:
+        ctx.logger.info("✓ Keine ausstehenden Dokumente")
+        ctx.logger.info("=" * 60)
+        return
+
+    # ── 2. Nach Dataset-ID gruppieren (1 RAGflow-Aufruf pro Dataset) ─────────
+    by_dataset: dict[str, list] = defaultdict(list)
+    for doc in docs:
+        if doc.get('aiCollectionId'):
+            by_dataset[doc['aiCollectionId']].append(doc)
+
+    updated = 0
+    failed  = 0
+
+    for dataset_id, dataset_docs in by_dataset.items():
+        # RAGflow-Dokumente des Datasets laden
+        try:
+            ragflow_docs = await ragflow.list_documents(dataset_id)
+            ragflow_by_id = {rd['id']: rd for rd in ragflow_docs}
+        except Exception as e:
+            ctx.logger.error(f"   ❌ RAGflow list_documents({dataset_id[:12]}…) fehlgeschlagen: {e}")
+            failed += len(dataset_docs)
+            continue
+
+        for doc in dataset_docs:
+            doc_id         = doc['id']
+            ai_file_id     = doc.get('aiFileId', '')
+            current_status = doc.get('aiParsingStatus') or 'unknown'
+
+            ragflow_doc = ragflow_by_id.get(ai_file_id)
+            if not ragflow_doc:
+                ctx.logger.warn(
+                    f"   ⚠️  CDokumente {doc_id}: aiFileId {ai_file_id[:12]}… nicht in RAGflow gefunden"
+                )
+                continue
+
+            run = (ragflow_doc.get('run') or 'UNSTART').upper()
+            new_status = RUN_STATUS_MAP.get(run, 'unknown')
+
+            if new_status == current_status:
+                continue  # keine Änderung
+
+            ctx.logger.info(
+                f"   📄 {doc_id}: {current_status} → {new_status} "
+                f"(run={run}, progress={ragflow_doc.get('progress', 0):.0%})"
+            )
+            try:
+                await espocrm.update_entity('CDokumente', doc_id, {
+                    'aiParsingStatus': new_status,
+                })
+                updated += 1
+            except Exception as e:
+                ctx.logger.error(f"   ❌ Update CDokumente {doc_id} fehlgeschlagen: {e}")
+                failed += 1
+
+    ctx.logger.info(f"   ✅ Aktualisiert: {updated}  ❌ Fehler: {failed}")
+    ctx.logger.info("=" * 60)