- Added `aiknowledge_sync_utils.py` for provider-agnostic synchronization logic for CAIKnowledge entities, supporting both xAI and RAGFlow. - Introduced lifecycle management for CAIKnowledge entities including states: new, active, paused, and deactivated. - Implemented change detection using Blake3 hash for efficient document synchronization. - Created `ragflow_service.py` to handle dataset and document management with RAGFlow API. - Added daily cron job in `aiknowledge_daily_cron_step.py` to synchronize active CAIKnowledge entities with unclean or failed statuses. - Developed `aiknowledge_sync_event_step.py` to process synchronization events from webhooks and cron jobs.
512 lines
19 KiB
Python
512 lines
19 KiB
Python
"""RAGFlow Dataset & Document Service"""
|
||
import os
|
||
import asyncio
|
||
from functools import partial
|
||
from typing import Optional, List, Dict, Any
|
||
from services.logging_utils import get_service_logger
|
||
|
||
RAGFLOW_DEFAULT_BASE_URL = "http://192.168.1.64:9380"
|
||
|
||
# Defaults fuer Dokument-Analyse
|
||
RAGFLOW_AUTO_KEYWORDS = 14
|
||
RAGFLOW_AUTO_QUESTIONS = 7
|
||
|
||
|
||
def _base_to_dict(obj: Any) -> Any:
|
||
"""
|
||
Konvertiert ragflow_sdk.modules.base.Base rekursiv zu einem plain dict.
|
||
Filtert den internen 'rag'-Client-Key heraus.
|
||
"""
|
||
try:
|
||
from ragflow_sdk.modules.base import Base
|
||
if isinstance(obj, Base):
|
||
return {k: _base_to_dict(v) for k, v in vars(obj).items() if k != 'rag'}
|
||
except ImportError:
|
||
pass
|
||
if isinstance(obj, dict):
|
||
return {k: _base_to_dict(v) for k, v in obj.items()}
|
||
if isinstance(obj, list):
|
||
return [_base_to_dict(i) for i in obj]
|
||
return obj
|
||
|
||
|
||
class RAGFlowService:
|
||
"""
|
||
Client fuer RAGFlow API via ragflow-sdk (Python SDK).
|
||
|
||
Wrapt das synchrone SDK in asyncio.run_in_executor, sodass
|
||
es nahtlos in Motia-Steps (async) verwendet werden kann.
|
||
|
||
Dataflow beim Upload:
|
||
upload_document() →
|
||
1. upload_documents([{blob}]) # Datei hochladen
|
||
2. doc.update({meta_fields}) # blake3 + advoware-Felder setzen
|
||
3. async_parse_documents([id]) # Parsing starten (chunk_method=laws)
|
||
|
||
Benoetigte Umgebungsvariablen:
|
||
- RAGFLOW_API_KEY – API Key
|
||
- RAGFLOW_BASE_URL – Optional, URL Override (Default: http://192.168.1.64:9380)
|
||
"""
|
||
|
||
SUPPORTED_MIME_TYPES = {
|
||
'application/pdf',
|
||
'application/msword',
|
||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||
'application/vnd.ms-excel',
|
||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||
'application/vnd.oasis.opendocument.text',
|
||
'application/epub+zip',
|
||
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
||
'text/plain',
|
||
'text/html',
|
||
'text/markdown',
|
||
'text/csv',
|
||
'text/xml',
|
||
'application/json',
|
||
'application/xml',
|
||
}
|
||
|
||
def __init__(self, ctx=None):
|
||
self.api_key = os.getenv('RAGFLOW_API_KEY', '')
|
||
base_url_env = os.getenv('RAGFLOW_BASE_URL', '')
|
||
self.base_url = base_url_env or RAGFLOW_DEFAULT_BASE_URL
|
||
self.ctx = ctx
|
||
self.logger = get_service_logger('ragflow', ctx)
|
||
self._rag = None
|
||
|
||
if not self.api_key:
|
||
raise ValueError("RAGFLOW_API_KEY not configured in environment")
|
||
|
||
def _log(self, msg: str, level: str = 'info') -> None:
|
||
log_func = getattr(self.logger, level, self.logger.info)
|
||
log_func(msg)
|
||
|
||
def _get_client(self):
|
||
"""Gibt RAGFlow SDK Client zurueck (lazy init, sync)."""
|
||
if self._rag is None:
|
||
from ragflow_sdk import RAGFlow
|
||
self._rag = RAGFlow(api_key=self.api_key, base_url=self.base_url)
|
||
return self._rag
|
||
|
||
async def _run(self, func, *args, **kwargs):
|
||
"""Fuehrt synchrone SDK-Funktion in ThreadPoolExecutor aus."""
|
||
loop = asyncio.get_event_loop()
|
||
return await loop.run_in_executor(None, partial(func, *args, **kwargs))
|
||
|
||
# ========== Dataset Management ==========
|
||
|
||
async def create_dataset(
|
||
self,
|
||
name: str,
|
||
chunk_method: str = 'laws',
|
||
embedding_model: Optional[str] = None,
|
||
description: Optional[str] = None,
|
||
auto_keywords: int = RAGFLOW_AUTO_KEYWORDS,
|
||
auto_questions: int = RAGFLOW_AUTO_QUESTIONS,
|
||
) -> Dict:
|
||
"""
|
||
Erstellt ein neues RAGFlow Dataset (entspricht xAI Collection).
|
||
|
||
Verwendet standardmaessig chunk_method='laws' (optimiert fuer Rechtsdokumente).
|
||
Setzt nach der Erstellung auto_keywords=14 und auto_questions=7.
|
||
|
||
Returns:
|
||
dict mit 'id', 'name', 'chunk_method', 'parser_config', etc.
|
||
"""
|
||
self._log(
|
||
f"📚 Creating dataset: {name} "
|
||
f"(chunk_method={chunk_method}, keywords={auto_keywords}, questions={auto_questions})"
|
||
)
|
||
|
||
def _create():
|
||
rag = self._get_client()
|
||
kwargs = dict(name=name, chunk_method=chunk_method)
|
||
if embedding_model:
|
||
kwargs['embedding_model'] = embedding_model
|
||
if description:
|
||
kwargs['description'] = description
|
||
dataset = rag.create_dataset(**kwargs)
|
||
|
||
# parser_config kann erst nach create via update() gesetzt werden
|
||
dataset.update({
|
||
'parser_config': {
|
||
'auto_keywords': auto_keywords,
|
||
'auto_questions': auto_questions,
|
||
}
|
||
})
|
||
return self._dataset_to_dict(dataset)
|
||
|
||
result = await self._run(_create)
|
||
self._log(f"✅ Dataset created: {result.get('id')} ({name})")
|
||
return result
|
||
|
||
async def get_dataset_by_name(self, name: str) -> Optional[Dict]:
|
||
"""
|
||
Sucht Dataset nach Name. Gibt None zurueck wenn nicht gefunden.
|
||
"""
|
||
def _find():
|
||
rag = self._get_client()
|
||
# list_datasets(name=...) hat Permission-Bugs – lokal filtern
|
||
all_datasets = rag.list_datasets(page_size=100)
|
||
for ds in all_datasets:
|
||
if getattr(ds, 'name', None) == name:
|
||
return self._dataset_to_dict(ds)
|
||
return None
|
||
|
||
result = await self._run(_find)
|
||
if result:
|
||
self._log(f"🔍 Dataset found: {result.get('id')} ({name})")
|
||
return result
|
||
|
||
async def ensure_dataset(
|
||
self,
|
||
name: str,
|
||
chunk_method: str = 'laws',
|
||
embedding_model: Optional[str] = None,
|
||
description: Optional[str] = None,
|
||
auto_keywords: int = RAGFLOW_AUTO_KEYWORDS,
|
||
auto_questions: int = RAGFLOW_AUTO_QUESTIONS,
|
||
) -> Dict:
|
||
"""
|
||
Gibt bestehendes Dataset zurueck oder erstellt ein neues (get-or-create).
|
||
Entspricht xAI create_collection mit idempotency.
|
||
|
||
Returns:
|
||
dict mit 'id', 'name', etc.
|
||
"""
|
||
existing = await self.get_dataset_by_name(name)
|
||
if existing:
|
||
self._log(f"✅ Dataset exists: {existing.get('id')} ({name})")
|
||
return existing
|
||
return await self.create_dataset(
|
||
name=name,
|
||
chunk_method=chunk_method,
|
||
embedding_model=embedding_model,
|
||
description=description,
|
||
auto_keywords=auto_keywords,
|
||
auto_questions=auto_questions,
|
||
)
|
||
|
||
async def delete_dataset(self, dataset_id: str) -> None:
|
||
"""
|
||
Loescht ein Dataset inklusive aller Dokumente.
|
||
Entspricht xAI delete_collection.
|
||
"""
|
||
self._log(f"🗑️ Deleting dataset: {dataset_id}")
|
||
|
||
def _delete():
|
||
rag = self._get_client()
|
||
rag.delete_datasets(ids=[dataset_id])
|
||
|
||
await self._run(_delete)
|
||
self._log(f"✅ Dataset deleted: {dataset_id}")
|
||
|
||
async def list_datasets(self) -> List[Dict]:
|
||
"""Listet alle Datasets auf."""
|
||
def _list():
|
||
rag = self._get_client()
|
||
return [self._dataset_to_dict(d) for d in rag.list_datasets()]
|
||
|
||
result = await self._run(_list)
|
||
self._log(f"📋 Listed {len(result)} datasets")
|
||
return result
|
||
|
||
# ========== Document Management ==========
|
||
|
||
async def upload_document(
|
||
self,
|
||
dataset_id: str,
|
||
file_content: bytes,
|
||
filename: str,
|
||
mime_type: str = 'application/octet-stream',
|
||
blake3_hash: Optional[str] = None,
|
||
espocrm_id: Optional[str] = None,
|
||
description: Optional[str] = None,
|
||
advoware_art: Optional[str] = None,
|
||
advoware_bemerkung: Optional[str] = None,
|
||
) -> Dict:
|
||
"""
|
||
Laedt ein Dokument in ein Dataset hoch.
|
||
|
||
Ablauf (3 Schritte):
|
||
1. upload_documents() – Datei hochladen
|
||
2. doc.update(meta_fields) – Metadaten setzen inkl. blake3_hash
|
||
3. async_parse_documents() – Parsing mit chunk_method=laws starten
|
||
|
||
Meta-Felder die gesetzt werden:
|
||
- blake3_hash (fuer Change Detection, entspricht xAI BLAKE3)
|
||
- espocrm_id (Rueckreferenz zu EspoCRM CDokument)
|
||
- description (Dokumentbeschreibung)
|
||
- advoware_art (Advoware Dokumenten-Art)
|
||
- advoware_bemerkung (Advoware Bemerkung/Notiz)
|
||
|
||
Returns:
|
||
dict mit 'id', 'name', 'run', 'meta_fields', etc.
|
||
"""
|
||
if mime_type == 'application/octet-stream' and filename.lower().endswith('.pdf'):
|
||
mime_type = 'application/pdf'
|
||
|
||
self._log(
|
||
f"📤 Uploading {len(file_content)} bytes to dataset {dataset_id}: "
|
||
f"{filename} ({mime_type})"
|
||
)
|
||
|
||
def _upload_and_tag():
|
||
rag = self._get_client()
|
||
datasets = rag.list_datasets(id=dataset_id)
|
||
if not datasets:
|
||
raise RuntimeError(f"Dataset not found: {dataset_id}")
|
||
dataset = datasets[0]
|
||
|
||
# Schritt 1: Upload
|
||
dataset.upload_documents([{
|
||
'display_name': filename,
|
||
'blob': file_content,
|
||
}])
|
||
|
||
# Dokument-ID ermitteln (neuestes mit passendem Namen)
|
||
base_name = filename.split('/')[-1]
|
||
docs = dataset.list_documents(keywords=base_name, page_size=10)
|
||
doc = None
|
||
for d in docs:
|
||
if d.name == filename or d.name == base_name:
|
||
doc = d
|
||
break
|
||
if doc is None and docs:
|
||
doc = docs[0] # Fallback
|
||
if doc is None:
|
||
raise RuntimeError(f"Document not found after upload: {filename}")
|
||
|
||
# Schritt 2: Meta-Fields setzen
|
||
meta: Dict[str, str] = {}
|
||
if blake3_hash:
|
||
meta['blake3_hash'] = blake3_hash
|
||
if espocrm_id:
|
||
meta['espocrm_id'] = espocrm_id
|
||
if description:
|
||
meta['description'] = description
|
||
if advoware_art:
|
||
meta['advoware_art'] = advoware_art
|
||
if advoware_bemerkung:
|
||
meta['advoware_bemerkung'] = advoware_bemerkung
|
||
|
||
if meta:
|
||
doc.update({'meta_fields': meta})
|
||
|
||
# Schritt 3: Parsing starten
|
||
dataset.async_parse_documents([doc.id])
|
||
|
||
return self._document_to_dict(doc)
|
||
|
||
result = await self._run(_upload_and_tag)
|
||
self._log(
|
||
f"✅ Document uploaded & parsing started: {result.get('id')} ({filename})"
|
||
)
|
||
return result
|
||
|
||
async def update_document_meta(
|
||
self,
|
||
dataset_id: str,
|
||
doc_id: str,
|
||
blake3_hash: Optional[str] = None,
|
||
description: Optional[str] = None,
|
||
advoware_art: Optional[str] = None,
|
||
advoware_bemerkung: Optional[str] = None,
|
||
) -> None:
|
||
"""
|
||
Aktualisiert nur die Metadaten eines Dokuments (ohne Re-Upload).
|
||
Entspricht xAI PATCH-Metadata-Only.
|
||
Startet Parsing neu, da Chunk-Injection von meta_fields abhaengt.
|
||
"""
|
||
self._log(f"✏️ Updating metadata for document {doc_id}")
|
||
|
||
def _update():
|
||
rag = self._get_client()
|
||
datasets = rag.list_datasets(id=dataset_id)
|
||
if not datasets:
|
||
raise RuntimeError(f"Dataset not found: {dataset_id}")
|
||
dataset = datasets[0]
|
||
docs = dataset.list_documents(id=doc_id)
|
||
if not docs:
|
||
raise RuntimeError(f"Document not found: {doc_id}")
|
||
doc = docs[0]
|
||
|
||
# Bestehende meta_fields lesen und mergen
|
||
existing_meta = _base_to_dict(doc.meta_fields) or {}
|
||
if blake3_hash is not None:
|
||
existing_meta['blake3_hash'] = blake3_hash
|
||
if description is not None:
|
||
existing_meta['description'] = description
|
||
if advoware_art is not None:
|
||
existing_meta['advoware_art'] = advoware_art
|
||
if advoware_bemerkung is not None:
|
||
existing_meta['advoware_bemerkung'] = advoware_bemerkung
|
||
|
||
doc.update({'meta_fields': existing_meta})
|
||
# Re-parsing noetig damit Chunks aktualisierte Metadata enthalten
|
||
dataset.async_parse_documents([doc.id])
|
||
|
||
await self._run(_update)
|
||
self._log(f"✅ Metadata updated and re-parsing started: {doc_id}")
|
||
|
||
async def remove_document(self, dataset_id: str, doc_id: str) -> None:
|
||
"""
|
||
Loescht ein Dokument aus einem Dataset.
|
||
Entspricht xAI remove_from_collection.
|
||
"""
|
||
self._log(f"🗑️ Removing document {doc_id} from dataset {dataset_id}")
|
||
|
||
def _delete():
|
||
rag = self._get_client()
|
||
datasets = rag.list_datasets(id=dataset_id)
|
||
if not datasets:
|
||
raise RuntimeError(f"Dataset not found: {dataset_id}")
|
||
datasets[0].delete_documents(ids=[doc_id])
|
||
|
||
await self._run(_delete)
|
||
self._log(f"✅ Document removed: {doc_id}")
|
||
|
||
async def list_documents(self, dataset_id: str) -> List[Dict]:
|
||
"""
|
||
Listet alle Dokumente in einem Dataset auf (paginiert).
|
||
Entspricht xAI list_collection_documents.
|
||
"""
|
||
self._log(f"📋 Listing documents in dataset {dataset_id}")
|
||
|
||
def _list():
|
||
rag = self._get_client()
|
||
datasets = rag.list_datasets(id=dataset_id)
|
||
if not datasets:
|
||
raise RuntimeError(f"Dataset not found: {dataset_id}")
|
||
dataset = datasets[0]
|
||
docs = []
|
||
page = 1
|
||
while True:
|
||
batch = dataset.list_documents(page=page, page_size=100)
|
||
if not batch:
|
||
break
|
||
docs.extend(batch)
|
||
if len(batch) < 100:
|
||
break
|
||
page += 1
|
||
return [self._document_to_dict(d) for d in docs]
|
||
|
||
result = await self._run(_list)
|
||
self._log(f"✅ Listed {len(result)} documents")
|
||
return result
|
||
|
||
async def get_document(self, dataset_id: str, doc_id: str) -> Optional[Dict]:
|
||
"""Holt ein einzelnes Dokument by ID. None wenn nicht gefunden."""
|
||
def _get():
|
||
rag = self._get_client()
|
||
datasets = rag.list_datasets(id=dataset_id)
|
||
if not datasets:
|
||
return None
|
||
docs = datasets[0].list_documents(id=doc_id)
|
||
if not docs:
|
||
return None
|
||
return self._document_to_dict(docs[0])
|
||
|
||
result = await self._run(_get)
|
||
if result:
|
||
self._log(f"📄 Document found: {result.get('name')} (run={result.get('run')})")
|
||
return result
|
||
|
||
async def wait_for_parsing(
|
||
self,
|
||
dataset_id: str,
|
||
doc_id: str,
|
||
timeout_seconds: int = 120,
|
||
poll_interval: float = 3.0,
|
||
) -> Dict:
|
||
"""
|
||
Wartet bis das Parsing eines Dokuments abgeschlossen ist.
|
||
|
||
Returns:
|
||
Aktueller Dokument-State als dict.
|
||
|
||
Raises:
|
||
TimeoutError: Wenn Parsing nicht innerhalb timeout_seconds fertig wird.
|
||
RuntimeError: Wenn Parsing fehlschlaegt.
|
||
"""
|
||
self._log(f"⏳ Waiting for parsing: {doc_id} (timeout={timeout_seconds}s)")
|
||
elapsed = 0.0
|
||
|
||
while elapsed < timeout_seconds:
|
||
doc = await self.get_document(dataset_id, doc_id)
|
||
if doc is None:
|
||
raise RuntimeError(f"Document disappeared during parsing: {doc_id}")
|
||
|
||
run_status = doc.get('run', 'UNSTART')
|
||
if run_status == 'DONE':
|
||
self._log(
|
||
f"✅ Parsing done: {doc_id} "
|
||
f"(chunks={doc.get('chunk_count')}, tokens={doc.get('token_count')})"
|
||
)
|
||
return doc
|
||
elif run_status in ('FAIL', 'CANCEL'):
|
||
raise RuntimeError(
|
||
f"Parsing failed for {doc_id}: status={run_status}, "
|
||
f"msg={doc.get('progress_msg', '')}"
|
||
)
|
||
|
||
await asyncio.sleep(poll_interval)
|
||
elapsed += poll_interval
|
||
|
||
raise TimeoutError(
|
||
f"Parsing timeout after {timeout_seconds}s for document {doc_id}"
|
||
)
|
||
|
||
# ========== MIME Type Support ==========
|
||
|
||
def is_mime_type_supported(self, mime_type: str) -> bool:
|
||
"""Prueft ob RAGFlow diesen MIME-Type verarbeiten kann."""
|
||
return mime_type.lower().strip() in self.SUPPORTED_MIME_TYPES
|
||
|
||
# ========== Internal Helpers ==========
|
||
|
||
def _dataset_to_dict(self, dataset) -> Dict:
|
||
"""Konvertiert RAGFlow DataSet Objekt zu dict (inkl. parser_config unwrap)."""
|
||
return {
|
||
'id': getattr(dataset, 'id', None),
|
||
'name': getattr(dataset, 'name', None),
|
||
'chunk_method': getattr(dataset, 'chunk_method', None),
|
||
'embedding_model': getattr(dataset, 'embedding_model', None),
|
||
'description': getattr(dataset, 'description', None),
|
||
'chunk_count': getattr(dataset, 'chunk_count', 0),
|
||
'document_count': getattr(dataset, 'document_count', 0),
|
||
'parser_config': _base_to_dict(getattr(dataset, 'parser_config', {})),
|
||
}
|
||
|
||
def _document_to_dict(self, doc) -> Dict:
|
||
"""
|
||
Konvertiert RAGFlow Document Objekt zu dict.
|
||
|
||
meta_fields wird via _base_to_dict() zu einem plain dict unwrapped.
|
||
Enthaelt blake3_hash, espocrm_id, description, advoware_art,
|
||
advoware_bemerkung sofern gesetzt.
|
||
"""
|
||
raw_meta = getattr(doc, 'meta_fields', None)
|
||
meta_dict = _base_to_dict(raw_meta) if raw_meta is not None else {}
|
||
|
||
return {
|
||
'id': getattr(doc, 'id', None),
|
||
'name': getattr(doc, 'name', None),
|
||
'dataset_id': getattr(doc, 'dataset_id', None),
|
||
'chunk_method': getattr(doc, 'chunk_method', None),
|
||
'size': getattr(doc, 'size', 0),
|
||
'token_count': getattr(doc, 'token_count', 0),
|
||
'chunk_count': getattr(doc, 'chunk_count', 0),
|
||
'run': getattr(doc, 'run', 'UNSTART'),
|
||
'progress': getattr(doc, 'progress', 0.0),
|
||
'progress_msg': getattr(doc, 'progress_msg', ''),
|
||
'source_type': getattr(doc, 'source_type', 'local'),
|
||
'created_by': getattr(doc, 'created_by', ''),
|
||
'process_duration': getattr(doc, 'process_duration', 0.0),
|
||
# Metadaten (blake3_hash hier drin wenn gesetzt)
|
||
'meta_fields': meta_dict,
|
||
'blake3_hash': meta_dict.get('blake3_hash'),
|
||
'espocrm_id': meta_dict.get('espocrm_id'),
|
||
'parser_config': _base_to_dict(getattr(doc, 'parser_config', None)),
|
||
}
|