Files
motia-iii/services/xai_upload_utils.py
bsiggel 9b2fb5ae4a feat: Implement AI Knowledge Sync Utilities and RAGFlow Service
- Added `aiknowledge_sync_utils.py` for provider-agnostic synchronization logic for CAIKnowledge entities, supporting both xAI and RAGFlow.
- Introduced lifecycle management for CAIKnowledge entities including states: new, active, paused, and deactivated.
- Implemented change detection using Blake3 hash for efficient document synchronization.
- Created `ragflow_service.py` to handle dataset and document management with RAGFlow API.
- Added daily cron job in `aiknowledge_daily_cron_step.py` to synchronize active CAIKnowledge entities with unclean or failed statuses.
- Developed `aiknowledge_sync_event_step.py` to process synchronization events from webhooks and cron jobs.
2026-03-26 21:38:42 +00:00

315 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
xAI Upload Utilities
Shared logic for uploading documents from EspoCRM to xAI Collections.
Used by all sync flows (Advoware + direct xAI sync).
Handles:
- Blake3 hash-based change detection
- Upload to xAI with correct filename/MIME
- Collection management (create/verify)
- EspoCRM metadata update after sync
"""
from typing import Optional, Dict, Any
from datetime import datetime
class XAIUploadUtils:
"""
Stateless utility class for document upload operations to xAI.
All methods take explicit service instances to remain reusable
across different sync contexts.
"""
def __init__(self, ctx):
from services.logging_utils import get_service_logger
self._log = get_service_logger(__name__, ctx)
async def ensure_collection(
self,
akte: Dict[str, Any],
xai,
espocrm,
) -> Optional[str]:
"""
Ensure xAI collection exists for this Akte.
Creates one if missing, verifies it if present.
Returns:
collection_id or None on failure
"""
akte_id = akte['id']
akte_name = akte.get('name', f"Akte {akte.get('aktennummer', akte_id)}")
collection_id = akte.get('aiCollectionId')
if collection_id:
# Verify it still exists in xAI
try:
col = await xai.get_collection(collection_id)
if col:
self._log.debug(f"Collection {collection_id} verified for '{akte_name}'")
return collection_id
self._log.warn(f"Collection {collection_id} not found in xAI, recreating...")
except Exception as e:
self._log.warn(f"Could not verify collection {collection_id}: {e}, recreating...")
# Create new collection
try:
self._log.info(f"Creating xAI collection for '{akte_name}'...")
col = await xai.create_collection(
name=akte_name,
)
collection_id = col.get('collection_id') or col.get('id')
self._log.info(f"✅ Collection created: {collection_id}")
# Save back to EspoCRM
await espocrm.update_entity('CAkten', akte_id, {
'aiCollectionId': collection_id,
'aiSyncStatus': 'unclean', # Trigger full doc sync
})
return collection_id
except Exception as e:
self._log.error(f"❌ Failed to create xAI collection: {e}")
return None
async def sync_document_to_xai(
self,
doc: Dict[str, Any],
collection_id: str,
xai,
espocrm,
) -> bool:
"""
Sync a single CDokumente entity to xAI collection.
Decision logic (Blake3-based):
- aiSyncStatus in ['new', 'unclean', 'failed'] → always sync
- aiSyncStatus == 'synced' AND aiSyncHash == blake3hash → skip (no change)
- aiSyncStatus == 'synced' AND aiSyncHash != blake3hash → re-upload (changed)
- No attachment → mark unsupported
Returns:
True if synced/skipped successfully, False on error
"""
doc_id = doc['id']
doc_name = doc.get('name', doc_id)
ai_status = doc.get('aiSyncStatus', 'new')
ai_sync_hash = doc.get('aiSyncHash')
blake3_hash = doc.get('blake3hash')
ai_file_id = doc.get('aiFileId')
self._log.info(f" 📄 {doc_name}")
self._log.info(f" aiSyncStatus={ai_status}, aiSyncHash={ai_sync_hash[:12] if ai_sync_hash else 'N/A'}..., blake3={blake3_hash[:12] if blake3_hash else 'N/A'}...")
# File content unchanged (hash match) → kein Re-Upload nötig
if ai_status == 'synced' and ai_sync_hash and blake3_hash and ai_sync_hash == blake3_hash:
if ai_file_id:
self._log.info(f" ✅ Unverändert kein Re-Upload (hash match)")
else:
self._log.info(f" ⏭️ Skipped (hash match, kein aiFileId)")
return True
# Get attachment info
attachment_id = doc.get('dokumentId')
if not attachment_id:
self._log.warn(f" ⚠️ No attachment (dokumentId missing) - marking unsupported")
await espocrm.update_entity('CDokumente', doc_id, {
'aiSyncStatus': 'unsupported',
'aiLastSync': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
})
return True # Not an error, just unsupported
try:
# Download from EspoCRM
self._log.info(f" 📥 Downloading attachment {attachment_id}...")
file_content = await espocrm.download_attachment(attachment_id)
self._log.info(f" Downloaded {len(file_content)} bytes")
# Determine filename + MIME type
filename = doc.get('dokumentName') or doc.get('name', 'document.bin')
from urllib.parse import unquote
filename = unquote(filename)
import mimetypes
mime_type, _ = mimetypes.guess_type(filename)
if not mime_type:
mime_type = 'application/octet-stream'
# Remove old file from collection if updating
if ai_file_id and ai_status != 'new':
try:
await xai.remove_from_collection(collection_id, ai_file_id)
self._log.info(f" 🗑️ Removed old xAI file {ai_file_id}")
except Exception:
pass # Non-fatal - may already be gone
# Build metadata fields werden einmalig beim Upload gesetzt;
# Custom fields können nachträglich NICHT aktualisiert werden.
# xAI erlaubt KEINE leeren Strings als Feldwerte → nur befüllte Felder senden.
fields_raw = {
'document_name': doc.get('name', filename),
'description': str(doc.get('beschreibung', '') or ''),
'advoware_art': str(doc.get('advowareArt', '') or ''),
'advoware_bemerkung': str(doc.get('advowareBemerkung', '') or ''),
'espocrm_id': doc['id'],
'created_at': str(doc.get('createdAt', '') or ''),
'modified_at': str(doc.get('modifiedAt', '') or ''),
}
fields = {k: v for k, v in fields_raw.items() if v}
# Single-request upload directly to collection incl. metadata fields
self._log.info(f" 📤 Uploading '{filename}' ({mime_type}) with metadata...")
new_xai_file_id = await xai.upload_to_collection(
collection_id, file_content, filename, mime_type, fields=fields
)
self._log.info(f" ✅ Uploaded + metadata set: {new_xai_file_id}")
# Update CDokumente with sync result
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
await espocrm.update_entity('CDokumente', doc_id, {
'aiFileId': new_xai_file_id,
'aiCollectionId': collection_id,
'aiSyncHash': blake3_hash or doc.get('syncedHash'),
'aiSyncStatus': 'synced',
'aiLastSync': now,
})
self._log.info(f" ✅ EspoCRM updated")
return True
except Exception as e:
self._log.error(f" ❌ Failed: {e}")
await espocrm.update_entity('CDokumente', doc_id, {
'aiSyncStatus': 'failed',
'aiLastSync': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
})
return False
async def remove_document_from_xai(
self,
doc: Dict[str, Any],
collection_id: str,
xai,
espocrm,
) -> None:
"""Remove a CDokumente from its xAI collection (called on DELETE)."""
doc_id = doc['id']
ai_file_id = doc.get('aiFileId')
if not ai_file_id:
return
try:
await xai.remove_from_collection(collection_id, ai_file_id)
self._log.info(f" 🗑️ Removed {doc.get('name')} from xAI collection")
await espocrm.update_entity('CDokumente', doc_id, {
'aiFileId': None,
'aiSyncStatus': 'new',
'aiLastSync': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
})
except Exception as e:
self._log.warn(f" ⚠️ Could not remove from xAI: {e}")
class XAIProviderAdapter:
"""
Adapter der XAIService auf das Provider-Interface bringt,
das AIKnowledgeSyncUtils erwartet.
Interface (identisch mit RAGFlowService):
ensure_dataset(name, description) -> dict mit 'id'
list_documents(dataset_id) -> list[dict] mit 'id', 'name'
upload_document(dataset_id, file_content, filename, mime_type,
blake3_hash, espocrm_id, description,
advoware_art, advoware_bemerkung) -> dict mit 'id'
update_document_meta(dataset_id, doc_id, ...) -> None
remove_document(dataset_id, doc_id) -> None
delete_dataset(dataset_id) -> None
is_mime_type_supported(mime_type) -> bool
"""
def __init__(self, ctx=None):
from services.xai_service import XAIService
from services.logging_utils import get_service_logger
self._xai = XAIService(ctx)
self._log = get_service_logger('xai_adapter', ctx)
async def ensure_dataset(self, name: str, description: str = '') -> dict:
"""Erstellt oder verifiziert eine xAI Collection. Gibt {'id': collection_id} zurueck."""
existing = await self._xai.get_collection_by_name(name)
if existing:
col_id = existing.get('collection_id') or existing.get('id')
return {'id': col_id, 'name': name}
result = await self._xai.create_collection(name=name)
col_id = result.get('collection_id') or result.get('id')
return {'id': col_id, 'name': name}
async def list_documents(self, dataset_id: str) -> list:
"""Listet alle Dokumente in einer xAI Collection auf."""
raw = await self._xai.list_collection_documents(dataset_id)
return [{'id': d.get('file_id'), 'name': d.get('filename')} for d in raw]
async def upload_document(
self,
dataset_id: str,
file_content: bytes,
filename: str,
mime_type: str = 'application/octet-stream',
blake3_hash=None,
espocrm_id=None,
description=None,
advoware_art=None,
advoware_bemerkung=None,
) -> dict:
"""Laedt Dokument in xAI Collection mit Metadata-Fields."""
fields_raw = {
'document_name': filename,
'espocrm_id': espocrm_id or '',
'description': description or '',
'advoware_art': advoware_art or '',
'advoware_bemerkung': advoware_bemerkung or '',
}
if blake3_hash:
fields_raw['blake3_hash'] = blake3_hash
fields = {k: v for k, v in fields_raw.items() if v}
file_id = await self._xai.upload_to_collection(
collection_id=dataset_id,
file_content=file_content,
filename=filename,
mime_type=mime_type,
fields=fields,
)
return {'id': file_id, 'name': filename}
async def update_document_meta(
self,
dataset_id: str,
doc_id: str,
blake3_hash=None,
description=None,
advoware_art=None,
advoware_bemerkung=None,
) -> None:
"""
xAI unterstuetzt kein PATCH fuer Metadaten.
Re-Upload wird vom Caller gesteuert (via syncedMetadataHash Aenderung
fuehrt zum vollstaendigen Upload-Path).
Hier kein-op.
"""
self._log.warn(
"XAIProviderAdapter.update_document_meta: xAI unterstuetzt kein "
"Metadaten-PATCH kein-op. Naechster Sync loest Re-Upload aus."
)
async def remove_document(self, dataset_id: str, doc_id: str) -> None:
"""Loescht Dokument aus xAI Collection (Datei bleibt in xAI Files API)."""
await self._xai.remove_from_collection(dataset_id, doc_id)
async def delete_dataset(self, dataset_id: str) -> None:
"""Loescht xAI Collection."""
await self._xai.delete_collection(dataset_id)
def is_mime_type_supported(self, mime_type: str) -> bool:
return self._xai.is_mime_type_supported(mime_type)