feat: Implement AI Knowledge Sync Utilities and Event Handlers

- Added AIKnowledgeActivationStatus and AIKnowledgeSyncStatus enums to models.py for managing activation and sync states. - Introduced AIKnowledgeSync class in aiknowledge_sync_utils.py for synchronizing CAIKnowledge entities with XAI Collections, including collection lifecycle management, document synchronization, and metadata updates. - Created a daily cron job (aiknowledge_full_sync_cron_step.py) to perform a full sync of CAIKnowledge entities. - Developed an event handler (aiknowledge_sync_event_step.py) to synchronize CAIKnowledge entities with XAI Collections triggered by webhooks and cron jobs. - Implemented a webhook handler (aiknowledge_update_api_step.py) to receive updates from EspoCRM for CAIKnowledge entities and enqueue sync events. - Enhanced xai_service.py with methods for collection management, document listing, and metadata updates.
2026-03-11 21:14:52 +00:00
parent a5a122b688
commit 9bbfa61b3b
7 changed files with 1366 additions and 1 deletions
--- a/services/aiknowledge_sync_utils.py
+++ b/services/aiknowledge_sync_utils.py
@@ -0,0 +1,622 @@
+"""
+AI Knowledge Sync Utilities
+
+Utility functions for synchronizing CAIKnowledge entities with XAI Collections:
+- Collection lifecycle management (create, delete)
+- Document synchronization with BLAKE3 hash verification
+- Metadata-only updates via PATCH
+- Orphan detection and cleanup
+"""
+
+import hashlib
+import json
+from typing import Dict, Any, Optional, List, Tuple
+from datetime import datetime
+
+from services.sync_utils_base import BaseSyncUtils
+from services.models import (
+    AIKnowledgeActivationStatus,
+    AIKnowledgeSyncStatus,
+    JunctionSyncStatus
+)
+
+
+class AIKnowledgeSync(BaseSyncUtils):
+    """Utility class for AI Knowledge ↔ XAI Collections synchronization"""
+
+    def _get_lock_key(self, entity_id: str) -> str:
+        """Redis lock key for AI Knowledge entities"""
+        return f"sync_lock:aiknowledge:{entity_id}"
+
+    async def acquire_sync_lock(self, knowledge_id: str) -> bool:
+        """
+        Acquire distributed lock via Redis + update EspoCRM syncStatus.
+
+        Args:
+            knowledge_id: CAIKnowledge entity ID
+
+        Returns:
+            True if lock acquired, False if already locked
+        """
+        try:
+            # STEP 1: Atomic Redis lock
+            lock_key = self._get_lock_key(knowledge_id)
+            if not self._acquire_redis_lock(lock_key):
+                self._log(f"Redis lock already active for {knowledge_id}", level='warn')
+                return False
+
+            # STEP 2: Update syncStatus to pending_sync
+            try:
+                await self.espocrm.update_entity('CAIKnowledge', knowledge_id, {
+                    'syncStatus': AIKnowledgeSyncStatus.PENDING_SYNC.value
+                })
+            except Exception as e:
+                self._log(f"Could not set syncStatus: {e}", level='debug')
+
+            self._log(f"Sync lock acquired for {knowledge_id}")
+            return True
+
+        except Exception as e:
+            self._log(f"Error acquiring lock: {e}", level='error')
+            # Clean up Redis lock on error
+            lock_key = self._get_lock_key(knowledge_id)
+            self._release_redis_lock(lock_key)
+            return False
+
+    async def release_sync_lock(
+        self,
+        knowledge_id: str,
+        success: bool = True,
+        error_message: Optional[str] = None
+    ) -> None:
+        """
+        Release sync lock and set final status.
+
+        Args:
+            knowledge_id: CAIKnowledge entity ID
+            success: Whether sync succeeded
+            error_message: Optional error message
+        """
+        try:
+            update_data = {
+                'syncStatus': AIKnowledgeSyncStatus.SYNCED.value if success else AIKnowledgeSyncStatus.FAILED.value
+            }
+
+            if success:
+                update_data['lastSync'] = datetime.now().isoformat()
+                update_data['syncError'] = None
+            elif error_message:
+                update_data['syncError'] = error_message[:2000]
+
+            await self.espocrm.update_entity('CAIKnowledge', knowledge_id, update_data)
+
+            self._log(f"Sync lock released: {knowledge_id} → {'success' if success else 'failed'}")
+
+            # Release Redis lock
+            lock_key = self._get_lock_key(knowledge_id)
+            self._release_redis_lock(lock_key)
+
+        except Exception as e:
+            self._log(f"Error releasing lock: {e}", level='error')
+            # Ensure Redis lock is released
+            lock_key = self._get_lock_key(knowledge_id)
+            self._release_redis_lock(lock_key)
+
+    async def sync_knowledge_to_xai(self, knowledge_id: str, ctx) -> None:
+        """
+        Main sync orchestrator with activation status handling.
+
+        Args:
+            knowledge_id: CAIKnowledge entity ID
+            ctx: Motia context for logging
+        """
+        from services.espocrm import EspoCRMAPI
+        from services.xai_service import XAIService
+
+        espocrm = EspoCRMAPI(ctx)
+        xai = XAIService(ctx)
+
+        try:
+            # 1. Load knowledge entity
+            knowledge = await espocrm.get_entity('CAIKnowledge', knowledge_id)
+
+            activation_status = knowledge.get('activationStatus')
+            collection_id = knowledge.get('datenbankId')
+
+            ctx.logger.info("=" * 80)
+            ctx.logger.info(f"📋 Processing: {knowledge['name']}")
+            ctx.logger.info(f"   activationStatus: {activation_status}")
+            ctx.logger.info(f"   datenbankId: {collection_id or 'NONE'}")
+            ctx.logger.info("=" * 80)
+
+            # ═══════════════════════════════════════════════════════════
+            # CASE 1: NEW → Create Collection
+            # ═══════════════════════════════════════════════════════════
+            if activation_status == AIKnowledgeActivationStatus.NEW.value:
+                ctx.logger.info("🆕 Status 'new' → Creating XAI Collection")
+
+                collection = await xai.create_collection(
+                    name=knowledge['name'],
+                    metadata={
+                        'espocrm_entity_type': 'CAIKnowledge',
+                        'espocrm_entity_id': knowledge_id,
+                        'created_at': datetime.now().isoformat()
+                    }
+                )
+
+                collection_id = collection['id']
+
+                # Update EspoCRM: Set datenbankId + change status to 'active'
+                await espocrm.update_entity('CAIKnowledge', knowledge_id, {
+                    'datenbankId': collection_id,
+                    'activationStatus': AIKnowledgeActivationStatus.ACTIVE.value,
+                    'syncStatus': AIKnowledgeSyncStatus.UNCLEAN.value
+                })
+
+                ctx.logger.info(f"✅ Collection created: {collection_id}")
+                ctx.logger.info("   Status changed to 'active', next webhook will sync documents")
+                return
+
+            # ═══════════════════════════════════════════════════════════
+            # CASE 2: DEACTIVATED → Delete Collection from XAI
+            # ═══════════════════════════════════════════════════════════
+            elif activation_status == AIKnowledgeActivationStatus.DEACTIVATED.value:
+                ctx.logger.info("🗑️  Status 'deactivated' → Deleting XAI Collection")
+
+                if collection_id:
+                    try:
+                        await xai.delete_collection(collection_id)
+                        ctx.logger.info(f"✅ Collection deleted from XAI: {collection_id}")
+                    except Exception as e:
+                        ctx.logger.error(f"❌ Failed to delete collection: {e}")
+                else:
+                    ctx.logger.info("⏭️  No collection ID, nothing to delete")
+
+                # Update junction entries
+                junction_entries = await espocrm.get_junction_entries(
+                    'CAIKnowledgeCDokumente',
+                    'cAIKnowledgeId',
+                    knowledge_id
+                )
+
+                for junction in junction_entries:
+                    await espocrm.update_junction_entry('CAIKnowledgeCDokumente', junction['id'], {
+                        'syncstatus': JunctionSyncStatus.NEW.value,
+                        'aiDocumentId': None
+                    })
+
+                ctx.logger.info(f"✅ Deactivation complete, {len(junction_entries)} junction entries reset")
+                return
+
+            # ═══════════════════════════════════════════════════════════
+            # CASE 3: PAUSED → Skip Sync
+            # ═══════════════════════════════════════════════════════════
+            elif activation_status == AIKnowledgeActivationStatus.PAUSED.value:
+                ctx.logger.info("⏸️  Status 'paused' → No sync performed")
+                return
+
+            # ═══════════════════════════════════════════════════════════
+            # CASE 4: ACTIVE → Normal Sync
+            # ═══════════════════════════════════════════════════════════
+            elif activation_status == AIKnowledgeActivationStatus.ACTIVE.value:
+                if not collection_id:
+                    ctx.logger.error("❌ Status 'active' but no datenbankId!")
+                    raise RuntimeError("Active knowledge without collection ID")
+
+                ctx.logger.info(f"🔄 Status 'active' → Syncing documents to {collection_id}")
+
+                # Verify collection exists
+                collection = await xai.get_collection(collection_id)
+                if not collection:
+                    ctx.logger.warn(f"⚠️  Collection {collection_id} not found, recreating")
+                    collection = await xai.create_collection(
+                        name=knowledge['name'],
+                        metadata={
+                            'espocrm_entity_type': 'CAIKnowledge',
+                            'espocrm_entity_id': knowledge_id
+                        }
+                    )
+                    collection_id = collection['id']
+                    await espocrm.update_entity('CAIKnowledge', knowledge_id, {
+                        'datenbankId': collection_id
+                    })
+
+                # Sync documents
+                await self._sync_knowledge_documents(knowledge_id, collection_id, ctx)
+
+            else:
+                ctx.logger.error(f"❌ Unknown activationStatus: {activation_status}")
+                raise ValueError(f"Invalid activationStatus: {activation_status}")
+
+        finally:
+            await xai.close()
+
+    async def _sync_knowledge_documents(
+        self,
+        knowledge_id: str,
+        collection_id: str,
+        ctx
+    ) -> None:
+        """
+        Sync all documents of a knowledge base to XAI collection.
+
+        Args:
+            knowledge_id: CAIKnowledge entity ID
+            collection_id: XAI Collection ID
+            ctx: Motia context
+        """
+        from services.espocrm import EspoCRMAPI
+        from services.xai_service import XAIService
+
+        espocrm = EspoCRMAPI(ctx)
+        xai = XAIService(ctx)
+
+        # Load junction entries
+        junction_entries = await espocrm.get_junction_entries(
+            'CAIKnowledgeCDokumente',
+            'cAIKnowledgeId',
+            knowledge_id
+        )
+
+        ctx.logger.info(f"📊 Found {len(junction_entries)} junction entries")
+
+        if not junction_entries:
+            ctx.logger.info("✅ No documents to sync")
+            return
+
+        # Load documents
+        documents = {}
+        for junction in junction_entries:
+            doc_id = junction['cDokumenteId']
+            try:
+                doc = await espocrm.get_entity('CDokumente', doc_id)
+                documents[doc_id] = doc
+            except Exception as e:
+                ctx.logger.error(f"❌ Failed to load document {doc_id}: {e}")
+
+        ctx.logger.info(f"📊 Loaded {len(documents)}/{len(junction_entries)} documents")
+
+        # Sync each document
+        successful = 0
+        failed = 0
+        skipped = 0
+
+        for junction in junction_entries:
+            doc_id = junction['cDokumenteId']
+            document = documents.get(doc_id)
+
+            if not document:
+                failed += 1
+                continue
+
+            try:
+                synced = await self._sync_single_document(junction, document, collection_id, ctx)
+                if synced:
+                    successful += 1
+                else:
+                    skipped += 1
+            except Exception as e:
+                failed += 1
+                ctx.logger.error(f"❌ Failed to sync document {doc_id}: {e}")
+
+                # Mark as failed
+                await espocrm.update_junction_entry('CAIKnowledgeCDokumente', junction['id'], {
+                    'syncstatus': JunctionSyncStatus.FAILED.value
+                })
+
+        # Remove orphans
+        try:
+            await self._remove_orphaned_documents(collection_id, junction_entries, ctx)
+        except Exception as e:
+            ctx.logger.warn(f"⚠️  Failed to remove orphans: {e}")
+
+        # Summary
+        ctx.logger.info("=" * 80)
+        ctx.logger.info(f"📊 Sync Statistics:")
+        ctx.logger.info(f"   ✅ Synced: {successful}")
+        ctx.logger.info(f"   ⏭️  Skipped: {skipped}")
+        ctx.logger.info(f"   ❌ Failed: {failed}")
+        ctx.logger.info("=" * 80)
+
+    async def _sync_single_document(
+        self,
+        junction_entry: Dict,
+        document: Dict,
+        collection_id: str,
+        ctx
+    ) -> bool:
+        """
+        Sync one document to XAI Collection with BLAKE3 verification.
+
+        Args:
+            junction_entry: Junction table entry
+            document: CDokumente entity
+            collection_id: XAI Collection ID
+            ctx: Motia context
+
+        Returns:
+            True if synced, False if skipped
+        """
+        from services.espocrm import EspoCRMAPI
+        from services.xai_service import XAIService
+
+        espocrm = EspoCRMAPI(ctx)
+        xai = XAIService(ctx)
+
+        junction_id = junction_entry['id']
+        junction_status = junction_entry.get('syncstatus')
+        junction_ai_doc_id = junction_entry.get('aiDocumentId')
+
+        # 1. Check MIME type support
+        mime_type = document.get('mimeType') or 'application/octet-stream'
+        if not xai.is_mime_type_supported(mime_type):
+            await espocrm.update_junction_entry('CAIKnowledgeCDokumente', junction_id, {
+                'syncstatus': JunctionSyncStatus.UNSUPPORTED.value
+            })
+            ctx.logger.info(f"⏭️  Unsupported MIME: {document['name']}")
+            return False
+
+        # 2. Calculate hashes
+        current_file_hash = document.get('md5') or document.get('sha256')
+        if not current_file_hash:
+            ctx.logger.error(f"❌ No hash for document {document['id']}")
+            return False
+
+        current_metadata_hash = self._calculate_metadata_hash(document)
+
+        synced_file_hash = junction_entry.get('syncedHash')
+        synced_metadata_hash = junction_entry.get('syncedMetadataHash')
+        xai_blake3_hash = junction_entry.get('xaiBlake3Hash')
+
+        # 3. Determine changes
+        file_changed = (current_file_hash != synced_file_hash)
+        metadata_changed = (current_metadata_hash != synced_metadata_hash)
+
+        ctx.logger.info(f"📋 {document['name']}")
+        ctx.logger.info(f"   File changed: {file_changed}, Metadata changed: {metadata_changed}")
+
+        # 4. Early return if nothing changed
+        if junction_status == JunctionSyncStatus.SYNCED.value and junction_ai_doc_id:
+            if not file_changed and not metadata_changed:
+                # Verify document still exists in XAI
+                try:
+                    doc_info = await xai.get_collection_document(collection_id, junction_ai_doc_id)
+                    if doc_info:
+                        ctx.logger.info(f"   ✅ Already synced (verified)")
+                        return False
+                    else:
+                        ctx.logger.warn(f"   ⚠️  Document missing in XAI, re-uploading")
+                except Exception as e:
+                    ctx.logger.warn(f"   ⚠️  Could not verify: {e}")
+
+        # 5. Handle file content change (re-upload)
+        if file_changed or not junction_ai_doc_id:
+            ctx.logger.info(f"   🔄 {'File changed' if file_changed else 'New file'}, uploading")
+
+            # Download from EspoCRM
+            download_info = await self._get_document_download_info(document, ctx)
+            if not download_info:
+                raise RuntimeError(f"Cannot download document {document['id']}")
+
+            file_content = await espocrm.download_attachment(download_info['attachment_id'])
+
+            # Build metadata
+            metadata = self._build_xai_metadata(document)
+
+            # Upload to XAI
+            xai_file_id = await xai.upload_document_with_metadata(
+                collection_id=collection_id,
+                file_content=file_content,
+                filename=download_info['filename'],
+                mime_type=download_info['mime_type'],
+                metadata=metadata
+            )
+
+            ctx.logger.info(f"   ✅ Uploaded → {xai_file_id}")
+
+            # Verify upload
+            ctx.logger.info(f"   🔍 Verifying upload...")
+            success, blake3_hash = await xai.verify_upload_integrity(
+                collection_id=collection_id,
+                file_id=xai_file_id
+            )
+
+            if not success:
+                ctx.logger.error(f"   ❌ Upload verification failed!")
+                raise RuntimeError("Upload verification failed")
+
+            ctx.logger.info(f"   ✅ Verified: {blake3_hash[:32]}...")
+
+            # Update junction
+            await espocrm.update_junction_entry('CAIKnowledgeCDokumente', junction_id, {
+                'aiDocumentId': xai_file_id,
+                'syncstatus': JunctionSyncStatus.SYNCED.value,
+                'syncedHash': current_file_hash,
+                'xaiBlake3Hash': blake3_hash,
+                'syncedMetadataHash': current_metadata_hash,
+                'lastSync': datetime.now().isoformat()
+            })
+
+            return True
+
+        # 6. Handle metadata-only change
+        elif metadata_changed:
+            ctx.logger.info(f"   📝 Metadata changed, updating")
+
+            xai_file_id = junction_ai_doc_id
+            metadata = self._build_xai_metadata(document)
+
+            try:
+                # Try PATCH
+                await xai.update_document_metadata(collection_id, xai_file_id, metadata)
+                ctx.logger.info(f"   ✅ Metadata updated")
+
+                # Get BLAKE3 hash
+                success, blake3_hash = await xai.verify_upload_integrity(
+                    collection_id, xai_file_id
+                )
+
+                # Update junction
+                await espocrm.update_junction_entry('CAIKnowledgeCDokumente', junction_id, {
+                    'syncstatus': JunctionSyncStatus.SYNCED.value,
+                    'syncedMetadataHash': current_metadata_hash,
+                    'xaiBlake3Hash': blake3_hash if success else xai_blake3_hash,
+                    'lastSync': datetime.now().isoformat()
+                })
+
+                return True
+
+            except Exception as e:
+                ctx.logger.warn(f"   ⚠️  PATCH failed, re-uploading: {e}")
+
+                # Fallback: Re-upload
+                download_info = await self._get_document_download_info(document, ctx)
+                file_content = await espocrm.download_attachment(download_info['attachment_id'])
+
+                await xai.remove_from_collection(collection_id, xai_file_id)
+
+                xai_file_id = await xai.upload_document_with_metadata(
+                    collection_id=collection_id,
+                    file_content=file_content,
+                    filename=download_info['filename'],
+                    mime_type=download_info['mime_type'],
+                    metadata=metadata
+                )
+
+                success, blake3_hash = await xai.verify_upload_integrity(
+                    collection_id, xai_file_id
+                )
+
+                await espocrm.update_junction_entry('CAIKnowledgeCDokumente', junction_id, {
+                    'aiDocumentId': xai_file_id,
+                    'syncstatus': JunctionSyncStatus.SYNCED.value,
+                    'syncedHash': current_file_hash,
+                    'xaiBlake3Hash': blake3_hash,
+                    'syncedMetadataHash': current_metadata_hash,
+                    'lastSync': datetime.now().isoformat()
+                })
+
+                return True
+
+        return False
+
+    async def _remove_orphaned_documents(
+        self,
+        collection_id: str,
+        junction_entries: List[Dict],
+        ctx
+    ) -> None:
+        """
+        Remove documents from XAI that are no longer in junction table.
+
+        Args:
+            collection_id: XAI Collection ID
+            junction_entries: List of junction entries
+            ctx: Motia context
+        """
+        from services.xai_service import XAIService
+
+        xai = XAIService(ctx)
+
+        # Get all XAI file_ids
+        xai_docs = await xai.list_collection_documents(collection_id)
+        xai_file_ids = {doc.get('file_id') or doc.get('id') for doc in xai_docs if doc.get('file_id') or doc.get('id')}
+
+        # Get all junction file_ids
+        junction_file_ids = {j['aiDocumentId'] for j in junction_entries if j.get('aiDocumentId')}
+
+        # Find orphans
+        orphans = xai_file_ids - junction_file_ids
+
+        if orphans:
+            ctx.logger.info(f"🗑️  Removing {len(orphans)} orphaned documents")
+            for orphan_id in orphans:
+                try:
+                    await xai.remove_from_collection(collection_id, orphan_id)
+                    ctx.logger.info(f"   ✅ Removed orphan: {orphan_id}")
+                except Exception as e:
+                    ctx.logger.warn(f"   ⚠️  Failed to remove {orphan_id}: {e}")
+        else:
+            ctx.logger.info("✅ No orphaned documents found")
+
+    def _calculate_metadata_hash(self, document: Dict) -> str:
+        """
+        Calculate hash of sync-relevant metadata.
+
+        Args:
+            document: CDokumente entity
+
+        Returns:
+            MD5 hash (32 chars)
+        """
+        metadata = {
+            'name': document.get('name', ''),
+            'description': document.get('description', ''),
+        }
+
+        metadata_str = json.dumps(metadata, sort_keys=True)
+        return hashlib.md5(metadata_str.encode()).hexdigest()
+
+    def _build_xai_metadata(self, document: Dict) -> Dict[str, str]:
+        """
+        Build XAI metadata from CDokumente entity.
+
+        Args:
+            document: CDokumente entity
+
+        Returns:
+            Metadata dict for XAI
+        """
+        return {
+            'document_name': document.get('name', ''),
+            'description': document.get('description', ''),
+            'created_at': document.get('createdAt', ''),
+            'modified_at': document.get('modifiedAt', ''),
+            'espocrm_id': document.get('id', '')
+        }
+
+    async def _get_document_download_info(
+        self,
+        document: Dict,
+        ctx
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Get download info for CDokumente entity.
+
+        Args:
+            document: CDokumente entity
+            ctx: Motia context
+
+        Returns:
+            Dict with attachment_id, filename, mime_type
+        """
+        from services.espocrm import EspoCRMAPI
+
+        espocrm = EspoCRMAPI(ctx)
+
+        # Check for dokumentId (CDokumente custom field)
+        attachment_id = None
+        filename = None
+
+        if document.get('dokumentId'):
+            attachment_id = document.get('dokumentId')
+            filename = document.get('dokumentName')
+        elif document.get('fileId'):
+            attachment_id = document.get('fileId')
+            filename = document.get('fileName')
+
+        if not attachment_id:
+            ctx.logger.error(f"❌ No attachment ID for document {document['id']}")
+            return None
+
+        # Get attachment details
+        try:
+            attachment = await espocrm.get_entity('Attachment', attachment_id)
+            return {
+                'attachment_id': attachment_id,
+                'filename': filename or attachment.get('name', 'unknown'),
+                'mime_type': attachment.get('type', 'application/octet-stream')
+            }
+        except Exception as e:
+            ctx.logger.error(f"❌ Failed to get attachment {attachment_id}: {e}")
+            return None
--- a/services/espocrm.py
+++ b/services/espocrm.py
@@ -473,3 +473,70 @@ class EspoCRMAPI:
        except aiohttp.ClientError as e:
            self._log(f"Download failed: {e}", level='error')
            raise EspoCRMError(f"Download request failed: {e}") from e
+
+    # ========== Junction Table Operations ==========
+
+    async def get_junction_entries(
+        self,
+        junction_entity: str,
+        filter_field: str,
+        filter_value: str,
+        max_size: int = 1000
+    ) -> List[Dict[str, Any]]:
+        """
+        Load junction table entries with filtering.
+
+        Args:
+            junction_entity: Junction entity name (e.g., 'CAIKnowledgeCDokumente')
+            filter_field: Field to filter on (e.g., 'cAIKnowledgeId')
+            filter_value: Value to match
+            max_size: Maximum entries to return
+
+        Returns:
+            List of junction records with ALL additionalColumns
+
+        Example:
+            entries = await espocrm.get_junction_entries(
+                'CAIKnowledgeCDokumente',
+                'cAIKnowledgeId',
+                'kb-123'
+            )
+        """
+        self._log(f"Loading junction entries: {junction_entity} where {filter_field}={filter_value}")
+
+        result = await self.list_entities(
+            junction_entity,
+            where=[{
+                'type': 'equals',
+                'attribute': filter_field,
+                'value': filter_value
+            }],
+            max_size=max_size
+        )
+
+        entries = result.get('list', [])
+        self._log(f"✅ Loaded {len(entries)} junction entries")
+        return entries
+
+    async def update_junction_entry(
+        self,
+        junction_entity: str,
+        junction_id: str,
+        fields: Dict[str, Any]
+    ) -> None:
+        """
+        Update junction table entry.
+
+        Args:
+            junction_entity: Junction entity name
+            junction_id: Junction entry ID
+            fields: Fields to update
+
+        Example:
+            await espocrm.update_junction_entry(
+                'CAIKnowledgeCDokumente',
+                'jct-123',
+                {'syncstatus': 'synced', 'lastSync': '2026-03-11T20:00:00Z'}
+            )
+        """
+        await self.update_entity(junction_entity, junction_id, fields)
--- a/services/models.py
+++ b/services/models.py
@@ -68,6 +68,40 @@ class SalutationType(str, Enum):
    FIRMA = ""


+class AIKnowledgeActivationStatus(str, Enum):
+    """Activation status for CAIKnowledge collections"""
+    NEW = "new"                    # Collection noch nicht in XAI erstellt
+    ACTIVE = "active"              # Collection aktiv, Sync läuft
+    PAUSED = "paused"              # Collection existiert, aber kein Sync
+    DEACTIVATED = "deactivated"    # Collection aus XAI gelöscht
+    
+    def __str__(self) -> str:
+        return self.value
+
+
+class AIKnowledgeSyncStatus(str, Enum):
+    """Sync status for CAIKnowledge"""
+    UNCLEAN = "unclean"            # Änderungen pending
+    PENDING_SYNC = "pending_sync"   # Sync läuft (locked)
+    SYNCED = "synced"              # Alles synced
+    FAILED = "failed"              # Sync fehlgeschlagen
+    
+    def __str__(self) -> str:
+        return self.value
+
+
+class JunctionSyncStatus(str, Enum):
+    """Sync status for junction tables (CAIKnowledgeCDokumente)"""
+    NEW = "new"
+    UNCLEAN = "unclean"
+    SYNCED = "synced"
+    FAILED = "failed"
+    UNSUPPORTED = "unsupported"
+    
+    def __str__(self) -> str:
+        return self.value
+
+
 # ========== Advoware Models ==========

 class AdvowareBeteiligteBase(BaseModel):
--- a/services/xai_service.py
+++ b/services/xai_service.py
@@ -1,7 +1,8 @@
 """xAI Files & Collections Service"""
 import os
+import asyncio
 import aiohttp
-from typing import Optional, List
+from typing import Optional, List, Dict, Tuple
 from services.logging_utils import get_service_logger

 XAI_FILES_URL = "https://api.x.ai"
@@ -173,3 +174,392 @@ class XAIService:
                    f"⚠️  Fehler beim Entfernen aus Collection {collection_id}: {e}",
                    level='warn'
                )
+
+    # ========== Collection Management ==========
+
+    async def create_collection(
+        self,
+        name: str,
+        metadata: Optional[Dict[str, str]] = None,
+        field_definitions: Optional[List[Dict]] = None
+    ) -> Dict:
+        """
+        Erstellt eine neue xAI Collection.
+
+        POST https://management-api.x.ai/v1/collections
+
+        Args:
+            name: Collection name
+            metadata: Optional metadata dict
+            field_definitions: Optional field definitions for metadata fields
+
+        Returns:
+            Collection object mit 'id' field
+
+        Raises:
+            RuntimeError: bei HTTP-Fehler
+        """
+        self._log(f"📚 Creating collection: {name}")
+
+        # Standard field definitions für document metadata
+        if field_definitions is None:
+            field_definitions = [
+                {"key": "document_name", "inject_into_chunk": True},
+                {"key": "description", "inject_into_chunk": True},
+                {"key": "created_at", "inject_into_chunk": False},
+                {"key": "modified_at", "inject_into_chunk": False},
+                {"key": "espocrm_id", "inject_into_chunk": False}
+            ]
+
+        session = await self._get_session()
+        url = f"{XAI_MANAGEMENT_URL}/v1/collections"
+        headers = {
+            "Authorization": f"Bearer {self.management_key}",
+            "Content-Type": "application/json"
+        }
+
+        body = {
+            "collection_name": name,
+            "field_definitions": field_definitions
+        }
+
+        # Add metadata if provided
+        if metadata:
+            body["metadata"] = metadata
+
+        async with session.post(url, json=body, headers=headers) as response:
+            if response.status not in (200, 201):
+                raw = await response.text()
+                raise RuntimeError(
+                    f"Failed to create collection ({response.status}): {raw}"
+                )
+
+            data = await response.json()
+
+        collection_id = data.get('id')
+        self._log(f"✅ Collection created: {collection_id}")
+        return data
+
+    async def get_collection(self, collection_id: str) -> Optional[Dict]:
+        """
+        Holt Collection-Details.
+
+        GET https://management-api.x.ai/v1/collections/{collection_id}
+
+        Returns:
+            Collection object or None if not found
+
+        Raises:
+            RuntimeError: bei HTTP-Fehler (außer 404)
+        """
+        self._log(f"📄 Getting collection: {collection_id}")
+
+        session = await self._get_session()
+        url = f"{XAI_MANAGEMENT_URL}/v1/collections/{collection_id}"
+        headers = {"Authorization": f"Bearer {self.management_key}"}
+
+        async with session.get(url, headers=headers) as response:
+            if response.status == 404:
+                self._log(f"⚠️  Collection not found: {collection_id}", level='warn')
+                return None
+
+            if response.status not in (200,):
+                raw = await response.text()
+                raise RuntimeError(
+                    f"Failed to get collection ({response.status}): {raw}"
+                )
+
+            data = await response.json()
+
+        self._log(f"✅ Collection retrieved: {data.get('collection_name', 'N/A')}")
+        return data
+
+    async def delete_collection(self, collection_id: str) -> None:
+        """
+        Löscht eine XAI Collection.
+
+        DELETE https://management-api.x.ai/v1/collections/{collection_id}
+
+        NOTE: Documents in der Collection werden NICHT gelöscht!
+              Sie können noch in anderen Collections sein.
+
+        Raises:
+            RuntimeError: bei HTTP-Fehler
+        """
+        self._log(f"🗑️  Deleting collection {collection_id}")
+
+        session = await self._get_session()
+        url = f"{XAI_MANAGEMENT_URL}/v1/collections/{collection_id}"
+        headers = {"Authorization": f"Bearer {self.management_key}"}
+
+        async with session.delete(url, headers=headers) as response:
+            if response.status not in (200, 204):
+                raw = await response.text()
+                raise RuntimeError(
+                    f"Failed to delete collection {collection_id} ({response.status}): {raw}"
+                )
+
+        self._log(f"✅ Collection deleted: {collection_id}")
+
+    async def list_collection_documents(self, collection_id: str) -> List[Dict]:
+        """
+        Listet alle Dokumente in einer Collection.
+
+        GET https://management-api.x.ai/v1/collections/{collection_id}/documents
+
+        Returns:
+            List von document objects mit file_id, filename, hash, fields
+
+        Raises:
+            RuntimeError: bei HTTP-Fehler
+        """
+        self._log(f"📋 Listing documents in collection {collection_id}")
+
+        session = await self._get_session()
+        url = f"{XAI_MANAGEMENT_URL}/v1/collections/{collection_id}/documents"
+        headers = {"Authorization": f"Bearer {self.management_key}"}
+
+        async with session.get(url, headers=headers) as response:
+            if response.status not in (200,):
+                raw = await response.text()
+                raise RuntimeError(
+                    f"Failed to list documents ({response.status}): {raw}"
+                )
+
+            data = await response.json()
+
+        # API sollte eine Liste zurückgeben oder ein dict mit 'documents' key
+        if isinstance(data, list):
+            documents = data
+        elif isinstance(data, dict) and 'documents' in data:
+            documents = data['documents']
+        else:
+            documents = []
+
+        self._log(f"✅ Listed {len(documents)} documents")
+        return documents
+
+    async def get_collection_document(self, collection_id: str, file_id: str) -> Optional[Dict]:
+        """
+        Holt Dokument-Details aus einer XAI Collection.
+
+        GET https://management-api.x.ai/v1/collections/{collection_id}/documents/{file_id}
+
+        Returns:
+            Dict mit document info including BLAKE3 hash:
+            {
+                'file_id': 'file_xyz',
+                'filename': 'document.pdf',
+                'hash': 'blake3:abcd1234...',  # BLAKE3 Hash!
+                'fields': {...}  # Metadata
+            }
+
+        Returns None if not found.
+        """
+        self._log(f"📄 Getting document {file_id} from collection {collection_id}")
+
+        session = await self._get_session()
+        url = f"{XAI_MANAGEMENT_URL}/v1/collections/{collection_id}/documents/{file_id}"
+        headers = {"Authorization": f"Bearer {self.management_key}"}
+
+        async with session.get(url, headers=headers) as response:
+            if response.status == 404:
+                return None
+
+            if response.status not in (200,):
+                raw = await response.text()
+                raise RuntimeError(
+                    f"Failed to get document from collection ({response.status}): {raw}"
+                )
+
+            data = await response.json()
+
+        self._log(f"✅ Document info retrieved: {data.get('filename', 'N/A')}")
+        return data
+
+    async def update_document_metadata(
+        self,
+        collection_id: str,
+        file_id: str,
+        metadata: Dict[str, str]
+    ) -> None:
+        """
+        Aktualisiert nur Metadaten eines Documents (kein File-Upload).
+
+        PATCH https://management-api.x.ai/v1/collections/{collection_id}/documents/{file_id}
+
+        Args:
+            collection_id: XAI Collection ID
+            file_id: XAI file_id
+            metadata: Updated metadata fields
+
+        Raises:
+            RuntimeError: bei HTTP-Fehler
+        """
+        self._log(f"📝 Updating metadata for document {file_id}")
+
+        session = await self._get_session()
+        url = f"{XAI_MANAGEMENT_URL}/v1/collections/{collection_id}/documents/{file_id}"
+        headers = {
+            "Authorization": f"Bearer {self.management_key}",
+            "Content-Type": "application/json"
+        }
+
+        body = {"fields": metadata}
+
+        async with session.patch(url, json=body, headers=headers) as response:
+            if response.status not in (200, 204):
+                raw = await response.text()
+                raise RuntimeError(
+                    f"Failed to update document metadata ({response.status}): {raw}"
+                )
+
+        self._log(f"✅ Metadata updated for {file_id}")
+
+    # ========== High-Level Operations ==========
+
+    async def upload_document_with_metadata(
+        self,
+        collection_id: str,
+        file_content: bytes,
+        filename: str,
+        mime_type: str,
+        metadata: Dict[str, str]
+    ) -> str:
+        """
+        Upload file + add to collection with metadata in one operation.
+
+        Args:
+            collection_id: XAI Collection ID
+            file_content: File bytes
+            filename: Filename
+            mime_type: MIME type
+            metadata: Metadata fields
+
+        Returns:
+            XAI file_id
+
+        Raises:
+            RuntimeError: bei Upload/Add-Fehler
+        """
+        # Step 1: Upload file
+        file_id = await self.upload_file(file_content, filename, mime_type)
+
+        try:
+            # Step 2: Add to collection (XAI API automatically handles metadata)
+            # Note: Metadata muss beim POST mit angegeben werden
+            session = await self._get_session()
+            url = f"{XAI_MANAGEMENT_URL}/v1/collections/{collection_id}/documents/{file_id}"
+            headers = {
+                "Authorization": f"Bearer {self.management_key}",
+                "Content-Type": "application/json"
+            }
+
+            body = {"fields": metadata}
+
+            async with session.post(url, json=body, headers=headers) as response:
+                if response.status not in (200, 201):
+                    raw = await response.text()
+                    raise RuntimeError(
+                        f"Failed to add file to collection with metadata ({response.status}): {raw}"
+                    )
+
+            self._log(f"✅ File {file_id} added to collection {collection_id} with metadata")
+            return file_id
+
+        except Exception as e:
+            # Cleanup: File wurde hochgeladen aber nicht zur Collection hinzugefügt
+            self._log(f"⚠️  Failed to add to collection, file {file_id} may be orphaned", level='warn')
+            raise
+
+    async def verify_upload_integrity(
+        self,
+        collection_id: str,
+        file_id: str,
+        retry_attempts: int = 3
+    ) -> Tuple[bool, Optional[str]]:
+        """
+        Verifiziert Upload-Integrität via BLAKE3 Hash von XAI.
+
+        Args:
+            collection_id: XAI Collection ID
+            file_id: XAI file_id
+            retry_attempts: Retry bei temporären Fehlern
+
+        Returns:
+            (success: bool, blake3_hash: Optional[str])
+        """
+        for attempt in range(1, retry_attempts + 1):
+            try:
+                doc_info = await self.get_collection_document(collection_id, file_id)
+
+                if not doc_info:
+                    self._log(f"⚠️  Document {file_id} not found in collection", level='warn')
+                    return (False, None)
+
+                blake3_hash = doc_info.get('hash')
+
+                if not blake3_hash:
+                    self._log(f"⚠️  No hash returned by XAI API", level='warn')
+                    return (False, None)
+
+                self._log(f"✅ Upload verified, BLAKE3: {blake3_hash[:32]}...")
+                return (True, blake3_hash)
+
+            except Exception as e:
+                if attempt < retry_attempts:
+                    delay = 2 ** attempt  # Exponential backoff
+                    self._log(f"⚠️  Verification failed (attempt {attempt}), retry in {delay}s", level='warn')
+                    await asyncio.sleep(delay)
+                else:
+                    self._log(f"❌ Verification failed after {retry_attempts} attempts: {e}", level='error')
+                    return (False, None)
+
+        return (False, None)
+
+    def is_mime_type_supported(self, mime_type: str) -> bool:
+        """
+        Prüft, ob XAI diesen MIME-Type unterstützt.
+
+        Args:
+            mime_type: MIME type string
+
+        Returns:
+            True wenn unterstützt, False sonst
+        """
+        # Liste der unterstützten MIME-Types basierend auf XAI Dokumentation
+        supported_types = {
+            # Documents
+            'application/pdf',
+            'application/msword',
+            'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+            'application/vnd.ms-excel',
+            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+            'application/vnd.oasis.opendocument.text',
+            'application/epub+zip',
+            'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+            
+            # Text
+            'text/plain',
+            'text/html',
+            'text/markdown',
+            'text/csv',
+            'text/xml',
+            
+            # Code
+            'text/javascript',
+            'application/json',
+            'application/xml',
+            'text/x-python',
+            'text/x-java-source',
+            'text/x-c',
+            'text/x-c++src',
+            
+            # Other
+            'application/zip',
+        }
+
+        # Normalisiere MIME-Type (lowercase, strip whitespace)
+        normalized = mime_type.lower().strip()
+
+        return normalized in supported_types