feat: Enhance document synchronization by integrating CAIKnowledge handling and improving error logging

2026-03-12 22:30:11 +00:00
parent 8ed7cca432
commit 6bf2343a12
6 changed files with 492 additions and 362 deletions
--- a/services/aiknowledge_sync_utils.py
+++ b/services/aiknowledge_sync_utils.py
@@ -172,20 +172,25 @@ class AIKnowledgeSync(BaseSyncUtils):
                else:
                    ctx.logger.info("⏭️  No collection ID, nothing to delete")
-                # Update junction entries
+                # Reset junction entries
-                junction_entries = await espocrm.get_junction_entries(
+                documents = await espocrm.get_knowledge_documents_with_junction(knowledge_id)
-                    'CAIKnowledgeCDokumente',
+                
-                    'cAIKnowledgeId',
+                for doc in documents:
-                    knowledge_id
+                    doc_id = doc['documentId']
-                )
+                    try:
                        await espocrm.update_knowledge_document_junction(
                            knowledge_id,
                            doc_id,
                            {
                                'syncstatus': 'new',
                                'aiDocumentId': None
                            },
                            update_last_sync=False
                        )
                    except Exception as e:
                        ctx.logger.warn(f"⚠️  Failed to reset junction for {doc_id}: {e}")
-                for junction in junction_entries:
+                ctx.logger.info(f"✅ Deactivation complete, {len(documents)} junction entries reset")
                    await espocrm.update_junction_entry('CAIKnowledgeCDokumente', junction['id'], {
                        'syncstatus': JunctionSyncStatus.NEW.value,
                        'aiDocumentId': None
                    })
                ctx.logger.info(f"✅ Deactivation complete, {len(junction_entries)} junction entries reset")
                return
            # ═══════════════════════════════════════════════════════════
@@ -235,15 +240,20 @@ class AIKnowledgeSync(BaseSyncUtils):
        self,
        knowledge_id: str,
        collection_id: str,
-        ctx
+        ctx,
        full_sync: bool = False
    ) -> None:
        """
        Sync all documents of a knowledge base to XAI collection.
        Uses efficient JunctionData endpoint to get all documents with junction data
        and blake3 hashes in a single API call.
        Args:
            knowledge_id: CAIKnowledge entity ID
            collection_id: XAI Collection ID
            ctx: Motia context
            full_sync: If True, force Blake3 hash comparison for all documents (nightly cron)
        """
        from services.espocrm import EspoCRMAPI
        from services.xai_service import XAIService
@@ -251,294 +261,159 @@ class AIKnowledgeSync(BaseSyncUtils):
        espocrm = EspoCRMAPI(ctx)
        xai = XAIService(ctx)
-        # Load junction entries
+        # ═══════════════════════════════════════════════════════════════
-        junction_entries = await espocrm.get_junction_entries(
+        # STEP 1: Load all documents with junction data (single API call)
-            'CAIKnowledgeCDokumente',
+        # ═══════════════════════════════════════════════════════════════
-            'cAIKnowledgeId',
+        ctx.logger.info(f"📥 Loading documents with junction data for knowledge {knowledge_id}")
-            knowledge_id
+        
-        )
+        documents = await espocrm.get_knowledge_documents_with_junction(knowledge_id)
        ctx.logger.info(f"📊 Found {len(documents)} document(s)")
-        ctx.logger.info(f"📊 Found {len(junction_entries)} junction entries")
+        if not documents:
        if not junction_entries:
            ctx.logger.info("✅ No documents to sync")
            return
-        # Load documents
+        # ═══════════════════════════════════════════════════════════════
-        documents = {}
+        # STEP 2: Sync each document based on status/hash
-        for junction in junction_entries:
+        # ═══════════════════════════════════════════════════════════════
            doc_id = junction['cDokumenteId']
            try:
                doc = await espocrm.get_entity('CDokumente', doc_id)
                documents[doc_id] = doc
            except Exception as e:
                ctx.logger.error(f"❌ Failed to load document {doc_id}: {e}")
        ctx.logger.info(f"📊 Loaded {len(documents)}/{len(junction_entries)} documents")
        # Sync each document
        successful = 0
        failed = 0
        skipped = 0
-        for junction in junction_entries:
+        for doc in documents:
-            doc_id = junction['cDokumenteId']
+            doc_id = doc['documentId']
-            document = documents.get(doc_id)
+            doc_name = doc.get('documentName', 'Unknown')
-
+            junction_status = doc.get('syncstatus', 'new')
-            if not document:
+            ai_document_id = doc.get('aiDocumentId')
-                failed += 1
+            blake3_hash = doc.get('blake3hash')
-                continue
+            
-
+            ctx.logger.info(f"\n📄 {doc_name} (ID: {doc_id})")
            ctx.logger.info(f"   Status: {junction_status}")
            ctx.logger.info(f"   aiDocumentId: {ai_document_id or 'N/A'}")
            ctx.logger.info(f"   blake3hash: {blake3_hash[:16] if blake3_hash else 'N/A'}...")
            try:
-                synced = await self._sync_single_document(junction, document, collection_id, ctx)
+                # Decide if sync needed
-                if synced:
+                needs_sync = False
-                    successful += 1
+                reason = ""
-                else:
+                
                if junction_status in ['new', 'unclean', 'failed']:
                    needs_sync = True
                    reason = f"status={junction_status}"
                elif full_sync and blake3_hash and ai_document_id:
                    # Full sync mode: verify Blake3 hash with XAI
                    try:
                        xai_doc_info = await xai.get_collection_document(collection_id, ai_document_id)
                        if xai_doc_info:
                            xai_blake3 = xai_doc_info.get('blake3_hash')
                            if xai_blake3 != blake3_hash:
                                needs_sync = True
                                reason = f"blake3 mismatch (XAI: {xai_blake3[:16] if xai_blake3 else 'N/A'}... vs Doc: {blake3_hash[:16]}...)"
                                ctx.logger.info(f"   🔄 Blake3 mismatch detected!")
                            else:
                                ctx.logger.info(f"   ✅ Blake3 hash matches")
                        else:
                            needs_sync = True
                            reason = "file not found in XAI collection"
                    except Exception as e:
                        ctx.logger.warn(f"   ⚠️  Failed to verify Blake3: {e}")
                if not needs_sync:
                    ctx.logger.info(f"   ⏭️  Skipped (no sync needed)")
                    skipped += 1
                    continue
                ctx.logger.info(f"   🔄 Syncing: {reason}")
                # Download document
                attachment_id = doc.get('documentId')  # TODO: Get correct attachment ID from CDokumente
                file_content = await espocrm.download_attachment(attachment_id)
                ctx.logger.info(f"   📥 Downloaded {len(file_content)} bytes")
                # Upload to XAI
                filename = doc_name
                mime_type = 'application/octet-stream'  # TODO: Get from attachment
                xai_file_id = await xai.upload_file(file_content, filename, mime_type)
                ctx.logger.info(f"   📤 Uploaded to XAI: {xai_file_id}")
                # Add to collection
                await xai.add_to_collection(collection_id, xai_file_id)
                ctx.logger.info(f"   ✅ Added to collection {collection_id}")
                # Update junction
                await espocrm.update_knowledge_document_junction(
                    knowledge_id,
                    doc_id,
                    {
                        'aiDocumentId': xai_file_id,
                        'syncstatus': 'synced'
                    },
                    update_last_sync=True
                )
                ctx.logger.info(f"   ✅ Junction updated")
                successful += 1
            except Exception as e:
                failed += 1
-                ctx.logger.error(f"❌ Failed to sync document {doc_id}: {e}")
+                ctx.logger.error(f"   ❌ Sync failed: {e}")
                # Mark as failed in junction
                try:
                    await espocrm.update_knowledge_document_junction(
                        knowledge_id,
                        doc_id,
                        {'syncstatus': 'failed'},
                        update_last_sync=False
                    )
                except Exception as update_err:
                    ctx.logger.error(f"   ❌ Failed to update junction status: {update_err}")
-                # Mark as failed
+        # ═══════════════════════════════════════════════════════════════
-                await espocrm.update_junction_entry('CAIKnowledgeCDokumente', junction['id'], {
+        # STEP 3: Remove orphaned documents from XAI collection
-                    'syncstatus': JunctionSyncStatus.FAILED.value
+        # ═══════════════════════════════════════════════════════════════
                })
        # Remove orphans
        try:
-            await self._remove_orphaned_documents(collection_id, junction_entries, ctx)
+            ctx.logger.info(f"\n🧹 Checking for orphaned documents in XAI collection...")
            # Get all files in XAI collection (normalized structure)
            xai_documents = await xai.list_collection_documents(collection_id)
            xai_file_ids = {doc.get('file_id') for doc in xai_documents if doc.get('file_id')}
            # Get all ai_document_ids from junction
            junction_file_ids = {doc.get('aiDocumentId') for doc in documents if doc.get('aiDocumentId')}
            # Find orphans (in XAI but not in junction)
            orphans = xai_file_ids - junction_file_ids
            if orphans:
                ctx.logger.info(f"   Found {len(orphans)} orphaned file(s)")
                for orphan_id in orphans:
                    try:
                        await xai.remove_from_collection(collection_id, orphan_id)
                        ctx.logger.info(f"   🗑️  Removed {orphan_id}")
                    except Exception as e:
                        ctx.logger.warn(f"   ⚠️  Failed to remove {orphan_id}: {e}")
            else:
                ctx.logger.info(f"   ✅ No orphans found")
        except Exception as e:
-            ctx.logger.warn(f"⚠️  Failed to remove orphans: {e}")
+            ctx.logger.warn(f"⚠️  Failed to clean up orphans: {e}")
-        # Summary
+        # ═══════════════════════════════════════════════════════════════
        # STEP 4: Summary
        # ═══════════════════════════════════════════════════════════════
        ctx.logger.info("")
        ctx.logger.info("=" * 80)
        ctx.logger.info(f"📊 Sync Statistics:")
        ctx.logger.info(f"   ✅ Synced: {successful}")
        ctx.logger.info(f"   ⏭️  Skipped: {skipped}")
        ctx.logger.info(f"   ❌ Failed: {failed}")
        ctx.logger.info(f"   Mode: {'FULL SYNC (Blake3 verification)' if full_sync else 'INCREMENTAL'}")
        ctx.logger.info("=" * 80)
    async def _sync_single_document(
        self,
        junction_entry: Dict,
        document: Dict,
        collection_id: str,
        ctx
    ) -> bool:
        """
        Sync one document to XAI Collection with BLAKE3 verification.
        Args:
            junction_entry: Junction table entry
            document: CDokumente entity
            collection_id: XAI Collection ID
            ctx: Motia context
        Returns:
            True if synced, False if skipped
        """
        from services.espocrm import EspoCRMAPI
        from services.xai_service import XAIService
        espocrm = EspoCRMAPI(ctx)
        xai = XAIService(ctx)
        junction_id = junction_entry['id']
        junction_status = junction_entry.get('syncstatus')
        junction_ai_doc_id = junction_entry.get('aiDocumentId')
        # 1. Check MIME type support
        mime_type = document.get('mimeType') or 'application/octet-stream'
        if not xai.is_mime_type_supported(mime_type):
            await espocrm.update_junction_entry('CAIKnowledgeCDokumente', junction_id, {
                'syncstatus': JunctionSyncStatus.UNSUPPORTED.value
            })
            ctx.logger.info(f"⏭️  Unsupported MIME: {document['name']}")
            return False
        # 2. Calculate hashes
        current_file_hash = document.get('md5') or document.get('sha256')
        if not current_file_hash:
            ctx.logger.error(f"❌ No hash for document {document['id']}")
            return False
        current_metadata_hash = self._calculate_metadata_hash(document)
        synced_file_hash = junction_entry.get('syncedHash')
        synced_metadata_hash = junction_entry.get('syncedMetadataHash')
        xai_blake3_hash = junction_entry.get('xaiBlake3Hash')
        # 3. Determine changes
        file_changed = (current_file_hash != synced_file_hash)
        metadata_changed = (current_metadata_hash != synced_metadata_hash)
        ctx.logger.info(f"📋 {document['name']}")
        ctx.logger.info(f"   File changed: {file_changed}, Metadata changed: {metadata_changed}")
        # 4. Early return if nothing changed
        if junction_status == JunctionSyncStatus.SYNCED.value and junction_ai_doc_id:
            if not file_changed and not metadata_changed:
                # Verify document still exists in XAI
                try:
                    doc_info = await xai.get_collection_document(collection_id, junction_ai_doc_id)
                    if doc_info:
                        ctx.logger.info(f"   ✅ Already synced (verified)")
                        return False
                    else:
                        ctx.logger.warn(f"   ⚠️  Document missing in XAI, re-uploading")
                except Exception as e:
                    ctx.logger.warn(f"   ⚠️  Could not verify: {e}")
        # 5. Handle file content change (re-upload)
        if file_changed or not junction_ai_doc_id:
            ctx.logger.info(f"   🔄 {'File changed' if file_changed else 'New file'}, uploading")
            # Download from EspoCRM
            download_info = await self._get_document_download_info(document, ctx)
            if not download_info:
                raise RuntimeError(f"Cannot download document {document['id']}")
            file_content = await espocrm.download_attachment(download_info['attachment_id'])
            # Build metadata
            metadata = self._build_xai_metadata(document)
            # Upload to XAI
            xai_file_id = await xai.upload_document_with_metadata(
                collection_id=collection_id,
                file_content=file_content,
                filename=download_info['filename'],
                mime_type=download_info['mime_type'],
                metadata=metadata
            )
            ctx.logger.info(f"   ✅ Uploaded → {xai_file_id}")
            # Verify upload
            ctx.logger.info(f"   🔍 Verifying upload...")
            success, blake3_hash = await xai.verify_upload_integrity(
                collection_id=collection_id,
                file_id=xai_file_id
            )
            if not success:
                ctx.logger.error(f"   ❌ Upload verification failed!")
                raise RuntimeError("Upload verification failed")
            ctx.logger.info(f"   ✅ Verified: {blake3_hash[:32]}...")
            # Update junction
            await espocrm.update_junction_entry('CAIKnowledgeCDokumente', junction_id, {
                'aiDocumentId': xai_file_id,
                'syncstatus': JunctionSyncStatus.SYNCED.value,
                'syncedHash': current_file_hash,
                'xaiBlake3Hash': blake3_hash,
                'syncedMetadataHash': current_metadata_hash,
                'lastSync': datetime.now().isoformat()
            })
            return True
        # 6. Handle metadata-only change
        elif metadata_changed:
            ctx.logger.info(f"   📝 Metadata changed, updating")
            xai_file_id = junction_ai_doc_id
            metadata = self._build_xai_metadata(document)
            try:
                # Try PATCH
                await xai.update_document_metadata(collection_id, xai_file_id, metadata)
                ctx.logger.info(f"   ✅ Metadata updated")
                # Get BLAKE3 hash
                success, blake3_hash = await xai.verify_upload_integrity(
                    collection_id, xai_file_id
                )
                # Update junction
                await espocrm.update_junction_entry('CAIKnowledgeCDokumente', junction_id, {
                    'syncstatus': JunctionSyncStatus.SYNCED.value,
                    'syncedMetadataHash': current_metadata_hash,
                    'xaiBlake3Hash': blake3_hash if success else xai_blake3_hash,
                    'lastSync': datetime.now().isoformat()
                })
                return True
            except Exception as e:
                ctx.logger.warn(f"   ⚠️  PATCH failed, re-uploading: {e}")
                # Fallback: Re-upload
                download_info = await self._get_document_download_info(document, ctx)
                file_content = await espocrm.download_attachment(download_info['attachment_id'])
                await xai.remove_from_collection(collection_id, xai_file_id)
                xai_file_id = await xai.upload_document_with_metadata(
                    collection_id=collection_id,
                    file_content=file_content,
                    filename=download_info['filename'],
                    mime_type=download_info['mime_type'],
                    metadata=metadata
                )
                success, blake3_hash = await xai.verify_upload_integrity(
                    collection_id, xai_file_id
                )
                await espocrm.update_junction_entry('CAIKnowledgeCDokumente', junction_id, {
                    'aiDocumentId': xai_file_id,
                    'syncstatus': JunctionSyncStatus.SYNCED.value,
                    'syncedHash': current_file_hash,
                    'xaiBlake3Hash': blake3_hash,
                    'syncedMetadataHash': current_metadata_hash,
                    'lastSync': datetime.now().isoformat()
                })
                return True
        return False
    async def _remove_orphaned_documents(
        self,
        collection_id: str,
        junction_entries: List[Dict],
        ctx
    ) -> None:
        """
        Remove documents from XAI that are no longer in junction table.
        Args:
            collection_id: XAI Collection ID
            junction_entries: List of junction entries
            ctx: Motia context
        """
        from services.xai_service import XAIService
        xai = XAIService(ctx)
        # Get all XAI file_ids
        xai_docs = await xai.list_collection_documents(collection_id)
        xai_file_ids = {doc.get('file_id') or doc.get('id') for doc in xai_docs if doc.get('file_id') or doc.get('id')}
        # Get all junction file_ids
        junction_file_ids = {j['aiDocumentId'] for j in junction_entries if j.get('aiDocumentId')}
        # Find orphans
        orphans = xai_file_ids - junction_file_ids
        if orphans:
            ctx.logger.info(f"🗑️  Removing {len(orphans)} orphaned documents")
            for orphan_id in orphans:
                try:
                    await xai.remove_from_collection(collection_id, orphan_id)
                    ctx.logger.info(f"   ✅ Removed orphan: {orphan_id}")
                except Exception as e:
                    ctx.logger.warn(f"   ⚠️  Failed to remove {orphan_id}: {e}")
        else:
            ctx.logger.info("✅ No orphaned documents found")
    def _calculate_metadata_hash(self, document: Dict) -> str:
        """
        Calculate hash of sync-relevant metadata.
--- a/services/document_sync_utils.py
+++ b/services/document_sync_utils.py
@@ -242,78 +242,67 @@ class DocumentSync(BaseSyncUtils):
        entity_type: str = 'Document'
    ) -> List[str]:
        """
-        Determine all xAI collection IDs of entities linked to this document.
+        Determine all xAI collection IDs of CAIKnowledge entities linked to this document.
-        EspoCRM Many-to-Many: Document can be linked to arbitrary entities
+        Checks CAIKnowledgeCDokumente junction table:
-        (CBeteiligte, Account, CVmhErstgespraech, etc.)
+        - Status 'active' + datenbankId: Returns collection ID
        - Status 'new': Returns "NEW:{knowledge_id}" marker (collection must be created first)
        - Other statuses (paused, deactivated): Skips
        Args:
            document_id: Document ID
            entity_type: Entity type (e.g., 'CDokumente')
        Returns:
-            List of xAI collection IDs (deduplicated)
+            List of collection IDs or markers:
            - Normal IDs: "abc123..." (existing collections)
            - New markers: "NEW:kb-id..." (collection needs to be created via knowledge sync)
        """
        collections = set()
        self._log(f"🔍 Checking relations of {entity_type} {document_id}...")
        # ═══════════════════════════════════════════════════════════════
        # SPECIAL HANDLING: CAIKnowledge via Junction Table
        # ═══════════════════════════════════════════════════════════════
        try:
-            entity_def = await self.espocrm.get_entity_def(entity_type)
+            junction_entries = await self.espocrm.get_junction_entries(
-            links = entity_def.get('links', {}) if isinstance(entity_def, dict) else {}
+                'CAIKnowledgeCDokumente',
-        except Exception as e:
+                'cDokumenteId',
-            self._log(f"⚠️  Konnte Metadata fuer {entity_type} nicht laden: {e}", level='warn')
+                document_id
-            links = {}
+            )
-
+            
-        link_types = {'hasMany', 'hasChildren', 'manyMany', 'hasManyThrough'}
+            if junction_entries:
-
+                self._log(f"   📋 Found {len(junction_entries)} CAIKnowledge link(s)")
-        for link_name, link_def in links.items():
+                
-            try:
+                for junction in junction_entries:
-                if not isinstance(link_def, dict):
+                    knowledge_id = junction.get('cAIKnowledgeId')
-                    continue
+                    if not knowledge_id:
-                if link_def.get('type') not in link_types:
+                        continue
-                    continue
+                    
-
+                    try:
-                related_entity = link_def.get('entity')
+                        knowledge = await self.espocrm.get_entity('CAIKnowledge', knowledge_id)
-                if not related_entity:
+                        activation_status = knowledge.get('activationStatus')
-                    continue
+                        collection_id = knowledge.get('datenbankId')
-
+                        
-                related_def = await self.espocrm.get_entity_def(related_entity)
+                        if activation_status == 'active' and collection_id:
-                related_fields = related_def.get('fields', {}) if isinstance(related_def, dict) else {}
+                            # Existing collection - use it
                select_fields = ['id']
                if 'xaiCollectionId' in related_fields:
                    select_fields.append('xaiCollectionId')
                offset = 0
                page_size = 100
                while True:
                    result = await self.espocrm.list_related(
                        entity_type,
                        document_id,
                        link_name,
                        select=','.join(select_fields),
                        offset=offset,
                        max_size=page_size
                    )
                    entities = result.get('list', [])
                    if not entities:
                        break
                    for entity in entities:
                        collection_id = entity.get('xaiCollectionId')
                        if collection_id:
                            collections.add(collection_id)
-
+                            self._log(f"      ✅ CAIKnowledge {knowledge_id}: {collection_id} (active)")
-                    if len(entities) < page_size:
+                        elif activation_status == 'new':
-                        break
+                            # Collection doesn't exist yet - return special marker
-                    offset += page_size
+                            # Format: "NEW:{knowledge_id}" signals to caller: trigger knowledge sync first
-
+                            collections.add(f"NEW:{knowledge_id}")
-            except Exception as e:
+                            self._log(f"      🆕 CAIKnowledge {knowledge_id}: status='new' → collection must be created first")
-                self._log(f"   ⚠️  Fehler beim Prüfen von Link {link_name}: {e}", level='warn')
+                        else:
-                continue
+                            self._log(f"      ⏭️  CAIKnowledge {knowledge_id}: status={activation_status}, datenbankId={collection_id or 'N/A'}")
                    except Exception as e:
                        self._log(f"      ⚠️  Failed to load CAIKnowledge {knowledge_id}: {e}", level='warn')
        except Exception as e:
            self._log(f"   ⚠️  Failed to check CAIKnowledge junction: {e}", level='warn')
        result = list(collections)
        self._log(f"📊 Gesamt: {len(result)} eindeutige Collection(s) gefunden")
--- a/services/espocrm.py
+++ b/services/espocrm.py
@@ -58,6 +58,10 @@ class EspoCRMAPI:
        self._entity_defs_cache: Dict[str, Dict[str, Any]] = {}
        self._entity_defs_cache_ttl_seconds = int(os.getenv('ESPOCRM_METADATA_TTL_SECONDS', '300'))
        # Metadata cache (complete metadata loaded once)
        self._metadata_cache: Optional[Dict[str, Any]] = None
        self._metadata_cache_ts: float = 0
        # Optional Redis for caching/rate limiting (centralized)
        self.redis_client = get_redis_client(strict=False)
        if self.redis_client:
@@ -87,20 +91,76 @@ class EspoCRMAPI:
        if self._session and not self._session.closed:
            await self._session.close()
-    async def get_entity_def(self, entity_type: str) -> Dict[str, Any]:
+    async def get_metadata(self) -> Dict[str, Any]:
        """
        Get complete EspoCRM metadata (cached).
        Loads once and caches for TTL duration.
        Much faster than individual entity def calls.
        Returns:
            Complete metadata dict with entityDefs, clientDefs, etc.
        """
        now = time.monotonic()
-        cached = self._entity_defs_cache.get(entity_type)
+        
-        if cached and (now - cached['ts']) < self._entity_defs_cache_ttl_seconds:
+        # Return cached if still valid
-            return cached['data']
+        if (self._metadata_cache is not None and 
-
+            (now - self._metadata_cache_ts) < self._entity_defs_cache_ttl_seconds):
            return self._metadata_cache
        # Load fresh metadata
        try:
-            data = await self.api_call(f"/Metadata/EntityDefs/{entity_type}", method='GET')
+            self._log("📥 Loading complete EspoCRM metadata...", level='debug')
-        except EspoCRMAPIError:
+            metadata = await self.api_call("/Metadata", method='GET')
-            all_defs = await self.api_call("/Metadata/EntityDefs", method='GET')
+            
-            data = all_defs.get(entity_type, {}) if isinstance(all_defs, dict) else {}
+            if not isinstance(metadata, dict):
                self._log("⚠️  Metadata response is not a dict, using empty", level='warn')
                metadata = {}
            # Cache it
            self._metadata_cache = metadata
            self._metadata_cache_ts = now
            entity_count = len(metadata.get('entityDefs', {}))
            self._log(f"✅ Metadata cached: {entity_count} entity definitions", level='debug')
            return metadata
        except Exception as e:
            self._log(f"❌ Failed to load metadata: {e}", level='error')
            # Return empty dict as fallback
            return {}
-        self._entity_defs_cache[entity_type] = {'ts': now, 'data': data}
+    async def get_entity_def(self, entity_type: str) -> Dict[str, Any]:
-        return data
+        """
        Get entity definition for a specific entity type (cached via metadata).
        Uses complete metadata cache - much faster and correct API usage.
        Args:
            entity_type: Entity type (e.g., 'Document', 'CDokumente', 'Account')
        Returns:
            Entity definition dict with fields, links, etc.
        """
        try:
            metadata = await self.get_metadata()
            entity_defs = metadata.get('entityDefs', {})
            if not isinstance(entity_defs, dict):
                self._log(f"⚠️  entityDefs is not a dict for {entity_type}", level='warn')
                return {}
            entity_def = entity_defs.get(entity_type, {})
            if not entity_def:
                self._log(f"⚠️  No entity definition found for '{entity_type}'", level='debug')
            return entity_def
        except Exception as e:
            self._log(f"⚠️  Could not load entity def for {entity_type}: {e}", level='warn')
            return {}
    async def api_call(
        self,
@@ -540,3 +600,131 @@ class EspoCRMAPI:
            )
        """
        await self.update_entity(junction_entity, junction_id, fields)
    async def get_knowledge_documents_with_junction(
        self,
        knowledge_id: str
    ) -> List[Dict[str, Any]]:
        """
        Get all documents linked to a CAIKnowledge entry with junction data.
        Uses custom EspoCRM endpoint: GET /JunctionData/CAIKnowledge/{knowledge_id}/dokumentes
        Returns enriched list with:
        - junctionId: Junction table ID
        - cAIKnowledgeId, cDokumenteId: Junction keys
        - aiDocumentId: XAI document ID from junction
        - syncstatus: Sync status from junction (new, synced, failed, unclean)
        - lastSync: Last sync timestamp from junction
        - documentId, documentName: Document info
        - blake3hash: Blake3 hash from document entity
        - documentCreatedAt, documentModifiedAt: Document timestamps
        This consolidates multiple API calls into one efficient query.
        Args:
            knowledge_id: CAIKnowledge entity ID
        Returns:
            List of document dicts with junction data
        Example:
            docs = await espocrm.get_knowledge_documents_with_junction('69b1b03582bb6e2da')
            for doc in docs:
                print(f"{doc['documentName']}: {doc['syncstatus']}")
        """
        # JunctionData endpoint is at root level, not under /api/v1
        base_url = self.api_base_url.rstrip('/').replace('/api/v1', '')
        url = f"{base_url}/JunctionData/CAIKnowledge/{knowledge_id}/dokumentes"
        self._log(f"GET {url}")
        try:
            session = await self._get_session()
            timeout = aiohttp.ClientTimeout(total=self.api_timeout_seconds)
            async with session.get(url, headers=self._get_headers(), timeout=timeout) as response:
                self._log(f"Response status: {response.status}")
                if response.status == 404:
                    # Knowledge base not found or no documents linked
                    return []
                if response.status >= 400:
                    error_text = await response.text()
                    raise EspoCRMAPIError(f"JunctionData GET failed: {response.status} - {error_text}")
                result = await response.json()
                documents = result.get('list', [])
                self._log(f"✅ Loaded {len(documents)} document(s) with junction data")
                return documents
        except asyncio.TimeoutError:
            raise EspoCRMTimeoutError(f"Timeout getting junction data for knowledge {knowledge_id}")
        except aiohttp.ClientError as e:
            raise EspoCRMAPIError(f"Network error getting junction data: {e}")
    async def update_knowledge_document_junction(
        self,
        knowledge_id: str,
        document_id: str,
        fields: Dict[str, Any],
        update_last_sync: bool = True
    ) -> Dict[str, Any]:
        """
        Update junction columns for a specific document link.
        Uses custom EspoCRM endpoint:
        PUT /JunctionData/CAIKnowledge/{knowledge_id}/dokumentes/{document_id}
        Args:
            knowledge_id: CAIKnowledge entity ID
            document_id: CDokumente entity ID
            fields: Junction fields to update (aiDocumentId, syncstatus, etc.)
            update_last_sync: Whether to update lastSync timestamp (default: True)
        Returns:
            Updated junction data
        Example:
            await espocrm.update_knowledge_document_junction(
                '69b1b03582bb6e2da',
                '69a68b556a39771bf',
                {
                    'aiDocumentId': 'xai-file-abc123',
                    'syncstatus': 'synced'
                },
                update_last_sync=True
            )
        """
        # JunctionData endpoint is at root level, not under /api/v1
        base_url = self.api_base_url.rstrip('/').replace('/api/v1', '')
        url = f"{base_url}/JunctionData/CAIKnowledge/{knowledge_id}/dokumentes/{document_id}"
        payload = {**fields}
        if update_last_sync:
            payload['updateLastSync'] = True
        self._log(f"PUT {url}")
        self._log(f"   Payload: {payload}")
        try:
            session = await self._get_session()
            timeout = aiohttp.ClientTimeout(total=self.api_timeout_seconds)
            async with session.put(url, headers=self._get_headers(), json=payload, timeout=timeout) as response:
                self._log(f"Response status: {response.status}")
                if response.status >= 400:
                    error_text = await response.text()
                    raise EspoCRMAPIError(f"JunctionData PUT failed: {response.status} - {error_text}")
                result = await response.json()
                self._log(f"✅ Junction updated: junctionId={result.get('junctionId')}")
                return result
        except asyncio.TimeoutError:
            raise EspoCRMTimeoutError(f"Timeout updating junction data")
        except aiohttp.ClientError as e:
            raise EspoCRMAPIError(f"Network error updating junction data: {e}")
--- a/services/xai_service.py
+++ b/services/xai_service.py
@@ -236,7 +236,8 @@ class XAIService:
            data = await response.json()
-        collection_id = data.get('id')
+        # API returns 'collection_id' not 'id'
        collection_id = data.get('collection_id') or data.get('id')
        self._log(f"✅ Collection created: {collection_id}")
        return data
@@ -308,7 +309,18 @@ class XAIService:
        GET https://management-api.x.ai/v1/collections/{collection_id}/documents
        Returns:
-            List von document objects mit file_id, filename, hash, fields
+            List von normalized document objects:
            [
                {
                    'file_id': 'file_...',
                    'filename': 'doc.pdf',
                    'blake3_hash': 'hex_string',  # Plain hex, kein prefix
                    'size_bytes': 12345,
                    'content_type': 'application/pdf',
                    'fields': {},  # Custom metadata
                    'status': 'DOCUMENT_STATUS_...'
                }
            ]
        Raises:
            RuntimeError: bei HTTP-Fehler
@@ -328,16 +340,31 @@ class XAIService:
            data = await response.json()
-        # API sollte eine Liste zurückgeben oder ein dict mit 'documents' key
+        # API gibt Liste zurück oder dict mit 'documents' key
        if isinstance(data, list):
-            documents = data
+            raw_documents = data
        elif isinstance(data, dict) and 'documents' in data:
-            documents = data['documents']
+            raw_documents = data['documents']
        else:
-            documents = []
+            raw_documents = []
-        self._log(f"✅ Listed {len(documents)} documents")
+        # Normalize nested structure: file_metadata -> top-level
-        return documents
+        normalized = []
        for doc in raw_documents:
            file_meta = doc.get('file_metadata', {})
            normalized.append({
                'file_id': file_meta.get('file_id'),
                'filename': file_meta.get('name'),
                'blake3_hash': file_meta.get('hash'),  # Plain hex string
                'size_bytes': int(file_meta.get('size_bytes', 0)) if file_meta.get('size_bytes') else 0,
                'content_type': file_meta.get('content_type'),
                'created_at': file_meta.get('created_at'),
                'fields': doc.get('fields', {}),
                'status': doc.get('status')
            })
        self._log(f"✅ Listed {len(normalized)} documents")
        return normalized
    async def get_collection_document(self, collection_id: str, file_id: str) -> Optional[Dict]:
        """
@@ -346,12 +373,14 @@ class XAIService:
        GET https://management-api.x.ai/v1/collections/{collection_id}/documents/{file_id}
        Returns:
-            Dict mit document info including BLAKE3 hash:
+            Normalized dict mit document info:
            {
                'file_id': 'file_xyz',
                'filename': 'document.pdf',
-                'hash': 'blake3:abcd1234...',  # BLAKE3 Hash!
+                'blake3_hash': 'hex_string',  # Plain hex, kein prefix
-                'fields': {...}  # Metadata
+                'size_bytes': 12345,
                'content_type': 'application/pdf',
                'fields': {...}  # Custom metadata
            }
        Returns None if not found.
@@ -374,8 +403,21 @@ class XAIService:
            data = await response.json()
-        self._log(f"✅ Document info retrieved: {data.get('filename', 'N/A')}")
+        # Normalize nested structure
-        return data
+        file_meta = data.get('file_metadata', {})
        normalized = {
            'file_id': file_meta.get('file_id'),
            'filename': file_meta.get('name'),
            'blake3_hash': file_meta.get('hash'),  # Plain hex
            'size_bytes': int(file_meta.get('size_bytes', 0)) if file_meta.get('size_bytes') else 0,
            'content_type': file_meta.get('content_type'),
            'created_at': file_meta.get('created_at'),
            'fields': data.get('fields', {}),
            'status': data.get('status')
        }
        self._log(f"✅ Document info retrieved: {normalized.get('filename', 'N/A')}")
        return normalized
    async def update_document_metadata(
        self,
--- a/steps/advoware_cal_sync/calendar_sync_cron_step.py
+++ b/steps/advoware_cal_sync/calendar_sync_cron_step.py
@@ -18,7 +18,7 @@ config = {
    'description': 'Runs calendar sync automatically every 15 minutes',
    'flows': ['advoware-calendar-sync'],
    'triggers': [
-        cron("0 */15 * * * *")  # Every 15 minutes at second 0 (6-field: sec min hour day month weekday)
+        cron("0 15 1 * * *")  # Every 15 minutes at second 0 (6-field: sec min hour day month weekday)
    ],
    'enqueues': ['calendar_sync_all']
 }
--- a/steps/vmh/document_sync_event_step.py
+++ b/steps/vmh/document_sync_event_step.py
@@ -152,6 +152,42 @@ async def handle_create_or_update(entity_id: str, document: Dict[str, Any], sync
        if collection_ids:
            ctx.logger.info(f"   Collections: {collection_ids}")
        # ═══════════════════════════════════════════════════════════════
        # CHECK: Knowledge Bases mit Status "new" (noch keine Collection)
        # ═══════════════════════════════════════════════════════════════
        new_knowledge_bases = [cid for cid in collection_ids if cid.startswith('NEW:')]
        if new_knowledge_bases:
            ctx.logger.info("")
            ctx.logger.info("=" * 80)
            ctx.logger.info("🆕 DOKUMENT IST MIT KNOWLEDGE BASE(S) VERKNÜPFT (Status: new)")
            ctx.logger.info("=" * 80)
            for new_kb in new_knowledge_bases:
                kb_id = new_kb[4:]  # Remove "NEW:" prefix
                ctx.logger.info(f"📋 CAIKnowledge {kb_id}")
                ctx.logger.info(f"   Status: new → Collection muss zuerst erstellt werden")
                # Trigger Knowledge Sync
                ctx.logger.info(f"📤 Triggering aiknowledge.sync event...")
                await ctx.emit('aiknowledge.sync', {
                    'entity_id': kb_id,
                    'entity_type': 'CAIKnowledge',
                    'triggered_by': 'document_sync',
                    'document_id': entity_id
                })
                ctx.logger.info(f"✅ Event emitted for {kb_id}")
            # Release lock and skip document sync - knowledge sync will handle documents
            ctx.logger.info("")
            ctx.logger.info("=" * 80)
            ctx.logger.info("✅ KNOWLEDGE SYNC GETRIGGERT")
            ctx.logger.info("   Document Sync wird übersprungen")
            ctx.logger.info("   (Knowledge Sync erstellt Collection und synchronisiert dann Dokumente)")
            ctx.logger.info("=" * 80)
            await sync_utils.release_sync_lock(entity_id, success=True, entity_type=entity_type)
            return
        # ═══════════════════════════════════════════════════════════════
        # PREVIEW-GENERIERUNG bei neuen/geänderten Dateien
        # ═══════════════════════════════════════════════════════════════