feat: Enhance document synchronization by integrating CAIKnowledge handling and improving error logging

This commit is contained in:
bsiggel
2026-03-12 22:30:11 +00:00
parent 8ed7cca432
commit 6bf2343a12
6 changed files with 492 additions and 362 deletions

View File

@@ -172,20 +172,25 @@ class AIKnowledgeSync(BaseSyncUtils):
else:
ctx.logger.info("⏭️ No collection ID, nothing to delete")
# Update junction entries
junction_entries = await espocrm.get_junction_entries(
'CAIKnowledgeCDokumente',
'cAIKnowledgeId',
knowledge_id
)
# Reset junction entries
documents = await espocrm.get_knowledge_documents_with_junction(knowledge_id)
for junction in junction_entries:
await espocrm.update_junction_entry('CAIKnowledgeCDokumente', junction['id'], {
'syncstatus': JunctionSyncStatus.NEW.value,
'aiDocumentId': None
})
for doc in documents:
doc_id = doc['documentId']
try:
await espocrm.update_knowledge_document_junction(
knowledge_id,
doc_id,
{
'syncstatus': 'new',
'aiDocumentId': None
},
update_last_sync=False
)
except Exception as e:
ctx.logger.warn(f"⚠️ Failed to reset junction for {doc_id}: {e}")
ctx.logger.info(f"✅ Deactivation complete, {len(junction_entries)} junction entries reset")
ctx.logger.info(f"✅ Deactivation complete, {len(documents)} junction entries reset")
return
# ═══════════════════════════════════════════════════════════
@@ -235,15 +240,20 @@ class AIKnowledgeSync(BaseSyncUtils):
self,
knowledge_id: str,
collection_id: str,
ctx
ctx,
full_sync: bool = False
) -> None:
"""
Sync all documents of a knowledge base to XAI collection.
Uses efficient JunctionData endpoint to get all documents with junction data
and blake3 hashes in a single API call.
Args:
knowledge_id: CAIKnowledge entity ID
collection_id: XAI Collection ID
ctx: Motia context
full_sync: If True, force Blake3 hash comparison for all documents (nightly cron)
"""
from services.espocrm import EspoCRMAPI
from services.xai_service import XAIService
@@ -251,294 +261,159 @@ class AIKnowledgeSync(BaseSyncUtils):
espocrm = EspoCRMAPI(ctx)
xai = XAIService(ctx)
# Load junction entries
junction_entries = await espocrm.get_junction_entries(
'CAIKnowledgeCDokumente',
'cAIKnowledgeId',
knowledge_id
)
# ═══════════════════════════════════════════════════════════════
# STEP 1: Load all documents with junction data (single API call)
# ═══════════════════════════════════════════════════════════════
ctx.logger.info(f"📥 Loading documents with junction data for knowledge {knowledge_id}")
ctx.logger.info(f"📊 Found {len(junction_entries)} junction entries")
documents = await espocrm.get_knowledge_documents_with_junction(knowledge_id)
if not junction_entries:
ctx.logger.info(f"📊 Found {len(documents)} document(s)")
if not documents:
ctx.logger.info("✅ No documents to sync")
return
# Load documents
documents = {}
for junction in junction_entries:
doc_id = junction['cDokumenteId']
try:
doc = await espocrm.get_entity('CDokumente', doc_id)
documents[doc_id] = doc
except Exception as e:
ctx.logger.error(f"❌ Failed to load document {doc_id}: {e}")
ctx.logger.info(f"📊 Loaded {len(documents)}/{len(junction_entries)} documents")
# Sync each document
# ═══════════════════════════════════════════════════════════════
# STEP 2: Sync each document based on status/hash
# ═══════════════════════════════════════════════════════════════
successful = 0
failed = 0
skipped = 0
for junction in junction_entries:
doc_id = junction['cDokumenteId']
document = documents.get(doc_id)
for doc in documents:
doc_id = doc['documentId']
doc_name = doc.get('documentName', 'Unknown')
junction_status = doc.get('syncstatus', 'new')
ai_document_id = doc.get('aiDocumentId')
blake3_hash = doc.get('blake3hash')
if not document:
failed += 1
continue
ctx.logger.info(f"\n📄 {doc_name} (ID: {doc_id})")
ctx.logger.info(f" Status: {junction_status}")
ctx.logger.info(f" aiDocumentId: {ai_document_id or 'N/A'}")
ctx.logger.info(f" blake3hash: {blake3_hash[:16] if blake3_hash else 'N/A'}...")
try:
synced = await self._sync_single_document(junction, document, collection_id, ctx)
if synced:
successful += 1
else:
# Decide if sync needed
needs_sync = False
reason = ""
if junction_status in ['new', 'unclean', 'failed']:
needs_sync = True
reason = f"status={junction_status}"
elif full_sync and blake3_hash and ai_document_id:
# Full sync mode: verify Blake3 hash with XAI
try:
xai_doc_info = await xai.get_collection_document(collection_id, ai_document_id)
if xai_doc_info:
xai_blake3 = xai_doc_info.get('blake3_hash')
if xai_blake3 != blake3_hash:
needs_sync = True
reason = f"blake3 mismatch (XAI: {xai_blake3[:16] if xai_blake3 else 'N/A'}... vs Doc: {blake3_hash[:16]}...)"
ctx.logger.info(f" 🔄 Blake3 mismatch detected!")
else:
ctx.logger.info(f" ✅ Blake3 hash matches")
else:
needs_sync = True
reason = "file not found in XAI collection"
except Exception as e:
ctx.logger.warn(f" ⚠️ Failed to verify Blake3: {e}")
if not needs_sync:
ctx.logger.info(f" ⏭️ Skipped (no sync needed)")
skipped += 1
continue
ctx.logger.info(f" 🔄 Syncing: {reason}")
# Download document
attachment_id = doc.get('documentId') # TODO: Get correct attachment ID from CDokumente
file_content = await espocrm.download_attachment(attachment_id)
ctx.logger.info(f" 📥 Downloaded {len(file_content)} bytes")
# Upload to XAI
filename = doc_name
mime_type = 'application/octet-stream' # TODO: Get from attachment
xai_file_id = await xai.upload_file(file_content, filename, mime_type)
ctx.logger.info(f" 📤 Uploaded to XAI: {xai_file_id}")
# Add to collection
await xai.add_to_collection(collection_id, xai_file_id)
ctx.logger.info(f" ✅ Added to collection {collection_id}")
# Update junction
await espocrm.update_knowledge_document_junction(
knowledge_id,
doc_id,
{
'aiDocumentId': xai_file_id,
'syncstatus': 'synced'
},
update_last_sync=True
)
ctx.logger.info(f" ✅ Junction updated")
successful += 1
except Exception as e:
failed += 1
ctx.logger.error(f"❌ Failed to sync document {doc_id}: {e}")
ctx.logger.error(f" ❌ Sync failed: {e}")
# Mark as failed
await espocrm.update_junction_entry('CAIKnowledgeCDokumente', junction['id'], {
'syncstatus': JunctionSyncStatus.FAILED.value
})
# Mark as failed in junction
try:
await espocrm.update_knowledge_document_junction(
knowledge_id,
doc_id,
{'syncstatus': 'failed'},
update_last_sync=False
)
except Exception as update_err:
ctx.logger.error(f" ❌ Failed to update junction status: {update_err}")
# Remove orphans
# ═══════════════════════════════════════════════════════════════
# STEP 3: Remove orphaned documents from XAI collection
# ═══════════════════════════════════════════════════════════════
try:
await self._remove_orphaned_documents(collection_id, junction_entries, ctx)
except Exception as e:
ctx.logger.warn(f"⚠️ Failed to remove orphans: {e}")
ctx.logger.info(f"\n🧹 Checking for orphaned documents in XAI collection...")
# Summary
# Get all files in XAI collection (normalized structure)
xai_documents = await xai.list_collection_documents(collection_id)
xai_file_ids = {doc.get('file_id') for doc in xai_documents if doc.get('file_id')}
# Get all ai_document_ids from junction
junction_file_ids = {doc.get('aiDocumentId') for doc in documents if doc.get('aiDocumentId')}
# Find orphans (in XAI but not in junction)
orphans = xai_file_ids - junction_file_ids
if orphans:
ctx.logger.info(f" Found {len(orphans)} orphaned file(s)")
for orphan_id in orphans:
try:
await xai.remove_from_collection(collection_id, orphan_id)
ctx.logger.info(f" 🗑️ Removed {orphan_id}")
except Exception as e:
ctx.logger.warn(f" ⚠️ Failed to remove {orphan_id}: {e}")
else:
ctx.logger.info(f" ✅ No orphans found")
except Exception as e:
ctx.logger.warn(f"⚠️ Failed to clean up orphans: {e}")
# ═══════════════════════════════════════════════════════════════
# STEP 4: Summary
# ═══════════════════════════════════════════════════════════════
ctx.logger.info("")
ctx.logger.info("=" * 80)
ctx.logger.info(f"📊 Sync Statistics:")
ctx.logger.info(f" ✅ Synced: {successful}")
ctx.logger.info(f" ⏭️ Skipped: {skipped}")
ctx.logger.info(f" ❌ Failed: {failed}")
ctx.logger.info(f" Mode: {'FULL SYNC (Blake3 verification)' if full_sync else 'INCREMENTAL'}")
ctx.logger.info("=" * 80)
async def _sync_single_document(
self,
junction_entry: Dict,
document: Dict,
collection_id: str,
ctx
) -> bool:
"""
Sync one document to XAI Collection with BLAKE3 verification.
Args:
junction_entry: Junction table entry
document: CDokumente entity
collection_id: XAI Collection ID
ctx: Motia context
Returns:
True if synced, False if skipped
"""
from services.espocrm import EspoCRMAPI
from services.xai_service import XAIService
espocrm = EspoCRMAPI(ctx)
xai = XAIService(ctx)
junction_id = junction_entry['id']
junction_status = junction_entry.get('syncstatus')
junction_ai_doc_id = junction_entry.get('aiDocumentId')
# 1. Check MIME type support
mime_type = document.get('mimeType') or 'application/octet-stream'
if not xai.is_mime_type_supported(mime_type):
await espocrm.update_junction_entry('CAIKnowledgeCDokumente', junction_id, {
'syncstatus': JunctionSyncStatus.UNSUPPORTED.value
})
ctx.logger.info(f"⏭️ Unsupported MIME: {document['name']}")
return False
# 2. Calculate hashes
current_file_hash = document.get('md5') or document.get('sha256')
if not current_file_hash:
ctx.logger.error(f"❌ No hash for document {document['id']}")
return False
current_metadata_hash = self._calculate_metadata_hash(document)
synced_file_hash = junction_entry.get('syncedHash')
synced_metadata_hash = junction_entry.get('syncedMetadataHash')
xai_blake3_hash = junction_entry.get('xaiBlake3Hash')
# 3. Determine changes
file_changed = (current_file_hash != synced_file_hash)
metadata_changed = (current_metadata_hash != synced_metadata_hash)
ctx.logger.info(f"📋 {document['name']}")
ctx.logger.info(f" File changed: {file_changed}, Metadata changed: {metadata_changed}")
# 4. Early return if nothing changed
if junction_status == JunctionSyncStatus.SYNCED.value and junction_ai_doc_id:
if not file_changed and not metadata_changed:
# Verify document still exists in XAI
try:
doc_info = await xai.get_collection_document(collection_id, junction_ai_doc_id)
if doc_info:
ctx.logger.info(f" ✅ Already synced (verified)")
return False
else:
ctx.logger.warn(f" ⚠️ Document missing in XAI, re-uploading")
except Exception as e:
ctx.logger.warn(f" ⚠️ Could not verify: {e}")
# 5. Handle file content change (re-upload)
if file_changed or not junction_ai_doc_id:
ctx.logger.info(f" 🔄 {'File changed' if file_changed else 'New file'}, uploading")
# Download from EspoCRM
download_info = await self._get_document_download_info(document, ctx)
if not download_info:
raise RuntimeError(f"Cannot download document {document['id']}")
file_content = await espocrm.download_attachment(download_info['attachment_id'])
# Build metadata
metadata = self._build_xai_metadata(document)
# Upload to XAI
xai_file_id = await xai.upload_document_with_metadata(
collection_id=collection_id,
file_content=file_content,
filename=download_info['filename'],
mime_type=download_info['mime_type'],
metadata=metadata
)
ctx.logger.info(f" ✅ Uploaded → {xai_file_id}")
# Verify upload
ctx.logger.info(f" 🔍 Verifying upload...")
success, blake3_hash = await xai.verify_upload_integrity(
collection_id=collection_id,
file_id=xai_file_id
)
if not success:
ctx.logger.error(f" ❌ Upload verification failed!")
raise RuntimeError("Upload verification failed")
ctx.logger.info(f" ✅ Verified: {blake3_hash[:32]}...")
# Update junction
await espocrm.update_junction_entry('CAIKnowledgeCDokumente', junction_id, {
'aiDocumentId': xai_file_id,
'syncstatus': JunctionSyncStatus.SYNCED.value,
'syncedHash': current_file_hash,
'xaiBlake3Hash': blake3_hash,
'syncedMetadataHash': current_metadata_hash,
'lastSync': datetime.now().isoformat()
})
return True
# 6. Handle metadata-only change
elif metadata_changed:
ctx.logger.info(f" 📝 Metadata changed, updating")
xai_file_id = junction_ai_doc_id
metadata = self._build_xai_metadata(document)
try:
# Try PATCH
await xai.update_document_metadata(collection_id, xai_file_id, metadata)
ctx.logger.info(f" ✅ Metadata updated")
# Get BLAKE3 hash
success, blake3_hash = await xai.verify_upload_integrity(
collection_id, xai_file_id
)
# Update junction
await espocrm.update_junction_entry('CAIKnowledgeCDokumente', junction_id, {
'syncstatus': JunctionSyncStatus.SYNCED.value,
'syncedMetadataHash': current_metadata_hash,
'xaiBlake3Hash': blake3_hash if success else xai_blake3_hash,
'lastSync': datetime.now().isoformat()
})
return True
except Exception as e:
ctx.logger.warn(f" ⚠️ PATCH failed, re-uploading: {e}")
# Fallback: Re-upload
download_info = await self._get_document_download_info(document, ctx)
file_content = await espocrm.download_attachment(download_info['attachment_id'])
await xai.remove_from_collection(collection_id, xai_file_id)
xai_file_id = await xai.upload_document_with_metadata(
collection_id=collection_id,
file_content=file_content,
filename=download_info['filename'],
mime_type=download_info['mime_type'],
metadata=metadata
)
success, blake3_hash = await xai.verify_upload_integrity(
collection_id, xai_file_id
)
await espocrm.update_junction_entry('CAIKnowledgeCDokumente', junction_id, {
'aiDocumentId': xai_file_id,
'syncstatus': JunctionSyncStatus.SYNCED.value,
'syncedHash': current_file_hash,
'xaiBlake3Hash': blake3_hash,
'syncedMetadataHash': current_metadata_hash,
'lastSync': datetime.now().isoformat()
})
return True
return False
async def _remove_orphaned_documents(
self,
collection_id: str,
junction_entries: List[Dict],
ctx
) -> None:
"""
Remove documents from XAI that are no longer in junction table.
Args:
collection_id: XAI Collection ID
junction_entries: List of junction entries
ctx: Motia context
"""
from services.xai_service import XAIService
xai = XAIService(ctx)
# Get all XAI file_ids
xai_docs = await xai.list_collection_documents(collection_id)
xai_file_ids = {doc.get('file_id') or doc.get('id') for doc in xai_docs if doc.get('file_id') or doc.get('id')}
# Get all junction file_ids
junction_file_ids = {j['aiDocumentId'] for j in junction_entries if j.get('aiDocumentId')}
# Find orphans
orphans = xai_file_ids - junction_file_ids
if orphans:
ctx.logger.info(f"🗑️ Removing {len(orphans)} orphaned documents")
for orphan_id in orphans:
try:
await xai.remove_from_collection(collection_id, orphan_id)
ctx.logger.info(f" ✅ Removed orphan: {orphan_id}")
except Exception as e:
ctx.logger.warn(f" ⚠️ Failed to remove {orphan_id}: {e}")
else:
ctx.logger.info("✅ No orphaned documents found")
def _calculate_metadata_hash(self, document: Dict) -> str:
"""
Calculate hash of sync-relevant metadata.

View File

@@ -242,77 +242,66 @@ class DocumentSync(BaseSyncUtils):
entity_type: str = 'Document'
) -> List[str]:
"""
Determine all xAI collection IDs of entities linked to this document.
Determine all xAI collection IDs of CAIKnowledge entities linked to this document.
EspoCRM Many-to-Many: Document can be linked to arbitrary entities
(CBeteiligte, Account, CVmhErstgespraech, etc.)
Checks CAIKnowledgeCDokumente junction table:
- Status 'active' + datenbankId: Returns collection ID
- Status 'new': Returns "NEW:{knowledge_id}" marker (collection must be created first)
- Other statuses (paused, deactivated): Skips
Args:
document_id: Document ID
entity_type: Entity type (e.g., 'CDokumente')
Returns:
List of xAI collection IDs (deduplicated)
List of collection IDs or markers:
- Normal IDs: "abc123..." (existing collections)
- New markers: "NEW:kb-id..." (collection needs to be created via knowledge sync)
"""
collections = set()
self._log(f"🔍 Checking relations of {entity_type} {document_id}...")
# ═══════════════════════════════════════════════════════════════
# SPECIAL HANDLING: CAIKnowledge via Junction Table
# ═══════════════════════════════════════════════════════════════
try:
entity_def = await self.espocrm.get_entity_def(entity_type)
links = entity_def.get('links', {}) if isinstance(entity_def, dict) else {}
except Exception as e:
self._log(f"⚠️ Konnte Metadata fuer {entity_type} nicht laden: {e}", level='warn')
links = {}
junction_entries = await self.espocrm.get_junction_entries(
'CAIKnowledgeCDokumente',
'cDokumenteId',
document_id
)
link_types = {'hasMany', 'hasChildren', 'manyMany', 'hasManyThrough'}
if junction_entries:
self._log(f" 📋 Found {len(junction_entries)} CAIKnowledge link(s)")
for link_name, link_def in links.items():
try:
if not isinstance(link_def, dict):
continue
if link_def.get('type') not in link_types:
continue
for junction in junction_entries:
knowledge_id = junction.get('cAIKnowledgeId')
if not knowledge_id:
continue
related_entity = link_def.get('entity')
if not related_entity:
continue
try:
knowledge = await self.espocrm.get_entity('CAIKnowledge', knowledge_id)
activation_status = knowledge.get('activationStatus')
collection_id = knowledge.get('datenbankId')
related_def = await self.espocrm.get_entity_def(related_entity)
related_fields = related_def.get('fields', {}) if isinstance(related_def, dict) else {}
select_fields = ['id']
if 'xaiCollectionId' in related_fields:
select_fields.append('xaiCollectionId')
offset = 0
page_size = 100
while True:
result = await self.espocrm.list_related(
entity_type,
document_id,
link_name,
select=','.join(select_fields),
offset=offset,
max_size=page_size
)
entities = result.get('list', [])
if not entities:
break
for entity in entities:
collection_id = entity.get('xaiCollectionId')
if collection_id:
if activation_status == 'active' and collection_id:
# Existing collection - use it
collections.add(collection_id)
self._log(f" ✅ CAIKnowledge {knowledge_id}: {collection_id} (active)")
elif activation_status == 'new':
# Collection doesn't exist yet - return special marker
# Format: "NEW:{knowledge_id}" signals to caller: trigger knowledge sync first
collections.add(f"NEW:{knowledge_id}")
self._log(f" 🆕 CAIKnowledge {knowledge_id}: status='new' → collection must be created first")
else:
self._log(f" ⏭️ CAIKnowledge {knowledge_id}: status={activation_status}, datenbankId={collection_id or 'N/A'}")
if len(entities) < page_size:
break
offset += page_size
except Exception as e:
self._log(f" ⚠️ Failed to load CAIKnowledge {knowledge_id}: {e}", level='warn')
except Exception as e:
self._log(f" ⚠️ Fehler beim Prüfen von Link {link_name}: {e}", level='warn')
continue
except Exception as e:
self._log(f" ⚠️ Failed to check CAIKnowledge junction: {e}", level='warn')
result = list(collections)
self._log(f"📊 Gesamt: {len(result)} eindeutige Collection(s) gefunden")

View File

@@ -58,6 +58,10 @@ class EspoCRMAPI:
self._entity_defs_cache: Dict[str, Dict[str, Any]] = {}
self._entity_defs_cache_ttl_seconds = int(os.getenv('ESPOCRM_METADATA_TTL_SECONDS', '300'))
# Metadata cache (complete metadata loaded once)
self._metadata_cache: Optional[Dict[str, Any]] = None
self._metadata_cache_ts: float = 0
# Optional Redis for caching/rate limiting (centralized)
self.redis_client = get_redis_client(strict=False)
if self.redis_client:
@@ -87,20 +91,76 @@ class EspoCRMAPI:
if self._session and not self._session.closed:
await self._session.close()
async def get_entity_def(self, entity_type: str) -> Dict[str, Any]:
async def get_metadata(self) -> Dict[str, Any]:
"""
Get complete EspoCRM metadata (cached).
Loads once and caches for TTL duration.
Much faster than individual entity def calls.
Returns:
Complete metadata dict with entityDefs, clientDefs, etc.
"""
now = time.monotonic()
cached = self._entity_defs_cache.get(entity_type)
if cached and (now - cached['ts']) < self._entity_defs_cache_ttl_seconds:
return cached['data']
# Return cached if still valid
if (self._metadata_cache is not None and
(now - self._metadata_cache_ts) < self._entity_defs_cache_ttl_seconds):
return self._metadata_cache
# Load fresh metadata
try:
data = await self.api_call(f"/Metadata/EntityDefs/{entity_type}", method='GET')
except EspoCRMAPIError:
all_defs = await self.api_call("/Metadata/EntityDefs", method='GET')
data = all_defs.get(entity_type, {}) if isinstance(all_defs, dict) else {}
self._log("📥 Loading complete EspoCRM metadata...", level='debug')
metadata = await self.api_call("/Metadata", method='GET')
self._entity_defs_cache[entity_type] = {'ts': now, 'data': data}
return data
if not isinstance(metadata, dict):
self._log("⚠️ Metadata response is not a dict, using empty", level='warn')
metadata = {}
# Cache it
self._metadata_cache = metadata
self._metadata_cache_ts = now
entity_count = len(metadata.get('entityDefs', {}))
self._log(f"✅ Metadata cached: {entity_count} entity definitions", level='debug')
return metadata
except Exception as e:
self._log(f"❌ Failed to load metadata: {e}", level='error')
# Return empty dict as fallback
return {}
async def get_entity_def(self, entity_type: str) -> Dict[str, Any]:
"""
Get entity definition for a specific entity type (cached via metadata).
Uses complete metadata cache - much faster and correct API usage.
Args:
entity_type: Entity type (e.g., 'Document', 'CDokumente', 'Account')
Returns:
Entity definition dict with fields, links, etc.
"""
try:
metadata = await self.get_metadata()
entity_defs = metadata.get('entityDefs', {})
if not isinstance(entity_defs, dict):
self._log(f"⚠️ entityDefs is not a dict for {entity_type}", level='warn')
return {}
entity_def = entity_defs.get(entity_type, {})
if not entity_def:
self._log(f"⚠️ No entity definition found for '{entity_type}'", level='debug')
return entity_def
except Exception as e:
self._log(f"⚠️ Could not load entity def for {entity_type}: {e}", level='warn')
return {}
async def api_call(
self,
@@ -540,3 +600,131 @@ class EspoCRMAPI:
)
"""
await self.update_entity(junction_entity, junction_id, fields)
async def get_knowledge_documents_with_junction(
self,
knowledge_id: str
) -> List[Dict[str, Any]]:
"""
Get all documents linked to a CAIKnowledge entry with junction data.
Uses custom EspoCRM endpoint: GET /JunctionData/CAIKnowledge/{knowledge_id}/dokumentes
Returns enriched list with:
- junctionId: Junction table ID
- cAIKnowledgeId, cDokumenteId: Junction keys
- aiDocumentId: XAI document ID from junction
- syncstatus: Sync status from junction (new, synced, failed, unclean)
- lastSync: Last sync timestamp from junction
- documentId, documentName: Document info
- blake3hash: Blake3 hash from document entity
- documentCreatedAt, documentModifiedAt: Document timestamps
This consolidates multiple API calls into one efficient query.
Args:
knowledge_id: CAIKnowledge entity ID
Returns:
List of document dicts with junction data
Example:
docs = await espocrm.get_knowledge_documents_with_junction('69b1b03582bb6e2da')
for doc in docs:
print(f"{doc['documentName']}: {doc['syncstatus']}")
"""
# JunctionData endpoint is at root level, not under /api/v1
base_url = self.api_base_url.rstrip('/').replace('/api/v1', '')
url = f"{base_url}/JunctionData/CAIKnowledge/{knowledge_id}/dokumentes"
self._log(f"GET {url}")
try:
session = await self._get_session()
timeout = aiohttp.ClientTimeout(total=self.api_timeout_seconds)
async with session.get(url, headers=self._get_headers(), timeout=timeout) as response:
self._log(f"Response status: {response.status}")
if response.status == 404:
# Knowledge base not found or no documents linked
return []
if response.status >= 400:
error_text = await response.text()
raise EspoCRMAPIError(f"JunctionData GET failed: {response.status} - {error_text}")
result = await response.json()
documents = result.get('list', [])
self._log(f"✅ Loaded {len(documents)} document(s) with junction data")
return documents
except asyncio.TimeoutError:
raise EspoCRMTimeoutError(f"Timeout getting junction data for knowledge {knowledge_id}")
except aiohttp.ClientError as e:
raise EspoCRMAPIError(f"Network error getting junction data: {e}")
async def update_knowledge_document_junction(
self,
knowledge_id: str,
document_id: str,
fields: Dict[str, Any],
update_last_sync: bool = True
) -> Dict[str, Any]:
"""
Update junction columns for a specific document link.
Uses custom EspoCRM endpoint:
PUT /JunctionData/CAIKnowledge/{knowledge_id}/dokumentes/{document_id}
Args:
knowledge_id: CAIKnowledge entity ID
document_id: CDokumente entity ID
fields: Junction fields to update (aiDocumentId, syncstatus, etc.)
update_last_sync: Whether to update lastSync timestamp (default: True)
Returns:
Updated junction data
Example:
await espocrm.update_knowledge_document_junction(
'69b1b03582bb6e2da',
'69a68b556a39771bf',
{
'aiDocumentId': 'xai-file-abc123',
'syncstatus': 'synced'
},
update_last_sync=True
)
"""
# JunctionData endpoint is at root level, not under /api/v1
base_url = self.api_base_url.rstrip('/').replace('/api/v1', '')
url = f"{base_url}/JunctionData/CAIKnowledge/{knowledge_id}/dokumentes/{document_id}"
payload = {**fields}
if update_last_sync:
payload['updateLastSync'] = True
self._log(f"PUT {url}")
self._log(f" Payload: {payload}")
try:
session = await self._get_session()
timeout = aiohttp.ClientTimeout(total=self.api_timeout_seconds)
async with session.put(url, headers=self._get_headers(), json=payload, timeout=timeout) as response:
self._log(f"Response status: {response.status}")
if response.status >= 400:
error_text = await response.text()
raise EspoCRMAPIError(f"JunctionData PUT failed: {response.status} - {error_text}")
result = await response.json()
self._log(f"✅ Junction updated: junctionId={result.get('junctionId')}")
return result
except asyncio.TimeoutError:
raise EspoCRMTimeoutError(f"Timeout updating junction data")
except aiohttp.ClientError as e:
raise EspoCRMAPIError(f"Network error updating junction data: {e}")

View File

@@ -236,7 +236,8 @@ class XAIService:
data = await response.json()
collection_id = data.get('id')
# API returns 'collection_id' not 'id'
collection_id = data.get('collection_id') or data.get('id')
self._log(f"✅ Collection created: {collection_id}")
return data
@@ -308,7 +309,18 @@ class XAIService:
GET https://management-api.x.ai/v1/collections/{collection_id}/documents
Returns:
List von document objects mit file_id, filename, hash, fields
List von normalized document objects:
[
{
'file_id': 'file_...',
'filename': 'doc.pdf',
'blake3_hash': 'hex_string', # Plain hex, kein prefix
'size_bytes': 12345,
'content_type': 'application/pdf',
'fields': {}, # Custom metadata
'status': 'DOCUMENT_STATUS_...'
}
]
Raises:
RuntimeError: bei HTTP-Fehler
@@ -328,16 +340,31 @@ class XAIService:
data = await response.json()
# API sollte eine Liste zurückgeben oder ein dict mit 'documents' key
# API gibt Liste zurück oder dict mit 'documents' key
if isinstance(data, list):
documents = data
raw_documents = data
elif isinstance(data, dict) and 'documents' in data:
documents = data['documents']
raw_documents = data['documents']
else:
documents = []
raw_documents = []
self._log(f"✅ Listed {len(documents)} documents")
return documents
# Normalize nested structure: file_metadata -> top-level
normalized = []
for doc in raw_documents:
file_meta = doc.get('file_metadata', {})
normalized.append({
'file_id': file_meta.get('file_id'),
'filename': file_meta.get('name'),
'blake3_hash': file_meta.get('hash'), # Plain hex string
'size_bytes': int(file_meta.get('size_bytes', 0)) if file_meta.get('size_bytes') else 0,
'content_type': file_meta.get('content_type'),
'created_at': file_meta.get('created_at'),
'fields': doc.get('fields', {}),
'status': doc.get('status')
})
self._log(f"✅ Listed {len(normalized)} documents")
return normalized
async def get_collection_document(self, collection_id: str, file_id: str) -> Optional[Dict]:
"""
@@ -346,12 +373,14 @@ class XAIService:
GET https://management-api.x.ai/v1/collections/{collection_id}/documents/{file_id}
Returns:
Dict mit document info including BLAKE3 hash:
Normalized dict mit document info:
{
'file_id': 'file_xyz',
'filename': 'document.pdf',
'hash': 'blake3:abcd1234...', # BLAKE3 Hash!
'fields': {...} # Metadata
'blake3_hash': 'hex_string', # Plain hex, kein prefix
'size_bytes': 12345,
'content_type': 'application/pdf',
'fields': {...} # Custom metadata
}
Returns None if not found.
@@ -374,8 +403,21 @@ class XAIService:
data = await response.json()
self._log(f"✅ Document info retrieved: {data.get('filename', 'N/A')}")
return data
# Normalize nested structure
file_meta = data.get('file_metadata', {})
normalized = {
'file_id': file_meta.get('file_id'),
'filename': file_meta.get('name'),
'blake3_hash': file_meta.get('hash'), # Plain hex
'size_bytes': int(file_meta.get('size_bytes', 0)) if file_meta.get('size_bytes') else 0,
'content_type': file_meta.get('content_type'),
'created_at': file_meta.get('created_at'),
'fields': data.get('fields', {}),
'status': data.get('status')
}
self._log(f"✅ Document info retrieved: {normalized.get('filename', 'N/A')}")
return normalized
async def update_document_metadata(
self,

View File

@@ -18,7 +18,7 @@ config = {
'description': 'Runs calendar sync automatically every 15 minutes',
'flows': ['advoware-calendar-sync'],
'triggers': [
cron("0 */15 * * * *") # Every 15 minutes at second 0 (6-field: sec min hour day month weekday)
cron("0 15 1 * * *") # Every 15 minutes at second 0 (6-field: sec min hour day month weekday)
],
'enqueues': ['calendar_sync_all']
}

View File

@@ -152,6 +152,42 @@ async def handle_create_or_update(entity_id: str, document: Dict[str, Any], sync
if collection_ids:
ctx.logger.info(f" Collections: {collection_ids}")
# ═══════════════════════════════════════════════════════════════
# CHECK: Knowledge Bases mit Status "new" (noch keine Collection)
# ═══════════════════════════════════════════════════════════════
new_knowledge_bases = [cid for cid in collection_ids if cid.startswith('NEW:')]
if new_knowledge_bases:
ctx.logger.info("")
ctx.logger.info("=" * 80)
ctx.logger.info("🆕 DOKUMENT IST MIT KNOWLEDGE BASE(S) VERKNÜPFT (Status: new)")
ctx.logger.info("=" * 80)
for new_kb in new_knowledge_bases:
kb_id = new_kb[4:] # Remove "NEW:" prefix
ctx.logger.info(f"📋 CAIKnowledge {kb_id}")
ctx.logger.info(f" Status: new → Collection muss zuerst erstellt werden")
# Trigger Knowledge Sync
ctx.logger.info(f"📤 Triggering aiknowledge.sync event...")
await ctx.emit('aiknowledge.sync', {
'entity_id': kb_id,
'entity_type': 'CAIKnowledge',
'triggered_by': 'document_sync',
'document_id': entity_id
})
ctx.logger.info(f"✅ Event emitted for {kb_id}")
# Release lock and skip document sync - knowledge sync will handle documents
ctx.logger.info("")
ctx.logger.info("=" * 80)
ctx.logger.info("✅ KNOWLEDGE SYNC GETRIGGERT")
ctx.logger.info(" Document Sync wird übersprungen")
ctx.logger.info(" (Knowledge Sync erstellt Collection und synchronisiert dann Dokumente)")
ctx.logger.info("=" * 80)
await sync_utils.release_sync_lock(entity_id, success=True, entity_type=entity_type)
return
# ═══════════════════════════════════════════════════════════════
# PREVIEW-GENERIERUNG bei neuen/geänderten Dateien
# ═══════════════════════════════════════════════════════════════