fix: Improve orphan detection and Blake3 hash verification in document synchronization

This commit is contained in:
bsiggel
2026-03-13 08:40:20 +00:00
parent b0fceef4e2
commit bb13d59ddb

View File

@@ -281,8 +281,9 @@ class AIKnowledgeSync(BaseSyncUtils):
# ═══════════════════════════════════════════════════════════════ # ═══════════════════════════════════════════════════════════════
successful = 0 successful = 0
failed = 0 failed = 0
skipped = 0 skipped = 0
# Track aiDocumentIds for orphan detection (collected during sync)
synced_file_ids: set = set()
for doc in documents: for doc in documents:
doc_id = doc['documentId'] doc_id = doc['documentId']
doc_name = doc.get('documentName', 'Unknown') doc_name = doc.get('documentName', 'Unknown')
@@ -303,27 +304,43 @@ class AIKnowledgeSync(BaseSyncUtils):
if junction_status in ['new', 'unclean', 'failed']: if junction_status in ['new', 'unclean', 'failed']:
needs_sync = True needs_sync = True
reason = f"status={junction_status}" reason = f"status={junction_status}"
elif junction_status == 'synced' and blake3_hash and ai_document_id: elif junction_status == 'synced':
# Verify Blake3 hash with XAI (always, since hash from JunctionData API is free) # Synced status should have both blake3_hash and ai_document_id
try: if not blake3_hash:
xai_doc_info = await xai.get_collection_document(collection_id, ai_document_id) needs_sync = True
if xai_doc_info: reason = "inconsistency: synced but no blake3 hash"
xai_blake3 = xai_doc_info.get('blake3_hash') ctx.logger.warn(f" ⚠️ Synced document missing blake3 hash!")
elif not ai_document_id:
if xai_blake3 != blake3_hash: needs_sync = True
needs_sync = True reason = "inconsistency: synced but no aiDocumentId"
reason = f"blake3 mismatch (XAI: {xai_blake3[:16] if xai_blake3 else 'N/A'}... vs EspoCRM: {blake3_hash[:16]}...)" ctx.logger.warn(f" ⚠️ Synced document missing aiDocumentId!")
ctx.logger.info(f" 🔄 Blake3 mismatch detected!") else:
# Verify Blake3 hash with XAI (always, since hash from JunctionData API is free)
try:
xai_doc_info = await xai.get_collection_document(collection_id, ai_document_id)
if xai_doc_info:
xai_blake3 = xai_doc_info.get('blake3_hash')
if xai_blake3 != blake3_hash:
needs_sync = True
reason = f"blake3 mismatch (XAI: {xai_blake3[:16] if xai_blake3 else 'N/A'}... vs EspoCRM: {blake3_hash[:16]}...)"
ctx.logger.info(f" 🔄 Blake3 mismatch detected!")
else:
ctx.logger.info(f" ✅ Blake3 hash matches")
else: else:
ctx.logger.info(f" ✅ Blake3 hash matches") needs_sync = True
else: reason = "file not found in XAI collection"
ctx.logger.warn(f" ⚠️ Document marked synced but not in XAI!")
except Exception as e:
needs_sync = True needs_sync = True
reason = "file not found in XAI collection" reason = f"verification failed: {e}"
except Exception as e: ctx.logger.warn(f" ⚠️ Failed to verify Blake3, will re-sync: {e}")
ctx.logger.warn(f" ⚠️ Failed to verify Blake3: {e}")
if not needs_sync: if not needs_sync:
ctx.logger.info(f" ⏭️ Skipped (no sync needed)") ctx.logger.info(f" ⏭️ Skipped (no sync needed)")
# Document is already synced, track its aiDocumentId
if ai_document_id:
synced_file_ids.add(ai_document_id)
skipped += 1 skipped += 1
continue continue
@@ -338,24 +355,27 @@ class AIKnowledgeSync(BaseSyncUtils):
failed += 1 failed += 1
continue continue
# Get attachment details for MIME type # Get attachment details for MIME type and original filename
try: try:
attachment = await espocrm.get_entity('Attachment', attachment_id) attachment = await espocrm.get_entity('Attachment', attachment_id)
mime_type = attachment.get('type', 'application/octet-stream') mime_type = attachment.get('type', 'application/octet-stream')
file_size = attachment.get('size', 0) file_size = attachment.get('size', 0)
original_filename = attachment.get('name', doc_name) # Original filename with extension
except Exception as e: except Exception as e:
ctx.logger.warn(f" ⚠️ Failed to get attachment details: {e}, using defaults") ctx.logger.warn(f" ⚠️ Failed to get attachment details: {e}, using defaults")
mime_type = 'application/octet-stream' mime_type = 'application/octet-stream'
file_size = 0 file_size = 0
original_filename = doc_name
ctx.logger.info(f" 📎 Attachment: {attachment_id} ({mime_type}, {file_size} bytes)") ctx.logger.info(f" 📎 Attachment: {attachment_id} ({mime_type}, {file_size} bytes)")
ctx.logger.info(f" 📄 Original filename: {original_filename}")
# Download document # Download document
file_content = await espocrm.download_attachment(attachment_id) file_content = await espocrm.download_attachment(attachment_id)
ctx.logger.info(f" 📥 Downloaded {len(file_content)} bytes") ctx.logger.info(f" 📥 Downloaded {len(file_content)} bytes")
# Upload to XAI # Upload to XAI with original filename (includes extension)
filename = doc_name filename = original_filename
xai_file_id = await xai.upload_file(file_content, filename, mime_type) xai_file_id = await xai.upload_file(file_content, filename, mime_type)
ctx.logger.info(f" 📤 Uploaded to XAI: {xai_file_id}") ctx.logger.info(f" 📤 Uploaded to XAI: {xai_file_id}")
@@ -376,6 +396,9 @@ class AIKnowledgeSync(BaseSyncUtils):
) )
ctx.logger.info(f" ✅ Junction updated") ctx.logger.info(f" ✅ Junction updated")
# Track the new aiDocumentId for orphan detection
synced_file_ids.add(xai_file_id)
successful += 1 successful += 1
except Exception as e: except Exception as e:
@@ -403,11 +426,12 @@ class AIKnowledgeSync(BaseSyncUtils):
xai_documents = await xai.list_collection_documents(collection_id) xai_documents = await xai.list_collection_documents(collection_id)
xai_file_ids = {doc.get('file_id') for doc in xai_documents if doc.get('file_id')} xai_file_ids = {doc.get('file_id') for doc in xai_documents if doc.get('file_id')}
# Get all ai_document_ids from junction # Use synced_file_ids (collected during this sync) for orphan detection
junction_file_ids = {doc.get('aiDocumentId') for doc in documents if doc.get('aiDocumentId')} # This includes both pre-existing synced docs and newly uploaded ones
ctx.logger.info(f" XAI has {len(xai_file_ids)} files, we have {len(synced_file_ids)} synced")
# Find orphans (in XAI but not in junction) # Find orphans (in XAI but not in our current sync)
orphans = xai_file_ids - junction_file_ids orphans = xai_file_ids - synced_file_ids
if orphans: if orphans:
ctx.logger.info(f" Found {len(orphans)} orphaned file(s)") ctx.logger.info(f" Found {len(orphans)} orphaned file(s)")