fix: Improve orphan detection and Blake3 hash verification in document synchronization

fix: Update sync mode logging to clarify Blake3 hash verification status
2026-03-13 08:40:20 +00:00 · 2026-03-12 23:09:21 +00:00
1 changed files with 50 additions and 26 deletions
--- a/services/aiknowledge_sync_utils.py
+++ b/services/aiknowledge_sync_utils.py
@@ -282,7 +282,8 @@ class AIKnowledgeSync(BaseSyncUtils):
        successful = 0
        failed = 0
        skipped = 0        
-
+        # Track aiDocumentIds for orphan detection (collected during sync)
        synced_file_ids: set = set()
        for doc in documents:
            doc_id = doc['documentId']
            doc_name = doc.get('documentName', 'Unknown')
@@ -303,7 +304,17 @@ class AIKnowledgeSync(BaseSyncUtils):
                if junction_status in ['new', 'unclean', 'failed']:
                    needs_sync = True
                    reason = f"status={junction_status}"
-                elif junction_status == 'synced' and blake3_hash and ai_document_id:
+                elif junction_status == 'synced':
                    # Synced status should have both blake3_hash and ai_document_id
                    if not blake3_hash:
                        needs_sync = True
                        reason = "inconsistency: synced but no blake3 hash"
                        ctx.logger.warn(f"   ⚠️  Synced document missing blake3 hash!")
                    elif not ai_document_id:
                        needs_sync = True
                        reason = "inconsistency: synced but no aiDocumentId"
                        ctx.logger.warn(f"   ⚠️  Synced document missing aiDocumentId!")
                    else:
                        # Verify Blake3 hash with XAI (always, since hash from JunctionData API is free)
                        try:
                            xai_doc_info = await xai.get_collection_document(collection_id, ai_document_id)
@@ -319,11 +330,17 @@ class AIKnowledgeSync(BaseSyncUtils):
                            else:
                                needs_sync = True
                                reason = "file not found in XAI collection"
                                ctx.logger.warn(f"   ⚠️  Document marked synced but not in XAI!")
                        except Exception as e:
-                        ctx.logger.warn(f"   ⚠️  Failed to verify Blake3: {e}")
+                            needs_sync = True
                            reason = f"verification failed: {e}"
                            ctx.logger.warn(f"   ⚠️  Failed to verify Blake3, will re-sync: {e}")
                if not needs_sync:
                    ctx.logger.info(f"   ⏭️  Skipped (no sync needed)")
                    # Document is already synced, track its aiDocumentId
                    if ai_document_id:
                        synced_file_ids.add(ai_document_id)
                    skipped += 1
                    continue
@@ -338,24 +355,27 @@ class AIKnowledgeSync(BaseSyncUtils):
                    failed += 1
                    continue
-                # Get attachment details for MIME type
+                # Get attachment details for MIME type and original filename
                try:
                    attachment = await espocrm.get_entity('Attachment', attachment_id)
                    mime_type = attachment.get('type', 'application/octet-stream')
                    file_size = attachment.get('size', 0)
                    original_filename = attachment.get('name', doc_name)  # Original filename with extension
                except Exception as e:
                    ctx.logger.warn(f"   ⚠️  Failed to get attachment details: {e}, using defaults")
                    mime_type = 'application/octet-stream'
                    file_size = 0
                    original_filename = doc_name
                ctx.logger.info(f"   📎 Attachment: {attachment_id} ({mime_type}, {file_size} bytes)")
                ctx.logger.info(f"   📄 Original filename: {original_filename}")
                # Download document
                file_content = await espocrm.download_attachment(attachment_id)
                ctx.logger.info(f"   📥 Downloaded {len(file_content)} bytes")
-                # Upload to XAI
+                # Upload to XAI with original filename (includes extension)
-                filename = doc_name
+                filename = original_filename
                xai_file_id = await xai.upload_file(file_content, filename, mime_type)
                ctx.logger.info(f"   📤 Uploaded to XAI: {xai_file_id}")
@@ -376,6 +396,9 @@ class AIKnowledgeSync(BaseSyncUtils):
                )
                ctx.logger.info(f"   ✅ Junction updated")
                # Track the new aiDocumentId for orphan detection
                synced_file_ids.add(xai_file_id)
                successful += 1
            except Exception as e:
@@ -403,11 +426,12 @@ class AIKnowledgeSync(BaseSyncUtils):
            xai_documents = await xai.list_collection_documents(collection_id)
            xai_file_ids = {doc.get('file_id') for doc in xai_documents if doc.get('file_id')}
-            # Get all ai_document_ids from junction
+            # Use synced_file_ids (collected during this sync) for orphan detection
-            junction_file_ids = {doc.get('aiDocumentId') for doc in documents if doc.get('aiDocumentId')}
+            # This includes both pre-existing synced docs and newly uploaded ones
            ctx.logger.info(f"   XAI has {len(xai_file_ids)} files, we have {len(synced_file_ids)} synced")
-            # Find orphans (in XAI but not in junction)
+            # Find orphans (in XAI but not in our current sync)
-            orphans = xai_file_ids - junction_file_ids
+            orphans = xai_file_ids - synced_file_ids
            if orphans:
                ctx.logger.info(f"   Found {len(orphans)} orphaned file(s)")
@@ -432,7 +456,7 @@ class AIKnowledgeSync(BaseSyncUtils):
        ctx.logger.info(f"   ✅ Synced: {successful}")
        ctx.logger.info(f"   ⏭️  Skipped: {skipped}")
        ctx.logger.info(f"   ❌ Failed: {failed}")
-        ctx.logger.info(f"   Mode: {'FULL SYNC (Blake3 verification)' if full_sync else 'INCREMENTAL'}")
+        ctx.logger.info(f"   Mode: Blake3 hash verification enabled")
        ctx.logger.info("=" * 80)
    def _calculate_metadata_hash(self, document: Dict) -> str:
Author	SHA1	Message	Date
bsiggel	bb13d59ddb	fix: Improve orphan detection and Blake3 hash verification in document synchronization	2026-03-13 08:40:20 +00:00
bsiggel	b0fceef4e2	fix: Update sync mode logging to clarify Blake3 hash verification status	2026-03-12 23:09:21 +00:00