motia-iii/services/document_sync_utils.py

"""
Document Sync Utilities

Utility functions for document synchronization with xAI:
- Distributed locking via Redis + syncStatus
- Decision logic: When does a document need xAI sync?
- Related entities determination (Many-to-Many attachments)
- xAI Collection management
"""

from typing import Dict, Any, Optional, List, Tuple
from datetime import datetime, timedelta

from services.sync_utils_base import BaseSyncUtils
from services.models import FileStatus, XAISyncStatus

# Max retry before permanent failure
MAX_SYNC_RETRIES = 5

# Retry backoff: Wartezeit zwischen Retries (in Minuten)
RETRY_BACKOFF_MINUTES = [1, 5, 15, 60, 240]  # 1min, 5min, 15min, 1h, 4h

# Legacy file status values (for backward compatibility)
# These are old German and English status values that may still exist in the database
LEGACY_NEW_STATUS_VALUES = {'neu', 'Neu', 'New'}
LEGACY_CHANGED_STATUS_VALUES = {'geändert', 'Geändert', 'Changed'}
LEGACY_SYNCED_STATUS_VALUES = {'synced', 'Synced', 'synchronized', 'Synchronized'}


class DocumentSync(BaseSyncUtils):
    """Utility class for document synchronization with xAI"""

    def _get_lock_key(self, entity_id: str) -> str:
        """Redis lock key for documents"""
        return f"sync_lock:document:{entity_id}"

    async def acquire_sync_lock(self, entity_id: str, entity_type: str = 'CDokumente') -> bool:
        """
        Atomic distributed lock via Redis + syncStatus update

        Args:
            entity_id: EspoCRM Document ID
            entity_type: Entity-Type (CDokumente oder Document)

        Returns:
            True wenn Lock erfolgreich, False wenn bereits im Sync
        """
        try:
            # STEP 1: Atomic Redis lock (prevents race conditions)
            lock_key = self._get_lock_key(entity_id)
            if not self._acquire_redis_lock(lock_key):
                self._log(f"Redis lock bereits aktiv für {entity_type} {entity_id}", level='warn')
                return False

            # STEP 2: Update xaiSyncStatus to pending_sync
            try:
                await self.espocrm.update_entity(entity_type, entity_id, {
                    'xaiSyncStatus': XAISyncStatus.PENDING_SYNC.value
                })
            except Exception as e:
                self._log(f"Could not set xaiSyncStatus: {e}", level='debug')

            self._log(f"Sync-Lock für {entity_type} {entity_id} erworben")
            return True

        except Exception as e:
            self._log(f"Fehler beim Acquire Lock: {e}", level='error')
            # Clean up Redis lock on error
            lock_key = self._get_lock_key(entity_id)
            self._release_redis_lock(lock_key)
            return False

    async def release_sync_lock(
        self,
        entity_id: str,
        success: bool = True,
        error_message: Optional[str] = None,
        extra_fields: Optional[Dict[str, Any]] = None,
        entity_type: str = 'CDokumente'
    ) -> None:
        """
        Gibt Sync-Lock frei und setzt finalen Status

        Args:
            entity_id: EspoCRM Document ID
            success: Ob Sync erfolgreich war
            error_message: Optional: Fehlermeldung
            extra_fields: Optional: Zusätzliche Felder (z.B. xaiFileId, xaiCollections)
            entity_type: Entity-Type (CDokumente oder Document)
        """
        try:
            update_data = {}

            # Set xaiSyncStatus: clean on success, failed on error
            try:
                update_data['xaiSyncStatus'] = XAISyncStatus.CLEAN.value if success else XAISyncStatus.FAILED.value

                if error_message:
                    update_data['xaiSyncError'] = error_message[:2000]
                else:
                    update_data['xaiSyncError'] = None
            except:
                pass  # Fields may not exist

            # Merge extra fields (z.B. xaiFileId, xaiCollections)
            if extra_fields:
                update_data.update(extra_fields)

            if update_data:
                await self.espocrm.update_entity(entity_type, entity_id, update_data)

            self._log(f"Sync-Lock released: {entity_type} {entity_id} → {'success' if success else 'failed'}")

            # Release Redis lock
            lock_key = self._get_lock_key(entity_id)
            self._release_redis_lock(lock_key)

        except Exception as e:
            self._log(f"Fehler beim Release Lock: {e}", level='error')
            # Ensure Redis lock is released even on error
            lock_key = self._get_lock_key(entity_id)
            self._release_redis_lock(lock_key)

    async def should_sync_to_xai(
        self,
        document: Dict[str, Any],
        entity_type: str = 'CDokumente'
    ) -> Tuple[bool, List[str], str]:
        """
        Decide if a document needs to be synchronized to xAI.

        Checks:
        1. File status field ("new", "changed")
        2. Hash values for change detection
        3. Related entities with xAI collections

        Args:
            document: Complete document entity from EspoCRM

        Returns:
            Tuple[bool, List[str], str]:
                - bool: Whether sync is needed
                - List[str]: List of collection IDs where the document should go
                - str: Reason/description of the decision
        """
        doc_id = document.get('id')
        doc_name = document.get('name', 'Unbenannt')

        # xAI-relevant fields
        xai_file_id = document.get('xaiFileId')
        xai_collections = document.get('xaiCollections') or []
        xai_sync_status = document.get('xaiSyncStatus')

        # File status and hash fields
        datei_status = document.get('dateiStatus') or document.get('fileStatus')
        file_md5 = document.get('md5') or document.get('fileMd5')
        file_sha = document.get('sha') or document.get('fileSha')
        xai_synced_hash = document.get('xaiSyncedHash')  # Hash at last xAI sync

        self._log(f"📋 Document analysis: {doc_name} (ID: {doc_id})")
        self._log(f"   xaiFileId: {xai_file_id or 'N/A'}")
        self._log(f"   xaiCollections: {xai_collections}")
        self._log(f"   xaiSyncStatus: {xai_sync_status or 'N/A'}")
        self._log(f"   fileStatus: {datei_status or 'N/A'}")
        self._log(f"   MD5: {file_md5[:16] if file_md5 else 'N/A'}...")
        self._log(f"   SHA: {file_sha[:16] if file_sha else 'N/A'}...")
        self._log(f"   xaiSyncedHash: {xai_synced_hash[:16] if xai_synced_hash else 'N/A'}...")

        # Determine target collections from relations (CDokumente -> linked entities)
        target_collections = await self._get_required_collections_from_relations(
            doc_id,
            entity_type=entity_type
        )

        # Check xaiSyncStatus="no_sync" -> no sync for this document
        if xai_sync_status == XAISyncStatus.NO_SYNC.value:
            self._log("⏭️  No xAI sync needed: xaiSyncStatus='no_sync'")
            return (False, [], "xaiSyncStatus is 'no_sync'")

        if not target_collections:
            self._log("⏭️  No xAI sync needed: No related entities with xAI collections")
            return (False, [], "No linked entities with xAI collections")

        # ═══════════════════════════════════════════════════════════════
        # PRIORITY CHECK 1: xaiSyncStatus="unclean" -> document was changed
        # ═══════════════════════════════════════════════════════════════
        if xai_sync_status == XAISyncStatus.UNCLEAN.value:
            self._log(f"🆕 xaiSyncStatus='unclean' → xAI sync REQUIRED")
            return (True, target_collections, "xaiSyncStatus='unclean'")

        # ═══════════════════════════════════════════════════════════════
        # PRIORITY CHECK 2: fileStatus "new" or "changed"
        # ═══════════════════════════════════════════════════════════════
        # Check for standard enum values and legacy values
        is_new = (datei_status == FileStatus.NEW.value or datei_status in LEGACY_NEW_STATUS_VALUES)
        is_changed = (datei_status == FileStatus.CHANGED.value or datei_status in LEGACY_CHANGED_STATUS_VALUES)

        if is_new or is_changed:
            self._log(f"🆕 fileStatus: '{datei_status}' → xAI sync REQUIRED")

            if target_collections:
                return (True, target_collections, f"fileStatus: {datei_status}")
            else:
                # File is new/changed but no collections found
                self._log(f"⚠️  fileStatus '{datei_status}' but no collections found - skipping sync")
                return (False, [], f"fileStatus: {datei_status}, but no collections")

        # ═══════════════════════════════════════════════════════════════
        # CASE 1: Document is already in xAI AND collections are set
        # ═══════════════════════════════════════════════════════════════
        if xai_file_id:
            self._log(f"✅ Document already synced to xAI with {len(target_collections)} collection(s)")

            # Check if file content was changed (hash comparison)
            current_hash = file_md5 or file_sha

            if current_hash and xai_synced_hash:
                if current_hash != xai_synced_hash:
                    self._log(f"🔄 Hash change detected! RESYNC required")
                    self._log(f"   Old: {xai_synced_hash[:16]}...")
                    self._log(f"   New: {current_hash[:16]}...")
                    return (True, target_collections, "File content changed (hash mismatch)")
                else:
                    self._log(f"✅ Hash identical - no change")
            else:
                self._log(f"⚠️  No hash values available for comparison")

            return (False, target_collections, "Already synced, no change detected")

        # ═══════════════════════════════════════════════════════════════
        # CASE 2: Document has xaiFileId but collections is empty/None
        # ═══════════════════════════════════════════════════════════════
        # ═══════════════════════════════════════════════════════════════
        # CASE 3: Collections present but no status/hash trigger
        # ═══════════════════════════════════════════════════════════════
        self._log(f"✅ Document is linked to {len(target_collections)} entity/ies with collections")
        return (True, target_collections, "Linked to entities that require collections")

    async def _get_required_collections_from_relations(
        self,
        document_id: str,
        entity_type: str = 'Document'
    ) -> List[str]:
        """
        Determine all xAI collection IDs of CAIKnowledge entities linked to this document.

        Checks CAIKnowledgeCDokumente junction table:
        - Status 'active' + datenbankId: Returns collection ID
        - Status 'new': Returns "NEW:{knowledge_id}" marker (collection must be created first)
        - Other statuses (paused, deactivated): Skips

        Args:
            document_id: Document ID
            entity_type: Entity type (e.g., 'CDokumente')

        Returns:
            List of collection IDs or markers:
            - Normal IDs: "abc123..." (existing collections)
            - New markers: "NEW:kb-id..." (collection needs to be created via knowledge sync)
        """
        collections = set()

        self._log(f"🔍 Checking relations of {entity_type} {document_id}...")

        # ═══════════════════════════════════════════════════════════════
        # SPECIAL HANDLING: CAIKnowledge via Junction Table
        # ═══════════════════════════════════════════════════════════════
        try:
            junction_entries = await self.espocrm.get_junction_entries(
                'CAIKnowledgeCDokumente',
                'cDokumenteId',
                document_id
            )

            if junction_entries:
                self._log(f"   📋 Found {len(junction_entries)} CAIKnowledge link(s)")

                for junction in junction_entries:
                    knowledge_id = junction.get('cAIKnowledgeId')
                    if not knowledge_id:
                        continue

                    try:
                        knowledge = await self.espocrm.get_entity('CAIKnowledge', knowledge_id)
                        activation_status = knowledge.get('activationStatus')
                        collection_id = knowledge.get('datenbankId')

                        if activation_status == 'active' and collection_id:
                            # Existing collection - use it
                            collections.add(collection_id)
                            self._log(f"      ✅ CAIKnowledge {knowledge_id}: {collection_id} (active)")
                        elif activation_status == 'new':
                            # Collection doesn't exist yet - return special marker
                            # Format: "NEW:{knowledge_id}" signals to caller: trigger knowledge sync first
                            collections.add(f"NEW:{knowledge_id}")
                            self._log(f"      🆕 CAIKnowledge {knowledge_id}: status='new' → collection must be created first")
                        else:
                            self._log(f"      ⏭️  CAIKnowledge {knowledge_id}: status={activation_status}, datenbankId={collection_id or 'N/A'}")

                    except Exception as e:
                        self._log(f"      ⚠️  Failed to load CAIKnowledge {knowledge_id}: {e}", level='warn')

        except Exception as e:
            self._log(f"   ⚠️  Failed to check CAIKnowledge junction: {e}", level='warn')

        result = list(collections)
        self._log(f"📊 Gesamt: {len(result)} eindeutige Collection(s) gefunden")

        return result

    async def get_document_download_info(self, document_id: str, entity_type: str = 'CDokumente') -> Optional[Dict[str, Any]]:
        """
        Holt Download-Informationen für ein Document

        Args:
            document_id: ID des Documents
            entity_type: Entity-Type (CDokumente oder Document)

        Returns:
            Dict mit:
                - attachment_id: ID des Attachments
                - download_url: URL zum Download
                - filename: Dateiname
                - mime_type: MIME-Type
                - size: Dateigröße in Bytes
        """
        try:
            # Hole vollständiges Document
            doc = await self.espocrm.get_entity(entity_type, document_id)

            # EspoCRM Documents können Files auf verschiedene Arten speichern:
            # CDokumente: dokumentId/dokumentName (Custom Entity)
            # Document: fileId/fileName ODER attachmentsIds

            attachment_id = None
            filename = None

            # Prüfe zuerst dokumentId (CDokumente Custom Entity)
            if doc.get('dokumentId'):
                attachment_id = doc.get('dokumentId')
                filename = doc.get('dokumentName')
                self._log(f"📎 CDokumente verwendet dokumentId: {attachment_id}")

            # Fallback: fileId (Standard Document Entity)
            elif doc.get('fileId'):
                attachment_id = doc.get('fileId')
                filename = doc.get('fileName')
                self._log(f"📎 Document verwendet fileId: {attachment_id}")

            # Fallback 2: attachmentsIds (z.B. bei zusätzlichen Attachments)
            elif doc.get('attachmentsIds'):
                attachment_ids = doc.get('attachmentsIds')
                if attachment_ids:
                    attachment_id = attachment_ids[0]
                    self._log(f"📎 Document verwendet attachmentsIds: {attachment_id}")

            if not attachment_id:
                self._log(f"⚠️  {entity_type} {document_id} hat weder dokumentId, fileId noch attachmentsIds", level='warn')
                self._log(f"   Verfügbare Felder: {list(doc.keys())}")
                return None

            # Hole Attachment-Details
            attachment = await self.espocrm.get_entity('Attachment', attachment_id)

            # Filename: Nutze dokumentName/fileName falls vorhanden, sonst aus Attachment
            final_filename = filename or attachment.get('name', 'unknown')

            return {
                'attachment_id': attachment_id,
                'download_url': f"/api/v1/Attachment/file/{attachment_id}",
                'filename': final_filename,
                'mime_type': attachment.get('type', 'application/octet-stream'),
                'size': attachment.get('size', 0)
            }

        except Exception as e:
            self._log(f"❌ Fehler beim Laden von Download-Info: {e}", level='error')
            return None

    async def generate_thumbnail(self, file_path: str, mime_type: str, max_width: int = 600, max_height: int = 800) -> Optional[bytes]:
        """
        Generiert Vorschaubild (Preview) für ein Document im WebP-Format

        Unterstützt:
        - PDF: Erste Seite als Bild
        - DOCX/DOC: Konvertierung zu PDF, dann erste Seite
        - Images: Resize auf Preview-Größe
        - Andere: Platzhalter-Icon basierend auf MIME-Type

        Args:
            file_path: Pfad zur Datei (lokal)
            mime_type: MIME-Type des Documents
            max_width: Maximale Breite (default: 600px)
            max_height: Maximale Höhe (default: 800px)

        Returns:
            Preview als WebP bytes oder None bei Fehler
        """
        self._log(f"🖼️  Preview-Generierung für {mime_type} (max: {max_width}x{max_height})")

        try:
            from PIL import Image
            import io

            thumbnail = None

            # PDF-Handling
            if mime_type == 'application/pdf':
                try:
                    from pdf2image import convert_from_path
                    self._log("   Converting PDF page 1 to image...")
                    images = convert_from_path(file_path, first_page=1, last_page=1, dpi=150)
                    if images:
                        thumbnail = images[0]
                except ImportError:
                    self._log("⚠️  pdf2image nicht installiert - überspringe PDF-Preview", level='warn')
                    return None
                except Exception as e:
                    self._log(f"⚠️  PDF-Konvertierung fehlgeschlagen: {e}", level='warn')
                    return None

            # DOCX/DOC-Handling
            elif mime_type in ['application/vnd.openxmlformats-officedocument.wordprocessingml.document',
                              'application/msword']:
                try:
                    import tempfile
                    import os
                    from docx2pdf import convert
                    from pdf2image import convert_from_path

                    self._log("   Converting DOCX → PDF → Image...")

                    # Temporäres PDF erstellen
                    with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp:
                        pdf_path = tmp.name

                    # DOCX → PDF (benötigt LibreOffice)
                    convert(file_path, pdf_path)

                    # PDF → Image
                    images = convert_from_path(pdf_path, first_page=1, last_page=1, dpi=150)
                    if images:
                        thumbnail = images[0]

                    # Cleanup
                    os.remove(pdf_path)

                except ImportError:
                    self._log("⚠️  docx2pdf nicht installiert - überspringe DOCX-Preview", level='warn')
                    return None
                except Exception as e:
                    self._log(f"⚠️  DOCX-Konvertierung fehlgeschlagen: {e}", level='warn')
                    return None

            # Image-Handling
            elif mime_type.startswith('image/'):
                try:
                    self._log("   Processing image file...")
                    thumbnail = Image.open(file_path)
                except Exception as e:
                    self._log(f"⚠️  Image-Laden fehlgeschlagen: {e}", level='warn')
                    return None

            else:
                self._log(f"⚠️  Keine Preview-Generierung für MIME-Type: {mime_type}", level='warn')
                return None

            if not thumbnail:
                return None

            # Resize auf max dimensions (behält Aspect Ratio)
            thumbnail.thumbnail((max_width, max_height), Image.Resampling.LANCZOS)

            # Convert zu WebP bytes
            buffer = io.BytesIO()
            thumbnail.save(buffer, format='WEBP', quality=85)
            webp_bytes = buffer.getvalue()

            self._log(f"✅ Preview generiert: {len(webp_bytes)} bytes WebP")
            return webp_bytes

        except Exception as e:
            self._log(f"❌ Fehler bei Preview-Generierung: {e}", level='error')
            import traceback
            self._log(traceback.format_exc(), level='debug')
            return None

    async def update_sync_metadata(
        self,
        document_id: str,
        xai_file_id: Optional[str] = None,
        collection_ids: Optional[List[str]] = None,
        file_hash: Optional[str] = None,
        preview_data: Optional[bytes] = None,
        reset_file_status: bool = False,
        entity_type: str = 'CDokumente'
    ) -> None:
        """
        Updated Document-Metadaten nach erfolgreichem xAI-Sync oder Preview-Generierung

        Args:
            document_id: EspoCRM Document ID
            xai_file_id: xAI File ID (optional - setzt nur wenn vorhanden)
            collection_ids: Liste der xAI Collection IDs (optional)
            file_hash: MD5/SHA Hash des gesyncten Files
            preview_data: Vorschaubild (WebP) als bytes
            reset_file_status: Ob fileStatus/dateiStatus zurückgesetzt werden soll
            entity_type: Entity-Type (CDokumente oder Document)
        """
        try:
            update_data = {}

            # Nur xAI-Felder updaten wenn vorhanden
            if xai_file_id:
                # CDokumente verwendet xaiId, Document verwendet xaiFileId
                if entity_type == 'CDokumente':
                    update_data['xaiId'] = xai_file_id
                else:
                    update_data['xaiFileId'] = xai_file_id

            if collection_ids is not None:
                update_data['xaiCollections'] = collection_ids

            # fileStatus auf "unchanged" setzen wenn Dokument verarbeitet/clean ist
            if reset_file_status:
                if entity_type == 'CDokumente':
                    update_data['fileStatus'] = 'unchanged'
                else:
                    # Document Entity hat kein fileStatus, nur dateiStatus
                    update_data['dateiStatus'] = 'unchanged'

            # xaiSyncStatus auf "clean" setzen wenn xAI-Sync erfolgreich war
            if xai_file_id:
                update_data['xaiSyncStatus'] = 'clean'

            # Hash speichern für zukünftige Change Detection
            if file_hash:
                update_data['xaiSyncedHash'] = file_hash

            # Preview als Attachment hochladen (falls vorhanden)
            if preview_data:
                await self._upload_preview_to_espocrm(document_id, preview_data, entity_type)

            # Nur updaten wenn es etwas zu updaten gibt
            if update_data:
                await self.espocrm.update_entity(entity_type, document_id, update_data)
                self._log(f"✅ Sync-Metadaten aktualisiert für {entity_type} {document_id}: {list(update_data.keys())}")

        except Exception as e:
            self._log(f"❌ Fehler beim Update von Sync-Metadaten: {e}", level='error')
            raise

    async def _upload_preview_to_espocrm(self, document_id: str, preview_data: bytes, entity_type: str = 'CDokumente') -> None:
        """
        Lädt Preview-Image als Attachment zu EspoCRM hoch

        Args:
            document_id: Document ID
            preview_data: WebP Preview als bytes
            entity_type: Entity-Type (CDokumente oder Document)
        """
        try:
            self._log(f"📤 Uploading preview image to {entity_type} ({len(preview_data)} bytes)...")

            # EspoCRM erwartet base64-encoded file im Format: data:mime/type;base64,xxxxx
            import base64
            import aiohttp

            # Base64-encode preview data
            base64_data = base64.b64encode(preview_data).decode('ascii')
            file_data_uri = f"data:image/webp;base64,{base64_data}"

            # Upload via JSON POST mit base64-encoded file field
            url = self.espocrm.api_base_url.rstrip('/') + '/Attachment'
            headers = {
                'X-Api-Key': self.espocrm.api_key,
                'Content-Type': 'application/json'
            }

            payload = {
                'name': 'preview.webp',
                'type': 'image/webp',
                'role': 'Attachment',
                'field': 'preview',
                'relatedType': entity_type,
                'relatedId': document_id,
                'file': file_data_uri
            }

            self._log(f"📤 Posting to {url} with base64-encoded file ({len(base64_data)} chars)")
            self._log(f"   relatedType={entity_type}, relatedId={document_id}, field=preview")

            timeout = aiohttp.ClientTimeout(total=30)
            async with aiohttp.ClientSession(timeout=timeout) as session:
                async with session.post(url, headers=headers, json=payload) as response:
                    self._log(f"Upload response status: {response.status}")

                    if response.status >= 400:
                        error_text = await response.text()
                        self._log(f"❌ Upload failed: {error_text}", level='error')
                        raise Exception(f"Upload error {response.status}: {error_text}")

                    result = await response.json()
                    attachment_id = result.get('id')
                    self._log(f"✅ Preview Attachment created: {attachment_id}")

            # Update Entity mit previewId
            self._log(f"📝 Updating {entity_type} with previewId...")
            await self.espocrm.update_entity(entity_type, document_id, {
                'previewId': attachment_id,
                'previewName': 'preview.webp'
            })
            self._log(f"✅ {entity_type} previewId/previewName aktualisiert")

        except Exception as e:
            self._log(f"❌ Fehler beim Preview-Upload: {e}", level='error')
            # Don't raise - Preview ist optional, Sync sollte trotzdem erfolgreich sein