From 0e521f22f8614edddc0af3395d1053eb5e283966 Mon Sep 17 00:00:00 2001 From: bsiggel Date: Tue, 3 Mar 2026 09:28:49 +0000 Subject: [PATCH] feat(preview-generation): implement thumbnail generation for documents; add preview upload to EspoCRM --- docs/DOCUMENT_SYNC_XAI_STATUS.md | 115 +++------------ services/document_sync_utils.py | 203 +++++++++++++++++++------- services/espocrm.py | 114 +++++++++++++++ steps/vmh/document_sync_event_step.py | 87 +++++++++++ 4 files changed, 371 insertions(+), 148 deletions(-) diff --git a/docs/DOCUMENT_SYNC_XAI_STATUS.md b/docs/DOCUMENT_SYNC_XAI_STATUS.md index 4821814..38c84d3 100644 --- a/docs/DOCUMENT_SYNC_XAI_STATUS.md +++ b/docs/DOCUMENT_SYNC_XAI_STATUS.md @@ -20,110 +20,41 @@ ## ⏳ In Arbeit -### 4. Thumbnail-Generierung (`generate_thumbnail()`) +### 4. Preview-Generierung (`generate_thumbnail()`) -**Anforderungen:** -- Erste Seite eines PDFs als Vorschaubild -- DOCX/DOC → PDF → Image Konvertierung -- Bild-Dateien: Resize auf Thumbnail-Größe -- Fallback: Generic File-Icons basierend auf MIME-Type +**✅ Implementiert** - Bereit zum Installieren der Dependencies + +**Konfiguration:** +- **Feld in EspoCRM**: `preview` (Attachment) +- **Format**: **WebP** (bessere Kompression als PNG/JPEG) +- **Größe**: **600x800px** (behält Aspect Ratio) +- **Qualität**: 85% (guter Kompromiss zwischen Qualität und Dateigröße) + +**Unterstützte Formate:** +- ✅ PDF: Erste Seite als Preview +- ✅ DOCX/DOC: Konvertierung zu PDF, dann erste Seite +- ✅ Images (JPG, PNG, etc.): Resize auf Preview-Größe +- ❌ Andere: Kein Preview (TODO: Generic File-Icons) **Benötigte Dependencies:** ```bash # Python Packages -pip install pdf2image python-docx Pillow docx2pdf +pip install pdf2image Pillow docx2pdf # System Dependencies (Ubuntu/Debian) apt-get install poppler-utils libreoffice ``` -**Implementierungs-Schritte:** +**Installation:** +```bash +cd /opt/motia-iii/bitbylaw +/opt/bin/uv pip install pdf2image Pillow docx2pdf -1. **PDF Handling** (Priorität 1): -```python -from pdf2image import convert_from_path -from PIL import Image -import io - -def generate_pdf_thumbnail(pdf_path: str) -> bytes: - # Konvertiere erste Seite zu Image - images = convert_from_path(pdf_path, first_page=1, last_page=1, dpi=150) - thumbnail = images[0] - - # Resize auf Thumbnail-Größe (z.B. 200x280) - thumbnail.thumbnail((200, 280), Image.Resampling.LANCZOS) - - # Convert zu bytes - buffer = io.BytesIO() - thumbnail.save(buffer, format='PNG') - return buffer.getvalue() +# System packages +sudo apt-get update +sudo apt-get install -y poppler-utils libreoffice ``` -2. **DOCX Handling** (Priorität 2): -```python -from docx2pdf import convert -import tempfile -import os - -def generate_docx_thumbnail(docx_path: str) -> bytes: - # Temporäres PDF erstellen - with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp: - pdf_path = tmp.name - - # DOCX → PDF Konvertierung (benötigt LibreOffice) - convert(docx_path, pdf_path) - - # PDF-Thumbnail generieren - thumbnail = generate_pdf_thumbnail(pdf_path) - - # Cleanup - os.remove(pdf_path) - - return thumbnail -``` - -3. **Image Handling** (Priorität 3): -```python -from PIL import Image -import io - -def generate_image_thumbnail(image_path: str) -> bytes: - img = Image.open(image_path) - img.thumbnail((200, 280), Image.Resampling.LANCZOS) - - buffer = io.BytesIO() - img.save(buffer, format='PNG') - return buffer.getvalue() -``` - -4. **Thumbnail Upload zu EspoCRM**: -```python -# EspoCRM unterstützt Preview-Images via Attachment API -async def upload_thumbnail_to_espocrm( - document_id: str, - thumbnail_bytes: bytes, - espocrm_api -): - # Create Attachment - attachment_data = { - 'name': 'preview.png', - 'type': 'image/png', - 'role': 'Inline Attachment', - 'parentType': 'Document', - 'parentId': document_id, - 'field': 'previewImage' # Custom field? - } - - # Upload via EspoCRM Attachment API - # POST /api/v1/Attachment mit multipart/form-data - # TODO: espocrm.py muss upload_attachment() Methode bekommen -``` - -**Offene Fragen:** -- Welches Feld in EspoCRM Document für Preview? `previewImage`? `thumbnail`? -- Größe des Thumbnails? (empfohlen: 200x280 oder 300x400) -- Format: PNG oder JPEG? - ## ❌ Noch nicht implementiert ### 5. xAI Service (`xai_service.py`) @@ -202,7 +133,7 @@ class XAIService: - `xaiSyncedHash` (String): Hash beim letzten erfolgreichen Sync - `xaiSyncStatus` (Enum): "syncing", "synced", "failed" - `xaiSyncError` (Text): Fehlermeldung bei Sync-Fehler -- `previewImage` (Attachment?): Vorschaubild +- **`preview` (Attachment)**: Vorschaubild im WebP-Format (600x800px) ## 🚀 Nächste Schritte diff --git a/services/document_sync_utils.py b/services/document_sync_utils.py index fbfdef9..953a164 100644 --- a/services/document_sync_utils.py +++ b/services/document_sync_utils.py @@ -362,96 +362,187 @@ class DocumentSync: self._log(f"❌ Fehler beim Laden von Download-Info: {e}", level='error') return None - async def generate_thumbnail(self, file_path: str, mime_type: str) -> Optional[bytes]: + async def generate_thumbnail(self, file_path: str, mime_type: str, max_width: int = 600, max_height: int = 800) -> Optional[bytes]: """ - Generiert Vorschaubild (Thumbnail) für ein Document + Generiert Vorschaubild (Preview) für ein Document im WebP-Format Unterstützt: - PDF: Erste Seite als Bild - DOCX/DOC: Konvertierung zu PDF, dann erste Seite - - Images: Resize auf Thumbnail-Größe + - Images: Resize auf Preview-Größe - Andere: Platzhalter-Icon basierend auf MIME-Type Args: - file_path: Pfad zur Datei (lokal oder Download-URL) + file_path: Pfad zur Datei (lokal) mime_type: MIME-Type des Documents + max_width: Maximale Breite (default: 600px) + max_height: Maximale Höhe (default: 800px) Returns: - Thumbnail als bytes (PNG/JPEG) oder None bei Fehler + Preview als WebP bytes oder None bei Fehler """ - self._log(f"🖼️ Thumbnail-Generierung für {mime_type}") + self._log(f"🖼️ Preview-Generierung für {mime_type} (max: {max_width}x{max_height})") - # TODO: Implementierung - # - # Benötigte Libraries: - # - pdf2image (für PDF → Image) - # - python-docx + docx2pdf (für DOCX → PDF → Image) - # - Pillow (PIL) für Image-Processing - # - poppler-utils (System-Dependency für pdf2image) - # - # Implementierungs-Schritte: - # - # 1. PDF-Handling: - # from pdf2image import convert_from_path - # images = convert_from_path(file_path, first_page=1, last_page=1) - # thumbnail = images[0].resize((200, 280)) - # return thumbnail_to_bytes(thumbnail) - # - # 2. DOCX-Handling: - # - Konvertiere zu temporärem PDF - # - Dann wie PDF behandeln - # - # 3. Image-Handling: - # from PIL import Image - # img = Image.open(file_path) - # img.thumbnail((200, 280)) - # return image_to_bytes(img) - # - # 4. Fallback: - # - Generic file-type icon basierend auf MIME-Type - - self._log(f"⚠️ Thumbnail-Generierung noch nicht implementiert", level='warn') - return None + try: + from PIL import Image + import io + + thumbnail = None + + # PDF-Handling + if mime_type == 'application/pdf': + try: + from pdf2image import convert_from_path + self._log(" Converting PDF page 1 to image...") + images = convert_from_path(file_path, first_page=1, last_page=1, dpi=150) + if images: + thumbnail = images[0] + except ImportError: + self._log("⚠️ pdf2image nicht installiert - überspringe PDF-Preview", level='warn') + return None + except Exception as e: + self._log(f"⚠️ PDF-Konvertierung fehlgeschlagen: {e}", level='warn') + return None + + # DOCX/DOC-Handling + elif mime_type in ['application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'application/msword']: + try: + import tempfile + import os + from docx2pdf import convert + from pdf2image import convert_from_path + + self._log(" Converting DOCX → PDF → Image...") + + # Temporäres PDF erstellen + with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp: + pdf_path = tmp.name + + # DOCX → PDF (benötigt LibreOffice) + convert(file_path, pdf_path) + + # PDF → Image + images = convert_from_path(pdf_path, first_page=1, last_page=1, dpi=150) + if images: + thumbnail = images[0] + + # Cleanup + os.remove(pdf_path) + + except ImportError: + self._log("⚠️ docx2pdf nicht installiert - überspringe DOCX-Preview", level='warn') + return None + except Exception as e: + self._log(f"⚠️ DOCX-Konvertierung fehlgeschlagen: {e}", level='warn') + return None + + # Image-Handling + elif mime_type.startswith('image/'): + try: + self._log(" Processing image file...") + thumbnail = Image.open(file_path) + except Exception as e: + self._log(f"⚠️ Image-Laden fehlgeschlagen: {e}", level='warn') + return None + + else: + self._log(f"⚠️ Keine Preview-Generierung für MIME-Type: {mime_type}", level='warn') + return None + + if not thumbnail: + return None + + # Resize auf max dimensions (behält Aspect Ratio) + thumbnail.thumbnail((max_width, max_height), Image.Resampling.LANCZOS) + + # Convert zu WebP bytes + buffer = io.BytesIO() + thumbnail.save(buffer, format='WEBP', quality=85) + webp_bytes = buffer.getvalue() + + self._log(f"✅ Preview generiert: {len(webp_bytes)} bytes WebP") + return webp_bytes + + except Exception as e: + self._log(f"❌ Fehler bei Preview-Generierung: {e}", level='error') + import traceback + self._log(traceback.format_exc(), level='debug') + return None async def update_sync_metadata( self, document_id: str, - xai_file_id: str, - collection_ids: List[str], + xai_file_id: Optional[str] = None, + collection_ids: Optional[List[str]] = None, file_hash: Optional[str] = None, - thumbnail_data: Optional[bytes] = None + preview_data: Optional[bytes] = None ) -> None: """ Updated Document-Metadaten nach erfolgreichem xAI-Sync Args: document_id: EspoCRM Document ID - xai_file_id: xAI File ID - collection_ids: Liste der xAI Collection IDs + xai_file_id: xAI File ID (optional - setzt nur wenn vorhanden) + collection_ids: Liste der xAI Collection IDs (optional) file_hash: MD5/SHA Hash des gesyncten Files - thumbnail_data: Vorschaubild als bytes + preview_data: Vorschaubild (WebP) als bytes """ try: - update_data = { - 'xaiFileId': xai_file_id, - 'xaiCollections': collection_ids, - 'dateiStatus': 'Gesynct', # Status zurücksetzen - } + update_data = {} + + # Nur xAI-Felder updaten wenn vorhanden + if xai_file_id: + update_data['xaiFileId'] = xai_file_id + + if collection_ids is not None: + update_data['xaiCollections'] = collection_ids + + # Nur Status auf "Gesynct" setzen wenn xAI-File-ID vorhanden + if xai_file_id: + update_data['dateiStatus'] = 'Gesynct' # Hash speichern für zukünftige Change Detection if file_hash: update_data['xaiSyncedHash'] = file_hash - # Thumbnail als Attachment hochladen (falls vorhanden) - if thumbnail_data: - # TODO: Implementiere Thumbnail-Upload zu EspoCRM - # EspoCRM unterstützt Preview-Images für Documents - # Muss als separates Attachment hochgeladen werden - self._log(f"⚠️ Thumbnail-Upload noch nicht implementiert", level='warn') + # Preview als Attachment hochladen (falls vorhanden) + if preview_data: + await self._upload_preview_to_espocrm(document_id, preview_data) - await self.espocrm.update_entity('Document', document_id, update_data) - self._log(f"✅ Sync-Metadaten aktualisiert für Document {document_id}") + # Nur updaten wenn es etwas zu updaten gibt + if update_data: + await self.espocrm.update_entity('Document', document_id, update_data) + self._log(f"✅ Sync-Metadaten aktualisiert für Document {document_id}: {list(update_data.keys())}") except Exception as e: self._log(f"❌ Fehler beim Update von Sync-Metadaten: {e}", level='error') raise + + async def _upload_preview_to_espocrm(self, document_id: str, preview_data: bytes) -> None: + """ + Lädt Preview-Image als Attachment zu EspoCRM hoch + + Args: + document_id: Document ID + preview_data: WebP Preview als bytes + """ + try: + self._log(f"📤 Uploading preview image ({len(preview_data)} bytes)...") + + # Upload via EspoCRM Attachment API + await self.espocrm.upload_attachment( + file_content=preview_data, + filename='preview.webp', + parent_type='Document', + parent_id=document_id, + field='preview', + mime_type='image/webp', + role='Attachment' + ) + + self._log(f"✅ Preview erfolgreich hochgeladen") + + except Exception as e: + self._log(f"❌ Fehler beim Preview-Upload: {e}", level='error') + # Don't raise - Preview ist optional, Sync sollte trotzdem erfolgreich sein diff --git a/services/espocrm.py b/services/espocrm.py index 39f2807..79ce100 100644 --- a/services/espocrm.py +++ b/services/espocrm.py @@ -298,3 +298,117 @@ class EspoCRMAPI: result = await self.list_entities(entity_type, where=where) return result.get('list', []) + + async def upload_attachment( + self, + file_content: bytes, + filename: str, + parent_type: str, + parent_id: str, + field: str, + mime_type: str = 'application/octet-stream', + role: str = 'Attachment' + ) -> Dict[str, Any]: + """ + Upload an attachment to EspoCRM. + + Args: + file_content: File content as bytes + filename: Name of the file + parent_type: Parent entity type (e.g., 'Document') + parent_id: Parent entity ID + field: Field name for the attachment (e.g., 'preview') + mime_type: MIME type of the file + role: Attachment role (default: 'Attachment') + + Returns: + Attachment entity data + """ + self._log(f"Uploading attachment: {filename} ({len(file_content)} bytes) to {parent_type}/{parent_id}/{field}") + + url = self.api_base_url.rstrip('/') + '/Attachment' + headers = { + 'X-Api-Key': self.api_key, + # Content-Type wird automatisch von aiohttp gesetzt für FormData + } + + # Erstelle FormData + form_data = aiohttp.FormData() + form_data.add_field('file', file_content, filename=filename, content_type=mime_type) + form_data.add_field('parentType', parent_type) + form_data.add_field('parentId', parent_id) + form_data.add_field('field', field) + form_data.add_field('role', role) + form_data.add_field('name', filename) + + effective_timeout = aiohttp.ClientTimeout(total=self.api_timeout_seconds) + + async with aiohttp.ClientSession(timeout=effective_timeout) as session: + try: + async with session.post(url, headers=headers, data=form_data) as response: + self._log(f"Upload response status: {response.status}", level='debug') + + if response.status == 401: + raise EspoCRMAuthError("Authentication failed - check API key") + elif response.status == 403: + raise EspoCRMError("Access forbidden") + elif response.status == 404: + raise EspoCRMError(f"Attachment endpoint not found") + elif response.status >= 400: + error_text = await response.text() + raise EspoCRMError(f"Upload error {response.status}: {error_text}") + + # Parse response + if response.content_type == 'application/json': + result = await response.json() + attachment_id = result.get('id') + self._log(f"✅ Attachment uploaded successfully: {attachment_id}") + return result + else: + response_text = await response.text() + self._log(f"⚠️ Non-JSON response: {response_text[:200]}", level='warn') + return {'success': True, 'response': response_text} + + except aiohttp.ClientError as e: + self._log(f"Upload failed: {e}", level='error') + raise EspoCRMError(f"Upload request failed: {e}") from e + + async def download_attachment(self, attachment_id: str) -> bytes: + """ + Download an attachment from EspoCRM. + + Args: + attachment_id: Attachment ID + + Returns: + File content as bytes + """ + self._log(f"Downloading attachment: {attachment_id}") + + url = self.api_base_url.rstrip('/') + f'/Attachment/file/{attachment_id}' + headers = { + 'X-Api-Key': self.api_key, + } + + effective_timeout = aiohttp.ClientTimeout(total=self.api_timeout_seconds) + + async with aiohttp.ClientSession(timeout=effective_timeout) as session: + try: + async with session.get(url, headers=headers) as response: + if response.status == 401: + raise EspoCRMAuthError("Authentication failed - check API key") + elif response.status == 403: + raise EspoCRMError("Access forbidden") + elif response.status == 404: + raise EspoCRMError(f"Attachment not found: {attachment_id}") + elif response.status >= 400: + error_text = await response.text() + raise EspoCRMError(f"Download error {response.status}: {error_text}") + + content = await response.read() + self._log(f"✅ Downloaded {len(content)} bytes") + return content + + except aiohttp.ClientError as e: + self._log(f"Download failed: {e}", level='error') + raise EspoCRMError(f"Download request failed: {e}") from e diff --git a/steps/vmh/document_sync_event_step.py b/steps/vmh/document_sync_event_step.py index f66df7e..f94a567 100644 --- a/steps/vmh/document_sync_event_step.py +++ b/steps/vmh/document_sync_event_step.py @@ -146,15 +146,102 @@ async def handle_create_or_update(entity_id: str, document: Dict[str, Any], sync ctx.logger.info("🔍 ANALYSE: Braucht dieses Document xAI-Sync?") ctx.logger.info("=" * 80) + # Datei-Status für Preview-Generierung + datei_status = document.get('dateiStatus') or document.get('fileStatus') + # Entscheidungslogik: Soll dieses Document zu xAI? needs_sync, collection_ids, reason = await sync_utils.should_sync_to_xai(document) ctx.logger.info(f"📊 Entscheidung: {'✅ SYNC NÖTIG' if needs_sync else '⏭️ KEIN SYNC NÖTIG'}") ctx.logger.info(f" Grund: {reason}") + ctx.logger.info(f" Datei-Status: {datei_status or 'N/A'}") if collection_ids: ctx.logger.info(f" Collections: {collection_ids}") + # ═══════════════════════════════════════════════════════════════ + # PREVIEW-GENERIERUNG bei neuen/geänderten Dateien + # ═══════════════════════════════════════════════════════════════ + + if datei_status in ['Neu', 'Geändert', 'neu', 'geändert', 'New', 'Changed']: + ctx.logger.info("") + ctx.logger.info("=" * 80) + ctx.logger.info("🖼️ PREVIEW-GENERIERUNG STARTEN") + ctx.logger.info(f" Datei-Status: {datei_status}") + ctx.logger.info("=" * 80) + + try: + # 1. Hole Download-Informationen + download_info = await sync_utils.get_document_download_info(entity_id) + + if not download_info: + ctx.logger.warn("⚠️ Keine Download-Info verfügbar - überspringe Preview") + else: + ctx.logger.info(f"📥 Datei-Info:") + ctx.logger.info(f" Filename: {download_info['filename']}") + ctx.logger.info(f" MIME-Type: {download_info['mime_type']}") + ctx.logger.info(f" Size: {download_info['size']} bytes") + + # 2. Download File von EspoCRM + ctx.logger.info(f"📥 Downloading file...") + espocrm = sync_utils.espocrm + file_content = await espocrm.download_attachment(download_info['attachment_id']) + ctx.logger.info(f"✅ Downloaded {len(file_content)} bytes") + + # 3. Speichere temporär für Preview-Generierung + import tempfile + import os + + with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{download_info['filename']}") as tmp_file: + tmp_file.write(file_content) + tmp_path = tmp_file.name + + try: + # 4. Generiere Preview + ctx.logger.info(f"🖼️ Generating preview (600x800 WebP)...") + preview_data = await sync_utils.generate_thumbnail( + tmp_path, + download_info['mime_type'], + max_width=600, + max_height=800 + ) + + if preview_data: + ctx.logger.info(f"✅ Preview generated: {len(preview_data)} bytes WebP") + + # 5. Upload Preview zu EspoCRM + ctx.logger.info(f"📤 Uploading preview to EspoCRM...") + await sync_utils.update_sync_metadata( + entity_id, + preview_data=preview_data + # Keine xaiFileId/collections - nur Preview update + ) + ctx.logger.info(f"✅ Preview uploaded successfully") + else: + ctx.logger.warn("⚠️ Preview-Generierung lieferte keine Daten") + + finally: + # Cleanup temp file + try: + os.remove(tmp_path) + except: + pass + + except Exception as e: + ctx.logger.error(f"❌ Fehler bei Preview-Generierung: {e}") + import traceback + ctx.logger.error(traceback.format_exc()) + # Continue - Preview ist optional + + ctx.logger.info("") + ctx.logger.info("=" * 80) + ctx.logger.info("✅ PREVIEW-VERARBEITUNG ABGESCHLOSSEN") + ctx.logger.info("=" * 80) + + # ═══════════════════════════════════════════════════════════════ + # xAI SYNC (falls erforderlich) + # ═══════════════════════════════════════════════════════════════ + if not needs_sync: ctx.logger.info("✅ Kein xAI-Sync erforderlich, Lock wird released") await sync_utils.release_sync_lock(entity_id, success=True)