feat(sync): Add EML to TXT conversion for improved document handling in RAGflow sync

2026-03-27 01:23:52 +00:00
parent 61113d8f3d
commit c20baeb21a
1 changed files with 57 additions and 0 deletions
--- a/src/steps/crm/akte/akte_sync_event_step.py
+++ b/src/steps/crm/akte/akte_sync_event_step.py
@@ -674,6 +674,63 @@ async def _run_ragflow_sync(
                    file_content = await espocrm.download_attachment(attachment_id)
                    ctx.logger.info(f"     Downloaded {len(file_content)} bytes")
                    # ── EML → TXT Konvertierung ───────────────────────────────
                    if filename.lower().endswith('.eml'):
                        try:
                            import email as _email
                            from bs4 import BeautifulSoup
                            msg = _email.message_from_bytes(file_content)
                            subject = msg.get('Subject', '')
                            from_   = msg.get('From', '')
                            date    = msg.get('Date', '')
                            plain_parts, html_parts = [], []
                            if msg.is_multipart():
                                for part in msg.walk():
                                    ct = part.get_content_type()
                                    if ct == 'text/plain':
                                        plain_parts.append(part.get_payload(decode=True).decode(
                                            part.get_content_charset() or 'utf-8', errors='replace'))
                                    elif ct == 'text/html':
                                        html_parts.append(part.get_payload(decode=True).decode(
                                            part.get_content_charset() or 'utf-8', errors='replace'))
                            else:
                                ct = msg.get_content_type()
                                payload = msg.get_payload(decode=True).decode(
                                    msg.get_content_charset() or 'utf-8', errors='replace')
                                if ct == 'text/html':
                                    html_parts.append(payload)
                                else:
                                    plain_parts.append(payload)
                            if plain_parts:
                                body = '\n\n'.join(plain_parts)
                            elif html_parts:
                                soup = BeautifulSoup('\n'.join(html_parts), 'html.parser')
                                for tag in soup(['script', 'style', 'header', 'footer', 'nav']):
                                    tag.decompose()
                                body = '\n'.join(
                                    line.strip()
                                    for line in soup.get_text(separator='\n').splitlines()
                                    if line.strip()
                                )
                            else:
                                body = ''
                            header = (
                                f"Betreff: {subject}\n"
                                f"Von: {from_}\n"
                                f"Datum: {date}\n"
                                f"{'-' * 80}\n\n"
                            )
                            converted_text = (header + body).strip()
                            file_content = converted_text.encode('utf-8')
                            filename = filename[:-4] + '.txt'
                            mime_type = 'text/plain'
                            ctx.logger.info(
                                f"     📧 EML→TXT konvertiert: {len(file_content)} bytes "
                                f"(blake3 des Original-EML bleibt erhalten)"
                            )
                        except Exception as eml_err:
                            ctx.logger.warn(f"     ⚠️  EML-Konvertierung fehlgeschlagen, lade roh hoch: {eml_err}")
                    ctx.logger.info(f"     📤 Uploading '{filename}' ({mime_type})…")
                    result = await ragflow.upload_document(
                        dataset_id=dataset_id,