feat(sync): Add EML to TXT conversion for improved document handling in RAGflow sync

This commit is contained in:
bsiggel
2026-03-27 01:23:52 +00:00
parent 61113d8f3d
commit c20baeb21a

View File

@@ -674,6 +674,63 @@ async def _run_ragflow_sync(
file_content = await espocrm.download_attachment(attachment_id)
ctx.logger.info(f" Downloaded {len(file_content)} bytes")
# ── EML → TXT Konvertierung ───────────────────────────────
if filename.lower().endswith('.eml'):
try:
import email as _email
from bs4 import BeautifulSoup
msg = _email.message_from_bytes(file_content)
subject = msg.get('Subject', '')
from_ = msg.get('From', '')
date = msg.get('Date', '')
plain_parts, html_parts = [], []
if msg.is_multipart():
for part in msg.walk():
ct = part.get_content_type()
if ct == 'text/plain':
plain_parts.append(part.get_payload(decode=True).decode(
part.get_content_charset() or 'utf-8', errors='replace'))
elif ct == 'text/html':
html_parts.append(part.get_payload(decode=True).decode(
part.get_content_charset() or 'utf-8', errors='replace'))
else:
ct = msg.get_content_type()
payload = msg.get_payload(decode=True).decode(
msg.get_content_charset() or 'utf-8', errors='replace')
if ct == 'text/html':
html_parts.append(payload)
else:
plain_parts.append(payload)
if plain_parts:
body = '\n\n'.join(plain_parts)
elif html_parts:
soup = BeautifulSoup('\n'.join(html_parts), 'html.parser')
for tag in soup(['script', 'style', 'header', 'footer', 'nav']):
tag.decompose()
body = '\n'.join(
line.strip()
for line in soup.get_text(separator='\n').splitlines()
if line.strip()
)
else:
body = ''
header = (
f"Betreff: {subject}\n"
f"Von: {from_}\n"
f"Datum: {date}\n"
f"{'-' * 80}\n\n"
)
converted_text = (header + body).strip()
file_content = converted_text.encode('utf-8')
filename = filename[:-4] + '.txt'
mime_type = 'text/plain'
ctx.logger.info(
f" 📧 EML→TXT konvertiert: {len(file_content)} bytes "
f"(blake3 des Original-EML bleibt erhalten)"
)
except Exception as eml_err:
ctx.logger.warn(f" ⚠️ EML-Konvertierung fehlgeschlagen, lade roh hoch: {eml_err}")
ctx.logger.info(f" 📤 Uploading '{filename}' ({mime_type})…")
result = await ragflow.upload_document(
dataset_id=dataset_id,