feat(sync): Add EML to TXT conversion for improved document handling in RAGflow sync
This commit is contained in:
@@ -674,6 +674,63 @@ async def _run_ragflow_sync(
|
||||
file_content = await espocrm.download_attachment(attachment_id)
|
||||
ctx.logger.info(f" Downloaded {len(file_content)} bytes")
|
||||
|
||||
# ── EML → TXT Konvertierung ───────────────────────────────
|
||||
if filename.lower().endswith('.eml'):
|
||||
try:
|
||||
import email as _email
|
||||
from bs4 import BeautifulSoup
|
||||
msg = _email.message_from_bytes(file_content)
|
||||
subject = msg.get('Subject', '')
|
||||
from_ = msg.get('From', '')
|
||||
date = msg.get('Date', '')
|
||||
plain_parts, html_parts = [], []
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
ct = part.get_content_type()
|
||||
if ct == 'text/plain':
|
||||
plain_parts.append(part.get_payload(decode=True).decode(
|
||||
part.get_content_charset() or 'utf-8', errors='replace'))
|
||||
elif ct == 'text/html':
|
||||
html_parts.append(part.get_payload(decode=True).decode(
|
||||
part.get_content_charset() or 'utf-8', errors='replace'))
|
||||
else:
|
||||
ct = msg.get_content_type()
|
||||
payload = msg.get_payload(decode=True).decode(
|
||||
msg.get_content_charset() or 'utf-8', errors='replace')
|
||||
if ct == 'text/html':
|
||||
html_parts.append(payload)
|
||||
else:
|
||||
plain_parts.append(payload)
|
||||
if plain_parts:
|
||||
body = '\n\n'.join(plain_parts)
|
||||
elif html_parts:
|
||||
soup = BeautifulSoup('\n'.join(html_parts), 'html.parser')
|
||||
for tag in soup(['script', 'style', 'header', 'footer', 'nav']):
|
||||
tag.decompose()
|
||||
body = '\n'.join(
|
||||
line.strip()
|
||||
for line in soup.get_text(separator='\n').splitlines()
|
||||
if line.strip()
|
||||
)
|
||||
else:
|
||||
body = ''
|
||||
header = (
|
||||
f"Betreff: {subject}\n"
|
||||
f"Von: {from_}\n"
|
||||
f"Datum: {date}\n"
|
||||
f"{'-' * 80}\n\n"
|
||||
)
|
||||
converted_text = (header + body).strip()
|
||||
file_content = converted_text.encode('utf-8')
|
||||
filename = filename[:-4] + '.txt'
|
||||
mime_type = 'text/plain'
|
||||
ctx.logger.info(
|
||||
f" 📧 EML→TXT konvertiert: {len(file_content)} bytes "
|
||||
f"(blake3 des Original-EML bleibt erhalten)"
|
||||
)
|
||||
except Exception as eml_err:
|
||||
ctx.logger.warn(f" ⚠️ EML-Konvertierung fehlgeschlagen, lade roh hoch: {eml_err}")
|
||||
|
||||
ctx.logger.info(f" 📤 Uploading '{filename}' ({mime_type})…")
|
||||
result = await ragflow.upload_document(
|
||||
dataset_id=dataset_id,
|
||||
|
||||
Reference in New Issue
Block a user