feat(sync): Add EML to TXT conversion for improved document handling in RAGflow sync
This commit is contained in:
@@ -674,6 +674,63 @@ async def _run_ragflow_sync(
|
|||||||
file_content = await espocrm.download_attachment(attachment_id)
|
file_content = await espocrm.download_attachment(attachment_id)
|
||||||
ctx.logger.info(f" Downloaded {len(file_content)} bytes")
|
ctx.logger.info(f" Downloaded {len(file_content)} bytes")
|
||||||
|
|
||||||
|
# ── EML → TXT Konvertierung ───────────────────────────────
|
||||||
|
if filename.lower().endswith('.eml'):
|
||||||
|
try:
|
||||||
|
import email as _email
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
msg = _email.message_from_bytes(file_content)
|
||||||
|
subject = msg.get('Subject', '')
|
||||||
|
from_ = msg.get('From', '')
|
||||||
|
date = msg.get('Date', '')
|
||||||
|
plain_parts, html_parts = [], []
|
||||||
|
if msg.is_multipart():
|
||||||
|
for part in msg.walk():
|
||||||
|
ct = part.get_content_type()
|
||||||
|
if ct == 'text/plain':
|
||||||
|
plain_parts.append(part.get_payload(decode=True).decode(
|
||||||
|
part.get_content_charset() or 'utf-8', errors='replace'))
|
||||||
|
elif ct == 'text/html':
|
||||||
|
html_parts.append(part.get_payload(decode=True).decode(
|
||||||
|
part.get_content_charset() or 'utf-8', errors='replace'))
|
||||||
|
else:
|
||||||
|
ct = msg.get_content_type()
|
||||||
|
payload = msg.get_payload(decode=True).decode(
|
||||||
|
msg.get_content_charset() or 'utf-8', errors='replace')
|
||||||
|
if ct == 'text/html':
|
||||||
|
html_parts.append(payload)
|
||||||
|
else:
|
||||||
|
plain_parts.append(payload)
|
||||||
|
if plain_parts:
|
||||||
|
body = '\n\n'.join(plain_parts)
|
||||||
|
elif html_parts:
|
||||||
|
soup = BeautifulSoup('\n'.join(html_parts), 'html.parser')
|
||||||
|
for tag in soup(['script', 'style', 'header', 'footer', 'nav']):
|
||||||
|
tag.decompose()
|
||||||
|
body = '\n'.join(
|
||||||
|
line.strip()
|
||||||
|
for line in soup.get_text(separator='\n').splitlines()
|
||||||
|
if line.strip()
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
body = ''
|
||||||
|
header = (
|
||||||
|
f"Betreff: {subject}\n"
|
||||||
|
f"Von: {from_}\n"
|
||||||
|
f"Datum: {date}\n"
|
||||||
|
f"{'-' * 80}\n\n"
|
||||||
|
)
|
||||||
|
converted_text = (header + body).strip()
|
||||||
|
file_content = converted_text.encode('utf-8')
|
||||||
|
filename = filename[:-4] + '.txt'
|
||||||
|
mime_type = 'text/plain'
|
||||||
|
ctx.logger.info(
|
||||||
|
f" 📧 EML→TXT konvertiert: {len(file_content)} bytes "
|
||||||
|
f"(blake3 des Original-EML bleibt erhalten)"
|
||||||
|
)
|
||||||
|
except Exception as eml_err:
|
||||||
|
ctx.logger.warn(f" ⚠️ EML-Konvertierung fehlgeschlagen, lade roh hoch: {eml_err}")
|
||||||
|
|
||||||
ctx.logger.info(f" 📤 Uploading '{filename}' ({mime_type})…")
|
ctx.logger.info(f" 📤 Uploading '{filename}' ({mime_type})…")
|
||||||
result = await ragflow.upload_document(
|
result = await ragflow.upload_document(
|
||||||
dataset_id=dataset_id,
|
dataset_id=dataset_id,
|
||||||
|
|||||||
Reference in New Issue
Block a user