motia-iii/services/ragflow_service.py

"""RAGFlow Dataset & Document Service"""
import os
import asyncio
from functools import partial
from typing import Optional, List, Dict, Any
from services.logging_utils import get_service_logger

RAGFLOW_DEFAULT_BASE_URL = "http://192.168.1.64:9380"

# Defaults fuer Dokument-Analyse
RAGFLOW_AUTO_KEYWORDS = 14
RAGFLOW_AUTO_QUESTIONS = 7


def _base_to_dict(obj: Any) -> Any:
    """
    Konvertiert ragflow_sdk.modules.base.Base rekursiv zu einem plain dict.
    Filtert den internen 'rag'-Client-Key heraus.
    """
    try:
        from ragflow_sdk.modules.base import Base
        if isinstance(obj, Base):
            return {k: _base_to_dict(v) for k, v in vars(obj).items() if k != 'rag'}
    except ImportError:
        pass
    if isinstance(obj, dict):
        return {k: _base_to_dict(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [_base_to_dict(i) for i in obj]
    return obj


class RAGFlowService:
    """
    Client fuer RAGFlow API via ragflow-sdk (Python SDK).

    Wrapt das synchrone SDK in asyncio.run_in_executor, sodass
    es nahtlos in Motia-Steps (async) verwendet werden kann.

    Dataflow beim Upload:
      upload_document() →
        1. upload_documents([{blob}])      # Datei hochladen
        2. doc.update({meta_fields})       # blake3 + advoware-Felder setzen
        3. async_parse_documents([id])     # Parsing starten (chunk_method=laws)

    Benoetigte Umgebungsvariablen:
    - RAGFLOW_API_KEY   – API Key
    - RAGFLOW_BASE_URL  – Optional, URL Override (Default: http://192.168.1.64:9380)
    """

    SUPPORTED_MIME_TYPES = {
        'application/pdf',
        'application/msword',
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
        'application/vnd.ms-excel',
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
        'application/vnd.oasis.opendocument.text',
        'application/epub+zip',
        'application/vnd.openxmlformats-officedocument.presentationml.presentation',
        'text/plain',
        'text/html',
        'text/markdown',
        'text/csv',
        'text/xml',
        'application/json',
        'application/xml',
    }

    def __init__(self, ctx=None):
        self.api_key = os.getenv('RAGFLOW_API_KEY', '')
        base_url_env = os.getenv('RAGFLOW_BASE_URL', '')
        self.base_url = base_url_env or RAGFLOW_DEFAULT_BASE_URL
        self.ctx = ctx
        self.logger = get_service_logger('ragflow', ctx)
        self._rag = None

        if not self.api_key:
            raise ValueError("RAGFLOW_API_KEY not configured in environment")

    def _log(self, msg: str, level: str = 'info') -> None:
        log_func = getattr(self.logger, level, self.logger.info)
        log_func(msg)

    def _get_client(self):
        """Gibt RAGFlow SDK Client zurueck (lazy init, sync)."""
        if self._rag is None:
            from ragflow_sdk import RAGFlow
            self._rag = RAGFlow(api_key=self.api_key, base_url=self.base_url)
        return self._rag

    async def _run(self, func, *args, **kwargs):
        """Fuehrt synchrone SDK-Funktion in ThreadPoolExecutor aus."""
        loop = asyncio.get_event_loop()
        return await loop.run_in_executor(None, partial(func, *args, **kwargs))

    # ========== Dataset Management ==========

    async def create_dataset(
        self,
        name: str,
        chunk_method: str = 'laws',
        embedding_model: Optional[str] = None,
        description: Optional[str] = None,
        auto_keywords: int = RAGFLOW_AUTO_KEYWORDS,
        auto_questions: int = RAGFLOW_AUTO_QUESTIONS,
    ) -> Dict:
        """
        Erstellt ein neues RAGFlow Dataset (entspricht xAI Collection).

        Verwendet standardmaessig chunk_method='laws' (optimiert fuer Rechtsdokumente).
        Setzt nach der Erstellung auto_keywords=14 und auto_questions=7.

        Returns:
            dict mit 'id', 'name', 'chunk_method', 'parser_config', etc.
        """
        self._log(
            f"📚 Creating dataset: {name} "
            f"(chunk_method={chunk_method}, keywords={auto_keywords}, questions={auto_questions})"
        )

        def _create():
            rag = self._get_client()
            kwargs = dict(name=name, chunk_method=chunk_method)
            if embedding_model:
                kwargs['embedding_model'] = embedding_model
            if description:
                kwargs['description'] = description
            dataset = rag.create_dataset(**kwargs)

            # parser_config kann erst nach create via update() gesetzt werden
            dataset.update({
                'parser_config': {
                    'auto_keywords': auto_keywords,
                    'auto_questions': auto_questions,
                }
            })
            return self._dataset_to_dict(dataset)

        result = await self._run(_create)
        self._log(f"✅ Dataset created: {result.get('id')} ({name})")
        return result

    async def get_dataset_by_name(self, name: str) -> Optional[Dict]:
        """
        Sucht Dataset nach Name. Gibt None zurueck wenn nicht gefunden.
        """
        def _find():
            rag = self._get_client()
            # list_datasets(name=...) hat Permission-Bugs – lokal filtern
            all_datasets = rag.list_datasets(page_size=100)
            for ds in all_datasets:
                if getattr(ds, 'name', None) == name:
                    return self._dataset_to_dict(ds)
            return None

        result = await self._run(_find)
        if result:
            self._log(f"🔍 Dataset found: {result.get('id')} ({name})")
        return result

    async def ensure_dataset(
        self,
        name: str,
        chunk_method: str = 'laws',
        embedding_model: Optional[str] = None,
        description: Optional[str] = None,
        auto_keywords: int = RAGFLOW_AUTO_KEYWORDS,
        auto_questions: int = RAGFLOW_AUTO_QUESTIONS,
    ) -> Dict:
        """
        Gibt bestehendes Dataset zurueck oder erstellt ein neues (get-or-create).
        Entspricht xAI create_collection mit idempotency.

        Returns:
            dict mit 'id', 'name', etc.
        """
        existing = await self.get_dataset_by_name(name)
        if existing:
            self._log(f"✅ Dataset exists: {existing.get('id')} ({name})")
            return existing
        return await self.create_dataset(
            name=name,
            chunk_method=chunk_method,
            embedding_model=embedding_model,
            description=description,
            auto_keywords=auto_keywords,
            auto_questions=auto_questions,
        )

    async def delete_dataset(self, dataset_id: str) -> None:
        """
        Loescht ein Dataset inklusive aller Dokumente.
        Entspricht xAI delete_collection.
        """
        self._log(f"🗑️  Deleting dataset: {dataset_id}")

        def _delete():
            rag = self._get_client()
            rag.delete_datasets(ids=[dataset_id])

        await self._run(_delete)
        self._log(f"✅ Dataset deleted: {dataset_id}")

    async def list_datasets(self) -> List[Dict]:
        """Listet alle Datasets auf."""
        def _list():
            rag = self._get_client()
            return [self._dataset_to_dict(d) for d in rag.list_datasets()]

        result = await self._run(_list)
        self._log(f"📋 Listed {len(result)} datasets")
        return result

    # ========== Document Management ==========

    async def upload_document(
        self,
        dataset_id: str,
        file_content: bytes,
        filename: str,
        mime_type: str = 'application/octet-stream',
        blake3_hash: Optional[str] = None,
        espocrm_id: Optional[str] = None,
        description: Optional[str] = None,
        advoware_art: Optional[str] = None,
        advoware_bemerkung: Optional[str] = None,
    ) -> Dict:
        """
        Laedt ein Dokument in ein Dataset hoch.

        Ablauf (3 Schritte):
          1. upload_documents()         – Datei hochladen
          2. doc.update(meta_fields)    – Metadaten setzen inkl. blake3_hash
          3. async_parse_documents()    – Parsing mit chunk_method=laws starten

        Meta-Felder die gesetzt werden:
          - blake3_hash        (fuer Change Detection, entspricht xAI BLAKE3)
          - espocrm_id         (Rueckreferenz zu EspoCRM CDokument)
          - description        (Dokumentbeschreibung)
          - advoware_art       (Advoware Dokumenten-Art)
          - advoware_bemerkung (Advoware Bemerkung/Notiz)

        Returns:
            dict mit 'id', 'name', 'run', 'meta_fields', etc.
        """
        if mime_type == 'application/octet-stream' and filename.lower().endswith('.pdf'):
            mime_type = 'application/pdf'

        self._log(
            f"📤 Uploading {len(file_content)} bytes to dataset {dataset_id}: "
            f"{filename} ({mime_type})"
        )

        def _upload_and_tag():
            rag = self._get_client()
            datasets = rag.list_datasets(id=dataset_id)
            if not datasets:
                raise RuntimeError(f"Dataset not found: {dataset_id}")
            dataset = datasets[0]

            # Schritt 1: Upload
            dataset.upload_documents([{
                'display_name': filename,
                'blob': file_content,
            }])

            # Dokument-ID ermitteln (neuestes mit passendem Namen)
            base_name = filename.split('/')[-1]
            docs = dataset.list_documents(keywords=base_name, page_size=10)
            doc = None
            for d in docs:
                if d.name == filename or d.name == base_name:
                    doc = d
                    break
            if doc is None and docs:
                doc = docs[0]  # Fallback
            if doc is None:
                raise RuntimeError(f"Document not found after upload: {filename}")

            # Schritt 2: Meta-Fields setzen
            meta: Dict[str, str] = {}
            if blake3_hash:
                meta['blake3_hash'] = blake3_hash
            if espocrm_id:
                meta['espocrm_id'] = espocrm_id
            if description:
                meta['description'] = description
            if advoware_art:
                meta['advoware_art'] = advoware_art
            if advoware_bemerkung:
                meta['advoware_bemerkung'] = advoware_bemerkung

            if meta:
                doc.update({'meta_fields': meta})

            # Schritt 3: Parsing starten
            dataset.async_parse_documents([doc.id])

            return self._document_to_dict(doc)

        result = await self._run(_upload_and_tag)
        self._log(
            f"✅ Document uploaded & parsing started: {result.get('id')} ({filename})"
        )
        return result

    async def update_document_meta(
        self,
        dataset_id: str,
        doc_id: str,
        blake3_hash: Optional[str] = None,
        description: Optional[str] = None,
        advoware_art: Optional[str] = None,
        advoware_bemerkung: Optional[str] = None,
    ) -> None:
        """
        Aktualisiert nur die Metadaten eines Dokuments (ohne Re-Upload).
        Entspricht xAI PATCH-Metadata-Only.
        Startet Parsing neu, da Chunk-Injection von meta_fields abhaengt.
        """
        self._log(f"✏️  Updating metadata for document {doc_id}")

        def _update():
            rag = self._get_client()
            datasets = rag.list_datasets(id=dataset_id)
            if not datasets:
                raise RuntimeError(f"Dataset not found: {dataset_id}")
            dataset = datasets[0]
            docs = dataset.list_documents(id=doc_id)
            if not docs:
                raise RuntimeError(f"Document not found: {doc_id}")
            doc = docs[0]

            # Bestehende meta_fields lesen und mergen
            existing_meta = _base_to_dict(doc.meta_fields) or {}
            if blake3_hash is not None:
                existing_meta['blake3_hash'] = blake3_hash
            if description is not None:
                existing_meta['description'] = description
            if advoware_art is not None:
                existing_meta['advoware_art'] = advoware_art
            if advoware_bemerkung is not None:
                existing_meta['advoware_bemerkung'] = advoware_bemerkung

            doc.update({'meta_fields': existing_meta})
            # Re-parsing noetig damit Chunks aktualisierte Metadata enthalten
            dataset.async_parse_documents([doc.id])

        await self._run(_update)
        self._log(f"✅ Metadata updated and re-parsing started: {doc_id}")

    async def remove_document(self, dataset_id: str, doc_id: str) -> None:
        """
        Loescht ein Dokument aus einem Dataset.
        Entspricht xAI remove_from_collection.
        """
        self._log(f"🗑️  Removing document {doc_id} from dataset {dataset_id}")

        def _delete():
            rag = self._get_client()
            datasets = rag.list_datasets(id=dataset_id)
            if not datasets:
                raise RuntimeError(f"Dataset not found: {dataset_id}")
            datasets[0].delete_documents(ids=[doc_id])

        await self._run(_delete)
        self._log(f"✅ Document removed: {doc_id}")

    async def list_documents(self, dataset_id: str) -> List[Dict]:
        """
        Listet alle Dokumente in einem Dataset auf (paginiert).
        Entspricht xAI list_collection_documents.
        """
        self._log(f"📋 Listing documents in dataset {dataset_id}")

        def _list():
            rag = self._get_client()
            datasets = rag.list_datasets(id=dataset_id)
            if not datasets:
                raise RuntimeError(f"Dataset not found: {dataset_id}")
            dataset = datasets[0]
            docs = []
            page = 1
            while True:
                batch = dataset.list_documents(page=page, page_size=100)
                if not batch:
                    break
                docs.extend(batch)
                if len(batch) < 100:
                    break
                page += 1
            return [self._document_to_dict(d) for d in docs]

        result = await self._run(_list)
        self._log(f"✅ Listed {len(result)} documents")
        return result

    async def get_document(self, dataset_id: str, doc_id: str) -> Optional[Dict]:
        """Holt ein einzelnes Dokument by ID. None wenn nicht gefunden."""
        def _get():
            rag = self._get_client()
            datasets = rag.list_datasets(id=dataset_id)
            if not datasets:
                return None
            docs = datasets[0].list_documents(id=doc_id)
            if not docs:
                return None
            return self._document_to_dict(docs[0])

        result = await self._run(_get)
        if result:
            self._log(f"📄 Document found: {result.get('name')} (run={result.get('run')})")
        return result

    async def wait_for_parsing(
        self,
        dataset_id: str,
        doc_id: str,
        timeout_seconds: int = 120,
        poll_interval: float = 3.0,
    ) -> Dict:
        """
        Wartet bis das Parsing eines Dokuments abgeschlossen ist.

        Returns:
            Aktueller Dokument-State als dict.

        Raises:
            TimeoutError: Wenn Parsing nicht innerhalb timeout_seconds fertig wird.
            RuntimeError: Wenn Parsing fehlschlaegt.
        """
        self._log(f"⏳ Waiting for parsing: {doc_id} (timeout={timeout_seconds}s)")
        elapsed = 0.0

        while elapsed < timeout_seconds:
            doc = await self.get_document(dataset_id, doc_id)
            if doc is None:
                raise RuntimeError(f"Document disappeared during parsing: {doc_id}")

            run_status = doc.get('run', 'UNSTART')
            if run_status == 'DONE':
                self._log(
                    f"✅ Parsing done: {doc_id} "
                    f"(chunks={doc.get('chunk_count')}, tokens={doc.get('token_count')})"
                )
                return doc
            elif run_status in ('FAIL', 'CANCEL'):
                raise RuntimeError(
                    f"Parsing failed for {doc_id}: status={run_status}, "
                    f"msg={doc.get('progress_msg', '')}"
                )

            await asyncio.sleep(poll_interval)
            elapsed += poll_interval

        raise TimeoutError(
            f"Parsing timeout after {timeout_seconds}s for document {doc_id}"
        )

    # ========== MIME Type Support ==========

    def is_mime_type_supported(self, mime_type: str) -> bool:
        """Prueft ob RAGFlow diesen MIME-Type verarbeiten kann."""
        return mime_type.lower().strip() in self.SUPPORTED_MIME_TYPES

    # ========== Internal Helpers ==========

    def _dataset_to_dict(self, dataset) -> Dict:
        """Konvertiert RAGFlow DataSet Objekt zu dict (inkl. parser_config unwrap)."""
        return {
            'id': getattr(dataset, 'id', None),
            'name': getattr(dataset, 'name', None),
            'chunk_method': getattr(dataset, 'chunk_method', None),
            'embedding_model': getattr(dataset, 'embedding_model', None),
            'description': getattr(dataset, 'description', None),
            'chunk_count': getattr(dataset, 'chunk_count', 0),
            'document_count': getattr(dataset, 'document_count', 0),
            'parser_config': _base_to_dict(getattr(dataset, 'parser_config', {})),
        }

    def _document_to_dict(self, doc) -> Dict:
        """
        Konvertiert RAGFlow Document Objekt zu dict.

        meta_fields wird via _base_to_dict() zu einem plain dict unwrapped.
        Enthaelt blake3_hash, espocrm_id, description, advoware_art,
        advoware_bemerkung sofern gesetzt.
        """
        raw_meta = getattr(doc, 'meta_fields', None)
        meta_dict = _base_to_dict(raw_meta) if raw_meta is not None else {}

        return {
            'id': getattr(doc, 'id', None),
            'name': getattr(doc, 'name', None),
            'dataset_id': getattr(doc, 'dataset_id', None),
            'chunk_method': getattr(doc, 'chunk_method', None),
            'size': getattr(doc, 'size', 0),
            'token_count': getattr(doc, 'token_count', 0),
            'chunk_count': getattr(doc, 'chunk_count', 0),
            'run': getattr(doc, 'run', 'UNSTART'),
            'progress': getattr(doc, 'progress', 0.0),
            'progress_msg': getattr(doc, 'progress_msg', ''),
            'source_type': getattr(doc, 'source_type', 'local'),
            'created_by': getattr(doc, 'created_by', ''),
            'process_duration': getattr(doc, 'process_duration', 0.0),
            # Metadaten (blake3_hash hier drin wenn gesetzt)
            'meta_fields': meta_dict,
            'blake3_hash': meta_dict.get('blake3_hash'),
            'espocrm_id': meta_dict.get('espocrm_id'),
            'parser_config': _base_to_dict(getattr(doc, 'parser_config', None)),
        }