feat: Enhance document synchronization by integrating CAIKnowledge handling and improving error logging

2026-03-12 22:30:11 +00:00
parent 8ed7cca432
commit 6bf2343a12
6 changed files with 492 additions and 362 deletions
--- a/services/xai_service.py
+++ b/services/xai_service.py
@@ -236,7 +236,8 @@ class XAIService:

            data = await response.json()

-        collection_id = data.get('id')
+        # API returns 'collection_id' not 'id'
+        collection_id = data.get('collection_id') or data.get('id')
        self._log(f"✅ Collection created: {collection_id}")
        return data

@@ -308,7 +309,18 @@ class XAIService:
        GET https://management-api.x.ai/v1/collections/{collection_id}/documents

        Returns:
-            List von document objects mit file_id, filename, hash, fields
+            List von normalized document objects:
+            [
+                {
+                    'file_id': 'file_...',
+                    'filename': 'doc.pdf',
+                    'blake3_hash': 'hex_string',  # Plain hex, kein prefix
+                    'size_bytes': 12345,
+                    'content_type': 'application/pdf',
+                    'fields': {},  # Custom metadata
+                    'status': 'DOCUMENT_STATUS_...'
+                }
+            ]

        Raises:
            RuntimeError: bei HTTP-Fehler
@@ -328,16 +340,31 @@ class XAIService:

            data = await response.json()

-        # API sollte eine Liste zurückgeben oder ein dict mit 'documents' key
+        # API gibt Liste zurück oder dict mit 'documents' key
        if isinstance(data, list):
-            documents = data
+            raw_documents = data
        elif isinstance(data, dict) and 'documents' in data:
-            documents = data['documents']
+            raw_documents = data['documents']
        else:
-            documents = []
+            raw_documents = []

-        self._log(f"✅ Listed {len(documents)} documents")
-        return documents
+        # Normalize nested structure: file_metadata -> top-level
+        normalized = []
+        for doc in raw_documents:
+            file_meta = doc.get('file_metadata', {})
+            normalized.append({
+                'file_id': file_meta.get('file_id'),
+                'filename': file_meta.get('name'),
+                'blake3_hash': file_meta.get('hash'),  # Plain hex string
+                'size_bytes': int(file_meta.get('size_bytes', 0)) if file_meta.get('size_bytes') else 0,
+                'content_type': file_meta.get('content_type'),
+                'created_at': file_meta.get('created_at'),
+                'fields': doc.get('fields', {}),
+                'status': doc.get('status')
+            })
+
+        self._log(f"✅ Listed {len(normalized)} documents")
+        return normalized

    async def get_collection_document(self, collection_id: str, file_id: str) -> Optional[Dict]:
        """
@@ -346,12 +373,14 @@ class XAIService:
        GET https://management-api.x.ai/v1/collections/{collection_id}/documents/{file_id}

        Returns:
-            Dict mit document info including BLAKE3 hash:
+            Normalized dict mit document info:
            {
                'file_id': 'file_xyz',
                'filename': 'document.pdf',
-                'hash': 'blake3:abcd1234...',  # BLAKE3 Hash!
-                'fields': {...}  # Metadata
+                'blake3_hash': 'hex_string',  # Plain hex, kein prefix
+                'size_bytes': 12345,
+                'content_type': 'application/pdf',
+                'fields': {...}  # Custom metadata
            }

        Returns None if not found.
@@ -374,8 +403,21 @@ class XAIService:

            data = await response.json()

-        self._log(f"✅ Document info retrieved: {data.get('filename', 'N/A')}")
-        return data
+        # Normalize nested structure
+        file_meta = data.get('file_metadata', {})
+        normalized = {
+            'file_id': file_meta.get('file_id'),
+            'filename': file_meta.get('name'),
+            'blake3_hash': file_meta.get('hash'),  # Plain hex
+            'size_bytes': int(file_meta.get('size_bytes', 0)) if file_meta.get('size_bytes') else 0,
+            'content_type': file_meta.get('content_type'),
+            'created_at': file_meta.get('created_at'),
+            'fields': data.get('fields', {}),
+            'status': data.get('status')
+        }
+
+        self._log(f"✅ Document info retrieved: {normalized.get('filename', 'N/A')}")
+        return normalized

    async def update_document_metadata(
        self,