"""xAI Files & Collections Service""" import os import asyncio import aiohttp from typing import Optional, List, Dict, Tuple from services.logging_utils import get_service_logger XAI_FILES_URL = "https://api.x.ai" XAI_MANAGEMENT_URL = "https://management-api.x.ai" class XAIService: """ Client für xAI Files API und Collections Management API. Benötigte Umgebungsvariablen: - XAI_API_KEY – regulärer API-Key für File-Uploads (api.x.ai) - XAI_MANAGEMENT_KEY – Management-API-Key für Collection-Operationen (management-api.x.ai) """ def __init__(self, ctx=None): self.api_key = os.getenv('XAI_API_KEY', '') self.management_key = os.getenv('XAI_MANAGEMENT_KEY', '') self.ctx = ctx self.logger = get_service_logger('xai', ctx) self._session: Optional[aiohttp.ClientSession] = None if not self.api_key: raise ValueError("XAI_API_KEY not configured in environment") if not self.management_key: raise ValueError("XAI_MANAGEMENT_KEY not configured in environment") def _log(self, msg: str, level: str = 'info') -> None: """Delegate logging to service logger""" log_func = getattr(self.logger, level, self.logger.info) log_func(msg) async def _get_session(self) -> aiohttp.ClientSession: if self._session is None or self._session.closed: self._session = aiohttp.ClientSession( timeout=aiohttp.ClientTimeout(total=120) ) return self._session async def close(self) -> None: if self._session and not self._session.closed: await self._session.close() async def upload_file( self, file_content: bytes, filename: str, mime_type: str = 'application/octet-stream' ) -> str: """ Lädt eine Datei zur xAI Files API hoch (multipart/form-data). POST https://api.x.ai/v1/files Returns: xAI file_id (str) Raises: RuntimeError: bei HTTP-Fehler oder fehlendem file_id in der Antwort """ # Normalize MIME type: xAI needs correct Content-Type for proper processing # If generic octet-stream but file is clearly a PDF, fix it if mime_type == 'application/octet-stream' and filename.lower().endswith('.pdf'): mime_type = 'application/pdf' self._log(f"⚠️ Corrected MIME type to application/pdf for {filename}") self._log(f"📤 Uploading {len(file_content)} bytes to xAI: {filename} ({mime_type})") session = await self._get_session() url = f"{XAI_FILES_URL}/v1/files" headers = {"Authorization": f"Bearer {self.api_key}"} # Create multipart form with explicit UTF-8 filename encoding # aiohttp automatically URL-encodes filenames with special chars, # but xAI expects raw UTF-8 in the filename parameter form = aiohttp.FormData(quote_fields=False) form.add_field( 'file', file_content, filename=filename, content_type=mime_type ) # CRITICAL: purpose="file_search" enables proper PDF processing # Without this, xAI throws "internal error" on complex PDFs form.add_field('purpose', 'file_search') async with session.post(url, data=form, headers=headers) as response: try: data = await response.json() except Exception: raw = await response.text() data = {"_raw": raw} if response.status not in (200, 201): raise RuntimeError( f"xAI file upload failed ({response.status}): {data}" ) file_id = data.get('id') or data.get('file_id') if not file_id: raise RuntimeError( f"No file_id in xAI upload response: {data}" ) self._log(f"✅ xAI file uploaded: {file_id}") return file_id async def add_to_collection(self, collection_id: str, file_id: str) -> None: """ Fügt eine Datei einer xAI-Collection (Vector Store) hinzu. POST https://api.x.ai/v1/vector_stores/{vector_store_id}/files Uses the OpenAI-compatible API pattern for adding files to vector stores. This triggers proper indexing and processing. Raises: RuntimeError: bei HTTP-Fehler """ self._log(f"📚 Adding file {file_id} to collection {collection_id}") session = await self._get_session() # Use the OpenAI-compatible endpoint (not management API) url = f"{XAI_FILES_URL}/v1/vector_stores/{collection_id}/files" headers = { "Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json", } payload = {"file_id": file_id} async with session.post(url, json=payload, headers=headers) as response: if response.status not in (200, 201): raw = await response.text() raise RuntimeError( f"Failed to add file to collection {collection_id} ({response.status}): {raw}" ) self._log(f"✅ File {file_id} added to collection {collection_id}") async def remove_from_collection(self, collection_id: str, file_id: str) -> None: """ Entfernt eine Datei aus einer xAI-Collection. Die Datei selbst wird NICHT gelöscht – sie kann in anderen Collections sein. DELETE https://management-api.x.ai/v1/collections/{collection_id}/documents/{file_id} Raises: RuntimeError: bei HTTP-Fehler """ self._log(f"🗑️ Removing file {file_id} from collection {collection_id}") session = await self._get_session() url = f"{XAI_MANAGEMENT_URL}/v1/collections/{collection_id}/documents/{file_id}" headers = {"Authorization": f"Bearer {self.management_key}"} async with session.delete(url, headers=headers) as response: if response.status not in (200, 204): raw = await response.text() raise RuntimeError( f"Failed to remove file from collection {collection_id} ({response.status}): {raw}" ) self._log(f"✅ File {file_id} removed from collection {collection_id}") async def add_to_collections(self, collection_ids: List[str], file_id: str) -> List[str]: """ Fügt eine Datei zu mehreren Collections hinzu. Returns: Liste der erfolgreich hinzugefügten Collection-IDs """ added = [] for collection_id in collection_ids: try: await self.add_to_collection(collection_id, file_id) added.append(collection_id) except Exception as e: self._log( f"⚠️ Fehler beim Hinzufügen zu Collection {collection_id}: {e}", level='warn' ) return added async def remove_from_collections(self, collection_ids: List[str], file_id: str) -> None: """Entfernt eine Datei aus mehreren Collections (ignoriert Fehler pro Collection).""" for collection_id in collection_ids: try: await self.remove_from_collection(collection_id, file_id) except Exception as e: self._log( f"⚠️ Fehler beim Entfernen aus Collection {collection_id}: {e}", level='warn' ) # ========== Collection Management ========== async def create_collection( self, name: str, metadata: Optional[Dict[str, str]] = None, field_definitions: Optional[List[Dict]] = None ) -> Dict: """ Erstellt eine neue xAI Collection. POST https://management-api.x.ai/v1/collections Args: name: Collection name metadata: Optional metadata dict field_definitions: Optional field definitions for metadata fields Returns: Collection object mit 'id' field Raises: RuntimeError: bei HTTP-Fehler """ self._log(f"📚 Creating collection: {name}") # Standard field definitions für document metadata if field_definitions is None: field_definitions = [ {"key": "document_name", "inject_into_chunk": True}, {"key": "description", "inject_into_chunk": True}, {"key": "created_at", "inject_into_chunk": False}, {"key": "modified_at", "inject_into_chunk": False}, {"key": "espocrm_id", "inject_into_chunk": False} ] session = await self._get_session() url = f"{XAI_MANAGEMENT_URL}/v1/collections" headers = { "Authorization": f"Bearer {self.management_key}", "Content-Type": "application/json" } body = { "collection_name": name, "field_definitions": field_definitions } # Add metadata if provided if metadata: body["metadata"] = metadata async with session.post(url, json=body, headers=headers) as response: if response.status not in (200, 201): raw = await response.text() raise RuntimeError( f"Failed to create collection ({response.status}): {raw}" ) data = await response.json() # API returns 'collection_id' not 'id' collection_id = data.get('collection_id') or data.get('id') self._log(f"✅ Collection created: {collection_id}") return data async def get_collection(self, collection_id: str) -> Optional[Dict]: """ Holt Collection-Details. GET https://management-api.x.ai/v1/collections/{collection_id} Returns: Collection object or None if not found Raises: RuntimeError: bei HTTP-Fehler (außer 404) """ self._log(f"📄 Getting collection: {collection_id}") session = await self._get_session() url = f"{XAI_MANAGEMENT_URL}/v1/collections/{collection_id}" headers = {"Authorization": f"Bearer {self.management_key}"} async with session.get(url, headers=headers) as response: if response.status == 404: self._log(f"⚠️ Collection not found: {collection_id}", level='warn') return None if response.status not in (200,): raw = await response.text() raise RuntimeError( f"Failed to get collection ({response.status}): {raw}" ) data = await response.json() self._log(f"✅ Collection retrieved: {data.get('collection_name', 'N/A')}") return data async def delete_collection(self, collection_id: str) -> None: """ Löscht eine XAI Collection. DELETE https://management-api.x.ai/v1/collections/{collection_id} NOTE: Documents in der Collection werden NICHT gelöscht! Sie können noch in anderen Collections sein. Raises: RuntimeError: bei HTTP-Fehler """ self._log(f"🗑️ Deleting collection {collection_id}") session = await self._get_session() url = f"{XAI_MANAGEMENT_URL}/v1/collections/{collection_id}" headers = {"Authorization": f"Bearer {self.management_key}"} async with session.delete(url, headers=headers) as response: if response.status not in (200, 204): raw = await response.text() raise RuntimeError( f"Failed to delete collection {collection_id} ({response.status}): {raw}" ) self._log(f"✅ Collection deleted: {collection_id}") async def list_collection_documents(self, collection_id: str) -> List[Dict]: """ Listet alle Dokumente in einer Collection. GET https://management-api.x.ai/v1/collections/{collection_id}/documents Returns: List von normalized document objects: [ { 'file_id': 'file_...', 'filename': 'doc.pdf', 'blake3_hash': 'hex_string', # Plain hex, kein prefix 'size_bytes': 12345, 'content_type': 'application/pdf', 'fields': {}, # Custom metadata 'status': 'DOCUMENT_STATUS_...' } ] Raises: RuntimeError: bei HTTP-Fehler """ self._log(f"📋 Listing documents in collection {collection_id}") session = await self._get_session() url = f"{XAI_MANAGEMENT_URL}/v1/collections/{collection_id}/documents" headers = {"Authorization": f"Bearer {self.management_key}"} async with session.get(url, headers=headers) as response: if response.status not in (200,): raw = await response.text() raise RuntimeError( f"Failed to list documents ({response.status}): {raw}" ) data = await response.json() # API gibt Liste zurück oder dict mit 'documents' key if isinstance(data, list): raw_documents = data elif isinstance(data, dict) and 'documents' in data: raw_documents = data['documents'] else: raw_documents = [] # Normalize nested structure: file_metadata -> top-level normalized = [] for doc in raw_documents: file_meta = doc.get('file_metadata', {}) normalized.append({ 'file_id': file_meta.get('file_id'), 'filename': file_meta.get('name'), 'blake3_hash': file_meta.get('hash'), # Plain hex string 'size_bytes': int(file_meta.get('size_bytes', 0)) if file_meta.get('size_bytes') else 0, 'content_type': file_meta.get('content_type'), 'created_at': file_meta.get('created_at'), 'fields': doc.get('fields', {}), 'status': doc.get('status') }) self._log(f"✅ Listed {len(normalized)} documents") return normalized async def get_collection_document(self, collection_id: str, file_id: str) -> Optional[Dict]: """ Holt Dokument-Details aus einer XAI Collection. GET https://management-api.x.ai/v1/collections/{collection_id}/documents/{file_id} Returns: Normalized dict mit document info: { 'file_id': 'file_xyz', 'filename': 'document.pdf', 'blake3_hash': 'hex_string', # Plain hex, kein prefix 'size_bytes': 12345, 'content_type': 'application/pdf', 'fields': {...} # Custom metadata } Returns None if not found. """ self._log(f"📄 Getting document {file_id} from collection {collection_id}") session = await self._get_session() url = f"{XAI_MANAGEMENT_URL}/v1/collections/{collection_id}/documents/{file_id}" headers = {"Authorization": f"Bearer {self.management_key}"} async with session.get(url, headers=headers) as response: if response.status == 404: return None if response.status not in (200,): raw = await response.text() raise RuntimeError( f"Failed to get document from collection ({response.status}): {raw}" ) data = await response.json() # Normalize nested structure file_meta = data.get('file_metadata', {}) normalized = { 'file_id': file_meta.get('file_id'), 'filename': file_meta.get('name'), 'blake3_hash': file_meta.get('hash'), # Plain hex 'size_bytes': int(file_meta.get('size_bytes', 0)) if file_meta.get('size_bytes') else 0, 'content_type': file_meta.get('content_type'), 'created_at': file_meta.get('created_at'), 'fields': data.get('fields', {}), 'status': data.get('status') } self._log(f"✅ Document info retrieved: {normalized.get('filename', 'N/A')}") return normalized async def update_document_metadata( self, collection_id: str, file_id: str, metadata: Dict[str, str] ) -> None: """ Aktualisiert nur Metadaten eines Documents (kein File-Upload). PATCH https://management-api.x.ai/v1/collections/{collection_id}/documents/{file_id} Args: collection_id: XAI Collection ID file_id: XAI file_id metadata: Updated metadata fields Raises: RuntimeError: bei HTTP-Fehler """ self._log(f"📝 Updating metadata for document {file_id}") session = await self._get_session() url = f"{XAI_MANAGEMENT_URL}/v1/collections/{collection_id}/documents/{file_id}" headers = { "Authorization": f"Bearer {self.management_key}", "Content-Type": "application/json" } body = {"fields": metadata} async with session.patch(url, json=body, headers=headers) as response: if response.status not in (200, 204): raw = await response.text() raise RuntimeError( f"Failed to update document metadata ({response.status}): {raw}" ) self._log(f"✅ Metadata updated for {file_id}") def is_mime_type_supported(self, mime_type: str) -> bool: """ Prüft, ob XAI diesen MIME-Type unterstützt. Args: mime_type: MIME type string Returns: True wenn unterstützt, False sonst """ # Liste der unterstützten MIME-Types basierend auf XAI Dokumentation supported_types = { # Documents 'application/pdf', 'application/msword', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/vnd.ms-excel', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'application/vnd.oasis.opendocument.text', 'application/epub+zip', 'application/vnd.openxmlformats-officedocument.presentationml.presentation', # Text 'text/plain', 'text/html', 'text/markdown', 'text/csv', 'text/xml', # Code 'text/javascript', 'application/json', 'application/xml', 'text/x-python', 'text/x-java-source', 'text/x-c', 'text/x-c++src', # Other 'application/zip', } # Normalisiere MIME-Type (lowercase, strip whitespace) normalized = mime_type.lower().strip() return normalized in supported_types