feat: Enhance document synchronization by integrating CAIKnowledge handling and improving error logging

This commit is contained in:
bsiggel
2026-03-12 22:30:11 +00:00
parent 8ed7cca432
commit 6bf2343a12
6 changed files with 492 additions and 362 deletions

View File

@@ -236,7 +236,8 @@ class XAIService:
data = await response.json()
collection_id = data.get('id')
# API returns 'collection_id' not 'id'
collection_id = data.get('collection_id') or data.get('id')
self._log(f"✅ Collection created: {collection_id}")
return data
@@ -308,7 +309,18 @@ class XAIService:
GET https://management-api.x.ai/v1/collections/{collection_id}/documents
Returns:
List von document objects mit file_id, filename, hash, fields
List von normalized document objects:
[
{
'file_id': 'file_...',
'filename': 'doc.pdf',
'blake3_hash': 'hex_string', # Plain hex, kein prefix
'size_bytes': 12345,
'content_type': 'application/pdf',
'fields': {}, # Custom metadata
'status': 'DOCUMENT_STATUS_...'
}
]
Raises:
RuntimeError: bei HTTP-Fehler
@@ -328,16 +340,31 @@ class XAIService:
data = await response.json()
# API sollte eine Liste zurückgeben oder ein dict mit 'documents' key
# API gibt Liste zurück oder dict mit 'documents' key
if isinstance(data, list):
documents = data
raw_documents = data
elif isinstance(data, dict) and 'documents' in data:
documents = data['documents']
raw_documents = data['documents']
else:
documents = []
raw_documents = []
self._log(f"✅ Listed {len(documents)} documents")
return documents
# Normalize nested structure: file_metadata -> top-level
normalized = []
for doc in raw_documents:
file_meta = doc.get('file_metadata', {})
normalized.append({
'file_id': file_meta.get('file_id'),
'filename': file_meta.get('name'),
'blake3_hash': file_meta.get('hash'), # Plain hex string
'size_bytes': int(file_meta.get('size_bytes', 0)) if file_meta.get('size_bytes') else 0,
'content_type': file_meta.get('content_type'),
'created_at': file_meta.get('created_at'),
'fields': doc.get('fields', {}),
'status': doc.get('status')
})
self._log(f"✅ Listed {len(normalized)} documents")
return normalized
async def get_collection_document(self, collection_id: str, file_id: str) -> Optional[Dict]:
"""
@@ -346,12 +373,14 @@ class XAIService:
GET https://management-api.x.ai/v1/collections/{collection_id}/documents/{file_id}
Returns:
Dict mit document info including BLAKE3 hash:
Normalized dict mit document info:
{
'file_id': 'file_xyz',
'filename': 'document.pdf',
'hash': 'blake3:abcd1234...', # BLAKE3 Hash!
'fields': {...} # Metadata
'blake3_hash': 'hex_string', # Plain hex, kein prefix
'size_bytes': 12345,
'content_type': 'application/pdf',
'fields': {...} # Custom metadata
}
Returns None if not found.
@@ -374,8 +403,21 @@ class XAIService:
data = await response.json()
self._log(f"✅ Document info retrieved: {data.get('filename', 'N/A')}")
return data
# Normalize nested structure
file_meta = data.get('file_metadata', {})
normalized = {
'file_id': file_meta.get('file_id'),
'filename': file_meta.get('name'),
'blake3_hash': file_meta.get('hash'), # Plain hex
'size_bytes': int(file_meta.get('size_bytes', 0)) if file_meta.get('size_bytes') else 0,
'content_type': file_meta.get('content_type'),
'created_at': file_meta.get('created_at'),
'fields': data.get('fields', {}),
'status': data.get('status')
}
self._log(f"✅ Document info retrieved: {normalized.get('filename', 'N/A')}")
return normalized
async def update_document_metadata(
self,