motia-iii/steps/ai/chat_completions_api_step.py

"""AI Chat Completions API
Universal OpenAI-compatible Chat Completions API with xAI/LangChain Backend.
Features:
- File Search (RAG) via xAI Collections
- Web Search via xAI web_search tool
- Aktenzeichen-based automatic collection lookup
- **Echtes Streaming** (async generator + proper SSE headers)
- Multiple tools simultaneously
"""
import json
import time
from typing import Any, Dict, List, Optional
from motia import FlowContext, http, ApiRequest, ApiResponse

config = {
    "name": "AI Chat Completions API",
    "description": "Universal OpenAI-compatible Chat Completions API with xAI backend, RAG, and web search",
    "flows": ["ai-general"],
    "triggers": [
        http("POST", "/ai/v1/chat/completions"),
        http("POST", "/v1/chat/completions")
    ],
}


async def handler(request: ApiRequest, ctx: FlowContext[Any]) -> ApiResponse:
    """
    OpenAI-compatible Chat Completions endpoint mit **echtem** Streaming.
    """
    ctx.logger.info("=" * 80)
    ctx.logger.info("🤖 AI CHAT COMPLETIONS API – OPTIMIZED")
    ctx.logger.info("=" * 80)

    # Log request (sicher)
    ctx.logger.info("📥 REQUEST DETAILS:")
    if request.headers:
        ctx.logger.info(" Headers:")
        for header_name, header_value in request.headers.items():
            if header_name.lower() == 'authorization':
                ctx.logger.info(f" {header_name}: Bearer ***MASKED***")
            else:
                ctx.logger.info(f" {header_name}: {header_value}")

    try:
        # Parse body
        body = request.body or {}
        if not isinstance(body, dict):
            return ApiResponse(status=400, body={'error': 'Request body must be JSON object'})

        # Parameter extrahieren
        model_name = body.get('model', 'grok-4.20-beta-0309-reasoning')
        messages = body.get('messages', [])
        temperature = body.get('temperature', 0.7)
        max_tokens = body.get('max_tokens')
        stream = body.get('stream', False)
        extra_body = body.get('extra_body', {})

        enable_web_search = body.get('enable_web_search', extra_body.get('enable_web_search', False))
        web_search_config = body.get('web_search_config', extra_body.get('web_search_config', {}))

        ctx.logger.info(f"📋 Model: {model_name} | Stream: {stream} | Web Search: {enable_web_search}")

        # Messages loggen (kurz)
        ctx.logger.info("📨 MESSAGES:")
        for i, msg in enumerate(messages, 1):
            preview = (msg.get('content', '')[:120] + "...") if len(msg.get('content', '')) > 120 else msg.get('content', '')
            ctx.logger.info(f" [{i}] {msg.get('role')}: {preview}")

        # === Collection + Aktenzeichen Logic (unverändert) ===
        collection_id: Optional[str] = None
        aktenzeichen: Optional[str] = None

        if 'collection_id' in body:
            collection_id = body['collection_id']
        elif 'custom_collection_id' in body:
            collection_id = body['custom_collection_id']
        elif 'collection_id' in extra_body:
            collection_id = extra_body['collection_id']
        else:
            for msg in messages:
                if msg.get('role') == 'user':
                    content = msg.get('content', '')
                    from services.aktenzeichen_utils import extract_aktenzeichen, normalize_aktenzeichen, remove_aktenzeichen
                    aktenzeichen_raw = extract_aktenzeichen(content)
                    if aktenzeichen_raw:
                        aktenzeichen = normalize_aktenzeichen(aktenzeichen_raw)
                        collection_id = await lookup_collection_by_aktenzeichen(aktenzeichen, ctx)
                        if collection_id:
                            msg['content'] = remove_aktenzeichen(content)
                        break

        if not collection_id and not enable_web_search:
            return ApiResponse(
                status=400,
                body={'error': 'collection_id or web_search required'}
            )

        # === Service initialisieren ===
        from services.langchain_xai_service import LangChainXAIService
        langchain_service = LangChainXAIService(ctx)

        model = langchain_service.get_chat_model(
            model=model_name,
            temperature=temperature,
            max_tokens=max_tokens
        )

        model_with_tools = langchain_service.bind_tools(
            model=model,
            collection_id=collection_id,
            enable_web_search=enable_web_search,
            web_search_config=web_search_config,
            max_num_results=10
        )

        completion_id = f"chatcmpl-{ctx.traceId[:12]}" if hasattr(ctx, 'traceId') else f"chatcmpl-{int(time.time())}"
        created_ts = int(time.time())

        # ====================== ECHTES STREAMING ======================
        if stream:
            ctx.logger.info("🌊 Starting REAL SSE streaming (async generator)...")

            headers = {
                "Content-Type": "text/event-stream",
                "Cache-Control": "no-cache",
                "Connection": "keep-alive",
                "X-Accel-Buffering": "no",          # nginx / proxies
                "Transfer-Encoding": "chunked",
            }

            async def sse_generator():
                # Initial chunk (manche Clients brauchen das)
                yield f'data: {json.dumps({"id": completion_id, "object": "chat.completion.chunk", "created": created_ts, "model": model_name, "choices": [{"index": 0, "delta": {}, "finish_reason": None}]}, ensure_ascii=False)}\n\n'

                chunk_count = 0
                async for chunk in langchain_service.astream_chat(model_with_tools, messages):
                    delta = ""
                    if hasattr(chunk, "content"):
                        content = chunk.content
                        if isinstance(content, str):
                            delta = content
                        elif isinstance(content, list):
                            text_parts = [item.get('text', '') for item in content if isinstance(item, dict) and item.get('type') == 'text']
                            delta = ''.join(text_parts)

                    if delta:
                        chunk_count += 1
                        data = {
                            "id": completion_id,
                            "object": "chat.completion.chunk",
                            "created": created_ts,
                            "model": model_name,
                            "choices": [{
                                "index": 0,
                                "delta": {"content": delta},
                                "finish_reason": None
                            }]
                        }
                        yield f'data: {json.dumps(data, ensure_ascii=False)}\n\n'

                # Finish
                finish = {
                    "id": completion_id,
                    "object": "chat.completion.chunk",
                    "created": created_ts,
                    "model": model_name,
                    "choices": [{
                        "index": 0,
                        "delta": {},
                        "finish_reason": "stop"
                    }]
                }
                yield f'data: {json.dumps(finish, ensure_ascii=False)}\n\n'
                yield "data: [DONE]\n\n"

                ctx.logger.info(f"✅ Streaming abgeschlossen – {chunk_count} Chunks gesendet")

            return ApiResponse(
                status=200,
                headers=headers,
                body=sse_generator()          # ← async generator = echtes Streaming!
            )

        # ====================== NON-STREAMING (unverändert + optimiert) ======================
        else:
            return await handle_non_streaming_response(
                model_with_tools=model_with_tools,
                messages=messages,
                completion_id=completion_id,
                created_ts=created_ts,
                model_name=model_name,
                langchain_service=langchain_service,
                ctx=ctx
            )

    except Exception as e:
        ctx.logger.error(f"❌ ERROR: {e}", exc_info=True)
        return ApiResponse(
            status=500,
            body={'error': 'Internal server error', 'message': str(e)}
        )


async def handle_non_streaming_response(
    model_with_tools,
    messages: List[Dict[str, Any]],
    completion_id: str,
    created_ts: int,
    model_name: str,
    langchain_service,
    ctx: FlowContext
) -> ApiResponse:
    """Non-Streaming Handler (optimiert)."""
    try:
        result = await langchain_service.invoke_chat(model_with_tools, messages)

        # Content extrahieren (kompatibel mit xAI structured output)
        if hasattr(result, 'content'):
            raw = result.content
            if isinstance(raw, list):
                text_parts = [item.get('text', '') for item in raw if isinstance(item, dict) and item.get('type') == 'text']
                content = ''.join(text_parts) or str(raw)
            else:
                content = raw
        else:
            content = str(result)

        # Usage (falls verfügbar)
        usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
        if hasattr(result, 'usage_metadata'):
            u = result.usage_metadata
            usage = {
                "prompt_tokens": getattr(u, 'input_tokens', 0),
                "completion_tokens": getattr(u, 'output_tokens', 0),
                "total_tokens": getattr(u, 'input_tokens', 0) + getattr(u, 'output_tokens', 0)
            }

        response_body = {
            'id': completion_id,
            'object': 'chat.completion',
            'created': created_ts,
            'model': model_name,
            'choices': [{
                'index': 0,
                'message': {'role': 'assistant', 'content': content},
                'finish_reason': 'stop'
            }],
            'usage': usage
        }

        ctx.logger.info(f"✅ Non-streaming fertig – {len(content)} Zeichen")
        return ApiResponse(status=200, body=response_body)

    except Exception as e:
        ctx.logger.error(f"❌ Non-streaming failed: {e}")
        raise


async def lookup_collection_by_aktenzeichen(aktenzeichen: str, ctx: FlowContext) -> Optional[str]:
    """Aktenzeichen → Collection Lookup (unverändert)."""
    try:
        from services.espocrm import EspoCRMAPI
        espocrm = EspoCRMAPI(ctx)
        ctx.logger.info(f"🔍 Suche Räumungsklage für Aktenzeichen: {aktenzeichen}")

        search_result = await espocrm.search_entities(
            entity_type='Raeumungsklage',
            where=[{'type': 'equals', 'attribute': 'advowareAkteBezeichner', 'value': aktenzeichen}],
            select=['id', 'xaiCollectionId'],
            maxSize=1
        )

        if search_result and len(search_result) > 0:
            collection_id = search_result[0].get('xaiCollectionId')
            if collection_id:
                ctx.logger.info(f"✅ Collection gefunden: {collection_id}")
                return collection_id
        return None
    except Exception as e:
        ctx.logger.error(f"❌ Lookup failed: {e}")
        return None