Add AGENTS.md documentation for AI agent guidance

2026-02-23 09:59:52 -05:00
commit 2e2b817435
21 changed files with 2513 additions and 0 deletions
--- a/app/routers/init.py
+++ b/app/routers/init.py
@@ -0,0 +1,5 @@
+"""API routers for watsonx-openai-proxy."""
+
+from app.routers import chat, completions, embeddings, models
+
+__all__ = ["chat", "completions", "embeddings", "models"]
--- a/app/routers/chat.py
+++ b/app/routers/chat.py
@@ -0,0 +1,156 @@
+"""Chat completions endpoint router."""
+
+import json
+import uuid
+from typing import Union
+from fastapi import APIRouter, HTTPException, Request
+from fastapi.responses import StreamingResponse
+from app.models.openai_models import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ErrorResponse,
+    ErrorDetail,
+)
+from app.services.watsonx_service import watsonx_service
+from app.utils.transformers import (
+    transform_messages_to_watsonx,
+    transform_tools_to_watsonx,
+    transform_watsonx_to_openai_chat,
+    transform_watsonx_to_openai_chat_chunk,
+    format_sse_event,
+)
+from app.config import settings
+import logging
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter()
+
+
+@router.post(
+    "/v1/chat/completions",
+    response_model=Union[ChatCompletionResponse, ErrorResponse],
+    responses={
+        200: {"model": ChatCompletionResponse},
+        400: {"model": ErrorResponse},
+        401: {"model": ErrorResponse},
+        500: {"model": ErrorResponse},
+    },
+)
+async def create_chat_completion(
+    request: ChatCompletionRequest,
+    http_request: Request,
+):
+    """Create a chat completion using OpenAI-compatible API.
+    
+    This endpoint accepts OpenAI-formatted requests and translates them
+    to watsonx.ai API calls.
+    """
+    try:
+        # Map model name if needed
+        watsonx_model = settings.map_model(request.model)
+        logger.info(f"Chat completion request: {request.model} -> {watsonx_model}")
+        
+        # Transform messages
+        watsonx_messages = transform_messages_to_watsonx(request.messages)
+        
+        # Transform tools if present
+        watsonx_tools = transform_tools_to_watsonx(request.tools)
+        
+        # Handle streaming
+        if request.stream:
+            return StreamingResponse(
+                stream_chat_completion(
+                    watsonx_model,
+                    watsonx_messages,
+                    request,
+                    watsonx_tools,
+                ),
+                media_type="text/event-stream",
+            )
+        
+        # Non-streaming response
+        watsonx_response = await watsonx_service.chat_completion(
+            model_id=watsonx_model,
+            messages=watsonx_messages,
+            temperature=request.temperature or 1.0,
+            max_tokens=request.max_tokens,
+            top_p=request.top_p or 1.0,
+            stop=request.stop if isinstance(request.stop, list) else [request.stop] if request.stop else None,
+            tools=watsonx_tools,
+        )
+        
+        # Transform response
+        openai_response = transform_watsonx_to_openai_chat(
+            watsonx_response,
+            request.model,
+        )
+        
+        return openai_response
+        
+    except Exception as e:
+        logger.error(f"Error in chat completion: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": {
+                    "message": str(e),
+                    "type": "internal_error",
+                    "code": "internal_error",
+                }
+            },
+        )
+
+
+async def stream_chat_completion(
+    watsonx_model: str,
+    watsonx_messages: list,
+    request: ChatCompletionRequest,
+    watsonx_tools: list = None,
+):
+    """Stream chat completion responses.
+    
+    Args:
+        watsonx_model: The watsonx model ID
+        watsonx_messages: Transformed messages
+        request: Original OpenAI request
+        watsonx_tools: Transformed tools
+        
+    Yields:
+        Server-Sent Events with chat completion chunks
+    """
+    request_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
+    
+    try:
+        async for chunk in watsonx_service.chat_completion_stream(
+            model_id=watsonx_model,
+            messages=watsonx_messages,
+            temperature=request.temperature or 1.0,
+            max_tokens=request.max_tokens,
+            top_p=request.top_p or 1.0,
+            stop=request.stop if isinstance(request.stop, list) else [request.stop] if request.stop else None,
+            tools=watsonx_tools,
+        ):
+            # Transform chunk to OpenAI format
+            openai_chunk = transform_watsonx_to_openai_chat_chunk(
+                chunk,
+                request.model,
+                request_id,
+            )
+            
+            # Send as SSE
+            yield format_sse_event(openai_chunk.model_dump_json())
+        
+        # Send [DONE] message
+        yield format_sse_event("[DONE]")
+        
+    except Exception as e:
+        logger.error(f"Error in streaming chat completion: {str(e)}", exc_info=True)
+        error_response = ErrorResponse(
+            error=ErrorDetail(
+                message=str(e),
+                type="internal_error",
+                code="stream_error",
+            )
+        )
+        yield format_sse_event(error_response.model_dump_json())
--- a/app/routers/completions.py
+++ b/app/routers/completions.py
@@ -0,0 +1,109 @@
+"""Text completions endpoint router (legacy)."""
+
+import uuid
+from typing import Union
+from fastapi import APIRouter, HTTPException, Request
+from app.models.openai_models import (
+    CompletionRequest,
+    CompletionResponse,
+    ErrorResponse,
+    ErrorDetail,
+)
+from app.services.watsonx_service import watsonx_service
+from app.utils.transformers import transform_watsonx_to_openai_completion
+from app.config import settings
+import logging
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter()
+
+
+@router.post(
+    "/v1/completions",
+    response_model=Union[CompletionResponse, ErrorResponse],
+    responses={
+        200: {"model": CompletionResponse},
+        400: {"model": ErrorResponse},
+        401: {"model": ErrorResponse},
+        500: {"model": ErrorResponse},
+    },
+)
+async def create_completion(
+    request: CompletionRequest,
+    http_request: Request,
+):
+    """Create a text completion using OpenAI-compatible API (legacy).
+    
+    This endpoint accepts OpenAI-formatted completion requests and translates
+    them to watsonx.ai text generation API calls.
+    """
+    try:
+        # Map model name if needed
+        watsonx_model = settings.map_model(request.model)
+        logger.info(f"Completion request: {request.model} -> {watsonx_model}")
+        
+        # Handle prompt (can be string or list)
+        if isinstance(request.prompt, list):
+            if len(request.prompt) == 0:
+                raise HTTPException(
+                    status_code=400,
+                    detail={
+                        "error": {
+                            "message": "Prompt cannot be empty",
+                            "type": "invalid_request_error",
+                            "code": "invalid_prompt",
+                        }
+                    },
+                )
+            # For now, just use the first prompt
+            # TODO: Handle multiple prompts with n parameter
+            prompt = request.prompt[0] if isinstance(request.prompt[0], str) else ""
+        else:
+            prompt = request.prompt
+        
+        # Note: Streaming not implemented for completions yet
+        if request.stream:
+            raise HTTPException(
+                status_code=400,
+                detail={
+                    "error": {
+                        "message": "Streaming not supported for completions endpoint",
+                        "type": "invalid_request_error",
+                        "code": "streaming_not_supported",
+                    }
+                },
+            )
+        
+        # Call watsonx text generation
+        watsonx_response = await watsonx_service.text_generation(
+            model_id=watsonx_model,
+            prompt=prompt,
+            temperature=request.temperature or 1.0,
+            max_tokens=request.max_tokens,
+            top_p=request.top_p or 1.0,
+            stop=request.stop if isinstance(request.stop, list) else [request.stop] if request.stop else None,
+        )
+        
+        # Transform response
+        openai_response = transform_watsonx_to_openai_completion(
+            watsonx_response,
+            request.model,
+        )
+        
+        return openai_response
+        
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error in completion: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": {
+                    "message": str(e),
+                    "type": "internal_error",
+                    "code": "internal_error",
+                }
+            },
+        )
--- a/app/routers/embeddings.py
+++ b/app/routers/embeddings.py
@@ -0,0 +1,114 @@
+"""Embeddings endpoint router."""
+
+from typing import Union
+from fastapi import APIRouter, HTTPException, Request
+from app.models.openai_models import (
+    EmbeddingRequest,
+    EmbeddingResponse,
+    ErrorResponse,
+    ErrorDetail,
+)
+from app.services.watsonx_service import watsonx_service
+from app.utils.transformers import transform_watsonx_to_openai_embeddings
+from app.config import settings
+import logging
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter()
+
+
+@router.post(
+    "/v1/embeddings",
+    response_model=Union[EmbeddingResponse, ErrorResponse],
+    responses={
+        200: {"model": EmbeddingResponse},
+        400: {"model": ErrorResponse},
+        401: {"model": ErrorResponse},
+        500: {"model": ErrorResponse},
+    },
+)
+async def create_embeddings(
+    request: EmbeddingRequest,
+    http_request: Request,
+):
+    """Create embeddings using OpenAI-compatible API.
+    
+    This endpoint accepts OpenAI-formatted embedding requests and translates
+    them to watsonx.ai embeddings API calls.
+    """
+    try:
+        # Map model name if needed
+        watsonx_model = settings.map_model(request.model)
+        logger.info(f"Embeddings request: {request.model} -> {watsonx_model}")
+        
+        # Handle input (can be string or list)
+        if isinstance(request.input, str):
+            inputs = [request.input]
+        elif isinstance(request.input, list):
+            if len(request.input) == 0:
+                raise HTTPException(
+                    status_code=400,
+                    detail={
+                        "error": {
+                            "message": "Input cannot be empty",
+                            "type": "invalid_request_error",
+                            "code": "invalid_input",
+                        }
+                    },
+                )
+            # Handle list of strings or list of token IDs
+            if isinstance(request.input[0], str):
+                inputs = request.input
+            else:
+                # Token IDs not supported yet
+                raise HTTPException(
+                    status_code=400,
+                    detail={
+                        "error": {
+                            "message": "Token ID input not supported",
+                            "type": "invalid_request_error",
+                            "code": "unsupported_input_type",
+                        }
+                    },
+                )
+        else:
+            raise HTTPException(
+                status_code=400,
+                detail={
+                    "error": {
+                        "message": "Invalid input type",
+                        "type": "invalid_request_error",
+                        "code": "invalid_input_type",
+                    }
+                },
+            )
+        
+        # Call watsonx embeddings
+        watsonx_response = await watsonx_service.embeddings(
+            model_id=watsonx_model,
+            inputs=inputs,
+        )
+        
+        # Transform response
+        openai_response = transform_watsonx_to_openai_embeddings(
+            watsonx_response,
+            request.model,
+        )
+        
+        return openai_response
+        
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error in embeddings: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": {
+                    "message": str(e),
+                    "type": "internal_error",
+                    "code": "internal_error",
+                }
+            },
+        )
--- a/app/routers/models.py
+++ b/app/routers/models.py
@@ -0,0 +1,120 @@
+"""Models endpoint router."""
+
+import time
+from fastapi import APIRouter
+from app.models.openai_models import ModelsResponse, ModelInfo
+from app.config import settings
+import logging
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter()
+
+
+# Predefined list of available models
+# This can be extended or made dynamic based on watsonx.ai API
+AVAILABLE_MODELS = [
+    # Granite Models
+    "ibm/granite-3-1-8b-base",
+    "ibm/granite-3-2-8b-instruct",
+    "ibm/granite-3-3-8b-instruct",
+    "ibm/granite-3-8b-instruct",
+    "ibm/granite-4-h-small",
+    "ibm/granite-8b-code-instruct",
+    
+    # Llama Models
+    "meta-llama/llama-3-1-70b-gptq",
+    "meta-llama/llama-3-1-8b",
+    "meta-llama/llama-3-2-11b-vision-instruct",
+    "meta-llama/llama-3-2-90b-vision-instruct",
+    "meta-llama/llama-3-3-70b-instruct",
+    "meta-llama/llama-3-405b-instruct",
+    "meta-llama/llama-4-maverick-17b-128e-instruct-fp8",
+    
+    # Mistral Models
+    "mistral-large-2512",
+    "mistralai/mistral-medium-2505",
+    "mistralai/mistral-small-3-1-24b-instruct-2503",
+    
+    # Other Models
+    "openai/gpt-oss-120b",
+    
+    # Embedding Models
+    "ibm/slate-125m-english-rtrvr",
+    "ibm/slate-30m-english-rtrvr",
+]
+
+
+@router.get(
+    "/v1/models",
+    response_model=ModelsResponse,
+)
+async def list_models():
+    """List available models in OpenAI-compatible format.
+    
+    Returns a list of models that can be used with the API.
+    Includes both the actual watsonx model IDs and any mapped names.
+    """
+    created_time = int(time.time())
+    models = []
+    
+    # Add all available watsonx models
+    for model_id in AVAILABLE_MODELS:
+        models.append(
+            ModelInfo(
+                id=model_id,
+                created=created_time,
+                owned_by="ibm-watsonx",
+            )
+        )
+    
+    # Add mapped model names (e.g., gpt-4 -> ibm/granite-4-h-small)
+    model_mapping = settings.get_model_mapping()
+    for openai_name, watsonx_id in model_mapping.items():
+        if watsonx_id in AVAILABLE_MODELS:
+            models.append(
+                ModelInfo(
+                    id=openai_name,
+                    created=created_time,
+                    owned_by="ibm-watsonx",
+                )
+            )
+    
+    return ModelsResponse(data=models)
+
+
+@router.get(
+    "/v1/models/{model_id}",
+    response_model=ModelInfo,
+)
+async def retrieve_model(model_id: str):
+    """Retrieve information about a specific model.
+    
+    Args:
+        model_id: The model ID to retrieve
+        
+    Returns:
+        Model information
+    """
+    # Map the model if needed
+    watsonx_model = settings.map_model(model_id)
+    
+    # Check if model exists
+    if watsonx_model not in AVAILABLE_MODELS:
+        from fastapi import HTTPException
+        raise HTTPException(
+            status_code=404,
+            detail={
+                "error": {
+                    "message": f"Model '{model_id}' not found",
+                    "type": "invalid_request_error",
+                    "code": "model_not_found",
+                }
+            },
+        )
+    
+    return ModelInfo(
+        id=model_id,
+        created=int(time.time()),
+        owned_by="ibm-watsonx",
+    )