Add AGENTS.md documentation for AI agent guidance

This commit is contained in:
2026-02-23 09:59:52 -05:00
commit 2e2b817435
21 changed files with 2513 additions and 0 deletions

5
app/routers/__init__.py Normal file
View File

@@ -0,0 +1,5 @@
"""API routers for watsonx-openai-proxy."""
from app.routers import chat, completions, embeddings, models
__all__ = ["chat", "completions", "embeddings", "models"]

156
app/routers/chat.py Normal file
View File

@@ -0,0 +1,156 @@
"""Chat completions endpoint router."""
import json
import uuid
from typing import Union
from fastapi import APIRouter, HTTPException, Request
from fastapi.responses import StreamingResponse
from app.models.openai_models import (
ChatCompletionRequest,
ChatCompletionResponse,
ErrorResponse,
ErrorDetail,
)
from app.services.watsonx_service import watsonx_service
from app.utils.transformers import (
transform_messages_to_watsonx,
transform_tools_to_watsonx,
transform_watsonx_to_openai_chat,
transform_watsonx_to_openai_chat_chunk,
format_sse_event,
)
from app.config import settings
import logging
logger = logging.getLogger(__name__)
router = APIRouter()
@router.post(
"/v1/chat/completions",
response_model=Union[ChatCompletionResponse, ErrorResponse],
responses={
200: {"model": ChatCompletionResponse},
400: {"model": ErrorResponse},
401: {"model": ErrorResponse},
500: {"model": ErrorResponse},
},
)
async def create_chat_completion(
request: ChatCompletionRequest,
http_request: Request,
):
"""Create a chat completion using OpenAI-compatible API.
This endpoint accepts OpenAI-formatted requests and translates them
to watsonx.ai API calls.
"""
try:
# Map model name if needed
watsonx_model = settings.map_model(request.model)
logger.info(f"Chat completion request: {request.model} -> {watsonx_model}")
# Transform messages
watsonx_messages = transform_messages_to_watsonx(request.messages)
# Transform tools if present
watsonx_tools = transform_tools_to_watsonx(request.tools)
# Handle streaming
if request.stream:
return StreamingResponse(
stream_chat_completion(
watsonx_model,
watsonx_messages,
request,
watsonx_tools,
),
media_type="text/event-stream",
)
# Non-streaming response
watsonx_response = await watsonx_service.chat_completion(
model_id=watsonx_model,
messages=watsonx_messages,
temperature=request.temperature or 1.0,
max_tokens=request.max_tokens,
top_p=request.top_p or 1.0,
stop=request.stop if isinstance(request.stop, list) else [request.stop] if request.stop else None,
tools=watsonx_tools,
)
# Transform response
openai_response = transform_watsonx_to_openai_chat(
watsonx_response,
request.model,
)
return openai_response
except Exception as e:
logger.error(f"Error in chat completion: {str(e)}", exc_info=True)
raise HTTPException(
status_code=500,
detail={
"error": {
"message": str(e),
"type": "internal_error",
"code": "internal_error",
}
},
)
async def stream_chat_completion(
watsonx_model: str,
watsonx_messages: list,
request: ChatCompletionRequest,
watsonx_tools: list = None,
):
"""Stream chat completion responses.
Args:
watsonx_model: The watsonx model ID
watsonx_messages: Transformed messages
request: Original OpenAI request
watsonx_tools: Transformed tools
Yields:
Server-Sent Events with chat completion chunks
"""
request_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
try:
async for chunk in watsonx_service.chat_completion_stream(
model_id=watsonx_model,
messages=watsonx_messages,
temperature=request.temperature or 1.0,
max_tokens=request.max_tokens,
top_p=request.top_p or 1.0,
stop=request.stop if isinstance(request.stop, list) else [request.stop] if request.stop else None,
tools=watsonx_tools,
):
# Transform chunk to OpenAI format
openai_chunk = transform_watsonx_to_openai_chat_chunk(
chunk,
request.model,
request_id,
)
# Send as SSE
yield format_sse_event(openai_chunk.model_dump_json())
# Send [DONE] message
yield format_sse_event("[DONE]")
except Exception as e:
logger.error(f"Error in streaming chat completion: {str(e)}", exc_info=True)
error_response = ErrorResponse(
error=ErrorDetail(
message=str(e),
type="internal_error",
code="stream_error",
)
)
yield format_sse_event(error_response.model_dump_json())

109
app/routers/completions.py Normal file
View File

@@ -0,0 +1,109 @@
"""Text completions endpoint router (legacy)."""
import uuid
from typing import Union
from fastapi import APIRouter, HTTPException, Request
from app.models.openai_models import (
CompletionRequest,
CompletionResponse,
ErrorResponse,
ErrorDetail,
)
from app.services.watsonx_service import watsonx_service
from app.utils.transformers import transform_watsonx_to_openai_completion
from app.config import settings
import logging
logger = logging.getLogger(__name__)
router = APIRouter()
@router.post(
"/v1/completions",
response_model=Union[CompletionResponse, ErrorResponse],
responses={
200: {"model": CompletionResponse},
400: {"model": ErrorResponse},
401: {"model": ErrorResponse},
500: {"model": ErrorResponse},
},
)
async def create_completion(
request: CompletionRequest,
http_request: Request,
):
"""Create a text completion using OpenAI-compatible API (legacy).
This endpoint accepts OpenAI-formatted completion requests and translates
them to watsonx.ai text generation API calls.
"""
try:
# Map model name if needed
watsonx_model = settings.map_model(request.model)
logger.info(f"Completion request: {request.model} -> {watsonx_model}")
# Handle prompt (can be string or list)
if isinstance(request.prompt, list):
if len(request.prompt) == 0:
raise HTTPException(
status_code=400,
detail={
"error": {
"message": "Prompt cannot be empty",
"type": "invalid_request_error",
"code": "invalid_prompt",
}
},
)
# For now, just use the first prompt
# TODO: Handle multiple prompts with n parameter
prompt = request.prompt[0] if isinstance(request.prompt[0], str) else ""
else:
prompt = request.prompt
# Note: Streaming not implemented for completions yet
if request.stream:
raise HTTPException(
status_code=400,
detail={
"error": {
"message": "Streaming not supported for completions endpoint",
"type": "invalid_request_error",
"code": "streaming_not_supported",
}
},
)
# Call watsonx text generation
watsonx_response = await watsonx_service.text_generation(
model_id=watsonx_model,
prompt=prompt,
temperature=request.temperature or 1.0,
max_tokens=request.max_tokens,
top_p=request.top_p or 1.0,
stop=request.stop if isinstance(request.stop, list) else [request.stop] if request.stop else None,
)
# Transform response
openai_response = transform_watsonx_to_openai_completion(
watsonx_response,
request.model,
)
return openai_response
except HTTPException:
raise
except Exception as e:
logger.error(f"Error in completion: {str(e)}", exc_info=True)
raise HTTPException(
status_code=500,
detail={
"error": {
"message": str(e),
"type": "internal_error",
"code": "internal_error",
}
},
)

114
app/routers/embeddings.py Normal file
View File

@@ -0,0 +1,114 @@
"""Embeddings endpoint router."""
from typing import Union
from fastapi import APIRouter, HTTPException, Request
from app.models.openai_models import (
EmbeddingRequest,
EmbeddingResponse,
ErrorResponse,
ErrorDetail,
)
from app.services.watsonx_service import watsonx_service
from app.utils.transformers import transform_watsonx_to_openai_embeddings
from app.config import settings
import logging
logger = logging.getLogger(__name__)
router = APIRouter()
@router.post(
"/v1/embeddings",
response_model=Union[EmbeddingResponse, ErrorResponse],
responses={
200: {"model": EmbeddingResponse},
400: {"model": ErrorResponse},
401: {"model": ErrorResponse},
500: {"model": ErrorResponse},
},
)
async def create_embeddings(
request: EmbeddingRequest,
http_request: Request,
):
"""Create embeddings using OpenAI-compatible API.
This endpoint accepts OpenAI-formatted embedding requests and translates
them to watsonx.ai embeddings API calls.
"""
try:
# Map model name if needed
watsonx_model = settings.map_model(request.model)
logger.info(f"Embeddings request: {request.model} -> {watsonx_model}")
# Handle input (can be string or list)
if isinstance(request.input, str):
inputs = [request.input]
elif isinstance(request.input, list):
if len(request.input) == 0:
raise HTTPException(
status_code=400,
detail={
"error": {
"message": "Input cannot be empty",
"type": "invalid_request_error",
"code": "invalid_input",
}
},
)
# Handle list of strings or list of token IDs
if isinstance(request.input[0], str):
inputs = request.input
else:
# Token IDs not supported yet
raise HTTPException(
status_code=400,
detail={
"error": {
"message": "Token ID input not supported",
"type": "invalid_request_error",
"code": "unsupported_input_type",
}
},
)
else:
raise HTTPException(
status_code=400,
detail={
"error": {
"message": "Invalid input type",
"type": "invalid_request_error",
"code": "invalid_input_type",
}
},
)
# Call watsonx embeddings
watsonx_response = await watsonx_service.embeddings(
model_id=watsonx_model,
inputs=inputs,
)
# Transform response
openai_response = transform_watsonx_to_openai_embeddings(
watsonx_response,
request.model,
)
return openai_response
except HTTPException:
raise
except Exception as e:
logger.error(f"Error in embeddings: {str(e)}", exc_info=True)
raise HTTPException(
status_code=500,
detail={
"error": {
"message": str(e),
"type": "internal_error",
"code": "internal_error",
}
},
)

120
app/routers/models.py Normal file
View File

@@ -0,0 +1,120 @@
"""Models endpoint router."""
import time
from fastapi import APIRouter
from app.models.openai_models import ModelsResponse, ModelInfo
from app.config import settings
import logging
logger = logging.getLogger(__name__)
router = APIRouter()
# Predefined list of available models
# This can be extended or made dynamic based on watsonx.ai API
AVAILABLE_MODELS = [
# Granite Models
"ibm/granite-3-1-8b-base",
"ibm/granite-3-2-8b-instruct",
"ibm/granite-3-3-8b-instruct",
"ibm/granite-3-8b-instruct",
"ibm/granite-4-h-small",
"ibm/granite-8b-code-instruct",
# Llama Models
"meta-llama/llama-3-1-70b-gptq",
"meta-llama/llama-3-1-8b",
"meta-llama/llama-3-2-11b-vision-instruct",
"meta-llama/llama-3-2-90b-vision-instruct",
"meta-llama/llama-3-3-70b-instruct",
"meta-llama/llama-3-405b-instruct",
"meta-llama/llama-4-maverick-17b-128e-instruct-fp8",
# Mistral Models
"mistral-large-2512",
"mistralai/mistral-medium-2505",
"mistralai/mistral-small-3-1-24b-instruct-2503",
# Other Models
"openai/gpt-oss-120b",
# Embedding Models
"ibm/slate-125m-english-rtrvr",
"ibm/slate-30m-english-rtrvr",
]
@router.get(
"/v1/models",
response_model=ModelsResponse,
)
async def list_models():
"""List available models in OpenAI-compatible format.
Returns a list of models that can be used with the API.
Includes both the actual watsonx model IDs and any mapped names.
"""
created_time = int(time.time())
models = []
# Add all available watsonx models
for model_id in AVAILABLE_MODELS:
models.append(
ModelInfo(
id=model_id,
created=created_time,
owned_by="ibm-watsonx",
)
)
# Add mapped model names (e.g., gpt-4 -> ibm/granite-4-h-small)
model_mapping = settings.get_model_mapping()
for openai_name, watsonx_id in model_mapping.items():
if watsonx_id in AVAILABLE_MODELS:
models.append(
ModelInfo(
id=openai_name,
created=created_time,
owned_by="ibm-watsonx",
)
)
return ModelsResponse(data=models)
@router.get(
"/v1/models/{model_id}",
response_model=ModelInfo,
)
async def retrieve_model(model_id: str):
"""Retrieve information about a specific model.
Args:
model_id: The model ID to retrieve
Returns:
Model information
"""
# Map the model if needed
watsonx_model = settings.map_model(model_id)
# Check if model exists
if watsonx_model not in AVAILABLE_MODELS:
from fastapi import HTTPException
raise HTTPException(
status_code=404,
detail={
"error": {
"message": f"Model '{model_id}' not found",
"type": "invalid_request_error",
"code": "model_not_found",
}
},
)
return ModelInfo(
id=model_id,
created=int(time.time()),
owned_by="ibm-watsonx",
)