Add AGENTS.md documentation for AI agent guidance
This commit is contained in:
5
app/routers/__init__.py
Normal file
5
app/routers/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""API routers for watsonx-openai-proxy."""
|
||||
|
||||
from app.routers import chat, completions, embeddings, models
|
||||
|
||||
__all__ = ["chat", "completions", "embeddings", "models"]
|
||||
156
app/routers/chat.py
Normal file
156
app/routers/chat.py
Normal file
@@ -0,0 +1,156 @@
|
||||
"""Chat completions endpoint router."""
|
||||
|
||||
import json
|
||||
import uuid
|
||||
from typing import Union
|
||||
from fastapi import APIRouter, HTTPException, Request
|
||||
from fastapi.responses import StreamingResponse
|
||||
from app.models.openai_models import (
|
||||
ChatCompletionRequest,
|
||||
ChatCompletionResponse,
|
||||
ErrorResponse,
|
||||
ErrorDetail,
|
||||
)
|
||||
from app.services.watsonx_service import watsonx_service
|
||||
from app.utils.transformers import (
|
||||
transform_messages_to_watsonx,
|
||||
transform_tools_to_watsonx,
|
||||
transform_watsonx_to_openai_chat,
|
||||
transform_watsonx_to_openai_chat_chunk,
|
||||
format_sse_event,
|
||||
)
|
||||
from app.config import settings
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post(
|
||||
"/v1/chat/completions",
|
||||
response_model=Union[ChatCompletionResponse, ErrorResponse],
|
||||
responses={
|
||||
200: {"model": ChatCompletionResponse},
|
||||
400: {"model": ErrorResponse},
|
||||
401: {"model": ErrorResponse},
|
||||
500: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
async def create_chat_completion(
|
||||
request: ChatCompletionRequest,
|
||||
http_request: Request,
|
||||
):
|
||||
"""Create a chat completion using OpenAI-compatible API.
|
||||
|
||||
This endpoint accepts OpenAI-formatted requests and translates them
|
||||
to watsonx.ai API calls.
|
||||
"""
|
||||
try:
|
||||
# Map model name if needed
|
||||
watsonx_model = settings.map_model(request.model)
|
||||
logger.info(f"Chat completion request: {request.model} -> {watsonx_model}")
|
||||
|
||||
# Transform messages
|
||||
watsonx_messages = transform_messages_to_watsonx(request.messages)
|
||||
|
||||
# Transform tools if present
|
||||
watsonx_tools = transform_tools_to_watsonx(request.tools)
|
||||
|
||||
# Handle streaming
|
||||
if request.stream:
|
||||
return StreamingResponse(
|
||||
stream_chat_completion(
|
||||
watsonx_model,
|
||||
watsonx_messages,
|
||||
request,
|
||||
watsonx_tools,
|
||||
),
|
||||
media_type="text/event-stream",
|
||||
)
|
||||
|
||||
# Non-streaming response
|
||||
watsonx_response = await watsonx_service.chat_completion(
|
||||
model_id=watsonx_model,
|
||||
messages=watsonx_messages,
|
||||
temperature=request.temperature or 1.0,
|
||||
max_tokens=request.max_tokens,
|
||||
top_p=request.top_p or 1.0,
|
||||
stop=request.stop if isinstance(request.stop, list) else [request.stop] if request.stop else None,
|
||||
tools=watsonx_tools,
|
||||
)
|
||||
|
||||
# Transform response
|
||||
openai_response = transform_watsonx_to_openai_chat(
|
||||
watsonx_response,
|
||||
request.model,
|
||||
)
|
||||
|
||||
return openai_response
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in chat completion: {str(e)}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail={
|
||||
"error": {
|
||||
"message": str(e),
|
||||
"type": "internal_error",
|
||||
"code": "internal_error",
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
async def stream_chat_completion(
|
||||
watsonx_model: str,
|
||||
watsonx_messages: list,
|
||||
request: ChatCompletionRequest,
|
||||
watsonx_tools: list = None,
|
||||
):
|
||||
"""Stream chat completion responses.
|
||||
|
||||
Args:
|
||||
watsonx_model: The watsonx model ID
|
||||
watsonx_messages: Transformed messages
|
||||
request: Original OpenAI request
|
||||
watsonx_tools: Transformed tools
|
||||
|
||||
Yields:
|
||||
Server-Sent Events with chat completion chunks
|
||||
"""
|
||||
request_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
|
||||
|
||||
try:
|
||||
async for chunk in watsonx_service.chat_completion_stream(
|
||||
model_id=watsonx_model,
|
||||
messages=watsonx_messages,
|
||||
temperature=request.temperature or 1.0,
|
||||
max_tokens=request.max_tokens,
|
||||
top_p=request.top_p or 1.0,
|
||||
stop=request.stop if isinstance(request.stop, list) else [request.stop] if request.stop else None,
|
||||
tools=watsonx_tools,
|
||||
):
|
||||
# Transform chunk to OpenAI format
|
||||
openai_chunk = transform_watsonx_to_openai_chat_chunk(
|
||||
chunk,
|
||||
request.model,
|
||||
request_id,
|
||||
)
|
||||
|
||||
# Send as SSE
|
||||
yield format_sse_event(openai_chunk.model_dump_json())
|
||||
|
||||
# Send [DONE] message
|
||||
yield format_sse_event("[DONE]")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in streaming chat completion: {str(e)}", exc_info=True)
|
||||
error_response = ErrorResponse(
|
||||
error=ErrorDetail(
|
||||
message=str(e),
|
||||
type="internal_error",
|
||||
code="stream_error",
|
||||
)
|
||||
)
|
||||
yield format_sse_event(error_response.model_dump_json())
|
||||
109
app/routers/completions.py
Normal file
109
app/routers/completions.py
Normal file
@@ -0,0 +1,109 @@
|
||||
"""Text completions endpoint router (legacy)."""
|
||||
|
||||
import uuid
|
||||
from typing import Union
|
||||
from fastapi import APIRouter, HTTPException, Request
|
||||
from app.models.openai_models import (
|
||||
CompletionRequest,
|
||||
CompletionResponse,
|
||||
ErrorResponse,
|
||||
ErrorDetail,
|
||||
)
|
||||
from app.services.watsonx_service import watsonx_service
|
||||
from app.utils.transformers import transform_watsonx_to_openai_completion
|
||||
from app.config import settings
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post(
|
||||
"/v1/completions",
|
||||
response_model=Union[CompletionResponse, ErrorResponse],
|
||||
responses={
|
||||
200: {"model": CompletionResponse},
|
||||
400: {"model": ErrorResponse},
|
||||
401: {"model": ErrorResponse},
|
||||
500: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
async def create_completion(
|
||||
request: CompletionRequest,
|
||||
http_request: Request,
|
||||
):
|
||||
"""Create a text completion using OpenAI-compatible API (legacy).
|
||||
|
||||
This endpoint accepts OpenAI-formatted completion requests and translates
|
||||
them to watsonx.ai text generation API calls.
|
||||
"""
|
||||
try:
|
||||
# Map model name if needed
|
||||
watsonx_model = settings.map_model(request.model)
|
||||
logger.info(f"Completion request: {request.model} -> {watsonx_model}")
|
||||
|
||||
# Handle prompt (can be string or list)
|
||||
if isinstance(request.prompt, list):
|
||||
if len(request.prompt) == 0:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail={
|
||||
"error": {
|
||||
"message": "Prompt cannot be empty",
|
||||
"type": "invalid_request_error",
|
||||
"code": "invalid_prompt",
|
||||
}
|
||||
},
|
||||
)
|
||||
# For now, just use the first prompt
|
||||
# TODO: Handle multiple prompts with n parameter
|
||||
prompt = request.prompt[0] if isinstance(request.prompt[0], str) else ""
|
||||
else:
|
||||
prompt = request.prompt
|
||||
|
||||
# Note: Streaming not implemented for completions yet
|
||||
if request.stream:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail={
|
||||
"error": {
|
||||
"message": "Streaming not supported for completions endpoint",
|
||||
"type": "invalid_request_error",
|
||||
"code": "streaming_not_supported",
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
# Call watsonx text generation
|
||||
watsonx_response = await watsonx_service.text_generation(
|
||||
model_id=watsonx_model,
|
||||
prompt=prompt,
|
||||
temperature=request.temperature or 1.0,
|
||||
max_tokens=request.max_tokens,
|
||||
top_p=request.top_p or 1.0,
|
||||
stop=request.stop if isinstance(request.stop, list) else [request.stop] if request.stop else None,
|
||||
)
|
||||
|
||||
# Transform response
|
||||
openai_response = transform_watsonx_to_openai_completion(
|
||||
watsonx_response,
|
||||
request.model,
|
||||
)
|
||||
|
||||
return openai_response
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error in completion: {str(e)}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail={
|
||||
"error": {
|
||||
"message": str(e),
|
||||
"type": "internal_error",
|
||||
"code": "internal_error",
|
||||
}
|
||||
},
|
||||
)
|
||||
114
app/routers/embeddings.py
Normal file
114
app/routers/embeddings.py
Normal file
@@ -0,0 +1,114 @@
|
||||
"""Embeddings endpoint router."""
|
||||
|
||||
from typing import Union
|
||||
from fastapi import APIRouter, HTTPException, Request
|
||||
from app.models.openai_models import (
|
||||
EmbeddingRequest,
|
||||
EmbeddingResponse,
|
||||
ErrorResponse,
|
||||
ErrorDetail,
|
||||
)
|
||||
from app.services.watsonx_service import watsonx_service
|
||||
from app.utils.transformers import transform_watsonx_to_openai_embeddings
|
||||
from app.config import settings
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post(
|
||||
"/v1/embeddings",
|
||||
response_model=Union[EmbeddingResponse, ErrorResponse],
|
||||
responses={
|
||||
200: {"model": EmbeddingResponse},
|
||||
400: {"model": ErrorResponse},
|
||||
401: {"model": ErrorResponse},
|
||||
500: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
async def create_embeddings(
|
||||
request: EmbeddingRequest,
|
||||
http_request: Request,
|
||||
):
|
||||
"""Create embeddings using OpenAI-compatible API.
|
||||
|
||||
This endpoint accepts OpenAI-formatted embedding requests and translates
|
||||
them to watsonx.ai embeddings API calls.
|
||||
"""
|
||||
try:
|
||||
# Map model name if needed
|
||||
watsonx_model = settings.map_model(request.model)
|
||||
logger.info(f"Embeddings request: {request.model} -> {watsonx_model}")
|
||||
|
||||
# Handle input (can be string or list)
|
||||
if isinstance(request.input, str):
|
||||
inputs = [request.input]
|
||||
elif isinstance(request.input, list):
|
||||
if len(request.input) == 0:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail={
|
||||
"error": {
|
||||
"message": "Input cannot be empty",
|
||||
"type": "invalid_request_error",
|
||||
"code": "invalid_input",
|
||||
}
|
||||
},
|
||||
)
|
||||
# Handle list of strings or list of token IDs
|
||||
if isinstance(request.input[0], str):
|
||||
inputs = request.input
|
||||
else:
|
||||
# Token IDs not supported yet
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail={
|
||||
"error": {
|
||||
"message": "Token ID input not supported",
|
||||
"type": "invalid_request_error",
|
||||
"code": "unsupported_input_type",
|
||||
}
|
||||
},
|
||||
)
|
||||
else:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail={
|
||||
"error": {
|
||||
"message": "Invalid input type",
|
||||
"type": "invalid_request_error",
|
||||
"code": "invalid_input_type",
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
# Call watsonx embeddings
|
||||
watsonx_response = await watsonx_service.embeddings(
|
||||
model_id=watsonx_model,
|
||||
inputs=inputs,
|
||||
)
|
||||
|
||||
# Transform response
|
||||
openai_response = transform_watsonx_to_openai_embeddings(
|
||||
watsonx_response,
|
||||
request.model,
|
||||
)
|
||||
|
||||
return openai_response
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error in embeddings: {str(e)}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail={
|
||||
"error": {
|
||||
"message": str(e),
|
||||
"type": "internal_error",
|
||||
"code": "internal_error",
|
||||
}
|
||||
},
|
||||
)
|
||||
120
app/routers/models.py
Normal file
120
app/routers/models.py
Normal file
@@ -0,0 +1,120 @@
|
||||
"""Models endpoint router."""
|
||||
|
||||
import time
|
||||
from fastapi import APIRouter
|
||||
from app.models.openai_models import ModelsResponse, ModelInfo
|
||||
from app.config import settings
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
# Predefined list of available models
|
||||
# This can be extended or made dynamic based on watsonx.ai API
|
||||
AVAILABLE_MODELS = [
|
||||
# Granite Models
|
||||
"ibm/granite-3-1-8b-base",
|
||||
"ibm/granite-3-2-8b-instruct",
|
||||
"ibm/granite-3-3-8b-instruct",
|
||||
"ibm/granite-3-8b-instruct",
|
||||
"ibm/granite-4-h-small",
|
||||
"ibm/granite-8b-code-instruct",
|
||||
|
||||
# Llama Models
|
||||
"meta-llama/llama-3-1-70b-gptq",
|
||||
"meta-llama/llama-3-1-8b",
|
||||
"meta-llama/llama-3-2-11b-vision-instruct",
|
||||
"meta-llama/llama-3-2-90b-vision-instruct",
|
||||
"meta-llama/llama-3-3-70b-instruct",
|
||||
"meta-llama/llama-3-405b-instruct",
|
||||
"meta-llama/llama-4-maverick-17b-128e-instruct-fp8",
|
||||
|
||||
# Mistral Models
|
||||
"mistral-large-2512",
|
||||
"mistralai/mistral-medium-2505",
|
||||
"mistralai/mistral-small-3-1-24b-instruct-2503",
|
||||
|
||||
# Other Models
|
||||
"openai/gpt-oss-120b",
|
||||
|
||||
# Embedding Models
|
||||
"ibm/slate-125m-english-rtrvr",
|
||||
"ibm/slate-30m-english-rtrvr",
|
||||
]
|
||||
|
||||
|
||||
@router.get(
|
||||
"/v1/models",
|
||||
response_model=ModelsResponse,
|
||||
)
|
||||
async def list_models():
|
||||
"""List available models in OpenAI-compatible format.
|
||||
|
||||
Returns a list of models that can be used with the API.
|
||||
Includes both the actual watsonx model IDs and any mapped names.
|
||||
"""
|
||||
created_time = int(time.time())
|
||||
models = []
|
||||
|
||||
# Add all available watsonx models
|
||||
for model_id in AVAILABLE_MODELS:
|
||||
models.append(
|
||||
ModelInfo(
|
||||
id=model_id,
|
||||
created=created_time,
|
||||
owned_by="ibm-watsonx",
|
||||
)
|
||||
)
|
||||
|
||||
# Add mapped model names (e.g., gpt-4 -> ibm/granite-4-h-small)
|
||||
model_mapping = settings.get_model_mapping()
|
||||
for openai_name, watsonx_id in model_mapping.items():
|
||||
if watsonx_id in AVAILABLE_MODELS:
|
||||
models.append(
|
||||
ModelInfo(
|
||||
id=openai_name,
|
||||
created=created_time,
|
||||
owned_by="ibm-watsonx",
|
||||
)
|
||||
)
|
||||
|
||||
return ModelsResponse(data=models)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/v1/models/{model_id}",
|
||||
response_model=ModelInfo,
|
||||
)
|
||||
async def retrieve_model(model_id: str):
|
||||
"""Retrieve information about a specific model.
|
||||
|
||||
Args:
|
||||
model_id: The model ID to retrieve
|
||||
|
||||
Returns:
|
||||
Model information
|
||||
"""
|
||||
# Map the model if needed
|
||||
watsonx_model = settings.map_model(model_id)
|
||||
|
||||
# Check if model exists
|
||||
if watsonx_model not in AVAILABLE_MODELS:
|
||||
from fastapi import HTTPException
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail={
|
||||
"error": {
|
||||
"message": f"Model '{model_id}' not found",
|
||||
"type": "invalid_request_error",
|
||||
"code": "model_not_found",
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
return ModelInfo(
|
||||
id=model_id,
|
||||
created=int(time.time()),
|
||||
owned_by="ibm-watsonx",
|
||||
)
|
||||
Reference in New Issue
Block a user