-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathapi.py
124 lines (106 loc) · 3.35 KB
/
api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import List, Optional, Any, Tuple, Dict
import json
from gradio_client import Client
import asyncio
import os
app = FastAPI()
# CORS configuration
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Initialize the Gradio client
gradio_url = os.getenv("GRADIO_URL", "http://127.0.0.1:7860/")
gradio_client = Client(gradio_url)
class ChatMessage(BaseModel):
role: str
content: str
class ChatCompletionRequest(BaseModel):
model: str
messages: List[ChatMessage]
temperature: Optional[float] = 0.7
max_tokens: Optional[int] = None
stream: Optional[bool] = False
class Usage(BaseModel):
prompt_tokens: int
completion_tokens: int
total_tokens: int
class Choice(BaseModel):
index: int
message: ChatMessage
finish_reason: str
class ChatCompletionResponse(BaseModel):
id: str
object: str
created: int
model: str
choices: List[Choice]
usage: Usage
@app.post("/chat/completions", response_model=ChatCompletionResponse)
async def chat_completions(request: ChatCompletionRequest):
try:
# Get the last user message
last_user_message = next((msg.content for msg in reversed(request.messages) if msg.role == "user"), "")
# Prepare the chat history
history = []
for msg in request.messages:
if msg.role == "user":
history.append([msg.content, None])
elif msg.role == "assistant" and history:
history[-1][1] = msg.content
result = await asyncio.to_thread(
gradio_client.predict,
last_user_message,
history,
api_name="/chat"
)
# Extracting the response from the Gradio result
chat_history, processing_log = result
response = chat_history[-1][1] if chat_history else ""
# Construct the response
choice = Choice(
index=0,
message=ChatMessage(role="assistant", content=response),
finish_reason="stop"
)
# Dummy usage data (you might want to implement actual token counting)
usage = Usage(prompt_tokens=len(last_user_message), completion_tokens=len(response), total_tokens=len(last_user_message)+len(response))
return ChatCompletionResponse(
id="chatcmpl-" + os.urandom(4).hex(),
object="chat.completion",
created=int(asyncio.get_event_loop().time()),
model=request.model,
choices=[choice],
usage=usage
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/models")
async def list_models():
return {
"data": [
{
"id": "moa",
"object": "model",
"created": 1686935002,
"owned_by": "organization-owner"
}
],
"object": "list"
}
if __name__ == "__main__":
import uvicorn
uvicorn.run(
"api:app",
host="0.0.0.0",
port=int(os.getenv("PORT", 8000)),
reload=True,
ssl_keyfile=os.getenv("SSL_KEYFILE", None),
ssl_certfile=os.getenv("SSL_CERTFILE", None),
)