main.py

import os

from llama import Llama

from fastapi import FastAPI
from pydantic import BaseModel
from pydantic import Field
import uvicorn

# Setup our models

# Represents a Request to the Llama API
class Request(BaseModel):
    query: str = Field(description="The query to send to Llama")

# Represents an item generated from the Llama API
class Generation(BaseModel):
    role: str = Field(description="The role of the generation"),
    content: str = Field(description="The content of the generation")

# Represents a Response from the Llama API
class Response(BaseModel):
    generation: Generation = Field(description="An item of work generated by Llama")

# Set the path we should serve the Swagger UI from
docsUrl = "/"
if os.environ.get('DOCS_URL') is not None:
    docsUrl = os.environ.get('DOCS_URL')

# Add API Endpoint metadata
tags_metadata = [
    {
        "name": "submit",
        "description": "Endpoint used to submit a query to Llama",
    }
]

# Markdown formatted description
apiDescription = """
A simple Docker container for submitting queries to a local Llama 2 instance.

#### License

Llama 2 resources are goverened by the [LLAMA 2 COMMUNITY LICENSE AGREEMENT](https://github.com/facebookresearch/llama/blob/main/LICENSE).

This container is provided AS IS with a MIT License.
"""

# Setup our FastAPI server
app = FastAPI(
    docs_url=docsUrl,
    title="Simple Llama 2 Docker Container",
    description=apiDescription,
    openapi_tags=tags_metadata
)

# Where the model weight files should be loaded from
modelWeightPath = "/code/" + os.environ.get('MODEL_WEIGHT')
# Port to serve the UI from
port = 8080
if os.environ.get('PORT') is not None:
    port = int(os.environ.get('PORT'))

print('Building model with model weight at ' + modelWeightPath)

# Build our Llama model
generator = Llama.build(
        ckpt_dir=modelWeightPath,
        tokenizer_path=modelWeightPath + "/tokenizer.model",
        max_seq_len=512,
        max_batch_size=4,
)

print('Model build completed; starting server...')

# Create our POST endpoint
@app.post(
    "/submit", 
    tags=["submit"],
    summary="Submits a query",
    description="Submits a query to Llama using default settings"
)
def submit(request: Request) -> list[Response]:
    # Submit the query as a user query
    dialogs = [
        [{"role": "user", "content": request.query}],
    ]
    # Just return whatever Llama returns
    results = generator.chat_completion(
        dialogs,
    )
    return results

@app.on_event("startup")
async def startup_event():
    print('Server Started!')
    print('Visit http://localhost:' + str(port) + docsUrl + ' to access the UI!')

uvicorn.run(app, host="0.0.0.0", port=port)