Skip to content

Commit

Permalink
Added:
Browse files Browse the repository at this point in the history
    - In `openai`, support for `prompt` and `extra_body`. Reference: https://github.com/openai/openai-python/blob/195c05a64d39c87b2dfdf1eca2d339597f1fce03/src/openai/resources/completions.py#L41
    - Pass `llama-cli` options to `openai`.
    - `util` module with `is_cuda_available` function.
    - `openai` supports both `prompt` and `messages`. Reference: https://github.com/openai/openai-python/blob/195c05a64d39c87b2dfdf1eca2d339597f1fce03/src/openai/resources/completions.py#L45
  • Loading branch information
mtasic85 committed Jul 30, 2024
1 parent ec2aef5 commit bce8bfb
Show file tree
Hide file tree
Showing 8 changed files with 158 additions and 90 deletions.
18 changes: 16 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,28 @@
# CHANGELOG

## v0.1.10

Added:
- In `openai`, support for `prompt` and `extra_body`. Reference: https://github.com/openai/openai-python/blob/195c05a64d39c87b2dfdf1eca2d339597f1fce03/src/openai/resources/completions.py#L41
- Pass `llama-cli` options to `openai`.
- `util` module with `is_cuda_available` function.
- `openai` supports both `prompt` and `messages`. Reference: https://github.com/openai/openai-python/blob/195c05a64d39c87b2dfdf1eca2d339597f1fce03/src/openai/resources/completions.py#L45

## v0.1.9

Added:
- Support for default CPU tinyBLAS (llamafile, sgemm) builds
- Support for CPU OpenBLAS (GGML_OPENBLAS) builds
- Support for default CPU tinyBLAS (llamafile, sgemm) builds.
- Support for CPU OpenBLAS (GGML_OPENBLAS) builds.

Changed:
- Build scripts now have separate step/function `cuda_12_5_1_setup` which setups CUDA 12.5.1 env for build-time.

Fixed:
- Stop thread in `llama_generate` on `GeneratorExit`.

Removed:
- `callback` parameter in `llama_generate` and dependent functions.

## v0.1.8

Added:
Expand Down
18 changes: 18 additions & 0 deletions examples/demo_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,15 @@ def demo_chat_completions():
messages=messages,
temperature=0.0,
stop=['```\n'],

# llama-cpp-cffi
extra_body=dict(
batch_size=512,
n_gpu_layers=22,
main_gpu=0,
cont_batching=True,
flash_attn=True,
),
)

print(response.choices[0].message.content)
Expand All @@ -43,6 +52,15 @@ def demo_chat_completions_stream():
temperature=0.0,
stop=['```\n'],
stream=True,

# llama-cpp-cffi
extra_body=dict(
batch_size=512,
n_gpu_layers=22,
main_gpu=0,
cont_batching=True,
flash_attn=True,
),
)

for chunk in response:
Expand Down
55 changes: 21 additions & 34 deletions llama/llama_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,16 @@
from threading import Thread
from functools import partial

from numba import cuda
from transformers import AutoTokenizer
from huggingface_hub import hf_hub_download

from .formatter import get_tokenizer, get_special_tokens, format_messages
from .model import Model
from .options import Options, convert_options_to_bytes
from .util import is_cuda_available

if cuda.is_available():
try:
from ._llama_cli_cuda_12_5 import lib, ffi
except ImportError:
from ._llama_cli_cpu import lib, ffi
if is_cuda_available():
from ._llama_cli_cuda_12_5 import lib, ffi
else:
from ._llama_cli_cpu import lib, ffi

Expand All @@ -29,7 +26,7 @@
_LLAMA_SHOULD_STOP_T = ctypes.CFUNCTYPE(ctypes.c_int)


def _llama_yield_token_func(chunk_bytes: bytes, queue=None, callback=None, metadata=None):
def _llama_yield_token_func(chunk_bytes: bytes, queue: Queue, metadata: dict):
stop_on_special_token = metadata['stop_on_special_token']
special_tokens = metadata['special_tokens']

Expand Down Expand Up @@ -83,13 +80,13 @@ def _llama_yield_token_func(chunk_bytes: bytes, queue=None, callback=None, metad
metadata['buffer'] = ''


def _llama_should_stop_func(queue=None, callback=None, metadata=None) -> int:
def _llama_should_stop_func(queue: Queue, metadata: dict) -> int:
return 1 if metadata['should_stop'] else 0


def _llama_cli_main(argc, argv, queue=None, callback=None, metadata=None):
_llama_yield_token = _LLAMA_YIELD_TOKEN_T(partial(_llama_yield_token_func, queue=queue, callback=callback, metadata=metadata))
_llama_should_stop = _LLAMA_SHOULD_STOP_T(partial(_llama_should_stop_func, queue=queue, callback=callback, metadata=metadata))
def _llama_cli_main(argc, argv, queue: Queue, metadata: dict):
_llama_yield_token = _LLAMA_YIELD_TOKEN_T(partial(_llama_yield_token_func, queue=queue, metadata=metadata))
_llama_should_stop = _LLAMA_SHOULD_STOP_T(partial(_llama_should_stop_func, queue=queue, metadata=metadata))

_llama_yield_token_address = ctypes.cast(_llama_yield_token, ctypes.c_void_p).value
_llama_should_stop_address = ctypes.cast(_llama_should_stop, ctypes.c_void_p).value
Expand All @@ -98,22 +95,15 @@ def _llama_cli_main(argc, argv, queue=None, callback=None, metadata=None):
cffi__llama_should_stop_callback = ffi.cast('int (*_llama_should_stop_t)(void)', _llama_should_stop_address)

r = lib._llama_cli_main(argc, argv, cffi__llama_yield_token_callback, cffi__llama_should_stop_callback, 1)

if r != 0:
queue.put(None)
return

if queue is not None:
queue.put(None)
elif callback is not None:
callback(None)
# assert r == 0
queue.put(None)


def llama_generate(options: Options, callback=None) -> Iterator[str] | None:
def llama_generate(options: Options) -> Iterator[str]:
tokenizer: AutoTokenizer
creator_hf_repo: str
prompt: str
queue: Queue | None
queue: Queue

assert options.model and isinstance(options.model, Model)

Expand All @@ -130,10 +120,7 @@ def llama_generate(options: Options, callback=None) -> Iterator[str] | None:
if isinstance(options.prompt, list):
options.prompt = format_messages(tokenizer, options.prompt)

if callback:
queue = None
else:
queue = Queue()
queue = Queue()

metadata: dict = {
'prev_chunk_bytes': b'',
Expand All @@ -152,13 +139,10 @@ def llama_generate(options: Options, callback=None) -> Iterator[str] | None:
argv = [ffi.new('char[]', n) for n in argv]
argc = len(argv)

if callback:
_llama_cli_main(argc, argv, queue, callback, metadata)
yield ''
else:
t = Thread(target=_llama_cli_main, args=(argc, argv, queue, callback, metadata))
t.start()
t = Thread(target=_llama_cli_main, args=(argc, argv, queue, metadata))
t.start()

try:
while True:
chunk = queue.get()
queue.task_done()
Expand All @@ -167,6 +151,9 @@ def llama_generate(options: Options, callback=None) -> Iterator[str] | None:
break

yield chunk
except GeneratorExit:
# give signal to thread to stop
metadata['should_stop'] = True

queue.join()
t.join()
queue.join()
t.join()
34 changes: 31 additions & 3 deletions llama/openai.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
import json
import asyncio
from pprint import pprint
from typing import AsyncIterator

from aiohttp import web

from llama import llama_generate, get_config, Model, Options, AutoConfig
from .formatter import get_config, AutoConfig
from .llama_cli import llama_generate
from .model import Model
from .options import Options
from .util import is_cuda_available


async def generate_response(options: Options) -> AsyncIterator[str]:
Expand All @@ -15,7 +20,10 @@ async def generate_response(options: Options) -> AsyncIterator[str]:

async def chat_completions(request):
data = await request.json()
messages = data['messages']
print('data:')
pprint(data)
prompt = data.get('prompt')
messages = data.get('messages')
model = data['model']
frequency_penalty = data.get('frequency_penalty')
logit_bias = data.get('logit_bias')
Expand All @@ -37,6 +45,16 @@ async def chat_completions(request):
parallel_tool_calls = data.get('parallel_tool_calls', True)
user = data.get('user')

# llama-cpp-cffi
batch_size = data.get('batch_size')
flash_attn = data.get('flash_attn')
cont_batching = data.get('cont_batching')
gpu_layers = data.get('gpu_layers')
gpu_layers_draft = data.get('gpu_layers_draft')
split_mode = data.get('split_mode')
tensor_split = data.get('tensor_split')
main_gpu = data.get('main_gpu')

assert frequency_penalty is None
assert logit_bias is None
assert logprobs == False
Expand All @@ -57,13 +75,23 @@ async def chat_completions(request):
options = Options(
seed=seed,
ctx_size=ctx_size,
batch_size=batch_size,
predict=max_tokens,
prompt=messages,
prompt=prompt or messages,
top_p=top_p,
model=model,
stop=stop,
)

if is_cuda_available():
options.flash_attn = flash_attn
options.cont_batching = cont_batching
options.gpu_layers = gpu_layers
options.gpu_layers_draft = gpu_layers_draft
options.split_mode = split_mode
options.tensor_split = tensor_split
options.main_gpu = main_gpu

if stream:
response = web.StreamResponse()
response.headers['Content-Type'] = 'text/event-stream'
Expand Down
Loading

0 comments on commit bce8bfb

Please sign in to comment.