diff --git a/tests/test_vlm.py b/tests/test_vlm.py index 25d81d56..17400bd4 100644 --- a/tests/test_vlm.py +++ b/tests/test_vlm.py @@ -5,6 +5,7 @@ from nexa.gguf.llama.llama_chat_format import NanoLlavaChatHandler from tests.utils import download_model from nexa.gguf.lib_utils import is_gpu_available +import tempfile def image_to_base64_data_uri(file_path): """ @@ -15,53 +16,51 @@ def image_to_base64_data_uri(file_path): base64_data = base64.b64encode(img_file.read()).decode("utf-8") return f"data:image/png;base64,{base64_data}" -model_url = "https://nexa-model-hub-bucket.s3.us-west-1.amazonaws.com/public/nanoLLaVA/model-fp16.gguf" -mmproj_url = "https://nexa-model-hub-bucket.s3.us-west-1.amazonaws.com/public/nanoLLaVA/projector-fp16.gguf" -# Download paths -output_dir = os.getcwd() -model_path = download_model(model_url, output_dir) -mmproj_path = download_model(mmproj_url, output_dir) -print("Model downloaded to:", model_path) -print("MMProj downloaded to:", mmproj_path) +def test_image_generation(): + with tempfile.TemporaryDirectory() as temp_dir: + temp_dir = os.path.dirname(os.path.abspath(__file__)) + model_url = "https://nexa-model-hub-bucket.s3.us-west-1.amazonaws.com/public/nanoLLaVA/model-fp16.gguf" + mmproj_url = "https://nexa-model-hub-bucket.s3.us-west-1.amazonaws.com/public/nanoLLaVA/projector-fp16.gguf" -chat_handler = NanoLlavaChatHandler(clip_model_path=mmproj_path) + model_path = download_model(model_url, temp_dir) + mmproj_path = download_model(mmproj_url, temp_dir) + chat_handler = NanoLlavaChatHandler(clip_model_path=mmproj_path) -def test_image_generation(): - llm = llama.Llama( - model_path=model_path, - chat_handler=chat_handler, - n_ctx=2048, # n_ctx should be increased to accommodate the image embedding - n_gpu_layers=-1 if is_gpu_available() else 0, # Uncomment to use GPU acceleration - verbose=False, - ) - output = llm.create_chat_completion( - messages=[ - { - "role": "system", - "content": "You are an assistant who perfectly describes images.", - }, - { - "role": "user", - "content": [ - {"type": "text", "text": "What's in this image?"}, - { - "type": "image_url", - "image_url": { - "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + llm = llama.Llama( + model_path=model_path, + chat_handler=chat_handler, + n_ctx=2048, # n_ctx should be increased to accommodate the image embedding + n_gpu_layers=-1 if is_gpu_available() else 0, # Uncomment to use GPU acceleration + verbose=False, + ) + output = llm.create_chat_completion( + messages=[ + { + "role": "system", + "content": "You are an assistant who perfectly describes images.", + }, + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + { + "type": "image_url", + "image_url": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + }, }, - }, - ], - }, - ], - stream=True, - ) - for chunk in output: - delta = chunk["choices"][0]["delta"] - if "role" in delta: - print(delta["role"], end=": ") - elif "content" in delta: - print(delta["content"], end="") + ], + }, + ], + stream=True, + ) + for chunk in output: + delta = chunk["choices"][0]["delta"] + if "role" in delta: + print(delta["role"], end=": ") + elif "content" in delta: + print(delta["content"], end="") # if __name__ == "__main__":