diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0dc0268d4..3db8f68cc 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -43,12 +43,6 @@ jobs: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }} - - name: Run Gemini Tests - if: matrix.python-version != '3.9' - run: poetry run pytest tests/llm/test_gemini - env: - GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} - - name: Generate coverage report if: matrix.python-version == '3.11' run: | diff --git a/docs/concepts/multimodal.md b/docs/concepts/multimodal.md new file mode 100644 index 000000000..19f409999 --- /dev/null +++ b/docs/concepts/multimodal.md @@ -0,0 +1,39 @@ +# Multimodal + +Instructor supports multimodal interactions by providing helper classes that are automatically converted to the correct format for different providers, allowing you to work with both text and images in your prompts and responses. This functionality is implemented in the `multimodal.py` module and provides a seamless way to handle images alongside text for various AI models. + +## `Image` + +The core of multimodal support in Instructor is the `Image` class. This class represents an image that can be loaded from a URL or file path. It provides methods to create `Image` instances and convert them to formats compatible with different AI providers. + +It's important to note that Anthropic and OpenAI have different formats for handling images in their API requests. The `Image` class in Instructor abstracts away these differences, allowing you to work with a unified interface. + +### Usage + +You can create an `Image` instance from a URL or file path using the `from_url` or `from_path` methods. The `Image` class will automatically convert the image to a base64-encoded string and include it in the API request. + +```python +import instructor +import openai + +image1 = instructor.Image.from_url("https://example.com/image.jpg") +image2 = instructor.Image.from_path("path/to/image.jpg") + +client = instructor.from_openai(openai.OpenAI()) + +response = client.chat.completions.create( + model="gpt-4o-mini", + response_model=ImageAnalyzer, + messages=[ + {"role": "user", "content": [ + "What is in this two images?", + image1, + image2 + ]} + ] +) +``` + +The `Image` class takes care of the necessary conversions and formatting, ensuring that your code remains clean and provider-agnostic. This flexibility is particularly valuable when you're experimenting with different models or when you need to switch providers based on specific project requirements. + +By leveraging Instructor's multimodal capabilities, you can focus on building your application logic without worrying about the intricacies of each provider's image handling format. This not only saves development time but also makes your code more maintainable and adaptable to future changes in AI provider APIs. \ No newline at end of file diff --git a/instructor/__init__.py b/instructor/__init__.py index f090976fd..43d3a3d58 100644 --- a/instructor/__init__.py +++ b/instructor/__init__.py @@ -3,6 +3,7 @@ from .mode import Mode from .process_response import handle_response_model from .distil import FinetuneFormat, Instructions +from .multimodal import Image from .dsl import ( CitationMixin, Maybe, @@ -24,6 +25,7 @@ __all__ = [ "Instructor", + "Image", "from_openai", "from_litellm", "AsyncInstructor", diff --git a/instructor/multimodal.py b/instructor/multimodal.py new file mode 100644 index 000000000..6382e1d9d --- /dev/null +++ b/instructor/multimodal.py @@ -0,0 +1,117 @@ +from __future__ import annotations +import base64 +from typing import Any, Union +from pathlib import Path +from pydantic import BaseModel, Field +from .mode import Mode + + +class Image(BaseModel): + """Represents an image that can be loaded from a URL or file path.""" + + source: Union[str, Path] = Field(..., description="URL or file path of the image") # noqa: UP007 + media_type: str = Field(..., description="MIME type of the image") + data: Union[str, None] = Field( # noqa: UP007 + None, description="Base64 encoded image data", repr=False + ) + + @classmethod + def from_url(cls, url: str) -> Image: + """Create an Image instance from a URL.""" + return cls(source=url, media_type="image/jpeg", data=None) + + @classmethod + def from_path(cls, path: str | Path) -> Image: + """Create an Image instance from a file path.""" + path = Path(path) + if not path.is_file(): + raise FileNotFoundError(f"Image file not found: {path}") + + suffix = path.suffix.lower().lstrip(".") + if suffix not in ["jpeg", "jpg", "png"]: + raise ValueError(f"Unsupported image format: {suffix}") + + if path.stat().st_size == 0: + raise ValueError("Image file is empty") + + media_type = "image/jpeg" if suffix in ["jpeg", "jpg"] else "image/png" + data = base64.b64encode(path.read_bytes()).decode("utf-8") + return cls(source=str(path), media_type=media_type, data=data) + + def to_anthropic(self) -> dict[str, Any]: + """Convert the Image instance to Anthropic's API format.""" + if isinstance(self.source, str) and self.source.startswith( + ("http://", "https://") + ): + import requests + + response = requests.get(self.source) + response.raise_for_status() + self.data = base64.b64encode(response.content).decode("utf-8") + self.media_type = response.headers.get("Content-Type", "image/jpeg") + + return { + "type": "image", + "source": { + "type": "base64", + "media_type": self.media_type, + "data": self.data, + }, + } + + def to_openai(self) -> dict[str, Any]: + """Convert the Image instance to OpenAI's Vision API format.""" + if isinstance(self.source, str) and self.source.startswith( + ("http://", "https://") + ): + return {"type": "image_url", "image_url": {"url": self.source}} + elif self.data: + return { + "type": "image_url", + "image_url": {"url": f"data:{self.media_type};base64,{self.data}"}, + } + else: + raise ValueError("Image data is missing for base64 encoding.") + + +def convert_contents( + contents: Union[list[Union[str, Image]], str, Image], # noqa: UP007 + mode: Mode, +) -> Union[str, list[dict[str, Any]]]: # noqa: UP007 + """Convert content items to the appropriate format based on the specified mode.""" + if isinstance(contents, str): + return contents + if isinstance(contents, Image): + contents = [contents] + + converted_contents: list[dict[str, Union[str, Image]]] = [] # noqa: UP007 + for content in contents: + if isinstance(content, str): + converted_contents.append({"type": "text", "text": content}) + elif isinstance(content, Image): + if mode in {Mode.ANTHROPIC_JSON, Mode.ANTHROPIC_TOOLS}: + converted_contents.append(content.to_anthropic()) + elif mode in {Mode.GEMINI_JSON, Mode.GEMINI_TOOLS}: + raise NotImplementedError("Gemini is not supported yet") + else: + converted_contents.append(content.to_openai()) + else: + raise ValueError(f"Unsupported content type: {type(content)}") + return converted_contents + + +def convert_messages( + messages: list[dict[str, Union[str, list[Union[str, Image]]]]], # noqa: UP007 + mode: Mode, +) -> list[dict[str, Any]]: + """Convert messages to the appropriate format based on the specified mode.""" + converted_messages = [] + for message in messages: + role = message["role"] + content = message["content"] + if isinstance(content, str): + converted_messages.append({"role": role, "content": content}) # type: ignore + else: + converted_content = convert_contents(content, mode) + converted_messages.append({"role": role, "content": converted_content}) # type: ignore + return converted_messages # type: ignore diff --git a/instructor/process_response.py b/instructor/process_response.py index 6d2169b4f..89186fd4f 100644 --- a/instructor/process_response.py +++ b/instructor/process_response.py @@ -21,7 +21,7 @@ from instructor.dsl.simple_type import AdapterBase, ModelAdapter, is_simple_type from instructor.function_calls import OpenAISchema, openai_schema from instructor.utils import merge_consecutive_messages - +from instructor.multimodal import convert_messages logger = logging.getLogger("instructor") @@ -664,6 +664,9 @@ def handle_response_model( else: raise ValueError(f"Invalid patch mode: {mode}") + if "messages" in new_kwargs: + new_kwargs["messages"] = convert_messages(new_kwargs["messages"], mode) + logger.debug( f"Instructor Request: {mode.value=}, {response_model=}, {new_kwargs=}", extra={ diff --git a/mkdocs.yml b/mkdocs.yml index 59cbe2fef..1eea63e7b 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -207,7 +207,7 @@ nav: - Advanced Usage of Mistral Large: 'hub/mistral.md' - Generating Knowledge Graphs with AI: 'hub/knowledge_graph.md' - Extracting Relevant Clips from YouTube Videos: "hub/youtube_clips.md" - - Tutorial: Building Knowledge Graphs: 'tutorials/5-knowledge-graphs.ipynb' + - Building Knowledge Graphs: 'tutorials/5-knowledge-graphs.ipynb' - CLI Reference: - "CLI Reference": "cli/index.md" - "Finetuning GPT-3.5": "cli/finetune.md" diff --git a/tests/llm/test_openai/test_multimodal.py b/tests/llm/test_openai/test_multimodal.py new file mode 100644 index 000000000..78f753778 --- /dev/null +++ b/tests/llm/test_openai/test_multimodal.py @@ -0,0 +1,44 @@ +import pytest +from instructor.multimodal import Image +import instructor +from pydantic import Field, BaseModel +from itertools import product +from .util import models, modes + + +@pytest.mark.parametrize("model, mode", product(models, modes)) +def test_multimodal_image_description(model, mode, client): + client = instructor.patch(client, mode=mode) + + class ImageDescription(BaseModel): + objects: list[str] = Field(..., description="The objects in the image") + scene: str = Field(..., description="The scene of the image") + colors: list[str] = Field(..., description="The colors in the image") + + response = client.chat.completions.create( + model=model, # Ensure this is a vision-capable model + response_model=ImageDescription, + messages=[ + { + "role": "system", + "content": "You are a helpful assistant that can describe images", + }, + { + "role": "user", + "content": [ + "What is this?", + Image.from_url( + "https://pbs.twimg.com/profile_images/1816950591857233920/ZBxrWCbX_400x400.jpg" + ), + ], + }, + ], + ) + + # Assertions to validate the response + assert isinstance(response, ImageDescription) + assert len(response.objects) > 0 + assert response.scene != "" + assert len(response.colors) > 0 + + # Additional assertions can be added based on expected content of the sample image diff --git a/tests/test_multimodal.py b/tests/test_multimodal.py new file mode 100644 index 000000000..c0bdd9712 --- /dev/null +++ b/tests/test_multimodal.py @@ -0,0 +1,182 @@ +import pytest +from pathlib import Path +from instructor.multimodal import Image, convert_contents, convert_messages +from instructor.mode import Mode + + +def test_image_from_url(): + url = "https://example.com/image.jpg" + image = Image.from_url(url) + assert image.source == url + assert image.media_type == "image/jpeg" + assert image.data is None + + +def test_image_from_path(tmp_path: Path): + image_path = tmp_path / "test_image.jpg" + image_path.write_bytes(b"fake image data") + + image = Image.from_path(image_path) + assert image.source == str(image_path) + assert image.media_type == "image/jpeg" + assert image.data is not None + + +@pytest.mark.skip(reason="Needs to download image") +def test_image_to_anthropic(): + image = Image( + source="http://example.com/image.jpg", media_type="image/jpeg", data=None + ) + anthropic_format = image.to_anthropic() + assert anthropic_format["type"] == "image" + assert anthropic_format["source"]["type"] == "base64" + assert anthropic_format["source"]["media_type"] == "image/jpeg" + + +def test_image_to_openai(): + image = Image( + source="http://example.com/image.jpg", media_type="image/jpeg", data=None + ) + openai_format = image.to_openai() + assert openai_format["type"] == "image_url" + assert openai_format["image_url"]["url"] == "http://example.com/image.jpg" + + +def test_convert_contents(): + contents = ["Hello", Image.from_url("http://example.com/image.jpg")] + converted = list(convert_contents(contents, Mode.TOOLS)) + assert len(converted) == 2 + assert converted[0] == {"type": "text", "text": "Hello"} + assert converted[1]["type"] == "image_url" + assert converted[1]["image_url"]["url"] == "http://example.com/image.jpg" + + +def test_convert_messages(): + messages = [ + { + "role": "user", + "content": ["Hello", Image.from_url("http://example.com/image.jpg")], + }, + {"role": "assistant", "content": "Hi there!"}, + ] + converted = list(convert_messages(messages, Mode.TOOLS)) + assert len(converted) == 2 + assert converted[0]["role"] == "user" + assert len(converted[0]["content"]) == 2 + assert converted[0]["content"][0] == {"type": "text", "text": "Hello"} + assert converted[0]["content"][1]["type"] == "image_url" + assert converted[1]["role"] == "assistant" + assert converted[1]["content"] == "Hi there!" + + +def test_convert_messages_anthropic(): + messages = [ + { + "role": "user", + "content": [ + "Hello", + Image(source="base64data", media_type="image/jpeg", data="fakedata"), + ], + } + ] + converted = list(convert_messages(messages, Mode.ANTHROPIC_JSON)) + assert len(converted) == 1 + assert converted == [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Hello"}, + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/jpeg", + "data": "fakedata", + }, + }, + ], + } + ] + + +def test_convert_messages_gemini(): + messages = [ + { + "role": "user", + "content": ["Hello", Image.from_url("http://example.com/image.jpg")], + } + ] + with pytest.raises(NotImplementedError): + list(convert_messages(messages, Mode.GEMINI_JSON)) + + +# Additional tests + + +def test_image_from_path_unsupported_format(tmp_path: Path): + image_path = tmp_path / "test_image.gif" + image_path.write_bytes(b"fake gif data") + + with pytest.raises(ValueError, match="Unsupported image format: gif"): + Image.from_path(image_path) + + +def test_image_from_path_empty_file(tmp_path: Path): + image_path = tmp_path / "empty_image.jpg" + image_path.touch() + + with pytest.raises(ValueError, match="Image file is empty"): + Image.from_path(image_path) + + +def test_image_to_openai_base64(): + image = Image( + source="local_file.jpg", media_type="image/jpeg", data="base64encodeddata" + ) + openai_format = image.to_openai() + assert openai_format["type"] == "image_url" + assert openai_format["image_url"]["url"].startswith("data:image/jpeg;base64,") + + +def test_convert_contents_single_string(): + content = "Hello, world!" + converted = convert_contents(content, Mode.TOOLS) + assert converted == "Hello, world!" + + +def test_convert_contents_single_image(): + image = Image.from_url("http://example.com/image.jpg") + converted = list(convert_contents(image, Mode.TOOLS)) + assert len(converted) == 1 + assert converted == [ + {"type": "image_url", "image_url": {"url": "http://example.com/image.jpg"}} + ] + + +def test_convert_messages_mixed_content(): + messages = [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi there!"}, + {"role": "user", "content": Image.from_url("http://example.com/image.jpg")}, + ] + converted = list(convert_messages(messages, Mode.TOOLS)) + assert len(converted) == 3 + assert converted[0]["content"] == "Hello" + assert converted[1]["content"] == "Hi there!" + assert converted[2]["content"][0]["type"] == "image_url" + + +def test_convert_contents_invalid_type(): + with pytest.raises(ValueError, match="Unsupported content type"): + list(convert_contents([1, 2, 3], Mode.TOOLS)) + + +def test_convert_contents_anthropic_mode(): + contents = [ + "Hello", + Image(source="base64data", media_type="image/png", data="fakedata"), + ] + converted = list(convert_contents(contents, Mode.ANTHROPIC_JSON)) + assert converted[1]["type"] == "image" + assert converted[1]["source"]["type"] == "base64" + assert converted[1]["source"]["media_type"] == "image/png"