-
-
Notifications
You must be signed in to change notification settings - Fork 696
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
389 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
# Multimodal | ||
|
||
Instructor supports multimodal interactions by providing helper classes that are automatically converted to the correct format for different providers, allowing you to work with both text and images in your prompts and responses. This functionality is implemented in the `multimodal.py` module and provides a seamless way to handle images alongside text for various AI models. | ||
|
||
## `Image` | ||
|
||
The core of multimodal support in Instructor is the `Image` class. This class represents an image that can be loaded from a URL or file path. It provides methods to create `Image` instances and convert them to formats compatible with different AI providers. | ||
|
||
It's important to note that Anthropic and OpenAI have different formats for handling images in their API requests. The `Image` class in Instructor abstracts away these differences, allowing you to work with a unified interface. | ||
|
||
### Usage | ||
|
||
You can create an `Image` instance from a URL or file path using the `from_url` or `from_path` methods. The `Image` class will automatically convert the image to a base64-encoded string and include it in the API request. | ||
|
||
```python | ||
import instructor | ||
import openai | ||
|
||
image1 = instructor.Image.from_url("https://example.com/image.jpg") | ||
image2 = instructor.Image.from_path("path/to/image.jpg") | ||
|
||
client = instructor.from_openai(openai.OpenAI()) | ||
|
||
response = client.chat.completions.create( | ||
model="gpt-4o-mini", | ||
response_model=ImageAnalyzer, | ||
messages=[ | ||
{"role": "user", "content": [ | ||
"What is in this two images?", | ||
image1, | ||
image2 | ||
]} | ||
] | ||
) | ||
``` | ||
|
||
The `Image` class takes care of the necessary conversions and formatting, ensuring that your code remains clean and provider-agnostic. This flexibility is particularly valuable when you're experimenting with different models or when you need to switch providers based on specific project requirements. | ||
|
||
By leveraging Instructor's multimodal capabilities, you can focus on building your application logic without worrying about the intricacies of each provider's image handling format. This not only saves development time but also makes your code more maintainable and adaptable to future changes in AI provider APIs. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
from __future__ import annotations | ||
import base64 | ||
from typing import Any, Union | ||
from pathlib import Path | ||
from pydantic import BaseModel, Field | ||
from .mode import Mode | ||
|
||
|
||
class Image(BaseModel): | ||
"""Represents an image that can be loaded from a URL or file path.""" | ||
|
||
source: Union[str, Path] = Field(..., description="URL or file path of the image") # noqa: UP007 | ||
media_type: str = Field(..., description="MIME type of the image") | ||
data: Union[str, None] = Field( # noqa: UP007 | ||
None, description="Base64 encoded image data", repr=False | ||
) | ||
|
||
@classmethod | ||
def from_url(cls, url: str) -> Image: | ||
"""Create an Image instance from a URL.""" | ||
return cls(source=url, media_type="image/jpeg", data=None) | ||
|
||
@classmethod | ||
def from_path(cls, path: str | Path) -> Image: | ||
"""Create an Image instance from a file path.""" | ||
path = Path(path) | ||
if not path.is_file(): | ||
raise FileNotFoundError(f"Image file not found: {path}") | ||
|
||
suffix = path.suffix.lower().lstrip(".") | ||
if suffix not in ["jpeg", "jpg", "png"]: | ||
raise ValueError(f"Unsupported image format: {suffix}") | ||
|
||
if path.stat().st_size == 0: | ||
raise ValueError("Image file is empty") | ||
|
||
media_type = "image/jpeg" if suffix in ["jpeg", "jpg"] else "image/png" | ||
data = base64.b64encode(path.read_bytes()).decode("utf-8") | ||
return cls(source=str(path), media_type=media_type, data=data) | ||
|
||
def to_anthropic(self) -> dict[str, Any]: | ||
"""Convert the Image instance to Anthropic's API format.""" | ||
if isinstance(self.source, str) and self.source.startswith( | ||
("http://", "https://") | ||
): | ||
import requests | ||
|
||
response = requests.get(self.source) | ||
response.raise_for_status() | ||
self.data = base64.b64encode(response.content).decode("utf-8") | ||
self.media_type = response.headers.get("Content-Type", "image/jpeg") | ||
|
||
return { | ||
"type": "image", | ||
"source": { | ||
"type": "base64", | ||
"media_type": self.media_type, | ||
"data": self.data, | ||
}, | ||
} | ||
|
||
def to_openai(self) -> dict[str, Any]: | ||
"""Convert the Image instance to OpenAI's Vision API format.""" | ||
if isinstance(self.source, str) and self.source.startswith( | ||
("http://", "https://") | ||
): | ||
return {"type": "image_url", "image_url": {"url": self.source}} | ||
elif self.data: | ||
return { | ||
"type": "image_url", | ||
"image_url": {"url": f"data:{self.media_type};base64,{self.data}"}, | ||
} | ||
else: | ||
raise ValueError("Image data is missing for base64 encoding.") | ||
|
||
|
||
def convert_contents( | ||
contents: Union[list[Union[str, Image]], str, Image], # noqa: UP007 | ||
mode: Mode, | ||
) -> Union[str, list[dict[str, Any]]]: # noqa: UP007 | ||
"""Convert content items to the appropriate format based on the specified mode.""" | ||
if isinstance(contents, str): | ||
return contents | ||
if isinstance(contents, Image): | ||
contents = [contents] | ||
|
||
converted_contents: list[dict[str, Union[str, Image]]] = [] # noqa: UP007 | ||
for content in contents: | ||
if isinstance(content, str): | ||
converted_contents.append({"type": "text", "text": content}) | ||
elif isinstance(content, Image): | ||
if mode in {Mode.ANTHROPIC_JSON, Mode.ANTHROPIC_TOOLS}: | ||
converted_contents.append(content.to_anthropic()) | ||
elif mode in {Mode.GEMINI_JSON, Mode.GEMINI_TOOLS}: | ||
raise NotImplementedError("Gemini is not supported yet") | ||
else: | ||
converted_contents.append(content.to_openai()) | ||
else: | ||
raise ValueError(f"Unsupported content type: {type(content)}") | ||
return converted_contents | ||
|
||
|
||
def convert_messages( | ||
messages: list[dict[str, Union[str, list[Union[str, Image]]]]], # noqa: UP007 | ||
mode: Mode, | ||
) -> list[dict[str, Any]]: | ||
"""Convert messages to the appropriate format based on the specified mode.""" | ||
converted_messages = [] | ||
for message in messages: | ||
role = message["role"] | ||
content = message["content"] | ||
if isinstance(content, str): | ||
converted_messages.append({"role": role, "content": content}) # type: ignore | ||
else: | ||
converted_content = convert_contents(content, mode) | ||
converted_messages.append({"role": role, "content": converted_content}) # type: ignore | ||
return converted_messages # type: ignore |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import pytest | ||
from instructor.multimodal import Image | ||
import instructor | ||
from pydantic import Field, BaseModel | ||
from itertools import product | ||
from .util import models, modes | ||
|
||
|
||
@pytest.mark.parametrize("model, mode", product(models, modes)) | ||
def test_multimodal_image_description(model, mode, client): | ||
client = instructor.patch(client, mode=mode) | ||
|
||
class ImageDescription(BaseModel): | ||
objects: list[str] = Field(..., description="The objects in the image") | ||
scene: str = Field(..., description="The scene of the image") | ||
colors: list[str] = Field(..., description="The colors in the image") | ||
|
||
response = client.chat.completions.create( | ||
model=model, # Ensure this is a vision-capable model | ||
response_model=ImageDescription, | ||
messages=[ | ||
{ | ||
"role": "system", | ||
"content": "You are a helpful assistant that can describe images", | ||
}, | ||
{ | ||
"role": "user", | ||
"content": [ | ||
"What is this?", | ||
Image.from_url( | ||
"https://pbs.twimg.com/profile_images/1816950591857233920/ZBxrWCbX_400x400.jpg" | ||
), | ||
], | ||
}, | ||
], | ||
) | ||
|
||
# Assertions to validate the response | ||
assert isinstance(response, ImageDescription) | ||
assert len(response.objects) > 0 | ||
assert response.scene != "" | ||
assert len(response.colors) > 0 | ||
|
||
# Additional assertions can be added based on expected content of the sample image |
Oops, something went wrong.