diff --git a/docs/examples/index.md b/docs/examples/index.md
index dd31c2b65..1e2cb3c9d 100644
--- a/docs/examples/index.md
+++ b/docs/examples/index.md
@@ -23,9 +23,10 @@ Each cookbook provides step-by-step guidance and code snippets, making it easy t
13. [Generating advertising copy from images](image_to_ad_copy.md)
14. [Using local models from Ollama](ollama.md)
15. [Storing responses in a database](sqlmodel.md)
-17. [Segmenting documents using LLMs](document_segmentation.md)
-18. [Saving API costs with OpenAI's Batch API](batch_job_oai.md)
-19. [Using groqcloud api](groq.md)
-20. [Using Mistral/Mixtral](mistral.md)
+16. [Segmenting documents using LLMs](document_segmentation.md)
+17. [Saving API costs with OpenAI's Batch API](batch_job_oai.md)
+18. [Using groqcloud api](groq.md)
+19. [Using Mistral/Mixtral](mistral.md)
+20. [Working with Multi-Modal data with Gemini](multi_modal_gemini.md)
Explore more!
diff --git a/docs/examples/multi_modal_gemini.md b/docs/examples/multi_modal_gemini.md
new file mode 100644
index 000000000..2bfbf5822
--- /dev/null
+++ b/docs/examples/multi_modal_gemini.md
@@ -0,0 +1,190 @@
+# Using Gemini with Multi Modal Data
+
+This tutorial shows how to use `instructor` with `google-generativeai` to work with multi-modal data. In this example, we'll demonstrate three ways to work with audio files.
+
+We'll be using this [recording](https://storage.googleapis.com/generativeai-downloads/data/State_of_the_Union_Address_30_January_1961.mp3) that's taken from the [Google Generative AI cookbook](https://github.com/google-gemini/cookbook/blob/main/quickstarts/Audio.ipynb).
+
+## Normal Message
+
+The first way to work with audio files is to upload the entire audio file and pass it into the LLM as a normal message. This is the easiest way to get started and doesn't require any special setup.
+
+```python
+import instructor
+import google.generativeai as genai
+from pydantic import BaseModel
+
+
+client = instructor.from_gemini(
+ client=genai.GenerativeModel(
+ model_name="models/gemini-1.5-flash-latest",
+ ),
+ mode=instructor.Mode.GEMINI_JSON, # (1)!
+)
+
+mp3_file = genai.upload_file("./sample.mp3") #(2)!
+
+
+class Description(BaseModel):
+ description: str
+
+
+resp = client.create(
+ response_model=Description,
+ messages=[
+ {
+ "role": "user",
+ "content": "Summarize what's happening in this audio file and who the main speaker is",
+ },
+ {
+ "role": "user",
+ "content": mp3_file, # (3)!
+ },
+ ],
+)
+
+print(resp)
+#> description="The main speaker is President John F. Kennedy, and he's giving a
+#> State of the Union address to a joint session of Congress. He begins by
+#> acknowledging his fondness for the House of Representatives and his long
+#> history with it. He then goes on to discuss the state of the economy,
+#> highlighting the difficulties faced by Americans, such as unemployment and
+#> low farm incomes. He also touches on the Cold War and the international
+#> balance of payments. He speaks of the need to strengthen the US military,
+#> and he also discusses the importance of international cooperation and the
+#> need to address global issues like hunger and illiteracy. He ends by urging
+#> his audience to work together to face the challenges that lie ahead."
+```
+
+1. Make sure to set the mode to `GEMINI_JSON`, this is important because Tool Calling doesn't work with multi-modal inputs.
+2. Use `genai.upload_file` to upload your file. If you've already uploaded the file, you can get it by using `genai.get_file`
+3. Pass in the file object as any normal user message
+
+## Inline Audio Segment
+
+!!! note "Maximum File Size"
+
+ When uploading and working with audio, there is a maximum file size that we can upload to the api as an inline segment. You'll know when this error is thrown below.
+
+ ```
+ google.api_core.exceptions.InvalidArgument: 400 Request payload size exceeds the limit: 20971520 bytes. Please upload your files with the File API instead.`f = genai.upload_file(path); m.generate_content(['tell me about this file:', f])`
+ ```
+
+ When it comes to video files, we recommend using the file.upload method as shown in the example above.
+
+Secondly, we can also pass in a audio segment as a normal message as an inline object as shown below. This requires you to install the `pydub` library in order to do so.
+
+```python
+import instructor
+import google.generativeai as genai
+from pydantic import BaseModel
+from pydub import AudioSegment
+
+client = instructor.from_gemini(
+ client=genai.GenerativeModel(
+ model_name="models/gemini-1.5-flash-latest",
+ ),
+ mode=instructor.Mode.GEMINI_JSON, # (1)!
+)
+
+
+sound = AudioSegment.from_mp3("sample.mp3") # (2)!
+sound = sound[:60000]
+
+
+class Transcription(BaseModel):
+ summary: str
+ exact_transcription: str
+
+
+resp = client.create(
+ response_model=Transcription,
+ messages=[
+ {
+ "role": "user",
+ "content": "Please transcribe this recording",
+ },
+ {
+ "role": "user",
+ "content": {
+ "mime_type": "audio/mp3",
+ "data": sound.export().read(), # (3)!
+ },
+ },
+ ],
+)
+
+print(resp)
+
+# > summary='President delivers a speech to a joint session of Congress,
+# > highlighting his history in the House of Representatives and thanking
+# > the members of Congress for their guidance.',
+# >
+# > exact_transcription="The President's State of the Union address to a
+# > joint session of the Congress from the rostrum of the House of
+# > Representatives, Washington DC, January 30th 1961. Mr. Speaker, Mr.
+# > Vice-President, members of the Congress, it is a pleasure to return
+# > from whence I came. You are among my oldest friends in Washington,
+# > and this house is my oldest home. It was here that I first took the
+# > oath of federal office. It was here for 14 years that I gained both
+# > knowledge and inspiration from members of both"
+
+```
+
+1. Make sure to set the mode to `GEMINI_JSON`, this is important because Tool Calling doesn't work with multi-modal inputs.
+2. Use `AudioSegment.from_mp3` to load your audio file.
+3. Pass in the audio data as bytes to the `data` field using the content as a dictionary with the right content `mime_type` and `data` as bytes
+
+## Lists of Content
+
+We also support passing in these as a single list as per the documentation for `google-generativeai`. Here's how to do so with a audio segment snippet from the same recording.
+
+Note that the list can contain normal user messages as well as file objects. It's incredibly flexible.
+
+```python
+import instructor
+import google.generativeai as genai
+from pydantic import BaseModel
+
+
+client = instructor.from_gemini(
+ client=genai.GenerativeModel(
+ model_name="models/gemini-1.5-flash-latest",
+ ),
+ mode=instructor.Mode.GEMINI_JSON, # (1)!
+)
+
+mp3_file = genai.upload_file("./sample.mp3") # (2)!
+
+
+class Description(BaseModel):
+ description: str
+
+
+content = [
+ "Summarize what's happening in this audio file and who the main speaker is",
+ mp3_file, # (3)!
+]
+
+resp = client.create(
+ response_model=Description,
+ messages=[
+ {
+ "role": "user",
+ "content": content,
+ }
+ ],
+)
+
+print(resp)
+# > description='President John F. Kennedy delivers State of the Union Address to \
+# > Congress. He outlines national challenges: economic struggles, debt concerns, \
+# > communism threat, Cold War. Proposes solutions: increased military spending, \
+# > new economic programs, expanded foreign aid. Calls for active U.S. role in \
+# > international affairs. Emphasizes facing challenges, avoiding panic, and \
+# > working together for a better future.'
+
+```
+
+1. Make sure to set the mode to `GEMINI_JSON`, this is important because Tool Calling doesn't work with multi-modal inputs.
+2. Upload the file using `genai.upload_file` or get the file using `genai.get_file`
+3. Pass in the content as a list containing the normal user message and the file object.
diff --git a/docs/index.md b/docs/index.md
index e362d920c..f68a6a63b 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -19,41 +19,41 @@ It stands out for its simplicity, transparency, and user-centric design, built o
-- :material-code-tags: __Simple API with Full Prompt Control__
+- :material-code-tags: **Simple API with Full Prompt Control**
- Instructor provides a straightforward API that gives you complete ownership and control over your prompts. This allows for fine-tuned customization and optimization of your LLM interactions.
+ Instructor provides a straightforward API that gives you complete ownership and control over your prompts. This allows for fine-tuned customization and optimization of your LLM interactions.
- [:octicons-arrow-right-16: Explore Concepts](./concepts/models.md)
+ [:octicons-arrow-right-16: Explore Concepts](./concepts/models.md)
-- :material-translate: __Multi-Language Support__
+- :material-translate: **Multi-Language Support**
- Simplify structured data extraction from LLMs with type hints and validation.
+ Simplify structured data extraction from LLMs with type hints and validation.
- [:simple-python: Python](https://python.useinstructor.com) · [:simple-typescript: TypeScript](https://js.useinstructor.com) · [:simple-ruby: Ruby](https://ruby.useinstructor.com) · [:simple-go: Go](https://go.useinstructor.com) · [:simple-elixir: Elixir](https://hex.pm/packages/instructor) · [:simple-rust: Rust](https://rust.useinstructor.com)
+ [:simple-python: Python](https://python.useinstructor.com) · [:simple-typescript: TypeScript](https://js.useinstructor.com) · [:simple-ruby: Ruby](https://ruby.useinstructor.com) · [:simple-go: Go](https://go.useinstructor.com) · [:simple-elixir: Elixir](https://hex.pm/packages/instructor) · [:simple-rust: Rust](https://rust.useinstructor.com)
-- :material-refresh: __Reasking and Validation__
+- :material-refresh: **Reasking and Validation**
- Automatically reask the model when validation fails, ensuring high-quality outputs. Leverage Pydantic's validation for robust error handling.
+ Automatically reask the model when validation fails, ensuring high-quality outputs. Leverage Pydantic's validation for robust error handling.
- [:octicons-arrow-right-16: Learn about Reasking](./concepts/reask_validation.md)
+ [:octicons-arrow-right-16: Learn about Reasking](./concepts/reask_validation.md)
-- :material-repeat-variant: __Streaming Support__
+- :material-repeat-variant: **Streaming Support**
- Stream partial results and iterables with ease, allowing for real-time processing and improved responsiveness in your applications.
+ Stream partial results and iterables with ease, allowing for real-time processing and improved responsiveness in your applications.
- [:octicons-arrow-right-16: Learn about Streaming](./concepts/partial.md)
+ [:octicons-arrow-right-16: Learn about Streaming](./concepts/partial.md)
-- :material-code-braces: __Powered by Type Hints__
+- :material-code-braces: **Powered by Type Hints**
- Leverage Pydantic for schema validation, prompting control, less code, and IDE integration.
-
- [:octicons-arrow-right-16: Learn more](https://docs.pydantic.dev/)
+ Leverage Pydantic for schema validation, prompting control, less code, and IDE integration.
-- :material-lightning-bolt: __Simplified LLM Interactions__
+ [:octicons-arrow-right-16: Learn more](https://docs.pydantic.dev/)
- Support for [OpenAI](./hub/openai.md), [Anthropic](./hub/anthropic.md), [Google](./hub/google.md), [Vertex AI](./hub/vertexai.md), [Mistral/Mixtral](./hub/together.md), [Anyscale](./hub/anyscale.md), [Ollama](./hub/ollama.md), [llama-cpp-python](./hub/llama-cpp-python.md), [Cohere](./hub/cohere.md), [LiteLLM](./hub/litellm.md).
-
- [:octicons-arrow-right-16: See Hub](./hub/index.md)
+- :material-lightning-bolt: **Simplified LLM Interactions**
+
+ Support for [OpenAI](./hub/openai.md), [Anthropic](./hub/anthropic.md), [Google](./hub/google.md), [Vertex AI](./hub/vertexai.md), [Mistral/Mixtral](./hub/together.md), [Anyscale](./hub/anyscale.md), [Ollama](./hub/ollama.md), [llama-cpp-python](./hub/llama-cpp-python.md), [Cohere](./hub/cohere.md), [LiteLLM](./hub/litellm.md).
+
+ [:octicons-arrow-right-16: See Hub](./hub/index.md)
@@ -178,6 +178,10 @@ assert resp.age == 25
### Using Gemini
+The Vertex AI and Gemini Clients have different APIs. When using instructor with these clients, make sure to read the documentation for the specific client you're using to make sure you're using the correct methods.
+
+**Note**: Gemini Tool Calling is still in preview, and there are some limitations. You can learn more about them in the [Vertex AI examples notebook](../hub/vertexai.md). As of now, you cannot use tool calling with Gemini when you have multi-modal inputs (Eg. Images, Audio, Video), you must use the `JSON` mode equivalent for that client.
+
#### Google AI
```python
@@ -214,9 +218,67 @@ assert resp.name == "Jason"
assert resp.age == 25
```
-#### Vertex AI
+??? info "Using Gemini's multi-modal capabilities with `google-generativeai`"
+
+ The `google.generativeai` library has a different API than the `vertexai` library. But, using `instructor`, working with multi-modal data is easy.
+
+ Here's a quick example of how to use an Audio file with `google-generativeai`. We've used this [recording](https://storage.googleapis.com/generativeai-downloads/data/State_of_the_Union_Address_30_January_1961.mp3) that's taken from the [Google Generative AI cookbook](https://github.com/google-gemini/cookbook/blob/main/quickstarts/Audio.ipynb)
+
+ For a more in-depth example, you can check out our guide to working with Gemini using the `google-generativeai` package [here](./examples/multi_modal_gemini.md).
+
+
+ ```python
+ import instructor
+ import google.generativeai as genai
+ from pydantic import BaseModel
+
+
+ client = instructor.from_gemini(
+ client=genai.GenerativeModel(
+ model_name="models/gemini-1.5-flash-latest",
+ ),
+ mode=instructor.Mode.GEMINI_JSON, # (1)!
+ )
+
+ mp3_file = genai.upload_file("./sample.mp3") #(2)!
+
+
+ class Description(BaseModel):
+ description: str
+
+
+ resp = client.create(
+ response_model=Description,
+ messages=[
+ {
+ "role": "user",
+ "content": "Summarize what's happening in this audio file and who the main speaker is",
+ },
+ {
+ "role": "user",
+ "content": mp3_file, # (3)!
+ },
+ ],
+ )
-**Note**: Gemini Tool Calling is still in preview, and there are some limitations. You can learn more about them in the [Vertex AI examples notebook](../hub/vertexai.md).
+ print(resp)
+ #> description="The main speaker is President John F. Kennedy, and he's giving a
+ #> State of the Union address to a joint session of Congress. He begins by
+ #> acknowledging his fondness for the House of Representatives and his long
+ #> history with it. He then goes on to discuss the state of the economy,
+ #> highlighting the difficulties faced by Americans, such as unemployment and
+ #> low farm incomes. He also touches on the Cold War and the international
+ #> balance of payments. He speaks of the need to strengthen the US military,
+ #> and he also discusses the importance of international cooperation and the
+ #> need to address global issues like hunger and illiteracy. He ends by urging
+ #> his audience to work together to face the challenges that lie ahead."
+ ```
+
+ 1. Make sure to set the mode to `GEMINI_JSON`, this is important because Tool Calling doesn't work with multi-modal inputs.
+ 2. Use `genai.upload_file` to upload your file. If you've already uploaded the file, you can get it by using `genai.get_file`
+ 3. Pass in the file object as any normal user message
+
+#### Vertex AI
```python
import instructor
@@ -253,9 +315,9 @@ assert resp.name == "Jason"
assert resp.age == 25
```
-??? info "Want to use Gemini's multi-part formats?"
+??? info "Using Gemini's multi-modal capabilities with VertexAI"
- Instructor supports both the gemini and the vertexai libraries. We've most recently added support for multi-part file formats using google's `gm.Part` objects. This allows you to pass in additional information to the LLM about the data you'd like to see.
+ We've most recently added support for multi-part file formats using google's `gm.Part` objects. This allows you to pass in additional information to the LLM about the data you'd like to see.
Here are two examples of how to use multi-part formats with Instructor.
diff --git a/instructor/utils.py b/instructor/utils.py
index 32df7016c..bcbdfeb02 100644
--- a/instructor/utils.py
+++ b/instructor/utils.py
@@ -239,6 +239,18 @@ def __get__(self, instance: object, cls: type[Any]) -> R_co:
return self.cproperty(cls)
+def get_message_content(message: ChatCompletionMessageParam) -> list[Any]:
+ content = message.get("content", "")
+ try:
+ if isinstance(content, list):
+ return content
+ else:
+ return [content]
+ except Exception as e:
+ logging.debug(f"Error getting message content: {e}")
+ return [content]
+
+
def transform_to_gemini_prompt(
messages_chatgpt: list[ChatCompletionMessageParam],
) -> list[dict[str, Any]]:
@@ -249,14 +261,18 @@ def transform_to_gemini_prompt(
system_prompt = message["content"]
elif message["role"] == "user":
messages_gemini.append(
- {"role": "user", "parts": [message.get("content", "")]}
+ {"role": "user", "parts": get_message_content(message)}
)
elif message["role"] == "assistant":
messages_gemini.append(
- {"role": "model", "parts": [message.get("content", "")]}
+ {"role": "model", "parts": get_message_content(message)}
)
+
if system_prompt:
- messages_gemini[0]["parts"].insert(0, f"*{system_prompt}*")
+ if messages_gemini:
+ messages_gemini[0]["parts"].insert(0, f"*{system_prompt}*")
+ else:
+ messages_gemini.append({"role": "user", "parts": [f"*{system_prompt}*"]})
return messages_gemini
diff --git a/mkdocs.yml b/mkdocs.yml
index 310438fb7..e78586ef3 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -139,6 +139,7 @@ nav:
- Text Classification: 'examples/classification.md'
- Local Classification: 'examples/local_classification.md'
- Ollama: 'examples/ollama.md'
+ - Using Gemini for Multimodal Data: 'examples/multi_modal_gemini.md'
- Citing Sources (RAG): 'examples/exact_citations.md'
- Extracting Knowledge Graphs: 'examples/knowledge_graph.md'
- Extracting Tables with GPT-V: 'examples/extracting_tables.md'
diff --git a/tests/llm/test_gemini/test_files/sample.mp3 b/tests/llm/test_gemini/test_files/sample.mp3
new file mode 100644
index 000000000..da84096eb
Binary files /dev/null and b/tests/llm/test_gemini/test_files/sample.mp3 differ
diff --git a/tests/llm/test_gemini/test_list_content.py b/tests/llm/test_gemini/test_list_content.py
new file mode 100644
index 000000000..0217af466
--- /dev/null
+++ b/tests/llm/test_gemini/test_list_content.py
@@ -0,0 +1,42 @@
+import instructor
+import google.generativeai as genai
+from pydantic import BaseModel
+
+
+class User(BaseModel):
+ name: str
+ age: int
+
+
+class UserList(BaseModel):
+ items: list[User]
+
+
+def test_list_of_strings():
+ client = instructor.from_gemini(
+ genai.GenerativeModel("gemini-1.5-flash-latest"),
+ mode=instructor.Mode.GEMINI_JSON,
+ )
+
+ content = [
+ "Extract a list of users from the following text",
+ "Jason is 25 years old",
+ "Elizabeth is 12 years old",
+ "Chris is 27 years old",
+ ]
+
+ result = client.chat.completions.create(
+ response_model=UserList,
+ messages=[
+ {"role": "user", "content": content},
+ ],
+ )
+
+ assert isinstance(result, UserList), "Result should be an instance of UserList"
+ assert isinstance(result.items, list), "items should be a list"
+ assert len(result.items) == 3, "List should contain 3 items"
+
+ names = [item.name.upper() for item in result.items]
+ assert "JASON" in names, "'JASON' should be in the list"
+ assert "ELIZABETH" in names, "'ELIZABETH' should be in the list"
+ assert "CHRIS" in names, "'CHRIS' should be in the list"
diff --git a/tests/llm/test_gemini/test_multimodal_content.py b/tests/llm/test_gemini/test_multimodal_content.py
new file mode 100644
index 000000000..61946c790
--- /dev/null
+++ b/tests/llm/test_gemini/test_multimodal_content.py
@@ -0,0 +1,65 @@
+import instructor
+import google.generativeai as genai
+from pydantic import BaseModel
+
+
+class Description(BaseModel):
+ relevant_speakers: list[str]
+ summary: str
+
+
+def test_audio_compatability_list():
+ client = instructor.from_gemini(
+ genai.GenerativeModel("gemini-1.5-flash-latest"),
+ mode=instructor.Mode.GEMINI_JSON,
+ )
+
+ files = [file for file in genai.list_files()]
+ file_names = [file.display_name for file in files]
+
+ if "sample.mp3" not in file_names:
+ file = genai.upload_file("./test_files/sample.mp3")
+ else:
+ print("File already uploaded, extracting file obj now")
+ file = [file for file in files if file.display_name == "sample.mp3"][0]
+
+ content = ["Please transcribe this recording:", file]
+
+ result = client.chat.completions.create(
+ response_model=Description,
+ messages=[
+ {"role": "user", "content": content},
+ ],
+ )
+
+ assert isinstance(
+ result, Description
+ ), "Result should be an instance of Description"
+
+
+def test_audio_compatability_multiple_messages():
+ client = instructor.from_gemini(
+ genai.GenerativeModel("gemini-1.5-flash-latest"),
+ mode=instructor.Mode.GEMINI_JSON,
+ )
+
+ files = [file for file in genai.list_files()]
+ file_names = [file.display_name for file in files]
+
+ if "sample.mp3" not in file_names:
+ file = genai.upload_file("./test_files/sample.mp3")
+ else:
+ print("File already uploaded, extracting file obj now")
+ file = [file for file in files if file.display_name == "sample.mp3"][0]
+
+ result = client.chat.completions.create(
+ response_model=Description,
+ messages=[
+ {"role": "user", "content": "Please transcribe this recording:"},
+ {"role": "user", "content": file},
+ ],
+ )
+
+ assert isinstance(
+ result, Description
+ ), "Result should be an instance of Description"
diff --git a/tests/llm/test_gemini/test_roles.py b/tests/llm/test_gemini/test_roles.py
new file mode 100644
index 000000000..1ddd9d341
--- /dev/null
+++ b/tests/llm/test_gemini/test_roles.py
@@ -0,0 +1,38 @@
+import instructor
+import google.generativeai as genai
+from pydantic import BaseModel
+
+roles = [
+ "system",
+ "user",
+ "assistant",
+]
+
+
+def test_roles():
+ client = instructor.from_gemini(
+ client=genai.GenerativeModel(
+ model_name="models/gemini-1.5-flash-latest",
+ ),
+ mode=instructor.Mode.GEMINI_JSON,
+ )
+
+ class Description(BaseModel):
+ description: str
+
+ for role in roles:
+ resp = client.create(
+ response_model=Description,
+ messages=[
+ {
+ "role": role,
+ "content": "Describe what a sunset in the desert looks like.",
+ },
+ {
+ "role": "user",
+ "content": "Please adhere to the instructions",
+ },
+ ],
+ )
+
+ assert isinstance(resp, Description)