fix: ensure that utf-8 characters are not translated into \uXXXX form…

…at (#965)
instructor-ai · Aug 31, 2024 · 02fcfe3 · 02fcfe3
1 parent b96e9a3
commit 02fcfe3
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 5 deletions.
diff --git a/instructor/process_response.py b/instructor/process_response.py
@@ -269,7 +269,7 @@ def handle_response_model(
                 As a genius expert, your task is to understand the content and provide
                 the parsed objects in json that match the following json_schema:\n
 
-                {json.dumps(response_model.model_json_schema(), indent=2)}
+                {json.dumps(response_model.model_json_schema(), indent=2, ensure_ascii=False)}
 
                 Make sure to return an instance of the JSON, not the schema itself
                 """
@@ -357,16 +357,15 @@ def handle_response_model(
                 You must only respond in JSON format that adheres to the following schema:
 
                 <JSON_SCHEMA>
-                {json.dumps(response_model.model_json_schema(), indent=2)}
+                {json.dumps(response_model.model_json_schema(), indent=2, ensure_ascii=False)}
                 </JSON_SCHEMA>
                 """
                 new_kwargs["system"] = dedent(new_kwargs["system"])
             else:
                 new_kwargs["system"] += dedent(f"""
                 You must only respond in JSON format that adheres to the following schema:
-
                 <JSON_SCHEMA>
-                {json.dumps(response_model.model_json_schema(), indent=2)}
+                 {json.dumps(response_model.model_json_schema(), indent=2, ensure_ascii=False)}
                 </JSON_SCHEMA>
                 """)
 
@@ -439,7 +438,7 @@ def handle_response_model(
                 As a genius expert, your task is to understand the content and provide
                 the parsed objects in json that match the following json_schema:\n
 
-                {json.dumps(response_model.model_json_schema(), indent=2)}
+                {json.dumps(response_model.model_json_schema(), indent=2, ensure_ascii=False)}
 
                 Make sure to return an instance of the JSON, not the schema itself
                 """

diff --git a/tests/test_response_model_conversion.py b/tests/test_response_model_conversion.py
@@ -0,0 +1,54 @@
+from instructor.process_response import handle_response_model
+from pydantic import BaseModel, Field
+import instructor
+import pytest
+
+modes = [
+    instructor.Mode.ANTHROPIC_JSON,
+    instructor.Mode.JSON,
+    instructor.Mode.MD_JSON,
+    instructor.Mode.GEMINI_JSON,
+    instructor.Mode.VERTEXAI_JSON,
+]
+
+
+def get_system_prompt(user_tool_definition, mode):
+    if mode == instructor.Mode.ANTHROPIC_JSON:
+        return user_tool_definition["system"]
+    elif mode == instructor.Mode.GEMINI_JSON:
+        return "\n".join(user_tool_definition["contents"][0]["parts"])
+    elif mode == instructor.Mode.VERTEXAI_JSON:
+        return str(user_tool_definition["generation_config"])
+    return user_tool_definition["messages"][0]["content"]
+
+
+@pytest.mark.parametrize("mode", modes)
+def test_json_preserves_description_of_non_english_characters_in_json_mode(
+    mode,
+) -> None:
+    messages = [
+        {
+            "role": "user",
+            "content": "Extract the user from the text : 张三 20岁",
+        }
+    ]
+
+    class User(BaseModel):
+        name: str = Field(description="用户的名字")
+        age: int = Field(description="用户的年龄")
+
+    _, user_tool_definition = handle_response_model(User, mode=mode, messages=messages)
+
+    system_prompt = get_system_prompt(user_tool_definition, mode)
+    assert "用户的名字" in system_prompt
+    assert "用户的年龄" in system_prompt
+
+    _, user_tool_definition = handle_response_model(
+        User,
+        mode=mode,
+        system="你是一个AI助手",
+        messages=messages,
+    )
+    system_prompt = get_system_prompt(user_tool_definition, mode)
+    assert "用户的名字" in system_prompt
+    assert "用户的年龄" in system_prompt