Arize-ai · anticorrelator · Feb 3, 2025 · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -122,7 +122,7 @@ pg = [
   "psycopg[binary,pool]",
 ]
 container = [
-  "anthropic",
+  "anthropic>=0.45.2",
   "google-generativeai",
   "prometheus-client",
   "openai>=1.0.0",

diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -1,6 +1,6 @@
 asyncpg
 openai
-anthropic
+anthropic>=0.45.2
 google-generativeai
 psycopg[binary,pool]
 uvloop; platform_system != 'Windows'

diff --git a/requirements/type-check.txt b/requirements/type-check.txt
@@ -1,5 +1,5 @@
 -r ci.txt
-anthropic
+anthropic>=0.45.2
 asyncpg
 grpcio
 litellm>=1.0.3

diff --git a/requirements/unit-tests.txt b/requirements/unit-tests.txt
@@ -1,5 +1,5 @@
 -r ci.txt
-anthropic
+anthropic>=0.45.2
 Faker>=30.1.0
 arize
 asgi-lifespan

diff --git a/src/phoenix/server/api/helpers/dataset_helpers.py b/src/phoenix/server/api/helpers/dataset_helpers.py
@@ -7,6 +7,7 @@
     OpenInferenceMimeTypeValues,
     OpenInferenceSpanKindValues,
     SpanAttributes,
+    ToolAttributes,
     ToolCallAttributes,
 )
 
@@ -27,12 +28,18 @@ def get_dataset_example_input(span: Span) -> dict[str, Any]:
     input_mime_type = get_attribute_value(attributes, INPUT_MIME_TYPE)
     prompt_template_variables = get_attribute_value(attributes, LLM_PROMPT_TEMPLATE_VARIABLES)
     input_messages = get_attribute_value(attributes, LLM_INPUT_MESSAGES)
+    tool_definitions = []
+    if tools := get_attribute_value(attributes, LLM_TOOLS):
+        for tool in tools:
+            if definition := get_attribute_value(tool, TOOL_DEFINITION):
+                tool_definitions.append(definition)
     if span_kind == LLM:
         return _get_llm_span_input(
             input_messages=input_messages,
             input_value=input_value,
             input_mime_type=input_mime_type,
             prompt_template_variables=prompt_template_variables,
+            tools=tool_definitions,
         )
     return _get_generic_io_value(io_value=input_value, mime_type=input_mime_type, kind="input")
 
@@ -71,6 +78,7 @@ def _get_llm_span_input(
     input_value: Any,
     input_mime_type: Optional[str],
     prompt_template_variables: Any,
+    tools: Any,
 ) -> dict[str, Any]:
     """
     Extracts the input value from an LLM span and returns it as a dictionary.
@@ -84,6 +92,8 @@ def _get_llm_span_input(
         input = _get_generic_io_value(io_value=input_value, mime_type=input_mime_type, kind="input")
     if prompt_template_variables_data := _safely_json_decode(prompt_template_variables):
         input["prompt_template_variables"] = prompt_template_variables_data
+    if tool_definitions_data := [_safely_json_decode(tool_definition) for tool_definition in tools]:
+        input["tools"] = tool_definitions_data
     return input
 
 
@@ -215,3 +225,7 @@ def _safely_json_decode(value: Any) -> Any:
 # ToolCallAttributes
 TOOL_CALL_FUNCTION_ARGUMENTS_JSON = ToolCallAttributes.TOOL_CALL_FUNCTION_ARGUMENTS_JSON
 TOOL_CALL_FUNCTION_NAME = ToolCallAttributes.TOOL_CALL_FUNCTION_NAME
+
+# ToolAttributes
+LLM_TOOLS = SpanAttributes.LLM_TOOLS
+TOOL_DEFINITION = ToolAttributes.TOOL_JSON_SCHEMA
diff --git a/src/phoenix/server/api/helpers/playground_clients.py b/src/phoenix/server/api/helpers/playground_clients.py
@@ -794,6 +794,8 @@ async def chat_completion_create(
                     pass
                 elif isinstance(event, anthropic_streaming.InputJsonEvent):
                     raise NotImplementedError
+                elif isinstance(event, anthropic_streaming._types.CitationEvent):
+                    raise NotImplementedError
                 else:
                     assert_never(event)
 

diff --git a/src/phoenix/server/api/mutations/chat_mutations.py b/src/phoenix/server/api/mutations/chat_mutations.py
@@ -49,7 +49,6 @@
 from phoenix.server.api.input_types.TemplateOptions import TemplateOptions
 from phoenix.server.api.subscriptions import (
     _default_playground_experiment_description,
-    _default_playground_experiment_metadata,
     _default_playground_experiment_name,
 )
 from phoenix.server.api.types.ChatCompletionMessageRole import ChatCompletionMessageRole
@@ -183,12 +182,7 @@ async def chat_completion_over_dataset(
                 description=input.experiment_description
                 or _default_playground_experiment_description(dataset_name=dataset.name),
                 repetitions=1,
-                metadata_=input.experiment_metadata
-                or _default_playground_experiment_metadata(
-                    dataset_name=dataset.name,
-                    dataset_id=input.dataset_id,
-                    version_id=GlobalID(DatasetVersion.__name__, str(resolved_version_id)),
-                ),
+                metadata_=input.experiment_metadata or dict(),
                 project_name=PLAYGROUND_PROJECT_NAME,
             )
             session.add(experiment)

diff --git a/src/phoenix/server/api/mutations/dataset_mutations.py b/src/phoenix/server/api/mutations/dataset_mutations.py
@@ -4,7 +4,11 @@
 
 import strawberry
 from openinference.semconv.trace import (
+    MessageAttributes,
+    MessageContentAttributes,
     SpanAttributes,
+    ToolAttributes,
+    ToolCallAttributes,
 )
 from sqlalchemy import and_, delete, distinct, func, insert, select, update
 from strawberry import UNSET
@@ -181,6 +185,17 @@ async def add_spans_to_dataset(
             assert all(map(lambda id: isinstance(id, int), dataset_example_rowids))
             DatasetExampleRevision = models.DatasetExampleRevision
 
+            all_span_attributes = {
+                **SpanAttributes.__dict__,
+                **MessageAttributes.__dict__,
+                **MessageContentAttributes.__dict__,
+                **ToolCallAttributes.__dict__,
+                **ToolAttributes.__dict__,
+            }
+            nonprivate_span_attributes = {
+                k: v for k, v in all_span_attributes.items() if not k.startswith("_")
+            }
+
             await session.execute(
                 insert(DatasetExampleRevision),
                 [
@@ -190,6 +205,12 @@ async def add_spans_to_dataset(
                         DatasetExampleRevision.input.key: get_dataset_example_input(span),
                         DatasetExampleRevision.output.key: get_dataset_example_output(span),
                         DatasetExampleRevision.metadata_.key: {
+                            **(span.attributes.get(SpanAttributes.METADATA) or dict()),
+                            **{
+                                k: v
+                                for k, v in span.attributes.items()
+                                if k in nonprivate_span_attributes
+                            },
                             "span_kind": span.span_kind,
                             **(
                                 {"annotations": annotations}

diff --git a/src/phoenix/server/api/routers/v1/datasets.py b/src/phoenix/server/api/routers/v1/datasets.py
@@ -919,20 +919,23 @@ def _get_content_csv(examples: list[models.DatasetExampleRevision]) -> bytes:
 def _get_content_jsonl_openai_ft(examples: list[models.DatasetExampleRevision]) -> bytes:
     records = io.BytesIO()
     for ex in examples:
-        records.write(
-            (
-                json.dumps(
-                    {
-                        "messages": (
-                            ims if isinstance(ims := ex.input.get("messages"), list) else []
-                        )
-                        + (oms if isinstance(oms := ex.output.get("messages"), list) else [])
-                    },
-                    ensure_ascii=False,
-                )
-                + "\n"
-            ).encode()
-        )
+        input_messages = ex.input.get("messages", [])
+        if not isinstance(input_messages, list):
+            input_messages = []
+        output_messages = ex.output.get("messages", [])
+        if not isinstance(output_messages, list):
+            output_messages = []
+
+        record_dict = {
+            "messages": input_messages + output_messages,
+        }
+
+        tools = ex.input.get("tools", [])
+        if tools:
+            record_dict["tools"] = tools
+
+        records.write((json.dumps(record_dict, ensure_ascii=False) + "\n").encode())
+
     records.seek(0)
     return records.read()
 

diff --git a/src/phoenix/server/api/subscriptions.py b/src/phoenix/server/api/subscriptions.py
@@ -278,12 +278,7 @@ async def chat_completion_over_dataset(
                 description=input.experiment_description
                 or _default_playground_experiment_description(dataset_name=dataset.name),
                 repetitions=1,
-                metadata_=input.experiment_metadata
-                or _default_playground_experiment_metadata(
-                    dataset_name=dataset.name,
-                    dataset_id=input.dataset_id,
-                    version_id=GlobalID(DatasetVersion.__name__, str(resolved_version_id)),
-                ),
+                metadata_=input.experiment_metadata or dict(),
                 project_name=PLAYGROUND_PROJECT_NAME,
             )
             session.add(experiment)
@@ -581,16 +576,6 @@ def _default_playground_experiment_description(dataset_name: str) -> str:
     return f'Playground experiment for dataset "{dataset_name}"'
 
 
-def _default_playground_experiment_metadata(
-    dataset_name: str, dataset_id: GlobalID, version_id: GlobalID
-) -> dict[str, Any]:
-    return {
-        "dataset_name": dataset_name,
-        "dataset_id": str(dataset_id),
-        "dataset_version_id": str(version_id),
-    }
-
-
 LLM_OUTPUT_MESSAGES = SpanAttributes.LLM_OUTPUT_MESSAGES
 LLM_TOKEN_COUNT_COMPLETION = SpanAttributes.LLM_TOKEN_COUNT_COMPLETION
 LLM_TOKEN_COUNT_PROMPT = SpanAttributes.LLM_TOKEN_COUNT_PROMPT

diff --git a/tests/unit/server/api/test_subscriptions.py b/tests/unit/server/api/test_subscriptions.py
@@ -1206,12 +1206,8 @@ async def test_emits_expected_payloads_and_records_expected_spans_and_experiment
         assert experiment.pop("name") == "playground-experiment"
         assert isinstance(experiment_description := experiment.pop("description"), str)
         assert "dataset-name" in experiment_description
-        assert experiment.pop("metadata") == {
-            "dataset_name": "dataset-name",
-            "dataset_id": str(dataset_id),
-            "dataset_version_id": str(version_id),
-        }
         assert experiment.pop("projectName") == "playground"
+        assert experiment.pop("metadata") == {}
         assert isinstance(created_at := experiment.pop("createdAt"), str)
         assert isinstance(updated_at := experiment.pop("updatedAt"), str)
         assert created_at == updated_at