From 6c838b6f64b27c7ceaeb1a2ff481e805edd02a90 Mon Sep 17 00:00:00 2001
From: Collin Dutter <collin@griptape.ai>
Date: Tue, 22 Oct 2024 11:16:07 -0700
Subject: [PATCH] Toolkit Task Improvements (#1268)

---
 CHANGELOG.md                                  |  4 ++
 .../griptape-framework/structures/rulesets.md |  3 -
 griptape/tasks/actions_subtask.py             | 35 +++++++----
 griptape/tasks/toolkit_task.py                |  3 -
 .../templates/tasks/toolkit_task/system.j2    |  6 +-
 tests/unit/tasks/test_actions_subtask.py      | 59 +++++++++++++++++--
 6 files changed, 82 insertions(+), 28 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4ee9bcee2..e8930532f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -44,12 +44,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `Chat.output_fn`'s now takes an optional kwarg parameter, `stream`.
 - Implemented `SerializableMixin` in `Structure`, `BaseTask`, `BaseTool`, and `TaskMemory`
 - `@activity` decorated functions can now accept kwargs that are defined in the activity schema.
+- Updated `ToolkitTask` system prompt to no longer mention `memory_name` and `artifact_namespace`.
+- Models in `ToolkitTask` with native tool calling no longer need to provide their final answer as `Answer:`.
 - `EventListener.event_types` will now listen on child types of any provided type.
 
 ### Fixed
 
 - Structures not flushing events when not listening for `FinishStructureRunEvent`.
 - `EventListener.event_types` and the argument to `BaseEventListenerDriver.handler` being out of sync.
+- Models occasionally hallucinating `memory_name` and `artifact_namespace` into Tool schemas when using `ToolkitTask`.
+- Models occasionally providing overly succinct final answers when using `ToolkitTask`.
 
 ## \[0.33.1\] - 2024-10-11
 
diff --git a/docs/griptape-framework/structures/rulesets.md b/docs/griptape-framework/structures/rulesets.md
index 97018d8d3..72dfad62a 100644
--- a/docs/griptape-framework/structures/rulesets.md
+++ b/docs/griptape-framework/structures/rulesets.md
@@ -29,9 +29,6 @@ A [Ruleset](../../reference/griptape/rules/ruleset.md) can be used to define [Ru
 [JsonSchemaRule](../../reference/griptape/rules/json_schema_rule.md)s defines a structured format for the LLM's output by providing a JSON schema.
 This is particularly useful when you need the LLM to return well-formed data, such as JSON objects, with specific fields and data types.
 
-!!! warning
-    `JsonSchemaRule` may break [ToolkitTask](../structures/tasks.md#toolkit-task) which relies on a specific [output token](https://github.com/griptape-ai/griptape/blob/e6a04c7b88cf9fa5d6bcf4c833ffebfab89a3258/griptape/tasks/toolkit_task.py#L28).
-
 ```python
 --8<-- "docs/griptape-framework/structures/src/json_schema_rule.py"
 ```
diff --git a/griptape/tasks/actions_subtask.py b/griptape/tasks/actions_subtask.py
index 8d68ae996..1b4ccfb7d 100644
--- a/griptape/tasks/actions_subtask.py
+++ b/griptape/tasks/actions_subtask.py
@@ -217,14 +217,18 @@ def __init_from_prompt(self, value: str) -> None:
         actions_matches = re.findall(self.ACTIONS_PATTERN, value, re.DOTALL)
         answer_matches = re.findall(self.ANSWER_PATTERN, value, re.MULTILINE)
 
-        if self.thought is None and thought_matches:
-            self.thought = thought_matches[-1]
+        self.actions = self.__parse_actions(actions_matches)
 
-        self.__parse_actions(actions_matches)
+        if thought_matches:
+            self.thought = thought_matches[-1]
 
-        # If there are no actions to take but an answer is provided, set the answer as the output.
-        if len(self.actions) == 0 and self.output is None and answer_matches:
-            self.output = TextArtifact(answer_matches[-1])
+        if not self.actions and self.output is None:
+            if answer_matches:
+                # A direct answer is provided, set it as the output.
+                self.output = TextArtifact(answer_matches[-1])
+            else:
+                # The LLM failed to follow the ReAct prompt, set the LLM's raw response as the output.
+                self.output = TextArtifact(value)
 
     def __init_from_artifacts(self, artifacts: ListArtifact) -> None:
         """Parses the input Artifacts to extract the thought and actions.
@@ -243,23 +247,30 @@ def __init_from_artifacts(self, artifacts: ListArtifact) -> None:
             if isinstance(artifact, ActionArtifact)
         ]
 
-        thoughts = [artifact.value for artifact in artifacts.value if isinstance(artifact, TextArtifact)]
-        if thoughts:
-            self.thought = thoughts[0]
+        # When parsing from Artifacts we can't determine the thought unless there are also Actions
+        if self.actions:
+            thoughts = [artifact.value for artifact in artifacts.value if isinstance(artifact, TextArtifact)]
+            if thoughts:
+                self.thought = thoughts[0]
+        else:
+            if self.output is None:
+                self.output = TextArtifact(artifacts.to_text())
 
-    def __parse_actions(self, actions_matches: list[str]) -> None:
+    def __parse_actions(self, actions_matches: list[str]) -> list[ToolAction]:
         if len(actions_matches) == 0:
-            return
+            return []
         try:
             data = actions_matches[-1]
             actions_list: list[dict] = json.loads(data, strict=False)
 
-            self.actions = [self.__process_action_object(action_object) for action_object in actions_list]
+            return [self.__process_action_object(action_object) for action_object in actions_list]
         except json.JSONDecodeError as e:
             logger.exception("Subtask %s\nInvalid actions JSON: %s", self.origin_task.id, e)
 
             self.output = ErrorArtifact(f"Actions JSON decoding error: {e}", exception=e)
 
+            return []
+
     def __process_action_object(self, action_object: dict) -> ToolAction:
         # Load action tag; throw exception if the key is not present
         action_tag = action_object["tag"]
diff --git a/griptape/tasks/toolkit_task.py b/griptape/tasks/toolkit_task.py
index 2a4a926bd..e2b0e23a5 100644
--- a/griptape/tasks/toolkit_task.py
+++ b/griptape/tasks/toolkit_task.py
@@ -180,9 +180,6 @@ def run(self) -> BaseArtifact:
             if subtask.output is None:
                 if len(self.subtasks) >= self.max_subtasks:
                     subtask.output = ErrorArtifact(f"Exceeded tool limit of {self.max_subtasks} subtasks per task")
-                elif not subtask.actions:
-                    # handle case when the LLM failed to follow the ReAct prompt and didn't return a proper action
-                    subtask.output = subtask.input
                 else:
                     subtask.before_run()
                     subtask.run()
diff --git a/griptape/templates/tasks/toolkit_task/system.j2 b/griptape/templates/tasks/toolkit_task/system.j2
index 730ab85e4..e9ec48739 100644
--- a/griptape/templates/tasks/toolkit_task/system.j2
+++ b/griptape/templates/tasks/toolkit_task/system.j2
@@ -8,16 +8,14 @@ Actions: <JSON array of actions that MUST follow this schema: {{ actions_schema
 {{ stop_sequence }}: <action outputs>
 "Thought", "Actions", "{{ stop_sequence }}" must always start on a new line.
 
+You must use the following format when providing your final answer:
+Answer: <final answer>
 {% endif %}
 Repeat executing actions as many times as you need.
 If an action's output contains an error, you MUST ALWAYS try to fix the error by executing another action. 
-You must use the following format when providing your final answer:
-Answer: <final answer>
 
 Be truthful. ALWAYS be proactive and NEVER ask the user for more information input. Keep using actions until you have your final answer.
 NEVER make up actions, action names, or action paths. NEVER make up facts. NEVER reference tags in other action input values.
-
-Actions might store their output in memory as artifacts (with `memory_name` and `artifact_namespace`). If action output is stored in memory, ALWAYS try to pass it to another action. NEVER make up memory names or artifact namespaces.
 {% if meta_memory %}
 
 {{ meta_memory }}
diff --git a/tests/unit/tasks/test_actions_subtask.py b/tests/unit/tasks/test_actions_subtask.py
index e25a42120..6460e7323 100644
--- a/tests/unit/tasks/test_actions_subtask.py
+++ b/tests/unit/tasks/test_actions_subtask.py
@@ -9,7 +9,7 @@
 
 
 class TestActionsSubtask:
-    def test_basic_input(self):
+    def test_prompt_input(self):
         valid_input = (
             "Thought: need to test\n"
             'Actions: [{"tag": "foo", "name": "MockTool", "path": "test", "input": {"values": {"test": "value"}}}]\n'
@@ -25,22 +25,31 @@ def test_basic_input(self):
         assert json_dict[0]["name"] == "MockTool"
         assert json_dict[0]["path"] == "test"
         assert json_dict[0]["input"] == {"values": {"test": "value"}}
+        assert subtask.thought == "need to test"
+        assert subtask.output is None
 
-    def test_action_input(self):
-        valid_input = ActionArtifact(
-            ToolAction(tag="foo", name="MockTool", path="test", input={"values": {"test": "value"}})
+    def test_artifact_input(self):
+        valid_input = ListArtifact(
+            [
+                TextArtifact("need to test"),
+                ActionArtifact(
+                    ToolAction(tag="foo", name="MockTool", path="test", input={"values": {"test": "value"}})
+                ),
+                TextArtifact("answer"),
+            ]
         )
         task = ToolkitTask(tools=[MockTool()])
         Agent().add_task(task)
         subtask = task.add_subtask(ActionsSubtask(valid_input))
         json_dict = json.loads(subtask.actions_to_json())
 
-        assert subtask.thought is None
         assert json_dict[0]["name"] == "MockTool"
         assert json_dict[0]["path"] == "test"
         assert json_dict[0]["input"] == {"values": {"test": "value"}}
+        assert subtask.thought == "need to test"
+        assert subtask.output is None
 
-    def test_action_and_thought_input(self):
+    def test_artifact_action_and_thought_input(self):
         valid_input = ListArtifact(
             [
                 TextArtifact("thought"),
@@ -59,6 +68,42 @@ def test_action_and_thought_input(self):
         assert json_dict[0]["path"] == "test"
         assert json_dict[0]["input"] == {"values": {"test": "value"}}
 
+    def test_prompt_answer(self):
+        valid_input = "Answer: test output"
+
+        task = ToolkitTask(tools=[MockTool()])
+        Agent().add_task(task)
+        subtask = task.add_subtask(ActionsSubtask(valid_input))
+
+        assert subtask.thought is None
+        assert subtask.actions == []
+        assert subtask.output.value == "test output"
+
+    def test_prompt_implicit_answer(self):
+        valid_input = "test output"
+
+        task = ToolkitTask(tools=[MockTool()])
+        Agent().add_task(task)
+        subtask = task.add_subtask(ActionsSubtask(valid_input))
+
+        assert subtask.thought is None
+        assert subtask.actions == []
+        assert subtask.output.value == "test output"
+
+    def test_artifact_answer(self):
+        valid_input = ListArtifact(
+            [
+                TextArtifact("answer"),
+            ]
+        )
+        task = ToolkitTask(tools=[MockTool()])
+        Agent().add_task(task)
+        subtask = task.add_subtask(ActionsSubtask(valid_input))
+
+        assert subtask.thought is None
+        assert subtask.actions == []
+        assert subtask.output.value == "answer"
+
     def test_callable_input(self):
         valid_input = ListArtifact(
             [
@@ -146,6 +191,8 @@ def test_invalid_actions(self):
 
         assert isinstance(subtask.output, ErrorArtifact)
         assert "Actions JSON decoding error" in subtask.output.value
+        assert subtask.thought == "need to test"
+        assert subtask.actions == []
 
     def test_implicit_values(self):
         valid_input = (