From 6907d0dcca64cc4084ae21fc393633418eb09390 Mon Sep 17 00:00:00 2001 From: Dor Zohar Date: Fri, 20 Dec 2024 21:58:33 +0000 Subject: [PATCH 1/6] Add few shot feature and implement it on guideline_proposer --- schematic_generation_test_cache.json | 1 + src/parlant/bin/server.py | 6 + .../core/engines/alpha/guideline_proposer.py | 304 +++++++++++++++++- .../core/engines/alpha/prompt_builder.py | 23 ++ src/parlant/core/shots.py | 45 +++ tests/conftest.py | 5 + .../engines/alpha/test_guideline_proposer.py | 8 +- tests/core/test_shots.py | 76 +++++ 8 files changed, 464 insertions(+), 4 deletions(-) create mode 100644 schematic_generation_test_cache.json create mode 100644 src/parlant/core/shots.py create mode 100644 tests/core/test_shots.py diff --git a/schematic_generation_test_cache.json b/schematic_generation_test_cache.json new file mode 100644 index 000000000..9e26dfeeb --- /dev/null +++ b/schematic_generation_test_cache.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/src/parlant/bin/server.py b/src/parlant/bin/server.py index 1c3d652f3..ff18aedee 100755 --- a/src/parlant/bin/server.py +++ b/src/parlant/bin/server.py @@ -31,7 +31,9 @@ import uvicorn from parlant.adapters.vector_db.chroma import ChromaDatabase +from parlant.core.engines.alpha import guideline_proposer from parlant.core.nlp.service import NLPService +from parlant.core.shots import ShotCollection from parlant.core.tags import TagDocumentStore, TagStore from parlant.api.app import create_api_app, ASGIApplication from parlant.core.background_tasks import BackgroundTaskService @@ -78,6 +80,7 @@ from parlant.core.engines.alpha.tool_caller import ToolCallInferenceSchema from parlant.core.engines.alpha.guideline_proposer import ( GuidelineProposer, + GuidelinePropositionShot, GuidelinePropositionsSchema, ) from parlant.core.engines.alpha.message_event_generator import ( @@ -345,9 +348,12 @@ async def setup_container(nlp_service_name: str) -> AsyncIterator[Container]: SchematicGenerator[GuidelineConnectionPropositionsSchema] ] = await nlp_service.get_schematic_generator(GuidelineConnectionPropositionsSchema) + c[ShotCollection[GuidelinePropositionShot]] = guideline_proposer.shot_collection + c[GuidelineProposer] = GuidelineProposer( c[Logger], c[SchematicGenerator[GuidelinePropositionsSchema]], + c[ShotCollection[GuidelinePropositionShot]], ) c[GuidelineConnectionProposer] = GuidelineConnectionProposer( c[Logger], diff --git a/src/parlant/core/engines/alpha/guideline_proposer.py b/src/parlant/core/engines/alpha/guideline_proposer.py index e884162bd..07a4fc670 100644 --- a/src/parlant/core/engines/alpha/guideline_proposer.py +++ b/src/parlant/core/engines/alpha/guideline_proposer.py @@ -13,6 +13,7 @@ # limitations under the License. from dataclasses import dataclass +from datetime import datetime, timezone from functools import cached_property from itertools import chain import json @@ -31,11 +32,12 @@ ) from parlant.core.engines.alpha.prompt_builder import BuiltInSection, PromptBuilder, SectionStatus from parlant.core.glossary import Term -from parlant.core.guidelines import Guideline -from parlant.core.sessions import Event +from parlant.core.guidelines import Guideline, GuidelineContent +from parlant.core.sessions import Event, EventId, EventSource from parlant.core.emissions import EmittedEvent -from parlant.core.common import DefaultBaseModel +from parlant.core.common import DefaultBaseModel, JSONSerializable from parlant.core.logging import Logger +from parlant.core.shots import Shot, ShotCollection class GuidelinePropositionSchema(DefaultBaseModel): @@ -55,6 +57,60 @@ class GuidelinePropositionsSchema(DefaultBaseModel): checks: Sequence[GuidelinePropositionSchema] +@dataclass +class GuidelinePropositionShot(Shot): + interaction_events: Sequence[Event] + guidelines: Sequence[GuidelineContent] + expected_result: GuidelinePropositionsSchema + + @override + def format(self) -> str: + def adapt_event(e: Event) -> JSONSerializable: + source_map: dict[EventSource, str] = { + "customer": "user", + "customer_ui": "frontend_application", + "human_agent": "human_service_agent", + "human_agent_on_behalf_of_ai_agent": "ai_agent", + "ai_agent": "ai_agent", + "system": "system-provided", + } + + return { + "event_kind": e.kind, + "event_source": source_map[e.source], + "data": e.data, + } + + example = "" + if self.interaction_events: + example += f""" +- Interaction Events: ### +{json.dumps([adapt_event(e) for e in self.interaction_events], indent=2)} +### + +""" + if self.guidelines: + formatted_guidelines = "\n".join( + f"{i}) condition: {g.condition}, action: {g.action}" + for i, g in enumerate(self.guidelines, start=1) + ) + example += f""" +- Guidelines: ### +{formatted_guidelines} +### + +""" + + example += f""" +- **Expected Result**: +```json +{json.dumps(self.expected_result.model_dump(mode="json"), indent=2)} +``` +""" + + return example + + @dataclass(frozen=True) class ConditionApplicabilityEvaluation: guideline_number: int @@ -85,9 +141,11 @@ def __init__( self, logger: Logger, schematic_generator: SchematicGenerator[GuidelinePropositionsSchema], + shot_collection: ShotCollection[GuidelinePropositionShot], ) -> None: self._logger = logger self._schematic_generator = schematic_generator + self._shot_collection = shot_collection async def propose_guidelines( self, @@ -207,6 +265,7 @@ async def _process_guideline_batch( staged_events=staged_events, terms=terms, guidelines=guidelines_dict, + shots=await self._shot_collection.list(), ) with self._logger.operation( @@ -259,6 +318,7 @@ def _format_prompt( staged_events: Sequence[EmittedEvent], terms: Sequence[Term], guidelines: dict[int, Guideline], + shots: Sequence[Shot], ) -> str: assert len(agents) == 1 @@ -338,6 +398,7 @@ def _format_prompt( Examples of Condition Evaluations: ------------------- +<<<<<<< HEAD Example #1: - Interaction Events: ### [{{"id": "11", "kind": "", "source": "customer", @@ -573,8 +634,11 @@ def _format_prompt( ] }} ``` +======= +>>>>>>> ecf3aa96 (Add few shot feature and implement it on guideline_proposer) """ # noqa ) + builder.add_few_shots(shots) builder.add_agent_identity(agents[0]) builder.add_context_variables(context_variables) builder.add_glossary(terms) @@ -606,3 +670,237 @@ def _format_prompt( prompt = builder.build() return prompt + + +def _make_event(e_id: str, source: EventSource, message: str) -> Event: + return Event( + id=EventId(e_id), + source=source, + kind="message", + creation_utc=datetime.now(timezone.utc), + offset=0, + correlation_id="", + data={"message": message}, + deleted=False, + ) + + +example_1_events = [ + _make_event("11", "customer", "Can I purchase a subscription to your software?"), + _make_event("23", "ai_agent", "Absolutely, I can assist you with that right now."), + _make_event("34", "customer", "Please proceed with the subscription for the Pro plan."), + _make_event( + "56", + "ai_agent", + "Your subscription has been successfully activated. Is there anything else I can help you with?", + ), + _make_event("78", "customer", "Yes, can you tell me more about your data security policies?"), +] + +example_1_guidelines = [ + GuidelineContent( + condition="the customer initiates a purchase.", + action="Open a new cart for the customer", + ), + GuidelineContent( + condition="the customer asks about data security", + action="Refer the customer to our privacy policy page", + ), + GuidelineContent( + condition="the customer asked to subscribe to our pro plan", + action="maintain a helpful tone and thank them for shopping at our store", + ), +] + +example_1_expected = GuidelinePropositionsSchema( + checks=[ + GuidelinePropositionSchema( + guideline_number=1, + condition="the customer initiates a purchase", + condition_application_rationale="The purchase-related guideline was initiated earlier, but is currently irrelevant since the customer completed the purchase and the conversation has moved to a new topic.", + condition_applies=False, + applies_score=3, + ), + GuidelinePropositionSchema( + guideline_number=2, + condition="the customer asks about data security", + condition_applies=True, + condition_application_rationale="The customer specifically inquired about data security policies, making this guideline highly relevant to the ongoing discussion.", + action="Refer the customer to our privacy policy page", + guideline_previously_applied_rationale="This is the first time data security has been mentioned, and the user has not been referred to the privacy policy page yet", + guideline_previously_applied="no", + guideline_is_continuous=False, + guideline_should_reapply=False, + applies_score=9, + ), + GuidelinePropositionSchema( + guideline_number=3, + condition="the customer asked to subscribe to our pro plan", + condition_applies=True, + condition_application_rationale="The customer recently asked to subscribe to the pro plan. The conversation is beginning to drift elsewhere, but still deals with the pro plan", + action="maintain a helpful tone and thank them for shopping at our store", + guideline_previously_applied_rationale="a helpful tone was maintained, but the agent didn't thank the customer for shoppint at our store, making the guideline partially fulfilled. By this, it should be treated as if it was fully followed", + guideline_previously_applied="partially", + guideline_is_continuous=False, + guideline_should_reapply=False, + applies_score=6, + ), + ] +) + +example_2_events = [ + _make_event("11", "customer", "I'm looking for a job, what do you have available?"), + _make_event( + "23", + "ai_agent", + "Hi there! we have plenty of opportunities for you, where are you located?", + ), + _make_event("34", "customer", "I'm looking for anything around the bay area"), + _make_event( + "56", + "ai_agent", + "That's great. We have a number of positions available over there. What kind of role are you interested in?", + ), + _make_event("78", "customer", "Anything to do with training and maintaining AI agents"), +] + +example_2_guidelines = [ + GuidelineContent( + condition="the customer indicates that they are looking for a job.", + action="ask the customer for their location", + ), + GuidelineContent( + condition="the customer asks about job openings.", + action="emphasize that we have plenty of positions relevant to the customer, and over 10,000 opennings overall", + ), + GuidelineContent( + condition="discussing job opportunities.", action="maintain a positive, assuring tone" + ), +] + +example_2_expected = GuidelinePropositionsSchema( + checks=[ + GuidelinePropositionSchema( + guideline_number=3, + condition="the customer indicates that they are looking for a job.", + condition_application_rationale="The current discussion is about the type of job the customer is looking for", + condition_applies=True, + action="ask the customer for their location", + guideline_is_continuous=False, + guideline_previously_applied_rationale="The assistant asked for the customer's location earlier in the interaction. There is no need to ask for it again, as it is already known.", + guideline_previously_applied="fully", + guideline_should_reapply=False, + applies_score=3, + ), + GuidelinePropositionSchema( + guideline_number=4, + condition="the customer asks about job openings.", + condition_applies=True, + condition_application_rationale="the customer asked about job openings, and the discussion still revolves around this request", + action="emphasize that we have plenty of positions relevant to the customer, and over 10,000 opennings overall", + guideline_is_continuous=False, + guideline_previously_applied_rationale="The assistant already has emphasized that we have open positions, but neglected to mention that we offer 10k opennings overall. The guideline partially applies and should be treated as if it was fully applied. However, since the customer is narrowing down their search, this point should be re-emphasized to clarify that it still holds true.", + guideline_previously_applied="partially", + guideline_should_reapply=True, + applies_score=7, + ), + GuidelinePropositionSchema( + guideline_number=6, + condition="discussing job opportunities.", + condition_applies=True, + condition_application_rationale="the discussion is about job opportunities that are relevant to the customer, so the condition applies.", + action="maintain a positive, assuring tone", + guideline_is_continuous=True, + guideline_previously_applied_rationale="The assistant's tone is positive already. This action describes a continuous action, so the guideline should be re-applied.", + guideline_previously_applied="fully", + guideline_should_reapply=True, + applies_score=9, + ), + ] +) + + +example_3_events = [ + _make_event("11", "customer", "Hi there, what is the S&P500 trading at right now?"), + _make_event("23", "ai_agent", "Hello! It's currently priced at just about 6,000$."), + _make_event( + "34", "customer", "Better than I hoped. And what's the weather looking like today?" + ), + _make_event("56", "ai_agent", "It's 5 degrees Celsius in London today"), + _make_event("78", "customer", "Bummer. Does S&P500 still trade at 6,000$ by the way?"), +] + +example_3_guidelines = [ + GuidelineContent( + condition="the customer asks about the value of a stock.", + action="provide the price using the 'check_stock_price' tool", + ), + GuidelineContent( + condition="the weather at a certain location is discussed.", + action="check the weather at that location using the 'check_weather' tool", + ), + GuidelineContent( + condition="the customer asked about the weather.", + action="provide the customre with the temperature and the chances of precipitation", + ), +] + +example_3_expected = GuidelinePropositionsSchema( + checks=[ + GuidelinePropositionSchema( + guideline_number=1, + condition="the customer asks about the value of a stock.", + condition_application_rationale="The customer asked what does the S&P500 trade at", + condition_applies=True, + action="provide the price using the 'check_stock_price' tool", + guideline_is_continuous=False, + guideline_previously_applied_rationale="The assistant previously reported about the price of that stock following the customer's question, but since the price might have changed since then, it should be checked again.", + guideline_previously_applied="fully", + guideline_should_reapply=True, + applies_score=9, + ), + GuidelinePropositionSchema( + guideline_number=2, + condition="the weather at a certain location is discussed.", + condition_application_rationale="while weather was discussed earlier, the conversation have moved on to an entirely different topic (stock prices)", + condition_applies=False, + applies_score=3, + ), + GuidelinePropositionSchema( + guideline_number=3, + condition="the customer asked about the weather.", + condition_application_rationale="The customer asked about the weather earlier, though the conversation has somewhat moved on to a new topic", + condition_applies=True, + action="provide the customre with the temperature and the chances of precipitation", + guideline_is_continuous=False, + guideline_previously_applied_rationale="The action was partially fulfilled by reporting the temperature without the chances of precipitation. As partially fulfilled guidelines are treated as completed, this guideline is considered applied", + guideline_previously_applied="partially", + guideline_should_reapply=False, + applies_score=4, + ), + ] +) + + +_baseline_shots: Sequence[GuidelinePropositionShot] = [ + GuidelinePropositionShot( + description="Guidelines application when the customer purchases a subscription and asks about data security.", + interaction_events=example_1_events, + guidelines=example_1_guidelines, + expected_result=example_1_expected, + ), + GuidelinePropositionShot( + description="Guidelines application when the customer is looking for a job.", + interaction_events=example_2_events, + guidelines=example_2_guidelines, + expected_result=example_2_expected, + ), + GuidelinePropositionShot( + description="Guidelines application when the customer asks about stock prices and the weather.", + interaction_events=example_3_events, + guidelines=example_3_guidelines, + expected_result=example_3_expected, + ), +] + +shot_collection = ShotCollection[GuidelinePropositionShot](_baseline_shots) diff --git a/src/parlant/core/engines/alpha/prompt_builder.py b/src/parlant/core/engines/alpha/prompt_builder.py index 99963aef5..0177423b9 100644 --- a/src/parlant/core/engines/alpha/prompt_builder.py +++ b/src/parlant/core/engines/alpha/prompt_builder.py @@ -27,6 +27,7 @@ emitted_tool_events_to_dicts, ) from parlant.core.emissions import EmittedEvent +from parlant.core.shots import Shot class BuiltInSection(Enum): @@ -37,6 +38,7 @@ class BuiltInSection(Enum): GUIDELINE_DESCRIPTIONS = auto() GUIDELINES = auto() STAGED_EVENTS = auto() + FEW_SHOTS = auto() class SectionStatus(Enum): @@ -238,3 +240,24 @@ def add_staged_events( ) return self + + def add_few_shots( + self, + shots: Sequence[Shot], + ) -> PromptBuilder: + if shots: + self.add_section( + name=BuiltInSection.FEW_SHOTS, + content="".join( + f""" +Example #{i}: +{shot.format()} + + +""" + for i, shot in enumerate(shots, start=1) + ), + status=SectionStatus.ACTIVE, + ) + + return self diff --git a/src/parlant/core/shots.py b/src/parlant/core/shots.py new file mode 100644 index 000000000..70f97d6ec --- /dev/null +++ b/src/parlant/core/shots.py @@ -0,0 +1,45 @@ +from abc import abstractmethod +from dataclasses import dataclass +from typing import TypeVar, Generic, Sequence + + +@dataclass +class Shot: + description: str + """An explanation of what makes this shot interesting.""" + + @abstractmethod + def format(self) -> str: ... + + +TShot = TypeVar("TShot", bound=Shot) + + +class ShotCollection(Generic[TShot]): + def __init__(self, initial_shots: Sequence[TShot]) -> None: + self._shots: list[TShot] = list(initial_shots) + + async def append( + self, + shot: TShot, + ) -> None: + self._shots.append(shot) + + async def insert( + self, + shot: TShot, + index: int = 0, + ) -> None: + self._shots.insert(index, shot) + + async def list(self) -> Sequence[TShot]: + return self._shots + + async def remove( + self, + shot: TShot, + ) -> None: + self._shots = [s for s in self._shots if s != shot] + + async def clear(self) -> None: + self._shots.clear() diff --git a/tests/conftest.py b/tests/conftest.py index c9b9fa066..49fb69c03 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -31,6 +31,7 @@ from parlant.core.emission.event_publisher import EventPublisherFactory from parlant.core.emissions import EventEmitterFactory from parlant.core.customers import CustomerDocumentStore, CustomerStore +from parlant.core.engines.alpha import guideline_proposer from parlant.core.evaluations import ( EvaluationListener, PollingEvaluationListener, @@ -61,6 +62,7 @@ from parlant.core.glossary import GlossaryStore, GlossaryVectorStore from parlant.core.engines.alpha.guideline_proposer import ( GuidelineProposer, + GuidelinePropositionShot, GuidelinePropositionsSchema, ) from parlant.core.engines.alpha.message_event_generator import ( @@ -89,6 +91,7 @@ GuidelineToolAssociationDocumentStore, GuidelineToolAssociationStore, ) +from parlant.core.shots import ShotCollection from parlant.core.tags import TagDocumentStore, TagStore from parlant.core.tools import LocalToolService @@ -255,6 +258,8 @@ async def container( generation_schema, ) + container[ShotCollection[GuidelinePropositionShot]] = guideline_proposer.shot_collection + container[GuidelineProposer] = Singleton(GuidelineProposer) container[GuidelineConnectionProposer] = Singleton(GuidelineConnectionProposer) container[CoherenceChecker] = Singleton(CoherenceChecker) diff --git a/tests/core/stable/engines/alpha/test_guideline_proposer.py b/tests/core/stable/engines/alpha/test_guideline_proposer.py index e251ac075..dbd064969 100644 --- a/tests/core/stable/engines/alpha/test_guideline_proposer.py +++ b/tests/core/stable/engines/alpha/test_guideline_proposer.py @@ -35,6 +35,7 @@ from parlant.core.nlp.generation import SchematicGenerator from parlant.core.engines.alpha.guideline_proposer import ( GuidelineProposer, + GuidelinePropositionShot, GuidelinePropositionsSchema, ) from parlant.core.engines.alpha.guideline_proposition import ( @@ -45,6 +46,7 @@ from parlant.core.logging import Logger from parlant.core.glossary import TermId +from parlant.core.shots import ShotCollection from tests.core.common.utils import create_event_message from tests.test_utilities import SyncAwaiter @@ -223,7 +225,11 @@ def propose_guidelines( terms: Sequence[Term] = [], staged_events: Sequence[EmittedEvent] = [], ) -> Sequence[GuidelineProposition]: - guideline_proposer = GuidelineProposer(context.logger, context.schematic_generator) + guideline_proposer = GuidelineProposer( + context.logger, + context.schematic_generator, + context.container[ShotCollection[GuidelinePropositionShot]], + ) interaction_history = [ create_event_message( diff --git a/tests/core/test_shots.py b/tests/core/test_shots.py new file mode 100644 index 000000000..f4c729f2c --- /dev/null +++ b/tests/core/test_shots.py @@ -0,0 +1,76 @@ +from datetime import datetime, timezone +from lagom import Container + +from parlant.core.agents import Agent +from parlant.core.customers import Customer +from parlant.core.engines.alpha.guideline_proposer import ( + GuidelineProposer, + GuidelinePropositionSchema, + GuidelinePropositionShot, + GuidelinePropositionsSchema, +) +from parlant.core.guidelines import GuidelineContent +from parlant.core.sessions import Event, EventId +from parlant.core.shots import ShotCollection + + +async def test_that_appended_shot_is_displayed_in_guideline_proposer_prompt( + container: Container, + agent: Agent, + customer: Customer, +) -> None: + guideline_proposer = container[GuidelineProposer] + shot_collection = container[ShotCollection[GuidelinePropositionShot]] + + guideline_content = GuidelineContent( + condition="A user compliments the product", + action="Tell him to stop being a bitch", + ) + + guideline_proposition_shot = GuidelinePropositionShot( + description="Test Shot Description", + interaction_events=[ + Event( + id=EventId("test_id"), + source="customer", + kind="message", + creation_utc=datetime.now(timezone.utc), + offset=0, + correlation_id="", + data={"message": "This is a very cool feature, man!"}, + deleted=False, + ) + ], + guidelines=[guideline_content], + expected_result=GuidelinePropositionsSchema( + checks=[ + GuidelinePropositionSchema( + guideline_number=1, + condition=guideline_content.condition, + condition_application_rationale="Rationale: The user said that the feature is cool", + condition_applies=True, + action=guideline_content.action, + guideline_previously_applied="fully", + applies_score=8, + ) + ] + ), + ) + + await shot_collection.append(guideline_proposition_shot) + + shots = await shot_collection.list() + prompt = guideline_proposer._format_prompt( + agents=[agent], + customer=customer, + context_variables=[], + interaction_history=[], + staged_events=[], + terms=[], + guidelines={}, + shots=shots, + ) + + assert "Rationale: The user said that the feature is cool" in prompt + assert "This is a very cool feature, man!" in prompt + assert "Tell him to stop being a bitch" in prompt From 275018e559ccd01b1c264bc40d7ff5d786a04d18 Mon Sep 17 00:00:00 2001 From: Dor Zohar Date: Sat, 21 Dec 2024 08:20:24 +0000 Subject: [PATCH 2/6] Delete not relevant files --- schematic_generation_test_cache.json | 1 - 1 file changed, 1 deletion(-) delete mode 100644 schematic_generation_test_cache.json diff --git a/schematic_generation_test_cache.json b/schematic_generation_test_cache.json deleted file mode 100644 index 9e26dfeeb..000000000 --- a/schematic_generation_test_cache.json +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file From b0ad7742afa8a5b3f22334ec1e41c5398db41fab Mon Sep 17 00:00:00 2001 From: Dor Zohar Date: Sat, 21 Dec 2024 11:31:36 +0000 Subject: [PATCH 3/6] Polish tests, create shots functionality in GuidelineProposer, writing the format_shot inside the GuidelineProposer instead in the dataclass --- src/parlant/bin/server.py | 1 - .../core/engines/alpha/guideline_proposer.py | 114 ++++++++++-------- .../core/engines/alpha/prompt_builder.py | 23 ---- src/parlant/core/shots.py | 8 +- .../engines/alpha/test_guideline_proposer.py | 3 - tests/core/test_shots.py | 22 +--- 6 files changed, 67 insertions(+), 104 deletions(-) diff --git a/src/parlant/bin/server.py b/src/parlant/bin/server.py index ff18aedee..876195988 100755 --- a/src/parlant/bin/server.py +++ b/src/parlant/bin/server.py @@ -353,7 +353,6 @@ async def setup_container(nlp_service_name: str) -> AsyncIterator[Container]: c[GuidelineProposer] = GuidelineProposer( c[Logger], c[SchematicGenerator[GuidelinePropositionsSchema]], - c[ShotCollection[GuidelinePropositionShot]], ) c[GuidelineConnectionProposer] = GuidelineConnectionProposer( c[Logger], diff --git a/src/parlant/core/engines/alpha/guideline_proposer.py b/src/parlant/core/engines/alpha/guideline_proposer.py index 07a4fc670..8edd3a64d 100644 --- a/src/parlant/core/engines/alpha/guideline_proposer.py +++ b/src/parlant/core/engines/alpha/guideline_proposer.py @@ -63,53 +63,6 @@ class GuidelinePropositionShot(Shot): guidelines: Sequence[GuidelineContent] expected_result: GuidelinePropositionsSchema - @override - def format(self) -> str: - def adapt_event(e: Event) -> JSONSerializable: - source_map: dict[EventSource, str] = { - "customer": "user", - "customer_ui": "frontend_application", - "human_agent": "human_service_agent", - "human_agent_on_behalf_of_ai_agent": "ai_agent", - "ai_agent": "ai_agent", - "system": "system-provided", - } - - return { - "event_kind": e.kind, - "event_source": source_map[e.source], - "data": e.data, - } - - example = "" - if self.interaction_events: - example += f""" -- Interaction Events: ### -{json.dumps([adapt_event(e) for e in self.interaction_events], indent=2)} -### - -""" - if self.guidelines: - formatted_guidelines = "\n".join( - f"{i}) condition: {g.condition}, action: {g.action}" - for i, g in enumerate(self.guidelines, start=1) - ) - example += f""" -- Guidelines: ### -{formatted_guidelines} -### - -""" - - example += f""" -- **Expected Result**: -```json -{json.dumps(self.expected_result.model_dump(mode="json"), indent=2)} -``` -""" - - return example - @dataclass(frozen=True) class ConditionApplicabilityEvaluation: @@ -141,11 +94,9 @@ def __init__( self, logger: Logger, schematic_generator: SchematicGenerator[GuidelinePropositionsSchema], - shot_collection: ShotCollection[GuidelinePropositionShot], ) -> None: self._logger = logger self._schematic_generator = schematic_generator - self._shot_collection = shot_collection async def propose_guidelines( self, @@ -265,7 +216,7 @@ async def _process_guideline_batch( staged_events=staged_events, terms=terms, guidelines=guidelines_dict, - shots=await self._shot_collection.list(), + shots=await shot_collection.list(), ) with self._logger.operation( @@ -309,6 +260,55 @@ async def _process_guideline_batch( return propositions_generation_response.info, propositions + async def shots(self) -> Sequence[GuidelinePropositionShot]: + return await shot_collection.list() + + def _format_shot(self, shot: GuidelinePropositionShot) -> str: + def adapt_event(e: Event) -> JSONSerializable: + source_map: dict[EventSource, str] = { + "customer": "user", + "customer_ui": "frontend_application", + "human_agent": "human_service_agent", + "human_agent_on_behalf_of_ai_agent": "ai_agent", + "ai_agent": "ai_agent", + "system": "system-provided", + } + + return { + "event_kind": e.kind, + "event_source": source_map[e.source], + "data": e.data, + } + + formatted_shot = "" + if shot.interaction_events: + formatted_shot += f""" +- Interaction Events: ### +{json.dumps([adapt_event(e) for e in shot.interaction_events], indent=2)} +### + +""" + if shot.guidelines: + formatted_guidelines = "\n".join( + f"{i}) condition: {g.condition}, action: {g.action}" + for i, g in enumerate(shot.guidelines, start=1) + ) + formatted_shot += f""" +- Guidelines: ### +{formatted_guidelines} +### + +""" + + formatted_shot += f""" +- **Expected Result**: +```json +{json.dumps(shot.expected_result.model_dump(mode="json"), indent=2)} +``` +""" + + return formatted_shot + def _format_prompt( self, agents: Sequence[Agent], @@ -318,7 +318,7 @@ def _format_prompt( staged_events: Sequence[EmittedEvent], terms: Sequence[Term], guidelines: dict[int, Guideline], - shots: Sequence[Shot], + shots: Sequence[GuidelinePropositionShot], ) -> str: assert len(agents) == 1 @@ -638,7 +638,17 @@ def _format_prompt( >>>>>>> ecf3aa96 (Add few shot feature and implement it on guideline_proposer) """ # noqa ) - builder.add_few_shots(shots) + builder.add_section( + "".join( + f""" +Example #{i}: +{self._format_shot(shot)} + +### +""" + for i, shot in enumerate(shots, start=1) + ) + ) builder.add_agent_identity(agents[0]) builder.add_context_variables(context_variables) builder.add_glossary(terms) diff --git a/src/parlant/core/engines/alpha/prompt_builder.py b/src/parlant/core/engines/alpha/prompt_builder.py index 0177423b9..99963aef5 100644 --- a/src/parlant/core/engines/alpha/prompt_builder.py +++ b/src/parlant/core/engines/alpha/prompt_builder.py @@ -27,7 +27,6 @@ emitted_tool_events_to_dicts, ) from parlant.core.emissions import EmittedEvent -from parlant.core.shots import Shot class BuiltInSection(Enum): @@ -38,7 +37,6 @@ class BuiltInSection(Enum): GUIDELINE_DESCRIPTIONS = auto() GUIDELINES = auto() STAGED_EVENTS = auto() - FEW_SHOTS = auto() class SectionStatus(Enum): @@ -240,24 +238,3 @@ def add_staged_events( ) return self - - def add_few_shots( - self, - shots: Sequence[Shot], - ) -> PromptBuilder: - if shots: - self.add_section( - name=BuiltInSection.FEW_SHOTS, - content="".join( - f""" -Example #{i}: -{shot.format()} - - -""" - for i, shot in enumerate(shots, start=1) - ), - status=SectionStatus.ACTIVE, - ) - - return self diff --git a/src/parlant/core/shots.py b/src/parlant/core/shots.py index 70f97d6ec..0381dbb48 100644 --- a/src/parlant/core/shots.py +++ b/src/parlant/core/shots.py @@ -1,4 +1,3 @@ -from abc import abstractmethod from dataclasses import dataclass from typing import TypeVar, Generic, Sequence @@ -6,10 +5,7 @@ @dataclass class Shot: description: str - """An explanation of what makes this shot interesting.""" - - @abstractmethod - def format(self) -> str: ... + """An explanation of what makes this shot interesting""" TShot = TypeVar("TShot", bound=Shot) @@ -39,7 +35,7 @@ async def remove( self, shot: TShot, ) -> None: - self._shots = [s for s in self._shots if s != shot] + self._shots.remove(shot) async def clear(self) -> None: self._shots.clear() diff --git a/tests/core/stable/engines/alpha/test_guideline_proposer.py b/tests/core/stable/engines/alpha/test_guideline_proposer.py index dbd064969..6411574a4 100644 --- a/tests/core/stable/engines/alpha/test_guideline_proposer.py +++ b/tests/core/stable/engines/alpha/test_guideline_proposer.py @@ -35,7 +35,6 @@ from parlant.core.nlp.generation import SchematicGenerator from parlant.core.engines.alpha.guideline_proposer import ( GuidelineProposer, - GuidelinePropositionShot, GuidelinePropositionsSchema, ) from parlant.core.engines.alpha.guideline_proposition import ( @@ -46,7 +45,6 @@ from parlant.core.logging import Logger from parlant.core.glossary import TermId -from parlant.core.shots import ShotCollection from tests.core.common.utils import create_event_message from tests.test_utilities import SyncAwaiter @@ -228,7 +226,6 @@ def propose_guidelines( guideline_proposer = GuidelineProposer( context.logger, context.schematic_generator, - context.container[ShotCollection[GuidelinePropositionShot]], ) interaction_history = [ diff --git a/tests/core/test_shots.py b/tests/core/test_shots.py index f4c729f2c..ddb2f73c5 100644 --- a/tests/core/test_shots.py +++ b/tests/core/test_shots.py @@ -1,8 +1,6 @@ from datetime import datetime, timezone from lagom import Container -from parlant.core.agents import Agent -from parlant.core.customers import Customer from parlant.core.engines.alpha.guideline_proposer import ( GuidelineProposer, GuidelinePropositionSchema, @@ -16,15 +14,13 @@ async def test_that_appended_shot_is_displayed_in_guideline_proposer_prompt( container: Container, - agent: Agent, - customer: Customer, ) -> None: guideline_proposer = container[GuidelineProposer] shot_collection = container[ShotCollection[GuidelinePropositionShot]] guideline_content = GuidelineContent( condition="A user compliments the product", - action="Tell him to stop being a bitch", + action="Thank him with in Portuguese", ) guideline_proposition_shot = GuidelinePropositionShot( @@ -59,18 +55,6 @@ async def test_that_appended_shot_is_displayed_in_guideline_proposer_prompt( await shot_collection.append(guideline_proposition_shot) - shots = await shot_collection.list() - prompt = guideline_proposer._format_prompt( - agents=[agent], - customer=customer, - context_variables=[], - interaction_history=[], - staged_events=[], - terms=[], - guidelines={}, - shots=shots, - ) + shots = await guideline_proposer.shots() - assert "Rationale: The user said that the feature is cool" in prompt - assert "This is a very cool feature, man!" in prompt - assert "Tell him to stop being a bitch" in prompt + assert guideline_proposition_shot in shots From 588044def61f459e1c6c29353c6293b17651f8e5 Mon Sep 17 00:00:00 2001 From: Dor Zohar Date: Sat, 21 Dec 2024 16:38:49 +0000 Subject: [PATCH 4/6] Implement Shot mechanism in tool caller --- .../core/engines/alpha/guideline_proposer.py | 8 +- src/parlant/core/engines/alpha/tool_caller.py | 549 ++++++++++-------- .../engines/alpha/tool_event_generator.py | 6 +- tests/conftest.py | 4 +- tests/core/test_shots.py | 33 +- 5 files changed, 343 insertions(+), 257 deletions(-) diff --git a/src/parlant/core/engines/alpha/guideline_proposer.py b/src/parlant/core/engines/alpha/guideline_proposer.py index 8edd3a64d..e1fa9d360 100644 --- a/src/parlant/core/engines/alpha/guideline_proposer.py +++ b/src/parlant/core/engines/alpha/guideline_proposer.py @@ -216,7 +216,7 @@ async def _process_guideline_batch( staged_events=staged_events, terms=terms, guidelines=guidelines_dict, - shots=await shot_collection.list(), + shots=await self.shots(), ) with self._logger.operation( @@ -894,19 +894,19 @@ def _make_event(e_id: str, source: EventSource, message: str) -> Event: _baseline_shots: Sequence[GuidelinePropositionShot] = [ GuidelinePropositionShot( - description="Guidelines application when the customer purchases a subscription and asks about data security.", + description="", interaction_events=example_1_events, guidelines=example_1_guidelines, expected_result=example_1_expected, ), GuidelinePropositionShot( - description="Guidelines application when the customer is looking for a job.", + description="", interaction_events=example_2_events, guidelines=example_2_guidelines, expected_result=example_2_expected, ), GuidelinePropositionShot( - description="Guidelines application when the customer asks about stock prices and the weather.", + description="", interaction_events=example_3_events, guidelines=example_3_guidelines, expected_result=example_3_expected, diff --git a/src/parlant/core/engines/alpha/tool_caller.py b/src/parlant/core/engines/alpha/tool_caller.py index 5587b20d9..d95cdb439 100644 --- a/src/parlant/core/engines/alpha/tool_caller.py +++ b/src/parlant/core/engines/alpha/tool_caller.py @@ -21,6 +21,7 @@ from typing import Any, Mapping, NewType, Optional, Sequence, cast from parlant.core import async_utils +from parlant.core.shots import Shot, ShotCollection from parlant.core.tools import Tool, ToolContext from parlant.core.agents import Agent from parlant.core.common import JSONSerializable, generate_id, DefaultBaseModel @@ -68,6 +69,12 @@ class ToolCallInferenceSchema(DefaultBaseModel): tool_calls_for_candidate_tool: list[ToolCallEvaluation] +@dataclass +class ToolCallerInferenceShot(Shot): + context: str + expected_result: ToolCallInferenceSchema + + @dataclass(frozen=True) class ToolCall: id: ToolCallId @@ -185,6 +192,7 @@ async def _infer_tool_call_batch( batch, reference_tools, staged_events, + await self.shots(), ) with self._logger.operation(f"Tool classification for tool_id '{batch[0]}'"): @@ -221,6 +229,22 @@ async def execute_tool_calls( return tool_results + async def shots(self) -> Sequence[ToolCallerInferenceShot]: + return await shot_collection.list() + + def _format_shot( + self, + shot: ToolCallerInferenceShot, + ) -> str: + return f""" +- **Context**: +{shot.context} + +- **Expected Result**: +```json +{json.dumps(shot.expected_result.model_dump(mode="json"), indent=2)} +```""" + def _format_tool_call_inference_prompt( self, agents: Sequence[Agent], @@ -231,6 +255,7 @@ def _format_tool_call_inference_prompt( batch: tuple[ToolId, Tool, list[GuidelineProposition]], reference_tools: Sequence[tuple[ToolId, Tool]], staged_events: Sequence[EmittedEvent], + shots: Sequence[ToolCallerInferenceShot], ) -> str: assert len(agents) == 1 @@ -309,253 +334,18 @@ def _format_tool_call_inference_prompt( EXAMPLES ----------------- +""" # noqa + ) + builder.add_section( + "\n".join( + f""" +Example {i}: ### +{self._format_shot(shot)} ### -Example 1: - -Context - the id of the customer is 12345, and check_balance(12345) is already listed as a staged tool call -### -```json -{{ - "last_customer_message": "Do I have enough money in my account to get a taxi from New York to Newark?", - "most_recent_customer_inquiry_or_need": "Checking customer's balance, comparing it to the price of a taxi from New York to Newark, and report the result to the customer", - "most_recent_customer_inquiry_or_need_was_already_resolved": false, - "name": "check_balance", - "subtleties_to_be_aware_of": "", - "tool_calls_for_candidate_tool": [ - {{ - "applicability_rationale": "We need the client's current balance to respond to their question", - "applicability_score": 9, - "arguments": {{ - "customer_id": "12345" - }}, - "same_call_is_already_staged": true, - "comparison_with_rejected_tools_including_references_to_subtleties": "There are no tools in the list of rejected tools", - "relevant_subtleties": "", - "a_more_fitting_tool_was_rejected_for_some_reason_and_potentially_despite_a_found_subtlety": false, - "should_run": false - }} - ] -}} -``` - -### -Example 2: - -Context - the id of the customer is 12345, and check_balance(12345) is listed as the only staged tool call -### -```json -{{ - "last_customer_message": "Do I have enough money in my account to get a taxi from New York to Newark?", - "most_recent_customer_inquiry_or_need": "Checking customer's balance, comparing it to the price of a taxi from New York to Newark, and report the result to the customer", - "most_recent_customer_inquiry_or_need_was_already_resolved": false, - "name": "ping_supervisor", - "subtleties_to_be_aware_of": "", - "tool_calls_for_candidate_tool": [ - {{ - - "applicability_rationale": "There is no reason to notify the supervisor of anything", - "applicability_score": 1, - "same_call_is_already_staged": false, - "comparison_with_rejected_tools_including_references_to_subtleties": "There are no tools in the list of rejected tools", - "relevant_subtleties": "", - "a_more_fitting_tool_was_rejected_for_some_reason_and_potentially_despite_a_found_subtlety": false, - "should_run": false - }} - ] -}} -``` - -### -Example 3: - -Context - the id of the customer is 12345, and check_balance(12345) is the only staged tool call, assume some irrelevant reference tools exists -### -```json -{{ - "last_customer_message": "Do I have enough money in my account to get a taxi from New York to Newark?", - "most_recent_customer_inquiry_or_need": "Checking customer's balance, comparing it to the price of a taxi from New York to Newark, and report the result to the customer", - "most_recent_customer_inquiry_or_need_was_already_resolved": false, - "name": "check_ride_price", - "subtleties_to_be_aware_of": "", - "tool_calls_for_candidate_tool": [ - {{ - "applicability_rationale": "We need to know the price of a ride from New York to Newark to respond to the customer", - "applicability_score": 9, - "arguments": {{ - "origin": "New York", - "Destination": "Newark" - }}, - "same_call_is_already_staged": false, - "comparison_with_rejected_tools_including_references_to_subtleties": "None of the available reference tools are deemed more suitable for the candidate tool’s application", - "relevant_subtleties": "", - "a_more_fitting_tool_was_rejected_for_some_reason_and_potentially_despite_a_found_subtlety": false, - "should_run": true - }} - ] -}} -``` - -### -Example 4: -Context - the candidate tool is check_calories(): returns the number of calories in a the product -- one reference tool of check_stock(): returns all menu items that are currently in stock -### -```json -{{ - "last_customer_message": "Which pizza has more calories, the classic margherita or the deep dish?", - "most_recent_customer_inquiry_or_need": "Checking the number of calories in two types of pizza and replying with which one has more", - "most_recent_customer_inquiry_or_need_was_already_resolved": false, - "name": "check_calories", - "subtleties_to_be_aware_of": "", - "tool_calls_for_candidate_tool": [ - {{ - "applicability_rationale": "We need to check how many calories are in the margherita pizza", - "applicability_score": 9, - "arguments": {{ - "product_name": "margherita" - }}, - "same_call_is_already_staged": false, - "comparison_with_rejected_tools_including_references_to_subtleties": "None of the available reference tools are deemed more suitable for the candidate tool’s application", - "relevant_subtleties": "", - "a_more_fitting_tool_was_rejected_for_some_reason_and_potentially_despite_a_found_subtlety": false, - "should_run": true - - }}, - {{ - "applicability_rationale": "We need to check how many calories are in the deep dish pizza", - "applicability_score": 9, - "arguments": {{ - "product_name": "deep dish" - }}, - "same_call_is_already_staged": false, - "comparison_with_rejected_tools_including_references_to_subtleties": "None of the available reference tools are deemed more suitable for the candidate tool’s application", - "relevant_subtleties": "", - "a_more_fitting_tool_was_rejected_for_some_reason_and_potentially_despite_a_found_subtlety": false, - "should_run": true - }} - ] -}} -``` - -### -Example 5: -Context - the candidate tool is check_vehicle_price(model: str), and reference tool of - check_motorcycle_price(model: str) -### -```json -{{ - "last_customer_message": "What's your price for a Harley-Davidson Street Glide?", - "most_recent_customer_inquiry_or_need": "Checking the price of a Harley-Davidson Street Glide motorcycle", - "most_recent_customer_inquiry_or_need_was_already_resolved": false, - "name": "check_motorcycle_price", - "subtleties_to_be_aware_of": "", - "tool_calls_for_candidate_tool": [ - {{ - "applicability_rationale": "we need to check for the price of a specific motorcycle model", - "applicability_score": 9, - "arguments": {{ - "model": "Harley-Davidson Street Glide" - }}, - "same_call_is_already_staged": false, - "comparison_with_rejected_tools_including_references_to_subtleties": "candidate tool is more specialized for this use case than the rejected tools", - "relevant_subtleties": "", - "a_more_fitting_tool_was_rejected_for_some_reason_and_potentially_despite_a_found_subtlety": false, - "better_rejected_tool_name": "check_motorcycle_price", - "better_rejected_tool_rationale": "the only reference tool is less relevant than the candidate tool, since the candidate tool is designed specifically for motorcycle models, and not just general vehicles.", - "should_run": true - }} - ] -}} -``` - -### -Example 6: -Context - the candidate tool is check_motorcycle_price(model: str), and one reference tool of - check_vehicle_price(model: str) -### -```json -{{ - "last_customer_message": "What's your price for a Harley-Davidson Street Glide?", - "most_recent_customer_inquiry_or_need": "Checking the price of a Harley-Davidson Street Glide motorcycle", - "most_recent_customer_inquiry_or_need_was_already_resolved": false, - "name": "check_vehicle_price", - "subtleties_to_be_aware_of": "", - "tool_calls_for_candidate_tool": [ - {{ - "applicability_rationale": "we need to check for the price of a specific vehicle - a Harley-Davidson Street Glide", - "applicability_score": 8, - "arguments": {{ - "model": "Harley-Davidson Street Glide" - }}, - "same_call_is_already_staged": false, - "comparison_with_rejected_tools_including_references_to_subtleties": "not as good a fit as check_motorcycle_price", - "relevant_subtleties": "", - "a_more_fitting_tool_was_rejected_for_some_reason_and_potentially_despite_a_found_subtlety": true, - "better_rejected_tool_name": "check_motorcycle_price", - "better_rejected_tool_rationale": "check_motorcycle_price applies specifically for motorcycles, which is better fitting for this case compared to the more general check_vehicle_price", - "should_run": false - }} - ] -}} -``` -### -Example 7: -Context - the candidate tool is check_indoor_temperature(room: str), and reference tool of check_temperature(location: str, type: str) -### -```json -{{ - "last_customer_message": "What's the temperature in the living room right now?", - "most_recent_customer_inquiry_or_need": "Checking the current temperature in the living room", - "most_recent_customer_inquiry_or_need_was_already_resolved": false, - "name": "check_indoor_temperature", - "subtleties_to_be_aware_of": "", - "tool_calls_for_candidate_tool": [ - {{ - "applicability_rationale": "need to check the current temperature in a specific room", - "applicability_score": 7, - "arguments": {{ - "room": "living room" - }}, - "same_call_is_already_staged": false, - "comparison_with_rejected_tools_including_references_to_subtleties": "not as good a fit as check_temperature", - "relevant_subtleties": "", - "a_more_fitting_tool_was_rejected_for_some_reason_and_potentially_despite_a_found_subtlety": true, - "better_rejected_tool_name": "check_temperature", - "better_rejected_tool_rationale": "check_temperature is more versatile and can handle both indoor and outdoor locations with the type parameter, making it more suitable than the room-specific tool", - "should_run": false - }} - ] -}} -``` -### -Example 8: -Context - the candidate tool is search_product(query: str), and reference tool of search_electronics(query: str, specifications: dict) -### -```json -{{ - "last_customer_message": "I'm looking for a gaming laptop with at least 16GB RAM and an RTX 3080", - "most_recent_customer_inquiry_or_need": "Searching for a gaming laptop with specific technical requirements", - "most_recent_customer_inquiry_or_need_was_already_resolved": false, - "name": "search_product", - "subtleties_to_be_aware_of": "", - "tool_calls_for_candidate_tool": [ - {{ - "applicability_rationale": "need to search for a product with specific technical requirements", - "applicability_score": 6, - "arguments": {{ - "query": "gaming laptop RTX 3080 16GB RAM" - }}, - "same_call_is_already_staged": false, - "comparison_with_rejected_tools_including_references_to_subtleties": "not as good a fit as search_electronics", - "relevant_subtleties": "", - "a_more_fitting_tool_was_rejected_for_some_reason_and_potentially_despite_a_found_subtlety": true, - "better_rejected_tool_name": "search_electronics", - "better_rejected_tool_rationale": "search_electronics is more appropriate as it allows for structured specification of technical requirements rather than relying on text search, which will provide more accurate results for electronic products", - "should_run": false - }} - ] -}} -``` -""" # noqa +""" + for i, shot in enumerate(shots, start=1) + ) ) builder.add_context_variables(context_variables) builder.add_glossary(terms) @@ -633,7 +423,8 @@ def _format_tool_call_inference_prompt( """ ) - return builder.build() + prompt = builder.build() + return prompt def _add_tool_definitions_section( self, candidate_tool: tuple[ToolId, Tool], reference_tools: Sequence[tuple[ToolId, Tool]] @@ -796,3 +587,269 @@ async def _run_tool( "control": {}, }, ) + + +_baseline_shots: Sequence[ToolCallerInferenceShot] = [ + ToolCallerInferenceShot( + description="", + context="the id of the customer is 12345, and check_balance(12345) is already listed as a staged tool call", + expected_result=ToolCallInferenceSchema( + last_customer_message="Do I have enough money in my account to get a taxi from New York to Newark?", + most_recent_customer_inquiry_or_need=( + "Checking customer's balance, comparing it to the price of a taxi from New York to Newark, " + "and report the result to the customer" + ), + most_recent_customer_inquiry_or_need_was_already_resolved=False, + name="check_balance", + subtleties_to_be_aware_of="", + tool_calls_for_candidate_tool=[ + { + "applicability_rationale": "We need the client's current balance to respond to their question", + "applicability_score": 9, + "arguments": {"customer_id": "12345"}, + "same_call_is_already_staged": True, + "comparison_with_rejected_tools_including_references_to_subtleties": ( + "There are no tools in the list of rejected tools" + ), + "relevant_subtleties": "", + "a_more_fitting_tool_was_rejected_for_some_reason_and_potentially_despite_a_found_subtlety": False, + "better_rejected_tool_name": None, + "better_rejected_tool_rationale": None, + "should_run": False, + } + ], + ), + ), + ToolCallerInferenceShot( + description="", + context="the id of the customer is 12345, and check_balance(12345) is listed as the only staged tool call", + expected_result=ToolCallInferenceSchema( + last_customer_message="Do I have enough money in my account to get a taxi from New York to Newark?", + most_recent_customer_inquiry_or_need=( + "Checking customer's balance, comparing it to the price of a taxi from New York to Newark, " + "and report the result to the customer" + ), + most_recent_customer_inquiry_or_need_was_already_resolved=False, + name="ping_supervisor", + subtleties_to_be_aware_of="", + tool_calls_for_candidate_tool=[ + { + "applicability_rationale": "There is no reason to notify the supervisor of anything", + "applicability_score": 1, + "same_call_is_already_staged": False, + "comparison_with_rejected_tools_including_references_to_subtleties": "There are no tools in the list of rejected tools", + "relevant_subtleties": "", + "a_more_fitting_tool_was_rejected_for_some_reason_and_potentially_despite_a_found_subtlety": False, + "should_run": False, + } + ], + ), + ), + ToolCallerInferenceShot( + description="", + context=( + "the id of the customer is 12345, and check_balance(12345) is the only staged tool call; " + "some irrelevant reference tools exist" + ), + expected_result=ToolCallInferenceSchema( + last_customer_message="Do I have enough money in my account to get a taxi from New York to Newark?", + most_recent_customer_inquiry_or_need=( + "Checking customer's balance, comparing it to the price of a taxi from New York to Newark, " + "and report the result to the customer" + ), + most_recent_customer_inquiry_or_need_was_already_resolved=False, + name="check_ride_price", + subtleties_to_be_aware_of="", + tool_calls_for_candidate_tool=[ + { + "applicability_rationale": "We need to know the price of a ride from New York to Newark to respond to the customer", + "applicability_score": 9, + "arguments": {"origin": "New York", "Destination": "Newark"}, + "same_call_is_already_staged": False, + "comparison_with_rejected_tools_including_references_to_subtleties": ( + "None of the available reference tools are deemed more suitable for the candidate tool’s application" + ), + "relevant_subtleties": "", + "a_more_fitting_tool_was_rejected_for_some_reason_and_potentially_despite_a_found_subtlety": False, + "better_rejected_tool_name": None, + "better_rejected_tool_rationale": None, + "should_run": True, + } + ], + ), + ), + ToolCallerInferenceShot( + description="", + context=( + "the candidate tool is check_calories(): returns the number of calories in a product; " + "one reference tool is check_stock()" + ), + expected_result=ToolCallInferenceSchema( + last_customer_message="Which pizza has more calories, the classic margherita or the deep dish?", + most_recent_customer_inquiry_or_need=( + "Checking the number of calories in two types of pizza and replying with which one has more" + ), + most_recent_customer_inquiry_or_need_was_already_resolved=False, + name="check_calories", + subtleties_to_be_aware_of="", + tool_calls_for_candidate_tool=[ + { + "applicability_rationale": "We need to check how many calories are in the margherita pizza", + "applicability_score": 9, + "arguments": {"product_name": "margherita"}, + "same_call_is_already_staged": False, + "comparison_with_rejected_tools_including_references_to_subtleties": ( + "None of the available reference tools are deemed more suitable for the candidate tool’s application" + ), + "relevant_subtleties": "", + "a_more_fitting_tool_was_rejected_for_some_reason_and_potentially_despite_a_found_subtlety": False, + "better_rejected_tool_name": None, + "better_rejected_tool_rationale": None, + "should_run": True, + }, + { + "applicability_rationale": "We need to check how many calories are in the deep dish pizza", + "applicability_score": 9, + "arguments": {"product_name": "deep dish"}, + "same_call_is_already_staged": False, + "comparison_with_rejected_tools_including_references_to_subtleties": ( + "None of the available reference tools are deemed more suitable for the candidate tool’s application" + ), + "relevant_subtleties": "", + "a_more_fitting_tool_was_rejected_for_some_reason_and_potentially_despite_a_found_subtlety": False, + "better_rejected_tool_name": None, + "better_rejected_tool_rationale": None, + "should_run": True, + }, + ], + ), + ), + ToolCallerInferenceShot( + description="", + context=( + "the candidate tool is check_vehicle_price(model: str), and reference tool is check_motorcycle_price(model: str)" + ), + expected_result=ToolCallInferenceSchema( + last_customer_message="What's your price for a Harley-Davidson Street Glide?", + most_recent_customer_inquiry_or_need="Checking the price of a Harley-Davidson Street Glide motorcycle", + most_recent_customer_inquiry_or_need_was_already_resolved=False, + name="check_motorcycle_price", + subtleties_to_be_aware_of="", + tool_calls_for_candidate_tool=[ + { + "applicability_rationale": "we need to check for the price of a specific motorcycle model", + "applicability_score": 9, + "arguments": {"model": "Harley-Davidson Street Glide"}, + "same_call_is_already_staged": False, + "comparison_with_rejected_tools_including_references_to_subtleties": ( + "candidate tool is more specialized for this use case than the rejected tools" + ), + "relevant_subtleties": "", + "a_more_fitting_tool_was_rejected_for_some_reason_and_potentially_despite_a_found_subtlety": False, + "better_rejected_tool_name": "check_motorcycle_price", + "better_rejected_tool_rationale": ( + "the only reference tool is less relevant than the candidate tool, " + "since the candidate tool is designed specifically for motorcycle models, " + "and not just general vehicles." + ), + "should_run": True, + } + ], + ), + ), + ToolCallerInferenceShot( + description="", + context=( + "the candidate tool is check_motorcycle_price(model: str), and one reference tool is check_vehicle_price(model: str)" + ), + expected_result=ToolCallInferenceSchema( + last_customer_message="What's your price for a Harley-Davidson Street Glide?", + most_recent_customer_inquiry_or_need="Checking the price of a Harley-Davidson Street Glide motorcycle", + most_recent_customer_inquiry_or_need_was_already_resolved=False, + name="check_vehicle_price", + subtleties_to_be_aware_of="", + tool_calls_for_candidate_tool=[ + { + "applicability_rationale": "we need to check for the price of a specific vehicle - a Harley-Davidson Street Glide", + "applicability_score": 8, + "arguments": {"model": "Harley-Davidson Street Glide"}, + "same_call_is_already_staged": False, + "comparison_with_rejected_tools_including_references_to_subtleties": "not as good a fit as check_motorcycle_price", + "relevant_subtleties": "", + "a_more_fitting_tool_was_rejected_for_some_reason_and_potentially_despite_a_found_subtlety": True, + "better_rejected_tool_name": "check_motorcycle_price", + "better_rejected_tool_rationale": ( + "check_motorcycle_price applies specifically for motorcycles, " + "which is better fitting for this case compared to the more general check_vehicle_price" + ), + "should_run": False, + } + ], + ), + ), + ToolCallerInferenceShot( + description="", + context=( + "the candidate tool is check_indoor_temperature(room: str), and reference tool is check_temperature(location: str, type: str)" + ), + expected_result=ToolCallInferenceSchema( + last_customer_message="What's the temperature in the living room right now?", + most_recent_customer_inquiry_or_need="Checking the current temperature in the living room", + most_recent_customer_inquiry_or_need_was_already_resolved=False, + name="check_indoor_temperature", + subtleties_to_be_aware_of="", + tool_calls_for_candidate_tool=[ + { + "applicability_rationale": "need to check the current temperature in a specific room", + "applicability_score": 7, + "arguments": {"room": "living room"}, + "same_call_is_already_staged": False, + "comparison_with_rejected_tools_including_references_to_subtleties": "not as good a fit as check_temperature", + "relevant_subtleties": "", + "a_more_fitting_tool_was_rejected_for_some_reason_and_potentially_despite_a_found_subtlety": True, + "better_rejected_tool_name": "check_temperature", + "better_rejected_tool_rationale": ( + "check_temperature is more versatile and can handle both indoor and outdoor locations " + "with the type parameter, making it more suitable than the room-specific tool" + ), + "should_run": False, + } + ], + ), + ), + ToolCallerInferenceShot( + description="", + context=( + "the candidate tool is search_product(query: str), and reference tool is " + "search_electronics(query: str, specifications: dict)" + ), + expected_result=ToolCallInferenceSchema( + last_customer_message="I'm looking for a gaming laptop with at least 16GB RAM and an RTX 3080", + most_recent_customer_inquiry_or_need="Searching for a gaming laptop with specific technical requirements", + most_recent_customer_inquiry_or_need_was_already_resolved=False, + name="search_product", + subtleties_to_be_aware_of="", + tool_calls_for_candidate_tool=[ + { + "applicability_rationale": "need to search for a product with specific technical requirements", + "applicability_score": 6, + "arguments": {"query": "gaming laptop RTX 3080 16GB RAM"}, + "same_call_is_already_staged": False, + "comparison_with_rejected_tools_including_references_to_subtleties": "not as good a fit as search_electronics", + "relevant_subtleties": "", + "a_more_fitting_tool_was_rejected_for_some_reason_and_potentially_despite_a_found_subtlety": True, + "better_rejected_tool_name": "search_electronics", + "better_rejected_tool_rationale": ( + "search_electronics is more appropriate as it allows for structured " + "specification of technical requirements rather than relying on text search, " + "which will provide more accurate results for electronic products" + ), + "should_run": False, + } + ], + ), + ), +] + + +shot_collection = ShotCollection[ToolCallerInferenceShot](_baseline_shots) diff --git a/src/parlant/core/engines/alpha/tool_event_generator.py b/src/parlant/core/engines/alpha/tool_event_generator.py index 4bfef20b3..addcdeade 100644 --- a/src/parlant/core/engines/alpha/tool_event_generator.py +++ b/src/parlant/core/engines/alpha/tool_event_generator.py @@ -50,7 +50,7 @@ def __init__( self._correlator = correlator self._service_registry = service_registry - self._tool_caller = ToolCaller(logger, service_registry, schematic_generator) + self.tool_caller = ToolCaller(logger, service_registry, schematic_generator) async def generate_events( self, @@ -71,7 +71,7 @@ async def generate_events( self._logger.debug("Skipping tool calling; no tools associated with guidelines found") return None - inference_tool_calls_result = await self._tool_caller.infer_tool_calls( + inference_tool_calls_result = await self.tool_caller.infer_tool_calls( agents, context_variables, interaction_history, @@ -85,7 +85,7 @@ async def generate_events( if not tool_calls: return ToolEventGenerationsResult(inference_tool_calls_result.batch_generations, []) - tool_results = await self._tool_caller.execute_tool_calls( + tool_results = await self.tool_caller.execute_tool_calls( ToolContext( agent_id=agents[0].id, session_id=session_id, diff --git a/tests/conftest.py b/tests/conftest.py index 49fb69c03..c69ec21c9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -32,6 +32,7 @@ from parlant.core.emissions import EventEmitterFactory from parlant.core.customers import CustomerDocumentStore, CustomerStore from parlant.core.engines.alpha import guideline_proposer +from parlant.core.engines.alpha import tool_caller from parlant.core.evaluations import ( EvaluationListener, PollingEvaluationListener, @@ -69,7 +70,7 @@ MessageEventGenerator, MessageEventSchema, ) -from parlant.core.engines.alpha.tool_caller import ToolCallInferenceSchema +from parlant.core.engines.alpha.tool_caller import ToolCallInferenceSchema, ToolCallerInferenceShot from parlant.core.engines.alpha.tool_event_generator import ToolEventGenerator from parlant.core.engines.types import Engine from parlant.core.services.indexing.behavioral_change_evaluation import ( @@ -259,6 +260,7 @@ async def container( ) container[ShotCollection[GuidelinePropositionShot]] = guideline_proposer.shot_collection + container[ShotCollection[ToolCallerInferenceShot]] = tool_caller.shot_collection container[GuidelineProposer] = Singleton(GuidelineProposer) container[GuidelineConnectionProposer] = Singleton(GuidelineConnectionProposer) diff --git a/tests/core/test_shots.py b/tests/core/test_shots.py index ddb2f73c5..e0714a75a 100644 --- a/tests/core/test_shots.py +++ b/tests/core/test_shots.py @@ -7,6 +7,8 @@ GuidelinePropositionShot, GuidelinePropositionsSchema, ) +from parlant.core.engines.alpha.tool_caller import ToolCallInferenceSchema, ToolCallerInferenceShot +from parlant.core.engines.alpha.tool_event_generator import ToolEventGenerator from parlant.core.guidelines import GuidelineContent from parlant.core.sessions import Event, EventId from parlant.core.shots import ShotCollection @@ -23,7 +25,7 @@ async def test_that_appended_shot_is_displayed_in_guideline_proposer_prompt( action="Thank him with in Portuguese", ) - guideline_proposition_shot = GuidelinePropositionShot( + new_shot = GuidelinePropositionShot( description="Test Shot Description", interaction_events=[ Event( @@ -53,8 +55,33 @@ async def test_that_appended_shot_is_displayed_in_guideline_proposer_prompt( ), ) - await shot_collection.append(guideline_proposition_shot) + await shot_collection.append(new_shot) shots = await guideline_proposer.shots() - assert guideline_proposition_shot in shots + assert new_shot in shots + + +async def test_that_appended_shot_is_displayed_in_tool_caller_shots( + container: Container, +) -> None: + tool_caller = container[ToolEventGenerator].tool_caller + shot_collection = container[ShotCollection[ToolCallerInferenceShot]] + + new_shot = ToolCallerInferenceShot( + description="Test Shot Description", + context="Test shot - checking if appended shot is reflected by tool_caller.shots()", + expected_result=ToolCallInferenceSchema( + last_customer_message="Testing shot append", + most_recent_customer_inquiry_or_need="Verifying tool caller logic", + most_recent_customer_inquiry_or_need_was_already_resolved=False, + name="test_tool", + subtleties_to_be_aware_of="", + tool_calls_for_candidate_tool=[], + ), + ) + + await shot_collection.append(new_shot) + + shots = await tool_caller.shots() + assert new_shot in shots From 3f7040ba889b3116462582b608a7222c4dde52fc Mon Sep 17 00:00:00 2001 From: Dor Zohar Date: Sat, 21 Dec 2024 17:02:27 +0000 Subject: [PATCH 5/6] Implement Shot mechanism in message generator --- src/parlant/bin/server.py | 7 +- .../core/engines/alpha/guideline_proposer.py | 9 +- .../engines/alpha/message_event_generator.py | 660 ++++++++++-------- src/parlant/core/engines/alpha/tool_caller.py | 8 +- tests/conftest.py | 5 + tests/core/test_shots.py | 38 + 6 files changed, 426 insertions(+), 301 deletions(-) diff --git a/src/parlant/bin/server.py b/src/parlant/bin/server.py index 876195988..41a726487 100755 --- a/src/parlant/bin/server.py +++ b/src/parlant/bin/server.py @@ -32,6 +32,8 @@ from parlant.adapters.vector_db.chroma import ChromaDatabase from parlant.core.engines.alpha import guideline_proposer +from parlant.core.engines.alpha import tool_caller +from parlant.core.engines.alpha import message_event_generator from parlant.core.nlp.service import NLPService from parlant.core.shots import ShotCollection from parlant.core.tags import TagDocumentStore, TagStore @@ -77,7 +79,7 @@ GuidelineToolAssociationDocumentStore, GuidelineToolAssociationStore, ) -from parlant.core.engines.alpha.tool_caller import ToolCallInferenceSchema +from parlant.core.engines.alpha.tool_caller import ToolCallInferenceSchema, ToolCallerInferenceShot from parlant.core.engines.alpha.guideline_proposer import ( GuidelineProposer, GuidelinePropositionShot, @@ -85,6 +87,7 @@ ) from parlant.core.engines.alpha.message_event_generator import ( MessageEventGenerator, + MessageEventGeneratorShot, MessageEventSchema, ) from parlant.core.engines.alpha.tool_event_generator import ToolEventGenerator @@ -349,6 +352,8 @@ async def setup_container(nlp_service_name: str) -> AsyncIterator[Container]: ] = await nlp_service.get_schematic_generator(GuidelineConnectionPropositionsSchema) c[ShotCollection[GuidelinePropositionShot]] = guideline_proposer.shot_collection + c[ShotCollection[ToolCallerInferenceShot]] = tool_caller.shot_collection + c[ShotCollection[MessageEventGeneratorShot]] = message_event_generator.shot_collection c[GuidelineProposer] = GuidelineProposer( c[Logger], diff --git a/src/parlant/core/engines/alpha/guideline_proposer.py b/src/parlant/core/engines/alpha/guideline_proposer.py index e1fa9d360..aeb19c974 100644 --- a/src/parlant/core/engines/alpha/guideline_proposer.py +++ b/src/parlant/core/engines/alpha/guideline_proposer.py @@ -396,6 +396,7 @@ def _format_prompt( IMPORTANT: Some guidelines include multiple actions. If only a portion of those actions were fulfilled earlier in the conversation, output "fully" for guideline_previously_applied, and treat the guideline as though it has been fully executed. In such cases, re-apply the guideline only if its condition becomes true again later in the conversation, unless it is marked as continuous. +<<<<<<< HEAD Examples of Condition Evaluations: ------------------- <<<<<<< HEAD @@ -636,10 +637,16 @@ def _format_prompt( ``` ======= >>>>>>> ecf3aa96 (Add few shot feature and implement it on guideline_proposer) +======= +>>>>>>> 2d3af256 (Implement Shot mechanism in message generator) """ # noqa ) builder.add_section( - "".join( + """ +Examples of Condition Evaluations: +------------------- +""" + + "".join( f""" Example #{i}: {self._format_shot(shot)} diff --git a/src/parlant/core/engines/alpha/message_event_generator.py b/src/parlant/core/engines/alpha/message_event_generator.py index ffa40b964..fce416b6f 100644 --- a/src/parlant/core/engines/alpha/message_event_generator.py +++ b/src/parlant/core/engines/alpha/message_event_generator.py @@ -30,6 +30,7 @@ from parlant.core.sessions import Event from parlant.core.common import DefaultBaseModel from parlant.core.logging import Logger +from parlant.core.shots import Shot, ShotCollection from parlant.core.tools import ToolId @@ -74,6 +75,11 @@ class MessageEventSchema(DefaultBaseModel): revisions: list[Revision] +@dataclass +class MessageEventGeneratorShot(Shot): + expected_result: MessageEventSchema + + class MessageEventGenerator: def __init__( self, @@ -85,6 +91,9 @@ def __init__( self._correlator = correlator self._schematic_generator = schematic_generator + async def shots(self) -> Sequence[MessageEventGeneratorShot]: + return await shot_collection.list() + async def generate_events( self, event_emitter: EventEmitter, @@ -130,6 +139,7 @@ async def generate_events( ordinary_guideline_propositions=ordinary_guideline_propositions, tool_enabled_guideline_propositions=tool_enabled_guideline_propositions, staged_events=staged_events, + shots=await self.shots(), ) self._logger.debug(f"Message production prompt:\n{prompt}") @@ -220,6 +230,16 @@ def get_guideline_propositions_text( ### """ + def _format_shot( + self, + shot: MessageEventGeneratorShot, + ) -> str: + return f""" +- **Expected Result**: +```json +{json.dumps(shot.expected_result.model_dump(mode="json"), indent=2)} +```""" + def _format_prompt( self, agents: Sequence[Agent], @@ -230,6 +250,7 @@ def _format_prompt( ordinary_guideline_propositions: Sequence[GuidelineProposition], tool_enabled_guideline_propositions: Mapping[GuidelineProposition, Sequence[ToolId]], staged_events: Sequence[EmittedEvent], + shots: Sequence[MessageEventGeneratorShot], ) -> str: assert len(agents) == 1 builder = PromptBuilder() @@ -333,306 +354,22 @@ def _format_prompt( In cases of conflict, prioritize the business's values and ensure your decisions align with their overarching goals. +""" # noqa + ) + builder.add_section( + """ EXAMPLES ----------------- +""" + + "\n".join( + f""" +Example {i} - {shot.description}: ### +{self._format_shot(shot)} ### -Example 1: A reply that took critique in a few revisions to get right: ### -{{ - "last_message_of_customer": "Hi, I'd like to know the schedule for the next trains to Boston, please.", - "produced_reply": true, - "guidelines": [ - "When the customer asks for train schedules, provide them accurately and concisely." - ], - "insights": [ - "Use markdown format when applicable." - ], - "evaluation_for_each_instruction": [ - {{ - "number": 1, - "instruction": "When the customer asks for train schedules, provide them accurately and concisely.", - "evaluation": "The customer requested train schedules, so I need to respond with accurate timing information.", - "data_available": "Yes, the train schedule data is available." - }}, - {{ - "number": 2, - "instruction": "Use markdown format when applicable.", - "evaluation": "Markdown formatting makes the schedule clearer and more readable.", - "data_available": "Not specifically needed, but markdown format can be applied to any response." - }} - ], - "revisions": [ - {{ - "revision_number": 1, - "content": "Train Schedule:\nTrain 101 departs at 10:00 AM and arrives at 12:30 PM.\nTrain 205 departs at 1:00 PM and arrives at 3:45 PM.", - "instructions_followed": [ - "#1; When the customer asks for train schedules, provide them accurately and concisely." - ], - "instructions_broken": [ - "#2; Did not use markdown format when applicable." - ], - "is_repeat_message": false, - "followed_all_instructions": false, - "instructions_broken_due_to_missing_data": false, - "instructions_broken_only_due_to_prioritization": false - }}, - {{ - "revision_number": 2, - "content": "| Train | Departure | Arrival |\n|-------|-----------|---------|\n| 101 | 10:00 AM | 12:30 PM |\n| 205 | 1:00 PM | 3:45 PM |", - "instructions_followed": [ - "#1; When the customer asks for train schedules, provide them accurately and concisely.", - "#2; Use markdown format when applicable." - ], - "instructions_broken": [], - "is_repeat_message": false, - "followed_all_instructions": true - }} - ] -}} -### - -Example 2: A reply where one instruction was prioritized over another: ### -{{ - "last_message_of_customer": "", - "guidelines": [ - "When the customer chooses and orders a burger, then provide it", - "When the customer chooses specific ingredients on the burger, only provide those ingredients if we have them fresh in stock; otherwise, reject the order" - ], - "insights": [], - "evaluation_for_each_instruction": [ - {{ - "number": 1, - "instruction": "When the customer chooses and orders a burger, then provide it", - "evaluation": "This guideline currently applies, so I need to provide the customer with a burger.", - "data_available": "The burger choice is available in the interaction" - }}, - {{ - "number": 2, - "instruction": "When the customer chooses specific ingredients on the burger, only provide those ingredients if we have them fresh in stock; otherwise, reject the order.", - "evaluation": "The customer chose cheese on the burger, but all of the cheese we currently have is expired", - "data_available": "The relevant stock availability is given in the tool calls' data. Our cheese has expired." - }} - ], - "revisions": [ - {{ - "revision_number": 1, - "content": "I'd be happy to prepare your burger as soon as we restock the requested toppings.", - "instructions_followed": [ - "#2; upheld food quality and did not go on to preparing the burger without fresh toppings." - ], - "instructions_broken": [ - "#1; did not provide the burger with requested toppings immediately due to the unavailability of fresh ingredients." - ], - "is_repeat_message": false, - "followed_all_instructions": false, - "instructions_broken_only_due_to_prioritization": true, - "prioritization_rationale": "Given the higher priority score of guideline 2, maintaining food quality standards before serving the burger is prioritized over immediate service.", - "instructions_broken_due_to_missing_data": false - }} - ] -}} -### - - -Example 3: Non-Adherence Due to Missing Data: ### -{{ - "last_message_of_customer": "Hi there, can I get something to drink? What do you have on tap?", - "guidelines": [ - "When the customer asks for a drink, check the menu and offer what's on it" - ], - "insights": [ - "Do not state factual information that you do not know or are not sure about." - ], - "evaluation_for_each_instruction": [ - {{ - "number": 1, - "instruction": "When the customer asks for a drink, check the menu and offer what's on it", - "evaluation": "The customer did ask for a drink, so I should check the menu to see what's available.", - "data_available": "No, I don't have the menu info in the interaction or tool calls" - }}, - {{ - "number": 2, - "instruction": "Do not state factual information that you do not know or are not sure about", - "evaluation": "There's no information about what we have on tap, so I should not offer any specific option.", - "data_available": "No, the list of available drinks is not available to me" - }} - ], - "revisions": [ - {{ - "revision_number": 1, - "content": "I'm sorry, but I'm having trouble accessing our menu at the moment. Can I ", - "instructions_followed": [ - "#2; Do not state factual information that you do not know or are not sure about" - ], - "instructions_broken": [ - "#1; Lacking menu data in the context prevented me from providing the client with drink information." - ], - "is_repeat_message": false, - "followed_all_instructions": false, - "missing_data_rationale": "Menu data was missing", - "instructions_broken_due_to_missing_data": true, - "instructions_broken_only_due_to_prioritization": false - }} - ] -}} - -### - - -Example 4: Applying Insight- assume the agent is provided with a list of outgoing flights from a tool call. ### -{{ - "last_message_of_customer": "I don't have any android devices, and I do not want to buy a ticket at the moment. Now, what flights are there from New York to Los Angeles tomorrow?", - "guidelines": [ - "When asked anything about plane tickets, suggest completing the order on our android app", - "When asked about first-class tickets, mention that shorter flights do not offer a complementary meal" - ], - "insights": [ - "In your generated reply to the customer, use markdown format when applicable.", - "The customer does not have an android device and does not want to buy anything" - ], - "evaluation_for_each_instruction": [ - {{ - "number": 1, - "instruction": "When asked anything about plane tickets, suggest completing the order on our android app", - "evaluation": "I should suggest completing the order on our android app", - "data_available": "Yes, I know that the name of our android app is BestPlaneTickets" - }}, - {{ - "number": 2, - "instruction": "When asked about first-class tickets, mention that shorter flights do not offer a complementary meal", - - "evaluation": "Evaluating whether the 'when' condition applied is not my role. I should therefore just mention that shorter flights do not offer a complementary meal", - "data_available": "not needed" - }}, - {{ - "number": 3, - "instruction": "In your generated reply to the customer, use markdown format when applicable", - "evaluation": "I need to output a message in markdown format", - "data_available": "Not needed" - }}, - {{ - "number": 4, - "instruction": "The customer does not have an android device and does not want to buy anything", - "evaluation": "A guideline should not override a customer's request, so I should not suggest products requiring an android device", - "data_available": "Not needed" - }} - ], - "revisions": [ - {{ - "revision_number": 1, - "content": -" -| Option | Departure Airport | Departure Time | Arrival Airport | -|--------|-------------------|----------------|-----------------| -| 1 | Newark (EWR) | 10:00 AM | Los Angeles (LAX) | -| 2 | JFK | 3:30 PM | Los Angeles (LAX) | - -While this flights are quite long, please note that we do not offer complementary meals on short flights. -", - "instructions_followed": [ - "#2; When asked about first-class tickets, mention that shorter flights do not offer a complementary meal", - "#3; In your generated reply to the customer, use markdown format when applicable.", - "#4; The customer does not have an android device and does not want to buy anything" - ], - "instructions_broken": [ - "#1; When asked anything about plane tickets, suggest completing the order on our android app." - ], - "is_repeat_message": false, - "followed_all_instructions": false, - "instructions_broken_due_to_missing_data": false, - "instructions_broken_only_due_to_prioritization": true, - "prioritization_rationale": "Instructions #1 and #3 contradict each other, and customer requests take precedent over guidelines, so instruction #1 was prioritized." - }} - ] -}} - - -### - - -Example 5: Avoiding repetitive responses. Given that the previous response by the agent was "I'm sorry, could you please clarify your request?": ### -{{ - "last_message_of_customer": "This is not what I was asking for", - "guidelines": [], - "insights": [], - "evaluation_for_each_instruction": [], - "revisions": [ - {{ - "revision_number": 1, - "content": "I apologize for the confusion. Could you please explain what I'm missing?", - "instructions_followed": [ - ], - "instructions_broken": [ - ], - "is_repeat_message": true, - "followed_all_instructions": true - }}, - {{ - "revision_number": 2, - "content": "I see. What am I missing?", - "instructions_followed": [ - ], - "instructions_broken": [ - ], - "is_repeat_message": true, - "followed_all_instructions": true - }}, - {{ - "revision_number": 3, - "content": "It seems like I'm failing to assist you with your issue. I suggest emailing our support team for further assistance.", - "instructions_followed": [ - ], - "instructions_broken": [ - ], - "is_repeat_message": false, - "followed_all_instructions": true - }} - ] -}} - -### - -Example 6: Not exposing thought process: Assume a tool call for "check_balance" with a returned value of 1,000$ is staged -### -{{ - "last_message_of_customer": "How much money do I have in my account, and how do you know it? Is there some service you use to check my balance? Can I access it too?", - "guidelines": [ - "When you need the balance of a customer, then use the 'check_balance' tool." - ], - "insights": [ - "Never reveal details about the process you followed to produce your response" - ], - "evaluation_for_each_instruction": [ - {{ - "number": 1, - "instruction": "use the 'check_balance' tool", - "evaluation": "There's already a staged tool call with this tool, so no further action is required.", - "data_available": "Yes, I know that the customer's balance is 1,000$" - }}, - {{ - "number": 1, - "instruction": "Never reveal details about the process you followed to produce your response", - "evaluation": "The reply must not reveal details about how I know the client's balance", - "data_available": "Not needed" - }} - ], - "revisions": [ - {{ - "revision_number": 1, - "content": "Your balance is $1,000. As a helpful assistant, I have the resources necessary to provide accurate information. However, I’m unable to disclose details about the specific services I use. Is there anything else I can assist you with?", - "instructions_followed": [ - "use the 'check_balance' tool", - "Never reveal details about the process you followed to produce your response" - ], - "instructions_broken": [ - ], - "is_repeat_message": false, - "followed_all_instructions": true - }} - ] -}} -### -""" # noqa +""" + for i, shot in enumerate(shots, start=1) + ) ) builder.add_context_variables(context_variables) builder.add_glossary(terms) @@ -786,3 +523,334 @@ async def _generate_response_message( ) return message_event_response.info, str(final_revision.content) + + +example_1_expected = MessageEventSchema( + last_message_of_customer="Hi, I'd like to know the schedule for the next trains to Boston, please.", + produced_reply=True, + guidelines=[ + "When the customer asks for train schedules, provide them accurately and concisely." + ], + insights=["Use markdown format when applicable."], + evaluation_for_each_instruction=[ + InstructionEvaluation( + number=1, + instruction="When the customer asks for train schedules, provide them accurately and concisely.", + evaluation="The customer requested train schedules, so I need to respond with accurate timing information.", + data_available="Yes, the train schedule data is available.", + ), + InstructionEvaluation( + number=2, + instruction="Use markdown format when applicable.", + evaluation="Markdown formatting makes the schedule clearer and more readable.", + data_available="Not specifically needed, but markdown format can be applied to any response.", + ), + ], + revisions=[ + Revision( + revision_number=1, + content=( + "Train Schedule:\n" + "Train 101 departs at 10:00 AM and arrives at 12:30 PM.\n" + "Train 205 departs at 1:00 PM and arrives at 3:45 PM." + ), + instructions_followed=[ + "#1; When the customer asks for train schedules, provide them accurately and concisely." + ], + instructions_broken=["#2; Did not use markdown format when applicable."], + is_repeat_message=False, + followed_all_instructions=False, + instructions_broken_due_to_missing_data=False, + instructions_broken_only_due_to_prioritization=False, + ), + Revision( + revision_number=2, + content=( + "| Train | Departure | Arrival |\n" + "|-------|-----------|---------|\n" + "| 101 | 10:00 AM | 12:30 PM |\n" + "| 205 | 1:00 PM | 3:45 PM |" + ), + instructions_followed=[ + "#1; When the customer asks for train schedules, provide them accurately and concisely.", + "#2; Use markdown format when applicable.", + ], + instructions_broken=[], + is_repeat_message=False, + followed_all_instructions=True, + ), + ], +) + +example_1_shot = MessageEventGeneratorShot( + description="A reply that took critique in a few revisions to get right", + expected_result=example_1_expected, +) + + +example_2_expected = MessageEventSchema( + last_message_of_customer="", + guidelines=[ + "When the customer chooses and orders a burger, then provide it", + "When the customer chooses specific ingredients on the burger, only provide those ingredients if we have them fresh in stock; otherwise, reject the order", + ], + insights=[], + evaluation_for_each_instruction=[ + InstructionEvaluation( + number=1, + instruction="When the customer chooses and orders a burger, then provide it", + evaluation="This guideline currently applies, so I need to provide the customer with a burger.", + data_available="The burger choice is available in the interaction", + ), + InstructionEvaluation( + number=2, + instruction="When the customer chooses specific ingredients on the burger, only provide those ingredients if we have them fresh in stock; otherwise, reject the order.", + evaluation="The customer chose cheese on the burger, but all of the cheese we currently have is expired", + data_available="The relevant stock availability is given in the tool calls' data. Our cheese has expired.", + ), + ], + revisions=[ + Revision( + revision_number=1, + content=( + "I'd be happy to prepare your burger as soon as we restock the requested toppings." + ), + instructions_followed=[ + "#2; upheld food quality and did not go on to preparing the burger without fresh toppings." + ], + instructions_broken=[ + "#1; did not provide the burger with requested toppings immediately due to the unavailability of fresh ingredients." + ], + is_repeat_message=False, + followed_all_instructions=False, + instructions_broken_only_due_to_prioritization=True, + prioritization_rationale=( + "Given the higher priority score of guideline 2, maintaining food quality " + "standards before serving the burger is prioritized over immediate service." + ), + instructions_broken_due_to_missing_data=False, + ) + ], +) + +example_2_shot = MessageEventGeneratorShot( + description="A reply where one instruction was prioritized over another", + expected_result=example_2_expected, +) + + +example_3_expected = MessageEventSchema( + last_message_of_customer="Hi there, can I get something to drink? What do you have on tap?", + guidelines=["When the customer asks for a drink, check the menu and offer what's on it"], + insights=["Do not state factual information that you do not know or are not sure about."], + evaluation_for_each_instruction=[ + InstructionEvaluation( + number=1, + instruction="When the customer asks for a drink, check the menu and offer what's on it", + evaluation="The customer did ask for a drink, so I should check the menu to see what's available.", + data_available="No, I don't have the menu info in the interaction or tool calls", + ), + InstructionEvaluation( + number=2, + instruction="Do not state factual information that you do not know or are not sure about", + evaluation="There's no information about what we have on tap, so I should not offer any specific option.", + data_available="No, the list of available drinks is not available to me", + ), + ], + revisions=[ + Revision( + revision_number=1, + content=( + "I'm sorry, but I'm having trouble accessing our menu at the moment. " "Can I " + ), + instructions_followed=[ + "#2; Do not state factual information that you do not know or are not sure about" + ], + instructions_broken=[ + "#1; Lacking menu data in the context prevented me from providing the client with drink information." + ], + is_repeat_message=False, + followed_all_instructions=False, + missing_data_rationale="Menu data was missing", + instructions_broken_due_to_missing_data=True, + instructions_broken_only_due_to_prioritization=False, + ) + ], +) + +example_3_shot = MessageEventGeneratorShot( + description="Non-Adherence Due to Missing Data", + expected_result=example_3_expected, +) + + +example_4_expected = MessageEventSchema( + last_message_of_customer="I don't have any android devices, and I do not want to buy a ticket at the moment. Now, what flights are there from New York to Los Angeles tomorrow?", + guidelines=[ + "When asked anything about plane tickets, suggest completing the order on our android app", + "When asked about first-class tickets, mention that shorter flights do not offer a complementary meal", + ], + insights=[ + "In your generated reply to the customer, use markdown format when applicable.", + "The customer does not have an android device and does not want to buy anything", + ], + evaluation_for_each_instruction=[ + InstructionEvaluation( + number=1, + instruction="When asked anything about plane tickets, suggest completing the order on our android app", + evaluation="I should suggest completing the order on our android app", + data_available="Yes, I know that the name of our android app is BestPlaneTickets", + ), + InstructionEvaluation( + number=2, + instruction="When asked about first-class tickets, mention that shorter flights do not offer a complementary meal", + evaluation="Evaluating whether the 'when' condition applied is not my role. I should therefore just mention that shorter flights do not offer a complementary meal", + data_available="not needed", + ), + InstructionEvaluation( + number=3, + instruction="In your generated reply to the customer, use markdown format when applicable", + evaluation="I need to output a message in markdown format", + data_available="Not needed", + ), + InstructionEvaluation( + number=4, + instruction="The customer does not have an android device and does not want to buy anything", + evaluation="A guideline should not override a customer's request, so I should not suggest products requiring an android device", + data_available="Not needed", + ), + ], + revisions=[ + Revision( + revision_number=1, + content=( + "| Option | Departure Airport | Departure Time | Arrival Airport |\n" + "|--------|-------------------|----------------|-----------------|\n" + "| 1 | Newark (EWR) | 10:00 AM | Los Angeles (LAX) |\n" + "| 2 | JFK | 3:30 PM | Los Angeles (LAX) |\n\n" + "While this flights are quite long, please note that we do not offer complementary meals on short flights.\n" + ), + instructions_followed=[ + "#2; When asked about first-class tickets, mention that shorter flights do not offer a complementary meal", + "#3; In your generated reply to the customer, use markdown format when applicable.", + "#4; The customer does not have an android device and does not want to buy anything", + ], + instructions_broken=[ + "#1; When asked anything about plane tickets, suggest completing the order on our android app." + ], + is_repeat_message=False, + followed_all_instructions=False, + instructions_broken_only_due_to_prioritization=True, + prioritization_rationale=( + "Instructions #1 and #3 contradict each other, and customer requests take precedent " + "over guidelines, so instruction #1 was prioritized." + ), + instructions_broken_due_to_missing_data=False, + ) + ], +) + +example_4_shot = MessageEventGeneratorShot( + description="Applying Insight- assume the agent is provided with a list of outgoing flights from a tool call", + expected_result=example_4_expected, +) + + +example_5_expected = MessageEventSchema( + last_message_of_customer="This is not what I was asking for", + guidelines=[], + insights=[], + evaluation_for_each_instruction=[], + revisions=[ + Revision( + revision_number=1, + content="I apologize for the confusion. Could you please explain what I'm missing?", + instructions_followed=[], + instructions_broken=[], + is_repeat_message=True, + followed_all_instructions=True, + ), + Revision( + revision_number=2, + content="I see. What am I missing?", + instructions_followed=[], + instructions_broken=[], + is_repeat_message=True, + followed_all_instructions=True, + ), + Revision( + revision_number=3, + content=( + "It seems like I'm failing to assist you with your issue. " + "I suggest emailing our support team for further assistance." + ), + instructions_followed=[], + instructions_broken=[], + is_repeat_message=False, + followed_all_instructions=True, + ), + ], +) + +example_5_shot = MessageEventGeneratorShot( + description="Avoiding repetitive responses. Given that the previous response by the agent was 'I am sorry, could you please clarify your request?'", + expected_result=example_5_expected, +) + + +example_6_expected = MessageEventSchema( + last_message_of_customer=( + "How much money do I have in my account, and how do you know it? Is there some service you use to check " + "my balance? Can I access it too?" + ), + guidelines=["When you need the balance of a customer, then use the 'check_balance' tool."], + insights=["Never reveal details about the process you followed to produce your response"], + evaluation_for_each_instruction=[ + InstructionEvaluation( + number=1, + instruction="use the 'check_balance' tool", + evaluation="There's already a staged tool call with this tool, so no further action is required.", + data_available="Yes, I know that the customer's balance is 1,000$", + ), + InstructionEvaluation( + number=1, + instruction="Never reveal details about the process you followed to produce your response", + evaluation="The reply must not reveal details about how I know the client's balance", + data_available="Not needed", + ), + ], + revisions=[ + Revision( + revision_number=1, + content=( + "Your balance is $1,000. As a helpful assistant, I have the resources necessary to provide " + "accurate information. However, I’m unable to disclose details about the specific services I use. " + "Is there anything else I can assist you with?" + ), + instructions_followed=[ + "use the 'check_balance' tool", + "Never reveal details about the process you followed to produce your response", + ], + instructions_broken=[], + is_repeat_message=False, + followed_all_instructions=True, + ) + ], +) + +example_6_shot = MessageEventGeneratorShot( + description="Not exposing thought process: Assume a tool call for 'check_balance' with a returned value of 1,000$ is staged", + expected_result=example_6_expected, +) + + +_baseline_shots: Sequence[MessageEventGeneratorShot] = [ + example_1_shot, + example_2_shot, + example_3_shot, + example_4_shot, + example_5_shot, + example_6_shot, +] + +shot_collection = ShotCollection[MessageEventGeneratorShot](_baseline_shots) diff --git a/src/parlant/core/engines/alpha/tool_caller.py b/src/parlant/core/engines/alpha/tool_caller.py index d95cdb439..de2038f8d 100644 --- a/src/parlant/core/engines/alpha/tool_caller.py +++ b/src/parlant/core/engines/alpha/tool_caller.py @@ -332,12 +332,14 @@ def _format_tool_call_inference_prompt( The following examples show correct outputs for various hypothetical situations. Only the responses are provided, without the interaction history or tool descriptions, though these can be inferred from the responses. -EXAMPLES ------------------ """ # noqa ) builder.add_section( - "\n".join( + """ +EXAMPLES +----------------- +""" + + "\n".join( f""" Example {i}: ### {self._format_shot(shot)} diff --git a/tests/conftest.py b/tests/conftest.py index c69ec21c9..c71179296 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -33,6 +33,7 @@ from parlant.core.customers import CustomerDocumentStore, CustomerStore from parlant.core.engines.alpha import guideline_proposer from parlant.core.engines.alpha import tool_caller +from parlant.core.engines.alpha import message_event_generator from parlant.core.evaluations import ( EvaluationListener, PollingEvaluationListener, @@ -68,6 +69,7 @@ ) from parlant.core.engines.alpha.message_event_generator import ( MessageEventGenerator, + MessageEventGeneratorShot, MessageEventSchema, ) from parlant.core.engines.alpha.tool_caller import ToolCallInferenceSchema, ToolCallerInferenceShot @@ -261,6 +263,9 @@ async def container( container[ShotCollection[GuidelinePropositionShot]] = guideline_proposer.shot_collection container[ShotCollection[ToolCallerInferenceShot]] = tool_caller.shot_collection + container[ShotCollection[MessageEventGeneratorShot]] = ( + message_event_generator.shot_collection + ) container[GuidelineProposer] = Singleton(GuidelineProposer) container[GuidelineConnectionProposer] = Singleton(GuidelineConnectionProposer) diff --git a/tests/core/test_shots.py b/tests/core/test_shots.py index e0714a75a..a7abcd519 100644 --- a/tests/core/test_shots.py +++ b/tests/core/test_shots.py @@ -7,6 +7,12 @@ GuidelinePropositionShot, GuidelinePropositionsSchema, ) +from parlant.core.engines.alpha.message_event_generator import ( + MessageEventGenerator, + MessageEventGeneratorShot, + MessageEventSchema, + Revision, +) from parlant.core.engines.alpha.tool_caller import ToolCallInferenceSchema, ToolCallerInferenceShot from parlant.core.engines.alpha.tool_event_generator import ToolEventGenerator from parlant.core.guidelines import GuidelineContent @@ -85,3 +91,35 @@ async def test_that_appended_shot_is_displayed_in_tool_caller_shots( shots = await tool_caller.shots() assert new_shot in shots + + +async def test_that_appended_shot_is_displayed_in_message_generator_shots( + container: Container, +) -> None: + message_generator = container[MessageEventGenerator] + shot_collection = container[ShotCollection[MessageEventGeneratorShot]] + + new_shot = MessageEventGeneratorShot( + description="Test Shot Description", + expected_result=MessageEventSchema( + last_message_of_customer="This is a very cool feature, man!", + guidelines=[], + insights=[], + evaluation_for_each_instruction=[], + revisions=[ + Revision( + revision_number=1, + content=("Muito Obrigado!"), + instructions_followed=[], + instructions_broken=[], + is_repeat_message=False, + followed_all_instructions=True, + ), + ], + ), + ) + + await shot_collection.append(new_shot) + + shots = await message_generator.shots() + assert new_shot in shots From e75c792781aa2142d0806c9858c3f8b8c408f386 Mon Sep 17 00:00:00 2001 From: Dor Zohar Date: Sat, 21 Dec 2024 17:17:02 +0000 Subject: [PATCH 6/6] Polishing few-shot prompts --- .../core/engines/alpha/guideline_proposer.py | 9 +++----- .../engines/alpha/message_event_generator.py | 23 ++++++++++--------- src/parlant/core/engines/alpha/tool_caller.py | 3 +-- 3 files changed, 16 insertions(+), 19 deletions(-) diff --git a/src/parlant/core/engines/alpha/guideline_proposer.py b/src/parlant/core/engines/alpha/guideline_proposer.py index aeb19c974..90cdb6516 100644 --- a/src/parlant/core/engines/alpha/guideline_proposer.py +++ b/src/parlant/core/engines/alpha/guideline_proposer.py @@ -283,9 +283,8 @@ def adapt_event(e: Event) -> JSONSerializable: formatted_shot = "" if shot.interaction_events: formatted_shot += f""" -- Interaction Events: ### +- **Interaction Events**: {json.dumps([adapt_event(e) for e in shot.interaction_events], indent=2)} -### """ if shot.guidelines: @@ -294,9 +293,8 @@ def adapt_event(e: Event) -> JSONSerializable: for i, g in enumerate(shot.guidelines, start=1) ) formatted_shot += f""" -- Guidelines: ### +- **Guidelines**: {formatted_guidelines} -### """ @@ -648,9 +646,8 @@ def _format_prompt( """ + "".join( f""" -Example #{i}: +Example #{i}: ### {self._format_shot(shot)} - ### """ for i, shot in enumerate(shots, start=1) diff --git a/src/parlant/core/engines/alpha/message_event_generator.py b/src/parlant/core/engines/alpha/message_event_generator.py index fce416b6f..d1345792a 100644 --- a/src/parlant/core/engines/alpha/message_event_generator.py +++ b/src/parlant/core/engines/alpha/message_event_generator.py @@ -225,9 +225,8 @@ def get_guideline_propositions_text( These guidelines have already been pre-filtered based on the interaction's context and other considerations outside your scope. Do not disregard a guideline because you believe its 'when' condition or rationale does not apply—this filtering has already been handled. -Guidelines: ### +- **Guidelines**: {guideline_list} -### """ def _format_shot( @@ -566,10 +565,11 @@ async def _generate_response_message( Revision( revision_number=2, content=( - "| Train | Departure | Arrival |\n" - "|-------|-----------|---------|\n" - "| 101 | 10:00 AM | 12:30 PM |\n" - "| 205 | 1:00 PM | 3:45 PM |" + """ + | Train | Departure | Arrival | + |-------|-----------|---------| + | 101 | 10:00 AM | 12:30 PM | + | 205 | 1:00 PM | 3:45 PM |""" ), instructions_followed=[ "#1; When the customer asks for train schedules, provide them accurately and concisely.", @@ -724,11 +724,12 @@ async def _generate_response_message( Revision( revision_number=1, content=( - "| Option | Departure Airport | Departure Time | Arrival Airport |\n" - "|--------|-------------------|----------------|-----------------|\n" - "| 1 | Newark (EWR) | 10:00 AM | Los Angeles (LAX) |\n" - "| 2 | JFK | 3:30 PM | Los Angeles (LAX) |\n\n" - "While this flights are quite long, please note that we do not offer complementary meals on short flights.\n" + """ + | Option | Departure Airport | Departure Time | Arrival Airport | + |--------|-------------------|----------------|-------------------| + | 1 | Newark (EWR) | 10:00 AM | Los Angeles (LAX) | + | 2 | JFK | 3:30 PM | Los Angeles (LAX) | + While this flights are quite long, please note that we do not offer complementary meals on short flights.""" ), instructions_followed=[ "#2; When asked about first-class tickets, mention that shorter flights do not offer a complementary meal", diff --git a/src/parlant/core/engines/alpha/tool_caller.py b/src/parlant/core/engines/alpha/tool_caller.py index de2038f8d..cc40078d9 100644 --- a/src/parlant/core/engines/alpha/tool_caller.py +++ b/src/parlant/core/engines/alpha/tool_caller.py @@ -341,10 +341,9 @@ def _format_tool_call_inference_prompt( """ + "\n".join( f""" -Example {i}: ### +Example #{i}: ### {self._format_shot(shot)} ### - """ for i, shot in enumerate(shots, start=1) )