From 88fe9ec11e5a490d5ebe381ef585a4268887830d Mon Sep 17 00:00:00 2001 From: aoife cassidy Date: Thu, 31 Oct 2024 05:46:04 +0200 Subject: [PATCH 1/8] feat(llm): add ChatContext incomplete implementation, notably missing tool call hooks (see comment for more info). --- agents/src/llm/chat_context.ts | 105 +++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 agents/src/llm/chat_context.ts diff --git a/agents/src/llm/chat_context.ts b/agents/src/llm/chat_context.ts new file mode 100644 index 00000000..9b58ced1 --- /dev/null +++ b/agents/src/llm/chat_context.ts @@ -0,0 +1,105 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import type { AudioFrame } from '@livekit/rtc-node'; + +export enum ChatRole { + SYSTEM, + USER, + ASSISTANT, + TOOL, +} + +export interface ChatImage { + image: string | AudioFrame; + inferenceWidth?: number; + inferenceHeight?: number; + /** Used by LLM implementations to store a processed version of the image for later use. */ + cache: { [id: string | number | symbol]: any }; +} + +export interface ChatAudio { + frame: AudioFrame | AudioFrame[]; +} + +export type ChatContent = string | ChatImage | ChatAudio; + +export class ChatMessage { + readonly role: ChatRole; + readonly id?: string; + readonly name?: string; + readonly content?: ChatContent | ChatContent[]; + readonly toolCallId?: string; + readonly toolException?: Error; + + /** @internal */ + constructor({ + role, + id, + name, + content, + toolCallId, + toolException, + }: { + role: ChatRole; + id?: string; + name?: string; + content?: ChatContent | ChatContent[]; + toolCallId?: string; + toolException?: Error; + }) { + this.role = role; + this.id = id; + this.name = name; + this.content = content; + this.toolCallId = toolCallId; + this.toolException = toolException; + } + + // TODO(nbsp): tool call functions. + // the system defined in function_context.ts is fundamentally different (and much, much simpler) + // than the one in Python Agents. + // pair with theo to figure out what to do here (and later in MultimodalAgent/RealtimeModel) + + static create({ + text = '', + images = [], + role = ChatRole.SYSTEM, + }: { + text?: string; + images: ChatImage[]; + role: ChatRole; + }): ChatMessage { + if (!images.length) { + return new ChatMessage({ + role: ChatRole.ASSISTANT, + content: text, + }); + } else { + return new ChatMessage({ + role, + content: [...(text ? [text] : []), ...images], + }); + } + } + + /** Returns a structured clone of this message. */ + copy(): ChatMessage { + return structuredClone(this); + } +} + +export class ChatContext { + messages: ChatMessage[] = []; + metadata: { [id: string]: any } = {}; + + append(msg: { text?: string; images: ChatImage[]; role: ChatRole }): ChatContext { + this.messages.push(ChatMessage.create(msg)); + return this; + } + + /** Returns a structured clone of this context. */ + copy(): ChatContext { + return structuredClone(this); + } +} From 0d07e8579d9bc29b249791c78d4cc5c8d8523d61 Mon Sep 17 00:00:00 2001 From: aoife cassidy Date: Fri, 1 Nov 2024 07:03:20 +0200 Subject: [PATCH 2/8] =?UTF-8?q?na=C3=AFve=20function=20calling=20in=20Chat?= =?UTF-8?q?Message=20impl?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- agents/src/llm/chat_context.ts | 30 ++++++++++++++++++++++++++---- agents/src/llm/function_context.ts | 8 ++++++++ agents/src/llm/index.ts | 12 ++++++++++-- 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/agents/src/llm/chat_context.ts b/agents/src/llm/chat_context.ts index 9b58ced1..6d3ae35b 100644 --- a/agents/src/llm/chat_context.ts +++ b/agents/src/llm/chat_context.ts @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 import type { AudioFrame } from '@livekit/rtc-node'; +import type { CallableFunctionResult, FunctionContext } from './function_context.js'; export enum ChatRole { SYSTEM, @@ -29,6 +30,7 @@ export class ChatMessage { readonly id?: string; readonly name?: string; readonly content?: ChatContent | ChatContent[]; + readonly toolCalls?: FunctionContext; readonly toolCallId?: string; readonly toolException?: Error; @@ -38,6 +40,7 @@ export class ChatMessage { id, name, content, + toolCalls, toolCallId, toolException, }: { @@ -45,6 +48,7 @@ export class ChatMessage { id?: string; name?: string; content?: ChatContent | ChatContent[]; + toolCalls?: FunctionContext; toolCallId?: string; toolException?: Error; }) { @@ -52,14 +56,32 @@ export class ChatMessage { this.id = id; this.name = name; this.content = content; + this.toolCalls = toolCalls; this.toolCallId = toolCallId; this.toolException = toolException; } - // TODO(nbsp): tool call functions. - // the system defined in function_context.ts is fundamentally different (and much, much simpler) - // than the one in Python Agents. - // pair with theo to figure out what to do here (and later in MultimodalAgent/RealtimeModel) + static createToolFromFunctionResult(func: CallableFunctionResult): ChatMessage { + if (!func.result && !func.error) { + throw new TypeError('CallableFunctionResult must include result or error'); + } + + return new ChatMessage({ + role: ChatRole.TOOL, + name: func.name, + content: func.result || `Error: ${func.error}`, + toolCallId: func.toolCallId, + toolException: func.error, + }); + } + + static createToolCalls(toolCalls: FunctionContext, text = '') { + return new ChatMessage({ + role: ChatRole.ASSISTANT, + toolCalls, + content: text, + }); + } static create({ text = '', diff --git a/agents/src/llm/function_context.ts b/agents/src/llm/function_context.ts index e6673abb..af193b78 100644 --- a/agents/src/llm/function_context.ts +++ b/agents/src/llm/function_context.ts @@ -18,6 +18,14 @@ export interface CallableFunction

{ execute: (args: inferParameters

) => PromiseLike; } +/** A currently-running function call, called by the LLM. */ +export interface CallableFunctionResult { + name: string; + toolCallId: string; + result?: any; + error?: any; +} + /** An object containing callable functions and their names */ export type FunctionContext = { [name: string]: CallableFunction; diff --git a/agents/src/llm/index.ts b/agents/src/llm/index.ts index 80336ecb..a2672fde 100644 --- a/agents/src/llm/index.ts +++ b/agents/src/llm/index.ts @@ -1,11 +1,19 @@ // SPDX-FileCopyrightText: 2024 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 -import { +export { type CallableFunction, + type CallableFunctionResult, type FunctionContext, type inferParameters, oaiParams, } from './function_context.js'; -export { CallableFunction, FunctionContext, inferParameters, oaiParams }; +export { + type ChatImage, + type ChatAudio, + type ChatContent, + ChatRole, + ChatMessage, + ChatContext, +} from './chat_context.js'; From d72c5bafc6af7d7c3ef829ac3a227e2ca22c4310 Mon Sep 17 00:00:00 2001 From: aoife cassidy Date: Fri, 1 Nov 2024 07:03:54 +0200 Subject: [PATCH 3/8] update mma and oai.realtime to use ChatContext --- agents/src/multimodal/multimodal_agent.ts | 8 +- plugins/openai/src/realtime/realtime_model.ts | 167 ++++++++++++++++-- 2 files changed, 159 insertions(+), 16 deletions(-) diff --git a/agents/src/multimodal/multimodal_agent.ts b/agents/src/multimodal/multimodal_agent.ts index 045ef868..d7cf1395 100644 --- a/agents/src/multimodal/multimodal_agent.ts +++ b/agents/src/multimodal/multimodal_agent.ts @@ -64,13 +64,16 @@ export class MultimodalAgent extends EventEmitter { constructor({ model, + chatCtx, fncCtx, }: { model: RealtimeModel; - fncCtx?: llm.FunctionContext | undefined; + chatCtx?: llm.ChatContext; + fncCtx?: llm.FunctionContext; }) { super(); this.model = model; + this.#chatCtx = chatCtx; this.#fncCtx = fncCtx; } @@ -83,6 +86,7 @@ export class MultimodalAgent extends EventEmitter { #logger = log(); #session: RealtimeSession | null = null; #fncCtx: llm.FunctionContext | undefined = undefined; + #chatCtx: llm.ChatContext | undefined = undefined; #_started: boolean = false; #_pendingFunctionCalls: Set = new Set(); @@ -200,7 +204,7 @@ export class MultimodalAgent extends EventEmitter { } } - this.#session = this.model.session({ fncCtx: this.#fncCtx }); + this.#session = this.model.session({ fncCtx: this.#fncCtx, chatCtx: this.#chatCtx }); this.#started = true; // eslint-disable-next-line @typescript-eslint/no-explicit-any diff --git a/plugins/openai/src/realtime/realtime_model.ts b/plugins/openai/src/realtime/realtime_model.ts index 23b16e5a..a8d2ef93 100644 --- a/plugins/openai/src/realtime/realtime_model.ts +++ b/plugins/openai/src/realtime/realtime_model.ts @@ -1,7 +1,16 @@ // SPDX-FileCopyrightText: 2024 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 -import { AsyncIterableQueue, Future, Queue, llm, log, multimodal } from '@livekit/agents'; +import { + AsyncIterableQueue, + Future, + Queue, + llm, + log, + mergeFrames, + multimodal, +} from '@livekit/agents'; +import { ChatRole } from '@livekit/agents/src/llm/chat_context.js'; import { AudioFrame } from '@livekit/rtc-node'; import { once } from 'node:events'; import { WebSocket } from 'ws'; @@ -108,6 +117,7 @@ class InputAudioBuffer { class ConversationItem { #session: RealtimeSession; + #logger = log(); constructor(session: RealtimeSession) { this.#session = session; @@ -129,12 +139,127 @@ class ConversationItem { }); } - create(item: api_proto.ConversationItemCreateContent, previousItemId?: string): void { - this.#session.queueMsg({ - type: 'conversation.item.create', - item, - previous_item_id: previousItemId, - }); + // create(item: api_proto.ConversationItemCreateContent, previousItemId?: string): void { + create(message: llm.ChatMessage, previousItemId?: string): void { + if (!message.content) { + return; + } + + let event: api_proto.ConversationItemCreateEvent; + + if (message.toolCallId) { + if (typeof message.content !== 'string') { + throw new TypeError('message.content must be a string'); + } + + event = { + type: 'conversation.item.create', + previous_item_id: previousItemId, + item: { + type: 'function_call_output', + call_id: message.toolCallId, + output: message.content, + }, + }; + } else { + let content = message.content; + if (!Array.isArray(content)) { + content = [content]; + } + + if (message.role === ChatRole.USER) { + const contents: (api_proto.InputTextContent | api_proto.InputAudioContent)[] = []; + for (const c of content) { + if (typeof c === 'string') { + contents.push({ + type: 'input_text', + text: c, + }); + } else if ( + // typescript type guard for determining ChatAudio vs ChatImage + ((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatAudio => { + return (c as llm.ChatAudio).frame !== undefined; + })(c) + ) { + contents.push({ + type: 'input_audio', + audio: Buffer.from(mergeFrames(c.frame).data.buffer).toString('base64'), + }); + } + } + + event = { + type: 'conversation.item.create', + previous_item_id: previousItemId, + item: { + type: 'message', + role: 'user', + content: contents, + }, + }; + } else if (message.role === ChatRole.ASSISTANT) { + const contents: api_proto.TextContent[] = []; + for (const c of content) { + if (typeof c === 'string') { + contents.push({ + type: 'text', + text: c, + }); + } else if ( + // typescript type guard for determining ChatAudio vs ChatImage + ((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatAudio => { + return (c as llm.ChatAudio).frame !== undefined; + })(c) + ) { + this.#logger.warn('audio content in assistant message is not supported'); + } + } + + event = { + type: 'conversation.item.create', + previous_item_id: previousItemId, + item: { + type: 'message', + role: 'assistant', + content: contents, + }, + }; + } else if (message.role === ChatRole.SYSTEM) { + const contents: api_proto.InputTextContent[] = []; + for (const c of content) { + if (typeof c === 'string') { + contents.push({ + type: 'input_text', + text: c, + }); + } else if ( + // typescript type guard for determining ChatAudio vs ChatImage + ((c: llm.ChatAudio | llm.ChatImage): c is llm.ChatAudio => { + return (c as llm.ChatAudio).frame !== undefined; + })(c) + ) { + this.#logger.warn('audio content in system message is not supported'); + } + } + + event = { + type: 'conversation.item.create', + previous_item_id: previousItemId, + item: { + type: 'message', + role: 'system', + content: contents, + }, + }; + } else { + this.#logger + .child({ message }) + .warn('chat message is not supported inside the realtime API'); + return; + } + } + + this.#session.queueMsg(event); } } @@ -302,6 +427,7 @@ export class RealtimeModel extends multimodal.RealtimeModel { session({ fncCtx, + chatCtx, modalities = this.#defaultOpts.modalities, instructions = this.#defaultOpts.instructions, voice = this.#defaultOpts.voice, @@ -313,6 +439,7 @@ export class RealtimeModel extends multimodal.RealtimeModel { maxResponseOutputTokens = this.#defaultOpts.maxResponseOutputTokens, }: { fncCtx?: llm.FunctionContext; + chatCtx?: llm.ChatContext; modalities?: ['text', 'audio'] | ['text']; instructions?: string; voice?: api_proto.Voice; @@ -341,7 +468,10 @@ export class RealtimeModel extends multimodal.RealtimeModel { entraToken: this.#defaultOpts.entraToken, }; - const newSession = new RealtimeSession(opts, fncCtx); + const newSession = new RealtimeSession(opts, { + chatCtx: chatCtx || new llm.ChatContext(), + fncCtx, + }); this.#sessions.push(newSession); return newSession; } @@ -352,6 +482,7 @@ export class RealtimeModel extends multimodal.RealtimeModel { } export class RealtimeSession extends multimodal.RealtimeSession { + #chatCtx: llm.ChatContext | undefined = undefined; #fncCtx: llm.FunctionContext | undefined = undefined; #opts: ModelOptions; #pendingResponses: { [id: string]: RealtimeResponse } = {}; @@ -363,10 +494,14 @@ export class RealtimeSession extends multimodal.RealtimeSession { #closing = true; #sendQueue = new Queue(); - constructor(opts: ModelOptions, fncCtx?: llm.FunctionContext | undefined) { + constructor( + opts: ModelOptions, + { fncCtx, chatCtx }: { fncCtx?: llm.FunctionContext; chatCtx?: llm.ChatContext }, + ) { super(); this.#opts = opts; + this.#chatCtx = chatCtx; this.#fncCtx = fncCtx; this.#task = this.#start(); @@ -385,6 +520,10 @@ export class RealtimeSession extends multimodal.RealtimeSession { }); } + get chatCtx(): llm.ChatContext | undefined { + return this.#chatCtx; + } + get fncCtx(): llm.FunctionContext | undefined { return this.#fncCtx; } @@ -869,11 +1008,11 @@ export class RealtimeSession extends multimodal.RealtimeSession { callId: item.call_id, }); this.conversation.item.create( - { - type: 'function_call_output', - call_id: item.call_id, - output: content, - }, + llm.ChatMessage.createToolFromFunctionResult({ + name: item.name, + toolCallId: item.call_id, + result: content, + }), output.itemId, ); this.response.create(); From f6eac6052f7d11641f1c67728bb2f8927a9ae6be Mon Sep 17 00:00:00 2001 From: aoife cassidy Date: Fri, 1 Nov 2024 07:14:57 +0200 Subject: [PATCH 4/8] fix example --- agents/src/llm/chat_context.ts | 24 +++++++++++++++--------- examples/src/minimal_assistant.ts | 13 +++++++------ 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/agents/src/llm/chat_context.ts b/agents/src/llm/chat_context.ts index 6d3ae35b..de208a44 100644 --- a/agents/src/llm/chat_context.ts +++ b/agents/src/llm/chat_context.ts @@ -25,6 +25,12 @@ export interface ChatAudio { export type ChatContent = string | ChatImage | ChatAudio; +const defaultCreateChatMessage = { + text: '', + images: [], + role: ChatRole.SYSTEM, +}; + export class ChatMessage { readonly role: ChatRole; readonly id?: string; @@ -83,15 +89,15 @@ export class ChatMessage { }); } - static create({ - text = '', - images = [], - role = ChatRole.SYSTEM, - }: { - text?: string; - images: ChatImage[]; - role: ChatRole; - }): ChatMessage { + static create( + options: Partial<{ + text?: string; + images: ChatImage[]; + role: ChatRole; + }>, + ): ChatMessage { + const { text, images, role } = { ...defaultCreateChatMessage, ...options }; + if (!images.length) { return new ChatMessage({ role: ChatRole.ASSISTANT, diff --git a/examples/src/minimal_assistant.ts b/examples/src/minimal_assistant.ts index 765f118a..fcfca73d 100644 --- a/examples/src/minimal_assistant.ts +++ b/examples/src/minimal_assistant.ts @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: 2024 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 -import { type JobContext, WorkerOptions, cli, defineAgent, multimodal } from '@livekit/agents'; +import { type JobContext, WorkerOptions, cli, defineAgent, llm, multimodal } from '@livekit/agents'; import * as openai from '@livekit/agents-plugin-openai'; import { fileURLToPath } from 'node:url'; import { z } from 'zod'; @@ -52,11 +52,12 @@ export default defineAgent({ .start(ctx.room, participant) .then((session) => session as openai.realtime.RealtimeSession); - session.conversation.item.create({ - type: 'message', - role: 'user', - content: [{ type: 'input_text', text: 'Say "How can I help you today?"' }], - }); + session.conversation.item.create( + llm.ChatMessage.create({ + role: llm.ChatRole.USER, + text: 'Say "How can I help you today?"', + }), + ); session.response.create(); }, }); From f66438990de05943cee431f182f3ed4993adde81 Mon Sep 17 00:00:00 2001 From: aoife cassidy Date: Fri, 1 Nov 2024 07:20:31 +0200 Subject: [PATCH 5/8] whoops --- plugins/openai/src/realtime/realtime_model.ts | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/plugins/openai/src/realtime/realtime_model.ts b/plugins/openai/src/realtime/realtime_model.ts index a8d2ef93..af33a185 100644 --- a/plugins/openai/src/realtime/realtime_model.ts +++ b/plugins/openai/src/realtime/realtime_model.ts @@ -10,7 +10,6 @@ import { mergeFrames, multimodal, } from '@livekit/agents'; -import { ChatRole } from '@livekit/agents/src/llm/chat_context.js'; import { AudioFrame } from '@livekit/rtc-node'; import { once } from 'node:events'; import { WebSocket } from 'ws'; @@ -167,7 +166,7 @@ class ConversationItem { content = [content]; } - if (message.role === ChatRole.USER) { + if (message.role === llm.ChatRole.USER) { const contents: (api_proto.InputTextContent | api_proto.InputAudioContent)[] = []; for (const c of content) { if (typeof c === 'string') { @@ -197,7 +196,7 @@ class ConversationItem { content: contents, }, }; - } else if (message.role === ChatRole.ASSISTANT) { + } else if (message.role === llm.ChatRole.ASSISTANT) { const contents: api_proto.TextContent[] = []; for (const c of content) { if (typeof c === 'string') { @@ -224,7 +223,7 @@ class ConversationItem { content: contents, }, }; - } else if (message.role === ChatRole.SYSTEM) { + } else if (message.role === llm.ChatRole.SYSTEM) { const contents: api_proto.InputTextContent[] = []; for (const c of content) { if (typeof c === 'string') { From 343e5de91c22ae7b4c2fad160f872376ca980c7a Mon Sep 17 00:00:00 2001 From: aoife cassidy Date: Sat, 2 Nov 2024 04:34:27 +0200 Subject: [PATCH 6/8] Create purple-beds-clean.md --- .changeset/purple-beds-clean.md | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 .changeset/purple-beds-clean.md diff --git a/.changeset/purple-beds-clean.md b/.changeset/purple-beds-clean.md new file mode 100644 index 00000000..e746bf37 --- /dev/null +++ b/.changeset/purple-beds-clean.md @@ -0,0 +1,7 @@ +--- +"@livekit/agents": patch +"@livekit/agents-plugin-openai": patch +"livekit-agents-examples": patch +--- + +add ChatContext From 8b3b143af8291884e0ad279091b5896e2e3e0d9c Mon Sep 17 00:00:00 2001 From: aoife cassidy Date: Sat, 2 Nov 2024 04:35:51 +0200 Subject: [PATCH 7/8] mark as internal --- agents/src/llm/chat_context.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/agents/src/llm/chat_context.ts b/agents/src/llm/chat_context.ts index de208a44..5abccef8 100644 --- a/agents/src/llm/chat_context.ts +++ b/agents/src/llm/chat_context.ts @@ -15,7 +15,10 @@ export interface ChatImage { image: string | AudioFrame; inferenceWidth?: number; inferenceHeight?: number; - /** Used by LLM implementations to store a processed version of the image for later use. */ + /** + * @internal + * Used by LLM implementations to store a processed version of the image for later use. + */ cache: { [id: string | number | symbol]: any }; } From c4cfdeea504b280c1742abb4ac111f9c54b517c8 Mon Sep 17 00:00:00 2001 From: aoife cassidy Date: Sat, 2 Nov 2024 04:38:26 +0200 Subject: [PATCH 8/8] fmt --- agents/src/llm/chat_context.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agents/src/llm/chat_context.ts b/agents/src/llm/chat_context.ts index 5abccef8..0043e2e6 100644 --- a/agents/src/llm/chat_context.ts +++ b/agents/src/llm/chat_context.ts @@ -15,7 +15,7 @@ export interface ChatImage { image: string | AudioFrame; inferenceWidth?: number; inferenceHeight?: number; - /** + /** * @internal * Used by LLM implementations to store a processed version of the image for later use. */