708 move logic for detecting output defence bot filtering (#740)

* Renamed method to be clear what defences are being checked * Moved detection of output defences * Using await rather than then * Clearer use of the input defence report * WIP: openai file doesn't know about the defence report * WIP: Using new pushMessageToHistory method * Fixed chat history * Simpler combining of defence reports * Consistent blocking rules * Not mutating chatResponse in the performToolCalls method * Better loop * Not mutating chatResponse in the chatGptChatCompletion method * Simplified return * Method to add the user messages to chat history * Better output defence report * Moved combineChatDefenceReports to chat controller * No longer exporting getFilterList and detectFilterList * Fixed test build errors * detectTriggeredOutputDefences unit tests * Fixed chat controller tests * Removed output filtering integration tests This code is now covered by the unit tests * Moved utils method to new file * Fixed remaining tests * pushMessageToHistory unit tests * WIP: Now using the updated chat response * WIP: Fixed chat utils tests * WIP: Fixed remaining tests * Fix for response not being set properly * No longer adding transformed messae twice * Nicer chat while loop * Only sending back sent emails, not total emails * Fixed tests * Using flatMap * const updatedChatHistory in low level chat * Constructing chat response at the end of high level chat Like what is done in low level chat * Removed wrong comment * Fixed tests * Better function name * Better promise name * Not setting sent emails if the message was blocked * refactor chathistory code to reduce mutation * change test names and add comment * adds history check to first test * added second history check * removed some comments * correct some tests in integration/chatController.test * adds unit test for chatController to make sure history is updated properly * fixes defence trigger tests that were broken by mocks * refactors reused mocking code * added unit test to check history update in sandbox * update first test to include existing history * makes second test use existing history * adds comment that points out some weirdness * polishes off those tests * fixes weirdness about combining the empty defence report * fixes problem of not getting updated chat history * respond to chris - makes chatHistoryWithNewUsermessages more concise * respond to chris - adds back useful comment * simplify transformed message ternary expression * refactors transformMessage and only calls combineTransformedMessage once --------- Co-authored-by: Peter Marsh <[email protected]>
ScottLogic · Jan 18, 2024 · b2f1a42 · b2f1a42
1 parent 45e2a41
commit b2f1a42
Show file tree

Hide file tree

Showing 12 changed files with 907 additions and 367 deletions.
diff --git a/backend/src/controller/chatController.ts b/backend/src/controller/chatController.ts
@@ -2,15 +2,17 @@ import { Response } from 'express';
 
 import {
 	transformMessage,
-	detectTriggeredDefences,
+	detectTriggeredInputDefences,
 	combineTransformedMessage,
+	detectTriggeredOutputDefences,
 } from '@src/defence';
 import { OpenAiAddHistoryRequest } from '@src/models/api/OpenAiAddHistoryRequest';
 import { OpenAiChatRequest } from '@src/models/api/OpenAiChatRequest';
 import { OpenAiClearRequest } from '@src/models/api/OpenAiClearRequest';
 import { OpenAiGetHistoryRequest } from '@src/models/api/OpenAiGetHistoryRequest';
 import {
 	CHAT_MESSAGE_TYPE,
+	ChatDefenceReport,
 	ChatHistoryMessage,
 	ChatHttpResponse,
 	ChatModel,
@@ -21,27 +23,81 @@ import { Defence } from '@src/models/defence';
 import { EmailInfo } from '@src/models/email';
 import { LEVEL_NAMES } from '@src/models/level';
 import { chatGptSendMessage } from '@src/openai';
+import { pushMessageToHistory } from '@src/utils/chat';
 
 import { handleChatError } from './handleError';
 
+function combineChatDefenceReports(
+	reports: ChatDefenceReport[]
+): ChatDefenceReport {
+	const combinedReport: ChatDefenceReport = {
+		blockedReason: reports
+			.filter((report) => report.blockedReason !== null)
+			.map((report) => report.blockedReason)
+			.join('\n'),
+		isBlocked: reports.some((report) => report.isBlocked),
+		alertedDefences: reports.flatMap((report) => report.alertedDefences),
+		triggeredDefences: reports.flatMap((report) => report.triggeredDefences),
+	};
+	return combinedReport;
+}
+
+function createNewUserMessages(
+	message: string,
+	transformedMessage: string | null
+): ChatHistoryMessage[] {
+	if (transformedMessage) {
+		// if message has been transformed
+		return [
+			// original message
+			{
+				completion: null,
+				chatMessageType: CHAT_MESSAGE_TYPE.USER,
+				infoMessage: message,
+			},
+			// transformed message
+			{
+				completion: {
+					role: 'user',
+					content: transformedMessage,
+				},
+				chatMessageType: CHAT_MESSAGE_TYPE.USER_TRANSFORMED,
+			},
+		];
+	} else {
+		// not transformed, so just return the original message
+		return [
+			{
+				completion: {
+					role: 'user',
+					content: message,
+				},
+				chatMessageType: CHAT_MESSAGE_TYPE.USER,
+			},
+		];
+	}
+}
+
 // handle the chat logic for level 1 and 2 with no defences applied
 async function handleLowLevelChat(
 	message: string,
 	chatResponse: ChatHttpResponse,
 	currentLevel: LEVEL_NAMES,
 	chatModel: ChatModel,
 	chatHistory: ChatHistoryMessage[],
-	defences: Defence[],
-	sentEmails: EmailInfo[]
+	defences: Defence[]
 ): Promise<LevelHandlerResponse> {
+	const updatedChatHistory = createNewUserMessages(message, null).reduce(
+		pushMessageToHistory,
+		chatHistory
+	);
+
 	// get the chatGPT reply
 	const openAiReply = await chatGptSendMessage(
-		chatHistory,
+		updatedChatHistory,
 		defences,
 		chatModel,
 		message,
-		false,
-		sentEmails,
 		currentLevel
 	);
 
@@ -65,82 +121,71 @@ async function handleHigherLevelChat(
 	currentLevel: LEVEL_NAMES,
 	chatModel: ChatModel,
 	chatHistory: ChatHistoryMessage[],
-	defences: Defence[],
-	sentEmails: EmailInfo[]
+	defences: Defence[]
 ): Promise<LevelHandlerResponse> {
-	let updatedChatHistory = [...chatHistory];
-	let updatedChatResponse = {
-		...chatResponse,
-	};
 	// transform the message according to active defences
 	const transformedMessage = transformMessage(message, defences);
+	const transformedMessageCombined = transformedMessage
+		? combineTransformedMessage(transformedMessage)
+		: null;
+	const chatHistoryWithNewUserMessages = createNewUserMessages(
+		message,
+		transformedMessageCombined ?? null
+	).reduce(pushMessageToHistory, chatHistory);
 
-	if (transformedMessage) {
-		// if message has been transformed then add the original to chat history and send transformed to chatGPT
-		updatedChatHistory = [
-			...updatedChatHistory,
-			{
-				completion: null,
-				chatMessageType: CHAT_MESSAGE_TYPE.USER,
-				infoMessage: message,
-			},
-		];
-
-		updatedChatResponse = {
-			...updatedChatResponse,
-			transformedMessage,
-		};
-	}
 	// detect defences on input message
-	const triggeredDefencesPromise = detectTriggeredDefences(message, defences);
+	const triggeredInputDefencesPromise = detectTriggeredInputDefences(
+		message,
+		defences
+	);
 
 	// get the chatGPT reply
 	const openAiReplyPromise = chatGptSendMessage(
-		updatedChatHistory,
+		chatHistoryWithNewUserMessages,
 		defences,
 		chatModel,
-		transformedMessage
-			? combineTransformedMessage(transformedMessage)
-			: message,
-		transformedMessage ? true : false,
-		sentEmails,
+		transformedMessageCombined ?? message,
 		currentLevel
 	);
 
 	// run defence detection and chatGPT concurrently
-	const [defenceReport, openAiReply] = await Promise.all([
-		triggeredDefencesPromise,
+	const [inputDefenceReport, openAiReply] = await Promise.all([
+		triggeredInputDefencesPromise,
 		openAiReplyPromise,
 	]);
 
-	// if input message is blocked, restore the original chat history and add user message (not as completion)
-	if (defenceReport.isBlocked) {
-		updatedChatHistory = [
-			...updatedChatHistory,
-			{
+	const botReply = openAiReply.chatResponse.completion?.content?.toString();
+	const outputDefenceReport = botReply
+		? detectTriggeredOutputDefences(botReply, defences)
+		: null;
+
+	const defenceReports = outputDefenceReport
+		? [inputDefenceReport, outputDefenceReport]
+		: [inputDefenceReport];
+	const combinedDefenceReport = combineChatDefenceReports(defenceReports);
+
+	// if blocked, restore original chat history and add user message to chat history without completion
+	const updatedChatHistory = combinedDefenceReport.isBlocked
+		? pushMessageToHistory(chatHistory, {
 				completion: null,
 				chatMessageType: CHAT_MESSAGE_TYPE.USER,
 				infoMessage: message,
-			},
-		];
-		updatedChatResponse = {
-			...updatedChatResponse,
-			defenceReport,
-		};
-	} else {
-		updatedChatHistory = openAiReply.chatHistory;
-		updatedChatResponse = {
-			...updatedChatResponse,
-			reply: openAiReply.chatResponse.completion?.content?.toString() ?? '',
-			wonLevel: openAiReply.chatResponse.wonLevel,
-			openAIErrorMessage: openAiReply.chatResponse.openAIErrorMessage,
-			defenceReport,
-		};
-	}
+		  })
+		: openAiReply.chatHistory;
+
+	const updatedChatResponse: ChatHttpResponse = {
+		...chatResponse,
+		defenceReport: combinedDefenceReport,
+		openAIErrorMessage: openAiReply.chatResponse.openAIErrorMessage,
+		reply: !combinedDefenceReport.isBlocked && botReply ? botReply : '',
+		transformedMessage: transformedMessage ?? undefined,
+		wonLevel:
+			openAiReply.chatResponse.wonLevel && !combinedDefenceReport.isBlocked,
+	};
 	return {
 		chatResponse: updatedChatResponse,
 		chatHistory: updatedChatHistory,
-		sentEmails: openAiReply.sentEmails,
+		sentEmails: combinedDefenceReport.isBlocked ? [] : openAiReply.sentEmails,
 	};
 }
 
@@ -149,7 +194,7 @@ async function handleChatToGPT(req: OpenAiChatRequest, res: Response) {
 	const initChatResponse: ChatHttpResponse = {
 		reply: '',
 		defenceReport: {
-			blockedReason: '',
+			blockedReason: null,
 			isBlocked: false,
 			alertedDefences: [],
 			triggeredDefences: [],
@@ -184,8 +229,6 @@ async function handleChatToGPT(req: OpenAiChatRequest, res: Response) {
 	const totalSentEmails: EmailInfo[] = [
 		...req.session.levelState[currentLevel].sentEmails,
 	];
-	// keep track of the number of sent emails
-	const numSentEmails = totalSentEmails.length;
 
 	// use default model for levels, allow user to select in sandbox
 	const chatModel =
@@ -208,8 +251,7 @@ async function handleChatToGPT(req: OpenAiChatRequest, res: Response) {
 				currentLevel,
 				chatModel,
 				currentChatHistory,
-				defences,
-				totalSentEmails
+				defences
 			);
 		} else {
 			// apply the defence detection for level 3 and sandbox
@@ -219,8 +261,7 @@ async function handleChatToGPT(req: OpenAiChatRequest, res: Response) {
 				currentLevel,
 				chatModel,
 				currentChatHistory,
-				defences,
-				totalSentEmails
+				defences
 			);
 		}
 	} catch (error) {
@@ -234,7 +275,7 @@ async function handleChatToGPT(req: OpenAiChatRequest, res: Response) {
 		return;
 	}
 
-	const updatedChatHistory = levelResult.chatHistory;
+	let updatedChatHistory = levelResult.chatHistory;
 	totalSentEmails.push(...levelResult.sentEmails);
 
 	// update chat response
@@ -243,22 +284,19 @@ async function handleChatToGPT(req: OpenAiChatRequest, res: Response) {
 		reply: levelResult.chatResponse.reply,
 		wonLevel: levelResult.chatResponse.wonLevel,
 		openAIErrorMessage: levelResult.chatResponse.openAIErrorMessage,
-		sentEmails: levelResult.sentEmails.slice(numSentEmails),
+		sentEmails: levelResult.sentEmails,
 		defenceReport: levelResult.chatResponse.defenceReport,
 		transformedMessage: levelResult.chatResponse.transformedMessage,
 	};
 
 	if (updatedChatResponse.defenceReport.isBlocked) {
 		// chatReponse.reply is empty if blocked
-		updatedChatHistory.push({
+		updatedChatHistory = pushMessageToHistory(updatedChatHistory, {
 			completion: null,
 			chatMessageType: CHAT_MESSAGE_TYPE.BOT_BLOCKED,
 			infoMessage: updatedChatResponse.defenceReport.blockedReason,
 		});
-	}
-
-	// more error handling
-	else if (updatedChatResponse.openAIErrorMessage) {
+	} else if (updatedChatResponse.openAIErrorMessage) {
 		const errorMsg = simplifyOpenAIErrorMessage(
 			updatedChatResponse.openAIErrorMessage
 		);
@@ -276,6 +314,15 @@ async function handleChatToGPT(req: OpenAiChatRequest, res: Response) {
 		);
 		handleChatError(res, updatedChatResponse, errorMsg, 500);
 		return;
+	} else {
+		// add bot message to chat history
+		updatedChatHistory = pushMessageToHistory(updatedChatHistory, {
+			completion: {
+				role: 'assistant',
+				content: updatedChatResponse.reply,
+			},
+			chatMessageType: CHAT_MESSAGE_TYPE.BOT,
+		});
 	}
 
 	// update state
@@ -303,16 +350,12 @@ function addErrorToChatHistory(
 	chatHistory: ChatHistoryMessage[],
 	errorMessage: string
 ): ChatHistoryMessage[] {
-	const updatedChatHistory = [
-		...chatHistory,
-		{
-			completion: null,
-			chatMessageType: CHAT_MESSAGE_TYPE.ERROR_MSG,
-			infoMessage: errorMessage,
-		},
-	];
 	console.error(errorMessage);
-	return updatedChatHistory;
+	return pushMessageToHistory(chatHistory, {
+		completion: null,
+		chatMessageType: CHAT_MESSAGE_TYPE.ERROR_MSG,
+		infoMessage: errorMessage,
+	});
 }
 
 function handleGetChatHistory(req: OpenAiGetHistoryRequest, res: Response) {
@@ -335,14 +378,14 @@ function handleAddToChatHistory(req: OpenAiAddHistoryRequest, res: Response) {
 		level !== undefined &&
 		level >= LEVEL_NAMES.LEVEL_1
 	) {
-		req.session.levelState[level].chatHistory = [
-			...req.session.levelState[level].chatHistory,
+		req.session.levelState[level].chatHistory = pushMessageToHistory(
+			req.session.levelState[level].chatHistory,
 			{
 				completion: null,
 				chatMessageType,
 				infoMessage,
-			},
-		];
+			}
+		);
 		res.send();
 	} else {
 		res.status(400);