From f84a8953b6cbffad9917ef353d1c565c9b059d63 Mon Sep 17 00:00:00 2001 From: Reza Rahemtola Date: Tue, 6 Aug 2024 20:05:00 +0200 Subject: [PATCH] feat(kb): Chunks generation and basic search results --- package-lock.json | 1 + package.json | 1 + src/pages/Chat.vue | 65 ++++++++++++-------------------- src/pages/KnowledgeBase.vue | 50 +++++++++++++++++------- src/pages/KnowledgeBasesList.vue | 2 +- src/stores/chats.ts | 11 ------ src/stores/knowledge.ts | 12 +++++- src/stores/old-knowledge.ts | 56 --------------------------- src/types/knowledge.ts | 5 +++ src/utils/knowledge/default.ts | 33 ---------------- src/utils/knowledge/document.ts | 2 +- src/utils/knowledge/embedding.ts | 56 ++++++++++++++++++++------- 12 files changed, 123 insertions(+), 171 deletions(-) delete mode 100644 src/stores/old-knowledge.ts delete mode 100644 src/utils/knowledge/default.ts diff --git a/package-lock.json b/package-lock.json index a40c32b..56c6b27 100644 --- a/package-lock.json +++ b/package-lock.json @@ -27,6 +27,7 @@ "marked": "^13.0.3", "marked-highlight": "^2.1.3", "mime": "^4.0.4", + "ml-distance": "^4.0.1", "pdfjs-dist": "^4.5.136", "pinia": "^2.2.1", "pinia-plugin-persistedstate": "^3.2.1", diff --git a/package.json b/package.json index d6b54b3..4a2425b 100644 --- a/package.json +++ b/package.json @@ -38,6 +38,7 @@ "marked": "^13.0.3", "marked-highlight": "^2.1.3", "mime": "^4.0.4", + "ml-distance": "^4.0.1", "pdfjs-dist": "^4.5.136", "pinia": "^2.2.1", "pinia-plugin-persistedstate": "^3.2.1", diff --git a/src/pages/Chat.vue b/src/pages/Chat.vue index dd74ccd..a37e9dc 100644 --- a/src/pages/Chat.vue +++ b/src/pages/Chat.vue @@ -119,6 +119,8 @@ import { useSettingsStore } from 'stores/settings'; import { Chat, SendMessageParams, UIMessage } from 'src/types/chats'; import dayjs from 'dayjs'; import LtaiIcon from 'components/libertai/LtaiIcon.vue'; +import { searchDocuments } from 'src/utils/knowledge/embedding'; +import { useKnowledgeStore } from 'stores/knowledge'; const $q = useQuasar(); const route = useRoute(); @@ -128,6 +130,7 @@ const router = useRouter(); const chatsStore = useChatsStore(); const modelsStore = useModelsStore(); const settingsStore = useSettingsStore(); +const knowledgeStore = useKnowledgeStore(); // Local page state const isLoadingRef = ref(false); @@ -185,7 +188,8 @@ async function generatePersonaMessage() { const chatId = chatRef.value.id; const username = chatRef.value.username; - const messages = JSON.parse(JSON.stringify(chatRef.value.messages)); + const messages: UIMessage[] = JSON.parse(JSON.stringify(chatRef.value.messages)); + const knowledgeBaseIds = chatRef.value.knowledgeBases; const persona = chatRef.value.persona; const modelId = chatRef.value.modelId; @@ -211,17 +215,21 @@ async function generatePersonaMessage() { // Set loading state isLoadingRef.value = true; - // NOTE: assuming last message is guaranteed to be non-empty and the user's last message - // Get the last message from the user - // const lastMessage = messages[messages.length - 1]; - // const searchResultMessages: Message[] = []; - // const searchResults = await knowledgeStore.searchDocuments(lastMessage.content, chatTags); - // searchResults.forEach((result) => { - // searchResultMessages.push({ - // role: 'search-result', - // content: result.content, - // }); - // }); + let searchResultMessages: Message[] = []; + + // Finding related knowledge document chunks + if (knowledgeBaseIds.length > 0) { + const documents = knowledgeStore.getDocumentsFrom(knowledgeBaseIds); + const lastUserMessage = messages.findLast((message) => message.author === 'user')!; + const searchResults = await searchDocuments(lastUserMessage.content, documents); + console.log(searchResults); + searchResultMessages = searchResults.map( + (result): Message => ({ + role: 'search-result', + content: result.content, + }), + ); + } // Expand all the messages to inline any compatible attachments const expandedMessages = messages @@ -229,35 +237,12 @@ async function generatePersonaMessage() { const ret = []; // Push any attachments as messages ahead of the message itself message.attachments?.forEach((attachment) => { - if (attachment.content) { - ret.push({ - role: 'attachment', - content: `[${attachment.title}](${attachment.content})`, - }); - } - // else if (attachment.documentId) { - // ret.push({ - // role: 'attachment', - // content: `[${attachment.title}](document-id-${attachment.documentId})`, - // }); - // } + ret.push({ + role: 'attachment', + content: `[${attachment.title}](${attachment.content})`, + }); }); - // Push what search results we found based on the message - // TODO: this should probably be a more generic tool-call or llm-chain-link - // TODO: this should probably link back to the document id - // TODO: I should probably write these below messages in the log - // Really these search results should get attached to the message that - // lead to them being queried - // if (message.searchResults) { - // message.searchResults.forEach((result: Message) => { - // ret.push({ - // role: 'search-result', - // content: result.content, - // }); - // }); - // } - // Push the message itself ret.push(message); return ret; @@ -265,7 +250,7 @@ async function generatePersonaMessage() { .flat(); // Append the search results to the messages - const allMessages: Message[] = [...expandedMessages /*...searchResultMessages */]; + const allMessages: Message[] = [...expandedMessages, ...searchResultMessages]; // Generate a stream of responses from the AI for await (const output of inferenceEngine.generateAnswer(allMessages, model, persona, username, false)) { diff --git a/src/pages/KnowledgeBase.vue b/src/pages/KnowledgeBase.vue index a5ebfe3..8c1070a 100644 --- a/src/pages/KnowledgeBase.vue +++ b/src/pages/KnowledgeBase.vue @@ -43,7 +43,16 @@ - + @@ -51,7 +60,16 @@ Rename - + @@ -62,19 +80,22 @@ - - - - - - Are you sure you want to delete the the document {{ document.name }}? - - + + + + + Are you sure you want to delete the the document {{ selectedDocument!.name }}? + + @@ -103,6 +124,7 @@ const knowledgeStore = useKnowledgeStore(); const knowledgeBaseRef = ref(undefined); const knowledgeBaseIdentifierRef = ref(undefined); +const selectedDocument = ref(undefined); const showRenameDocument = ref(false); const showDeleteDocumentConfirmation = ref(false); diff --git a/src/pages/KnowledgeBasesList.vue b/src/pages/KnowledgeBasesList.vue index 49e5b60..ceba983 100644 --- a/src/pages/KnowledgeBasesList.vue +++ b/src/pages/KnowledgeBasesList.vue @@ -22,7 +22,7 @@

{{ knowledgeBase.name }}

{{ knowledgeBase.documents.length }} File{{ knowledgeBase.documents.length !== 1 ? 's' : '' }}

-

Last updated: {{ dayjs(knowledgeBase.lastUpdatedAt).format('LL') }}

+

Last updated: {{ dayjs(knowledgeBase.lastUpdatedAt).format('LL') }}

diff --git a/src/stores/chats.ts b/src/stores/chats.ts index 14c1ee6..34847d3 100644 --- a/src/stores/chats.ts +++ b/src/stores/chats.ts @@ -8,17 +8,6 @@ import localforage from 'localforage'; const CHATS_STORE_NAME = 'chats-store'; const CHATS_STORE_PINIA_KEY = 'chats-store-pinia-key'; -// TODO: Search results are not yet implemented -/** - * Representation of a search result: - * interface SearchResult { - * // embedding document id - * documentId: string; - * // embedding content - * content: string; - * } - */ - type ChatsStoreState = { version: number; chats: Chat[]; diff --git a/src/stores/knowledge.ts b/src/stores/knowledge.ts index 2e74410..d3c8391 100644 --- a/src/stores/knowledge.ts +++ b/src/stores/knowledge.ts @@ -1,7 +1,7 @@ import { defineStore } from 'pinia'; import { v4 as uuidv4 } from 'uuid'; -import { KnowledgeBase, KnowledgeBaseIdentifier } from 'src/types/knowledge'; +import { KnowledgeBase, KnowledgeBaseIdentifier, KnowledgeDocument } from 'src/types/knowledge'; import { useAccountStore } from 'stores/account'; type KnowledgeStoreState = { @@ -16,6 +16,16 @@ export const useKnowledgeStore = defineStore('knowledge', { knowledgeBaseIdentifiers: [], isLoaded: false, }), + getters: { + getDocumentsFrom: (state) => { + return (ids: string[]): KnowledgeDocument[] => { + return state.knowledgeBases + .filter((kb) => ids.includes(kb.id)) + .map((kb) => kb.documents) + .flat(); + }; + }, + }, actions: { async load() { const { alephStorage } = useAccountStore(); diff --git a/src/stores/old-knowledge.ts b/src/stores/old-knowledge.ts deleted file mode 100644 index de06121..0000000 --- a/src/stores/old-knowledge.ts +++ /dev/null @@ -1,56 +0,0 @@ -import { defineStore } from 'pinia'; -import { Document, KnowledgeStore } from '@libertai/libertai-js'; -import { defaultKnowledge } from '../utils/knowledge/default'; -import { v4 as uuidv4 } from 'uuid'; - -export const DEFAULT_KNOWLEDGE_TAG = 'default'; -export const KNOWLEDGE_STORE_PINIA_KEY = 'knowledge-store-pinia-key'; - -export const useKnowledgeStore = defineStore(KNOWLEDGE_STORE_PINIA_KEY, { - state: () => ({ - documents: [] as Document[], - knowledgeStore: new KnowledgeStore(), - }), - actions: { - async load() { - await this.knowledgeStore.load(); - await this.knowledgeStore.prune(); - const defaultDocumentTitles = defaultKnowledge.map((doc) => doc.title); - const docs: Document[] = Array.from(this.knowledgeStore.documents.values()); - const documentTitles = docs.map((doc) => doc.title); - // Check if default documents are already in the store - const missingDocuments = defaultDocumentTitles.filter((title) => !documentTitles.includes(title)); - // Add missing documents - const addedDocuments: Promise[] = []; - for (const title of missingDocuments) { - const doc = defaultKnowledge.find((doc) => doc.title === title)!; - const tags = doc.tags ? doc.tags : []; - tags.push(DEFAULT_KNOWLEDGE_TAG); - addedDocuments.push(this.addDocument(doc.title, doc.content, tags)); - docs.push({ ...doc, id: uuidv4() }); - } - await Promise.all(addedDocuments); - this.documents = docs; - }, - async addDocument(title: string, content: string, tags: string[] = []): Promise { - const doc = await this.knowledgeStore.addDocument(title, content, tags); - this.documents.push(doc); - return doc; - }, - async removeDocument(documentId: string) { - await this.knowledgeStore.removeDocument(documentId); - this.documents = this.documents.filter((doc) => doc.id !== documentId); - }, - async searchDocuments(query: string, tags: string[] = []) { - // If tags aren't empty, add the default tag - // Otherwise, if tags is empty, we'll just search - // with no filters, and the default tag will be included - if (tags.length > 0) { - tags.push(DEFAULT_KNOWLEDGE_TAG); - } - - // TODO: this should probably be none - return await this.knowledgeStore.searchDocuments(query, 3, 20, tags); - }, - }, -}); diff --git a/src/types/knowledge.ts b/src/types/knowledge.ts index 9c3d391..0056c21 100644 --- a/src/types/knowledge.ts +++ b/src/types/knowledge.ts @@ -1,5 +1,10 @@ import { z } from 'zod'; +export type KnowledgeSearchResult = { + content: string; + distance: number; +}; + const knowledgeDocumentChunk = z.object({ content: z.string(), vector: z.array(z.number()), diff --git a/src/utils/knowledge/default.ts b/src/utils/knowledge/default.ts deleted file mode 100644 index 4411717..0000000 --- a/src/utils/knowledge/default.ts +++ /dev/null @@ -1,33 +0,0 @@ -// Sample knowledge data about Aleph and Libertai (AI generated hehe) -export const defaultKnowledge = [ - { - title: 'Aleph.Im Background', - content: - 'Aleph.im is a decentralized cloud computing platform that aims ' + - 'to provide an alternative to traditional centralized cloud providers ' + - 'like Amazon Web Services (AWS) or Google Cloud. By leveraging blockchain ' + - 'technology and decentralized infrastructure, Aleph.im offers a range of ' + - 'benefits that set it apart from its centralized counterparts. Aleph.im offers ' + - 'improved scalability and fault tolerance. The decentralized nature of the platform means ' + - 'that it can handle increased demand and traffic without experiencing the same bottlenecks ' + - 'and performance issues that can plague centralized cloud providers. If one node in the network ' + - 'goes down, the system can automatically route requests to other available nodes, ensuring ' + - 'ensuring continuous uptime and availability. Another key feature of Aleph.im is its ' + - 'support for AI and machine learning workloads. The platform provides a range of tools ' + - 'and services that enable developers to deploy and run AI models in a decentralized manner. ' + - 'This includes the ability to train and deploy large language models, which are a critical ' + - 'component of many modern AI applications.', - tags: ['Aleph.Im', 'Blockchain', 'Decentralized Cloud Computing', 'DePin', 'Aleph', 'web3'], - }, - { - title: 'Libertai.io Background', - content: - "Libertai.io is a decentralized AI platform that is built on top of Aleph.im's cloud computing platform. " + - 'Unlike traditional centralized AI platforms like OpenAI, Libertai.io is designed to operate in a decentralized manner, ' + - "leveraging the security and scalability of Aleph.im's infrastructure to provide both privacy and performance. " + - "Since the AI models that power the assistant are deployed and run on Aleph.im's network, rather than on a centralized server, " + - 'users can be confident that their data and interactions are not being monitored or exploited by a single entity. This is particularly ' + - 'important for individuals and organizations that are concerned about the privacy implications of using traditional AI assistants. ', - tags: ['Libertai', 'AI', 'Decentralized AI', 'Aleph.Im', 'Privacy', 'Security', 'depin'], - }, -]; diff --git a/src/utils/knowledge/document.ts b/src/utils/knowledge/document.ts index aa9459a..83a2f6b 100644 --- a/src/utils/knowledge/document.ts +++ b/src/utils/knowledge/document.ts @@ -7,7 +7,7 @@ import { generateChunks } from 'src/utils/knowledge/embedding'; export const processDocument = async (file: File): Promise> => { const fileInfo = await extractFileContent(file); - const chunks = await generateChunks(file.name, fileInfo.content); + const chunks = await generateChunks(fileInfo.content); return { ...fileInfo, id: uuidv4(), name: file.name, size: file.size, chunks }; }; diff --git a/src/utils/knowledge/embedding.ts b/src/utils/knowledge/embedding.ts index 53dcd7c..dcfb213 100644 --- a/src/utils/knowledge/embedding.ts +++ b/src/utils/knowledge/embedding.ts @@ -1,12 +1,12 @@ import axios from 'axios'; import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; -import { KnowledgeDocumentChunk } from 'src/types/knowledge'; +import { KnowledgeDocument, KnowledgeDocumentChunk, KnowledgeSearchResult } from 'src/types/knowledge'; +import { distance } from 'ml-distance'; const DEFAULT_EMBEDDING_API_URL = 'https://curated.aleph.cloud/vm/ee1b2a8e5bd645447739d8b234ef495c9a2b4d0b98317d510a3ccf822808ebe5/embedding'; export const generateChunks = async ( - title: string, content: string, chunkSize = 500, overlapSize = 100, @@ -18,23 +18,51 @@ export const generateChunks = async ( }); // Split into a list of LangChain documents - const documents = await splitter.createDocuments( + const documentChunks = await splitter.createDocuments( [content], - // TODO: include metadata + // TODO: include metadata ? [], { - chunkHeader: `DOCUMENT TITLE: ${title}\n\n---\n\n`, - appendChunkOverlapHeader: true, + appendChunkOverlapHeader: false, }, ); - return await Promise.all( - documents.map( - async (d): Promise => ({ - content: d.pageContent, - vector: await embed(d.pageContent), - }), - ), - ); + const result: KnowledgeDocumentChunk[] = []; + + // Need to do this synchronously to avoid timeout on the embedding model API + for (const chunk of documentChunks) { + const embedding_vector = await embed(chunk.pageContent); + result.push({ + content: chunk.pageContent, + vector: embedding_vector, + }); + } + + return result; +}; + +export const searchDocuments = async ( + query: string, + documents: KnowledgeDocument[], + max_chunks = 5, + max_distance = 15, +): Promise => { + const query_vector = await embed(query); + const matches: KnowledgeSearchResult[] = []; + + // Iterate over all embeddings + documents.forEach((document) => { + document.chunks.forEach((chunk) => { + const euclidean_distance = distance.euclidean(query_vector, chunk.vector); + + // If the distance is greater than the max_distance, skip it + if (euclidean_distance > max_distance) return; + matches.push({ content: chunk.content, distance: euclidean_distance }); + }); + }); + + matches.sort((a, b) => a.distance - b.distance); + + return matches.slice(0, max_chunks); }; async function embed(content: string): Promise {