From f84a8953b6cbffad9917ef353d1c565c9b059d63 Mon Sep 17 00:00:00 2001
From: Reza Rahemtola <reza.rahemtola@epitech.eu>
Date: Tue, 6 Aug 2024 20:05:00 +0200
Subject: [PATCH] feat(kb): Chunks generation and basic search results

---
 package-lock.json                |  1 +
 package.json                     |  1 +
 src/pages/Chat.vue               | 65 ++++++++++++--------------------
 src/pages/KnowledgeBase.vue      | 50 +++++++++++++++++-------
 src/pages/KnowledgeBasesList.vue |  2 +-
 src/stores/chats.ts              | 11 ------
 src/stores/knowledge.ts          | 12 +++++-
 src/stores/old-knowledge.ts      | 56 ---------------------------
 src/types/knowledge.ts           |  5 +++
 src/utils/knowledge/default.ts   | 33 ----------------
 src/utils/knowledge/document.ts  |  2 +-
 src/utils/knowledge/embedding.ts | 56 ++++++++++++++++++++-------
 12 files changed, 123 insertions(+), 171 deletions(-)
 delete mode 100644 src/stores/old-knowledge.ts
 delete mode 100644 src/utils/knowledge/default.ts

diff --git a/package-lock.json b/package-lock.json
index a40c32b..56c6b27 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -27,6 +27,7 @@
         "marked": "^13.0.3",
         "marked-highlight": "^2.1.3",
         "mime": "^4.0.4",
+        "ml-distance": "^4.0.1",
         "pdfjs-dist": "^4.5.136",
         "pinia": "^2.2.1",
         "pinia-plugin-persistedstate": "^3.2.1",
diff --git a/package.json b/package.json
index d6b54b3..4a2425b 100644
--- a/package.json
+++ b/package.json
@@ -38,6 +38,7 @@
     "marked": "^13.0.3",
     "marked-highlight": "^2.1.3",
     "mime": "^4.0.4",
+    "ml-distance": "^4.0.1",
     "pdfjs-dist": "^4.5.136",
     "pinia": "^2.2.1",
     "pinia-plugin-persistedstate": "^3.2.1",
diff --git a/src/pages/Chat.vue b/src/pages/Chat.vue
index dd74ccd..a37e9dc 100644
--- a/src/pages/Chat.vue
+++ b/src/pages/Chat.vue
@@ -119,6 +119,8 @@ import { useSettingsStore } from 'stores/settings';
 import { Chat, SendMessageParams, UIMessage } from 'src/types/chats';
 import dayjs from 'dayjs';
 import LtaiIcon from 'components/libertai/LtaiIcon.vue';
+import { searchDocuments } from 'src/utils/knowledge/embedding';
+import { useKnowledgeStore } from 'stores/knowledge';
 
 const $q = useQuasar();
 const route = useRoute();
@@ -128,6 +130,7 @@ const router = useRouter();
 const chatsStore = useChatsStore();
 const modelsStore = useModelsStore();
 const settingsStore = useSettingsStore();
+const knowledgeStore = useKnowledgeStore();
 
 // Local page state
 const isLoadingRef = ref(false);
@@ -185,7 +188,8 @@ async function generatePersonaMessage() {
 
   const chatId = chatRef.value.id;
   const username = chatRef.value.username;
-  const messages = JSON.parse(JSON.stringify(chatRef.value.messages));
+  const messages: UIMessage[] = JSON.parse(JSON.stringify(chatRef.value.messages));
+  const knowledgeBaseIds = chatRef.value.knowledgeBases;
   const persona = chatRef.value.persona;
 
   const modelId = chatRef.value.modelId;
@@ -211,17 +215,21 @@ async function generatePersonaMessage() {
     // Set loading state
     isLoadingRef.value = true;
 
-    // NOTE: assuming last message is guaranteed to be non-empty and the user's last message
-    // Get the last message from the user
-    // const lastMessage = messages[messages.length - 1];
-    // const searchResultMessages: Message[] = [];
-    // const searchResults = await knowledgeStore.searchDocuments(lastMessage.content, chatTags);
-    // searchResults.forEach((result) => {
-    //   searchResultMessages.push({
-    //     role: 'search-result',
-    //     content: result.content,
-    //   });
-    // });
+    let searchResultMessages: Message[] = [];
+
+    // Finding related knowledge document chunks
+    if (knowledgeBaseIds.length > 0) {
+      const documents = knowledgeStore.getDocumentsFrom(knowledgeBaseIds);
+      const lastUserMessage = messages.findLast((message) => message.author === 'user')!;
+      const searchResults = await searchDocuments(lastUserMessage.content, documents);
+      console.log(searchResults);
+      searchResultMessages = searchResults.map(
+        (result): Message => ({
+          role: 'search-result',
+          content: result.content,
+        }),
+      );
+    }
 
     // Expand all the messages to inline any compatible attachments
     const expandedMessages = messages
@@ -229,35 +237,12 @@ async function generatePersonaMessage() {
         const ret = [];
         // Push any attachments as messages ahead of the message itself
         message.attachments?.forEach((attachment) => {
-          if (attachment.content) {
-            ret.push({
-              role: 'attachment',
-              content: `[${attachment.title}](${attachment.content})`,
-            });
-          }
-          // else if (attachment.documentId) {
-          //   ret.push({
-          //     role: 'attachment',
-          //     content: `[${attachment.title}](document-id-${attachment.documentId})`,
-          //   });
-          // }
+          ret.push({
+            role: 'attachment',
+            content: `[${attachment.title}](${attachment.content})`,
+          });
         });
 
-        // Push what search results we found based on the message
-        // TODO: this should probably be a more generic tool-call or llm-chain-link
-        // TODO: this should probably link back to the document id
-        // TODO: I should probably write these below messages in the log
-        //  Really these search results should get attached to the message that
-        //   lead to them being queried
-        // if (message.searchResults) {
-        //   message.searchResults.forEach((result: Message) => {
-        //     ret.push({
-        //       role: 'search-result',
-        //       content: result.content,
-        //     });
-        //   });
-        // }
-
         // Push the message itself
         ret.push(message);
         return ret;
@@ -265,7 +250,7 @@ async function generatePersonaMessage() {
       .flat();
 
     // Append the search results to the messages
-    const allMessages: Message[] = [...expandedMessages /*...searchResultMessages */];
+    const allMessages: Message[] = [...expandedMessages, ...searchResultMessages];
 
     // Generate a stream of responses from the AI
     for await (const output of inferenceEngine.generateAnswer(allMessages, model, persona, username, false)) {
diff --git a/src/pages/KnowledgeBase.vue b/src/pages/KnowledgeBase.vue
index a5ebfe3..8c1070a 100644
--- a/src/pages/KnowledgeBase.vue
+++ b/src/pages/KnowledgeBase.vue
@@ -43,7 +43,16 @@
 
           <q-btn-dropdown class="tw-p-1" dropdown-icon="more_horiz" unelevated>
             <q-list>
-              <q-item v-close-popup clickable @click="showRenameDocument = true">
+              <q-item
+                v-close-popup
+                clickable
+                @click="
+                  () => {
+                    selectedDocument = document;
+                    showRenameDocument = true;
+                  }
+                "
+              >
                 <q-item-section avatar>
                   <ltai-icon class="tw-mx-auto" name="svguse:icons.svg#pencil" />
                 </q-item-section>
@@ -51,7 +60,16 @@
                   <q-item-label>Rename</q-item-label>
                 </q-item-section>
               </q-item>
-              <q-item v-close-popup clickable @click="showDeleteDocumentConfirmation = true">
+              <q-item
+                v-close-popup
+                clickable
+                @click="
+                  () => {
+                    selectedDocument = document;
+                    showDeleteDocumentConfirmation = true;
+                  }
+                "
+              >
                 <q-item-section avatar>
                   <ltai-icon class="tw-mx-auto" name="svguse:icons.svg#delete" />
                 </q-item-section>
@@ -62,19 +80,22 @@
             </q-list>
           </q-btn-dropdown>
         </div>
-
-        <!-- Dialogs-->
-        <knowledge-base-rename-document-dialog
-          v-model="showRenameDocument"
-          :name="document.name"
-          @save="(newName: string) => renameDocument(document, newName)"
-        />
-        <ltai-dialog v-model="showDeleteDocumentConfirmation" title="Delete document" @save="deleteDocument(document)">
-          <q-card-section class="row">
-            <span>Are you sure you want to delete the the document {{ document.name }}?</span>
-          </q-card-section>
-        </ltai-dialog>
       </div>
+      <!-- Dialogs-->
+      <knowledge-base-rename-document-dialog
+        v-model="showRenameDocument"
+        :name="selectedDocument?.name ?? ''"
+        @save="(newName: string) => renameDocument(selectedDocument!, newName)"
+      />
+      <ltai-dialog
+        v-model="showDeleteDocumentConfirmation"
+        title="Delete document"
+        @save="deleteDocument(selectedDocument!)"
+      >
+        <q-card-section class="row">
+          <span>Are you sure you want to delete the the document {{ selectedDocument!.name }}?</span>
+        </q-card-section>
+      </ltai-dialog>
     </div>
   </section>
 </template>
@@ -103,6 +124,7 @@ const knowledgeStore = useKnowledgeStore();
 
 const knowledgeBaseRef = ref<KnowledgeBase | undefined>(undefined);
 const knowledgeBaseIdentifierRef = ref<KnowledgeBaseIdentifier | undefined>(undefined);
+const selectedDocument = ref<KnowledgeDocument | undefined>(undefined);
 const showRenameDocument = ref(false);
 const showDeleteDocumentConfirmation = ref(false);
 
diff --git a/src/pages/KnowledgeBasesList.vue b/src/pages/KnowledgeBasesList.vue
index 49e5b60..ceba983 100644
--- a/src/pages/KnowledgeBasesList.vue
+++ b/src/pages/KnowledgeBasesList.vue
@@ -22,7 +22,7 @@
           <p class="tw-font-bold tw-text-base">{{ knowledgeBase.name }}</p>
           <div class="tw-ml-auto tw-flex tw-gap-4">
             <p>{{ knowledgeBase.documents.length }} File{{ knowledgeBase.documents.length !== 1 ? 's' : '' }}</p>
-            <p>Last updated: {{ dayjs(knowledgeBase.lastUpdatedAt).format('LL') }}</p>
+            <p class="max-sm:tw-hidden">Last updated: {{ dayjs(knowledgeBase.lastUpdatedAt).format('LL') }}</p>
             <ltai-icon class="tw-w-5 tw-h-5" name="svguse:icons.svg#chevron-right" />
           </div>
         </div>
diff --git a/src/stores/chats.ts b/src/stores/chats.ts
index 14c1ee6..34847d3 100644
--- a/src/stores/chats.ts
+++ b/src/stores/chats.ts
@@ -8,17 +8,6 @@ import localforage from 'localforage';
 const CHATS_STORE_NAME = 'chats-store';
 const CHATS_STORE_PINIA_KEY = 'chats-store-pinia-key';
 
-// TODO: Search results are not yet implemented
-/**
- * Representation of a search result:
- * interface SearchResult {
- *  // embedding document id
- *  documentId: string;
- *  // embedding content
- *  content: string;
- * }
- */
-
 type ChatsStoreState = {
   version: number;
   chats: Chat[];
diff --git a/src/stores/knowledge.ts b/src/stores/knowledge.ts
index 2e74410..d3c8391 100644
--- a/src/stores/knowledge.ts
+++ b/src/stores/knowledge.ts
@@ -1,7 +1,7 @@
 import { defineStore } from 'pinia';
 import { v4 as uuidv4 } from 'uuid';
 
-import { KnowledgeBase, KnowledgeBaseIdentifier } from 'src/types/knowledge';
+import { KnowledgeBase, KnowledgeBaseIdentifier, KnowledgeDocument } from 'src/types/knowledge';
 import { useAccountStore } from 'stores/account';
 
 type KnowledgeStoreState = {
@@ -16,6 +16,16 @@ export const useKnowledgeStore = defineStore('knowledge', {
     knowledgeBaseIdentifiers: [],
     isLoaded: false,
   }),
+  getters: {
+    getDocumentsFrom: (state) => {
+      return (ids: string[]): KnowledgeDocument[] => {
+        return state.knowledgeBases
+          .filter((kb) => ids.includes(kb.id))
+          .map((kb) => kb.documents)
+          .flat();
+      };
+    },
+  },
   actions: {
     async load() {
       const { alephStorage } = useAccountStore();
diff --git a/src/stores/old-knowledge.ts b/src/stores/old-knowledge.ts
deleted file mode 100644
index de06121..0000000
--- a/src/stores/old-knowledge.ts
+++ /dev/null
@@ -1,56 +0,0 @@
-import { defineStore } from 'pinia';
-import { Document, KnowledgeStore } from '@libertai/libertai-js';
-import { defaultKnowledge } from '../utils/knowledge/default';
-import { v4 as uuidv4 } from 'uuid';
-
-export const DEFAULT_KNOWLEDGE_TAG = 'default';
-export const KNOWLEDGE_STORE_PINIA_KEY = 'knowledge-store-pinia-key';
-
-export const useKnowledgeStore = defineStore(KNOWLEDGE_STORE_PINIA_KEY, {
-  state: () => ({
-    documents: [] as Document[],
-    knowledgeStore: new KnowledgeStore(),
-  }),
-  actions: {
-    async load() {
-      await this.knowledgeStore.load();
-      await this.knowledgeStore.prune();
-      const defaultDocumentTitles = defaultKnowledge.map((doc) => doc.title);
-      const docs: Document[] = Array.from(this.knowledgeStore.documents.values());
-      const documentTitles = docs.map((doc) => doc.title);
-      // Check if default documents are already in the store
-      const missingDocuments = defaultDocumentTitles.filter((title) => !documentTitles.includes(title));
-      // Add missing documents
-      const addedDocuments: Promise<Document>[] = [];
-      for (const title of missingDocuments) {
-        const doc = defaultKnowledge.find((doc) => doc.title === title)!;
-        const tags = doc.tags ? doc.tags : [];
-        tags.push(DEFAULT_KNOWLEDGE_TAG);
-        addedDocuments.push(this.addDocument(doc.title, doc.content, tags));
-        docs.push({ ...doc, id: uuidv4() });
-      }
-      await Promise.all(addedDocuments);
-      this.documents = docs;
-    },
-    async addDocument(title: string, content: string, tags: string[] = []): Promise<Document> {
-      const doc = await this.knowledgeStore.addDocument(title, content, tags);
-      this.documents.push(doc);
-      return doc;
-    },
-    async removeDocument(documentId: string) {
-      await this.knowledgeStore.removeDocument(documentId);
-      this.documents = this.documents.filter((doc) => doc.id !== documentId);
-    },
-    async searchDocuments(query: string, tags: string[] = []) {
-      // If tags aren't empty, add the default tag
-      //  Otherwise, if tags is empty, we'll just search
-      //   with no filters, and the default tag will be included
-      if (tags.length > 0) {
-        tags.push(DEFAULT_KNOWLEDGE_TAG);
-      }
-
-      // TODO: this should probably be none
-      return await this.knowledgeStore.searchDocuments(query, 3, 20, tags);
-    },
-  },
-});
diff --git a/src/types/knowledge.ts b/src/types/knowledge.ts
index 9c3d391..0056c21 100644
--- a/src/types/knowledge.ts
+++ b/src/types/knowledge.ts
@@ -1,5 +1,10 @@
 import { z } from 'zod';
 
+export type KnowledgeSearchResult = {
+  content: string;
+  distance: number;
+};
+
 const knowledgeDocumentChunk = z.object({
   content: z.string(),
   vector: z.array(z.number()),
diff --git a/src/utils/knowledge/default.ts b/src/utils/knowledge/default.ts
deleted file mode 100644
index 4411717..0000000
--- a/src/utils/knowledge/default.ts
+++ /dev/null
@@ -1,33 +0,0 @@
-// Sample knowledge data about Aleph and Libertai (AI generated hehe)
-export const defaultKnowledge = [
-  {
-    title: 'Aleph.Im Background',
-    content:
-      'Aleph.im is a decentralized cloud computing platform that aims ' +
-      'to provide an alternative to traditional centralized cloud providers ' +
-      'like Amazon Web Services (AWS) or Google Cloud. By leveraging blockchain ' +
-      'technology and decentralized infrastructure, Aleph.im offers a range of ' +
-      'benefits that set it apart from its centralized counterparts. Aleph.im offers ' +
-      'improved scalability and fault tolerance. The decentralized nature of the platform means ' +
-      'that it can handle increased demand and traffic without experiencing the same bottlenecks ' +
-      'and performance issues that can plague centralized cloud providers. If one node in the network ' +
-      'goes down, the system can automatically route requests to other available nodes, ensuring ' +
-      'ensuring continuous uptime and availability. Another key feature of Aleph.im is its ' +
-      'support for AI and machine learning workloads. The platform provides a range of tools ' +
-      'and services that enable developers to deploy and run AI models in a decentralized manner. ' +
-      'This includes the ability to train and deploy large language models, which are a critical ' +
-      'component of many modern AI applications.',
-    tags: ['Aleph.Im', 'Blockchain', 'Decentralized Cloud Computing', 'DePin', 'Aleph', 'web3'],
-  },
-  {
-    title: 'Libertai.io Background',
-    content:
-      "Libertai.io is a decentralized AI platform that is built on top of Aleph.im's cloud computing platform. " +
-      'Unlike traditional centralized AI platforms like OpenAI, Libertai.io is designed to operate in a decentralized manner, ' +
-      "leveraging the security and scalability of Aleph.im's infrastructure to provide both privacy and performance. " +
-      "Since the AI models that power the assistant are deployed and run on Aleph.im's network, rather than on a centralized server, " +
-      'users can be confident that their data and interactions are not being monitored or exploited by a single entity. This is particularly ' +
-      'important for individuals and organizations that are concerned about the privacy implications of using traditional AI assistants. ',
-    tags: ['Libertai', 'AI', 'Decentralized AI', 'Aleph.Im', 'Privacy', 'Security', 'depin'],
-  },
-];
diff --git a/src/utils/knowledge/document.ts b/src/utils/knowledge/document.ts
index aa9459a..83a2f6b 100644
--- a/src/utils/knowledge/document.ts
+++ b/src/utils/knowledge/document.ts
@@ -7,7 +7,7 @@ import { generateChunks } from 'src/utils/knowledge/embedding';
 export const processDocument = async (file: File): Promise<Omit<KnowledgeDocument, 'store'>> => {
   const fileInfo = await extractFileContent(file);
 
-  const chunks = await generateChunks(file.name, fileInfo.content);
+  const chunks = await generateChunks(fileInfo.content);
 
   return { ...fileInfo, id: uuidv4(), name: file.name, size: file.size, chunks };
 };
diff --git a/src/utils/knowledge/embedding.ts b/src/utils/knowledge/embedding.ts
index 53dcd7c..dcfb213 100644
--- a/src/utils/knowledge/embedding.ts
+++ b/src/utils/knowledge/embedding.ts
@@ -1,12 +1,12 @@
 import axios from 'axios';
 import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
-import { KnowledgeDocumentChunk } from 'src/types/knowledge';
+import { KnowledgeDocument, KnowledgeDocumentChunk, KnowledgeSearchResult } from 'src/types/knowledge';
+import { distance } from 'ml-distance';
 
 const DEFAULT_EMBEDDING_API_URL =
   'https://curated.aleph.cloud/vm/ee1b2a8e5bd645447739d8b234ef495c9a2b4d0b98317d510a3ccf822808ebe5/embedding';
 
 export const generateChunks = async (
-  title: string,
   content: string,
   chunkSize = 500,
   overlapSize = 100,
@@ -18,23 +18,51 @@ export const generateChunks = async (
   });
 
   // Split into a list of LangChain documents
-  const documents = await splitter.createDocuments(
+  const documentChunks = await splitter.createDocuments(
     [content],
-    // TODO: include metadata
+    // TODO: include metadata ?
     [],
     {
-      chunkHeader: `DOCUMENT TITLE: ${title}\n\n---\n\n`,
-      appendChunkOverlapHeader: true,
+      appendChunkOverlapHeader: false,
     },
   );
-  return await Promise.all(
-    documents.map(
-      async (d): Promise<KnowledgeDocumentChunk> => ({
-        content: d.pageContent,
-        vector: await embed(d.pageContent),
-      }),
-    ),
-  );
+  const result: KnowledgeDocumentChunk[] = [];
+
+  // Need to do this synchronously to avoid timeout on the embedding model API
+  for (const chunk of documentChunks) {
+    const embedding_vector = await embed(chunk.pageContent);
+    result.push({
+      content: chunk.pageContent,
+      vector: embedding_vector,
+    });
+  }
+
+  return result;
+};
+
+export const searchDocuments = async (
+  query: string,
+  documents: KnowledgeDocument[],
+  max_chunks = 5,
+  max_distance = 15,
+): Promise<KnowledgeSearchResult[]> => {
+  const query_vector = await embed(query);
+  const matches: KnowledgeSearchResult[] = [];
+
+  // Iterate over all embeddings
+  documents.forEach((document) => {
+    document.chunks.forEach((chunk) => {
+      const euclidean_distance = distance.euclidean(query_vector, chunk.vector);
+
+      // If the distance is greater than the max_distance, skip it
+      if (euclidean_distance > max_distance) return;
+      matches.push({ content: chunk.content, distance: euclidean_distance });
+    });
+  });
+
+  matches.sort((a, b) => a.distance - b.distance);
+
+  return matches.slice(0, max_chunks);
 };
 
 async function embed(content: string): Promise<number[]> {