ValMobYKang · ValMobYKang · Dec 11, 2023 · Nov 23, 2023 · Nov 27, 2023 · Nov 30, 2023
diff --git a/.env.example b/.env.example
@@ -2,3 +2,5 @@ TOKENIZERS_PARALLELISM=false
 CONFLUENCE_URL=YOUR_CONFLUNECE_URL
 CONFLUENCE_SPACE=YOUR_SPACE_NAME
 MODEL=YOUR_GGUF_MODEL
+BITBUCKET_URL=YOUR_BITBUCKET_URL
+BITBUCKET_PROJECT=YOUR_BITBUCKET_PROJECT
diff --git a/.github/workflows/check_code.yml b/.github/workflows/check_code.yml
@@ -5,8 +5,6 @@ on:
     branches:
       - main
   pull_request:
-    branches:
-      - main
 
 jobs:
   ruff:

diff --git a/.gitignore b/.gitignore
@@ -161,6 +161,8 @@ cython_debug/
 
 
 # data 
-store/
+*store/
+store*/
 *cookies.txt
-auth.yaml
+auth.yaml
+*.local
diff --git a/src/backend.py b/src/backend.py
@@ -1,6 +1,4 @@
 import os
-import phoenix as px
-import llama_index
 from llama_index import (
     VectorStoreIndex,
     ServiceContext,
@@ -10,83 +8,161 @@
 from llama_index.llms import OpenAI
 from llama_index.prompts import PromptTemplate
 from llama_index.embeddings import HuggingFaceEmbedding
-from llama_index.node_parser import SimpleNodeParser
-from llama_index.text_splitter import TokenTextSplitter
 from llama_index.indices.prompt_helper import PromptHelper
-from utils import ConfluenceReader, SentenceTransformerRerank
+from llama_index.query_engine import CustomQueryEngine
+from llama_index.retrievers import BaseRetriever
+from llama_index.response_synthesizers import (
+    get_response_synthesizer,
+    BaseSynthesizer,
+)
+from llama_index.postprocessor.types import BaseNodePostprocessor
+from llama_index.schema import QueryBundle
+from llama_index.callbacks import CallbackManager
+
+import phoenix as px
+from phoenix.trace.llama_index import (
+    OpenInferenceTraceCallbackHandler,
+)
+
+from typing import Literal
+from utils import ConfluenceReader, SentenceTransformerRerank, BitbucketReader
 
 os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"
 os.environ["OPENAI_API_BASE"] = "http://localhost:8000/v1"
 
-LLM = OpenAI(temperature=0.1, max_tokens=2048)
-EMBEDDING = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
-RERANK = SentenceTransformerRerank(
-    model="cross-encoder/ms-marco-MiniLM-L-2-v2", top_n=3
-)
+session = px.launch_app()
+cb_manager = CallbackManager(handlers=[OpenInferenceTraceCallbackHandler()])
+
+
+class QueryMultiEngine(CustomQueryEngine):
+    retrievers: list[BaseRetriever]
+    response_synthesizer: BaseSynthesizer
+    node_postprocessors: list[BaseNodePostprocessor]
+
+    def custom_query(self, query_str: str):
+        nodes = []
+        for retriever in self.retrievers:
+            nodes += retriever.retrieve(query_str)
+        for postprocessor in self.node_postprocessors:
+            nodes = postprocessor.postprocess_nodes(
+                nodes=nodes, query_bundle=QueryBundle(query_str)
+            )
+
+        response_obj = self.response_synthesizer.synthesize(query_str, nodes)
+
+        return response_obj
+
+
+def service_context():
+    LLM = OpenAI(temperature=0.1, max_tokens=2048, callback_manager=cb_manager)
+    EMBEDDING = HuggingFaceEmbedding(
+        model_name="BAAI/bge-base-en-v1.5", callback_manager=cb_manager
+    )
+    return ServiceContext.from_defaults(
+        llm=LLM,
+        chunk_size=512,
+        chunk_overlap=20,
+        embed_model=EMBEDDING,
+        prompt_helper=PromptHelper(chunk_size_limit=1000),
+        callback_manager=cb_manager,
+    )
 
 
-def init_index():
-    if os.path.exists("store"):
-        index = load_index_from_storage(
-            storage_context=StorageContext.from_defaults(persist_dir="store"),
-            service_context=ServiceContext.from_defaults(
-                llm=LLM,
-                embed_model=EMBEDDING,
-                prompt_helper=PromptHelper(chunk_size_limit=2000),
-            ),
+def init_index(persist_dir: Literal["confluence_store", "bitbucket_store"]):
+    if os.path.exists(persist_dir):
+        print(f"Loading {persist_dir} ...")
+        return load_index_from_storage(
+            storage_context=StorageContext.from_defaults(persist_dir=persist_dir),
+            service_context=service_context(),
         )
-    else:
-        index = VectorStoreIndex.from_documents(
-            documents=ConfluenceReader(base_url=os.environ["CONFLUENCE_URL"]).load_data(
-                space_key=os.environ["CONFLUENCE_SPACE"],
-                page_status="current",
-                include_attachments=False,
-                max_num_results=10,
-            ),
-            service_context=ServiceContext.from_defaults(
-                llm=LLM,
-                node_parser=SimpleNodeParser.from_defaults(
-                    text_splitter=TokenTextSplitter(
-                        separator=" ",
-                        chunk_size=512,
-                        chunk_overlap=20,
-                        backup_separators=["\n"],
-                    )
-                ),
-                embed_model=EMBEDDING,
-            ),
-            show_progress=True,
+
+    if persist_dir == "bitbucket_store":
+        loader = BitbucketReader(
+            project_key=os.environ["BITBUCKET_PROJECT"],
+            base_url=os.environ["BITBUCKET_URL"],
+            branch="master",
+            extensions_to_skip=[
+                ".VIN-decoding",
+                "URL-generalization",
+                "scraping",
+                "FizzBuzz",
+                "Driver-Behaviour",
+                "VIN-OCR",
+                "Sensor-Log",
+                "png",
+                "jpg",
+                "ppm",
+            ],
+        ).load_data()
+    elif persist_dir == "confluence_store":
+        loader = ConfluenceReader(base_url=os.environ["CONFLUENCE_URL"]).load_data(
+            space_key=os.environ["CONFLUENCE_SPACE"],
+            page_status="current",
+            include_attachments=False,
+            max_num_results=10,
         )
-        index.storage_context.persist(persist_dir="store")
+    else:
+        raise Exception("Must have one store")
+
+    index = VectorStoreIndex.from_documents(
+        documents=loader,
+        service_context=service_context(),
+        show_progress=True,
+    )
+    index.storage_context.persist(persist_dir=persist_dir)
+
     return index
 
 
-def get_query_engine(index):
-    return index.as_query_engine(
-        similarity_top_k=5,
-        response_mode="compact",
+def get_query_engine(indices: list):
+    RERANK = SentenceTransformerRerank(
+        model="cross-encoder/ms-marco-MiniLM-L-2-v2", top_n=3
+    )
+    RERANK.callback_manager = cb_manager
+    dolphin_qa_prompt = PromptTemplate(
+        "<|im_start|>system \n"
+        "You will be presented with context. Your task is to answer the query only based on the context. "
+        "If the context cannot answer the query, you responses 'I don't know' directly without any more responses. \n"
+        "Approach this task step-by-step, take your time. "
+        "This is very important to my career.\n"
+        "The Context information is below. \n"
+        "---------------------\n{context_str}\n--------------------- <|im_end|>\n"
+        "<|im_start|>user \n"
+        "{query_str}<|im_end|> \n"
+        "<|im_start|>assistant"
+    )
+
+    if len(indices) == 1:
+        return indices[0].as_query_engine(
+            similarity_top_k=5,
+            service_context=service_context(),
+            response_mode="compact",
+            node_postprocessors=[RERANK],
+            text_qa_template=dolphin_qa_prompt,
+        )
+
+    return QueryMultiEngine(
+        retrievers=[index.as_retriever(similarity_top_k=5) for index in indices],
         node_postprocessors=[RERANK],
-        text_qa_template=PromptTemplate(
-            "<|im_start|>system \n"
-            "You will be presented with context. You task is to answer the query only based on the context. "
-            "If the context cannot answer the query, you responses 'I don't know'. \n"
-            "Approach this task step-by-step, take your time. \n"
-            "This is very important to my career.<|im_end|>\n"
-            "The Context information is below. \n"
-            "---------------------\n{context_str}\n---------------------\n"
-            "<|im_start|>user \n"
-            "{query_str}<|im_end|> \n"
-            "<|im_start|>assistant"
+        response_synthesizer=get_response_synthesizer(
+            service_context=service_context(),
+            response_mode="compact",
+            text_qa_template=dolphin_qa_prompt,
         ),
+        callback_manager=cb_manager,
     )
 
 
 if __name__ == "__main__":
-    session = px.launch_app()
-    llama_index.set_global_handler("arize_phoenix")
-
-    query_engine = get_query_engine(index=init_index())
     print("[Develop mode]")
+
+    query_engine_bitbucket = get_query_engine(
+        indices=[
+            init_index(persist_dir="bitbucket_store"),
+            init_index(persist_dir="confluence_store"),
+        ]
+    )
+
     while 1:
         question = input("Question: ")
-        print(query_engine.query(question))
+        print(query_engine_bitbucket.query(question))
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,8 +5,6 @@ on: @@
         branches:
           - main
       pull_request:
-        branches:
-          - main
     jobs:
       ruff:
@@ Expand Down @@