-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add Dockerfile and ingest-guru-cards.py
- Loading branch information
Showing
16 changed files
with
597 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,8 @@ chroma_db/ | |
*.log | ||
log/ | ||
|
||
# MacOS files | ||
*.DS_STORE | ||
|
||
# .env contains secret API keys | ||
.env |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
FROM python:3.11-slim | ||
|
||
WORKDIR /app | ||
|
||
COPY requirements.txt . | ||
RUN pip3 install --no-cache-dir -r requirements.txt | ||
|
||
RUN apt-get update && apt-get install -y \ | ||
curl unzip \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
ARG GURU_CARDS_URL | ||
RUN echo "Downloading from ${GURU_CARDS_URL}" \ | ||
&& curl -L "${GURU_CARDS_URL}" > download.zip \ | ||
&& unzip -o download.zip \ | ||
&& rm download.zip \ | ||
&& mv guru_cards_for_nava--Multi-benefit.json guru_cards_for_nava.json | ||
|
||
# Update .dockerignore to prevent files from being copied into the image | ||
COPY . . | ||
|
||
RUN ./ingest-guru-cards.py | ||
|
||
EXPOSE 8000 | ||
HEALTHCHECK CMD curl http://localhost:8000 || exit 1 | ||
ENTRYPOINT ["chainlit", "run", "--port", "8000", "-h", "chatbot-chainlit.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
#!/usr/bin/env python | ||
import json | ||
import os | ||
|
||
from bs4 import BeautifulSoup | ||
|
||
|
||
class GuruCardsProcessor: | ||
def __init__( | ||
self, | ||
file_path="./guru_cards_for_nava.json", | ||
question_key="preferredPhrase", | ||
content_key="content", | ||
): | ||
self.file_path = file_path | ||
self.question_key = question_key | ||
self.content_key = content_key | ||
|
||
def extract_qa_text_from_guru(self): | ||
json_data = self.cards_as_json() | ||
question_answers = self._extract_question_answers(json_data) | ||
return question_answers | ||
|
||
def cards_as_json(self): | ||
with open(self.file_path, encoding="utf-8") as data_file: | ||
return json.load(data_file) | ||
|
||
def _extract_question_answers(self, json_data): | ||
question_answers = {} | ||
for content in json_data: | ||
if not content[self.question_key].strip().endswith("?"): | ||
continue | ||
soup = BeautifulSoup(content[self.content_key], "html.parser") | ||
answer = soup.get_text(separator="\n", strip=True) | ||
question_answers[content[self.question_key].strip()] = answer | ||
return question_answers | ||
|
||
|
||
def save_simplified_json(gc_processor): | ||
"Saves a simplified version of the Guru cards JSON file for easier review" | ||
json_data = gc_processor.cards_as_json() | ||
name, ext = os.path.splitext(gc_processor.file_path) | ||
with open(f"{name}_simplified{ext}", "w", encoding="utf-8") as f: | ||
simplified_json = [] | ||
for card in json_data: | ||
tags = [tagsItem.get("value") for tagsItem in card.get("tags", [])] | ||
boards = [boardsItem.get("title") for boardsItem in card.get("boards", [])] | ||
soup = BeautifulSoup(card[gc_processor.content_key], "html.parser") | ||
content = soup.get_text(separator="\n", strip=True) | ||
simplified_json.append( | ||
{ | ||
"preferredPhrase": card["preferredPhrase"], | ||
"tags": ",".join(tags), | ||
"boards": ",".join(boards), | ||
gc_processor.content_key: content, | ||
} | ||
) | ||
json.dump(simplified_json, f, indent=4) | ||
|
||
|
||
if __name__ == "__main__": | ||
import sys | ||
|
||
if args := sys.argv[1:]: | ||
_gc_processor = GuruCardsProcessor(file_path=args[0]) | ||
else: | ||
_gc_processor = GuruCardsProcessor() | ||
|
||
save_simplified_json(_gc_processor) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import logging | ||
|
||
from langchain.docstore.document import Document | ||
from langchain_text_splitters import (NLTKTextSplitter, | ||
RecursiveCharacterTextSplitter, | ||
SpacyTextSplitter) | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class TextSplitter: | ||
def __init__(self, llm_client, token_limit, text_splitter_name, **text_splitter_args): | ||
""" | ||
- llm_client is used to get the number of tokens in a text | ||
- token_limit is the maximum number of tokens allowed by the embedding model | ||
""" | ||
self.llm_client = llm_client | ||
self.token_limit = token_limit | ||
self.text_splitter = self.create_text_splitter(text_splitter_name, **text_splitter_args) | ||
|
||
def create_text_splitter(self, choice, **kwargs): | ||
logger.info("Creating %s", choice) | ||
if choice == "NLTKTextSplitter": | ||
logger.warning(" Not using arguments: %s", kwargs) | ||
splitter = NLTKTextSplitter() | ||
elif choice == "SpacyTextSplitter": | ||
logger.warning(" Not using arguments: %s", kwargs) | ||
splitter = SpacyTextSplitter() | ||
elif choice == "RecursiveCharacterTextSplitter": | ||
logger.info(" Using arguments: %s", kwargs) | ||
splitter = RecursiveCharacterTextSplitter( | ||
chunk_size=kwargs["chunk_size"], chunk_overlap=kwargs["chunk_overlap"] | ||
) | ||
return splitter | ||
|
||
def split_into_chunks(self, title, text): | ||
""" | ||
- title is the title to be used as the source of the text | ||
- text is the text to split | ||
""" | ||
entire_text = title + "\n\n" + text | ||
texts = self.text_splitter.split_text(entire_text) | ||
|
||
logger.info(" Split into %s", len(texts)) | ||
for t in texts: | ||
token_count = self.llm_client.get_num_tokens(t) | ||
assert token_count <= self.token_limit, "Exceeded token limit of {self.token_limit}: {token_count}" | ||
|
||
return [Document(page_content=t, metadata={"source": title.strip(), "entire_card": entire_text}) for t in texts] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.