Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

working es #354

Merged
merged 5 commits into from
Sep 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions application/api/answer/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@


from application.core.settings import settings
from application.vectorstore.vector_creator import VectorCreator
from application.llm.llm_creator import LLMCreator
from application.vectorstore.faiss import FaissStore
from application.error import bad_request


Expand Down Expand Up @@ -226,7 +226,7 @@ def stream():
vectorstore = get_vectorstore({"active_docs": data["active_docs"]})
else:
vectorstore = ""
docsearch = FaissStore(vectorstore, embeddings_key)
docsearch = VectorCreator.create_vectorstore(settings.VECTOR_STORE, vectorstore, embeddings_key)

return Response(
complete_stream(question, docsearch,
Expand Down Expand Up @@ -260,7 +260,7 @@ def api_answer():
vectorstore = get_vectorstore(data)
# loading the index and the store and the prompt template
# Note if you have used other embeddings than OpenAI, you need to change the embeddings
docsearch = FaissStore(vectorstore, embeddings_key)
docsearch = VectorCreator.create_vectorstore(settings.VECTOR_STORE, vectorstore, embeddings_key)


llm = LLMCreator.create_llm(settings.LLM_NAME, api_key=api_key)
Expand Down
37 changes: 19 additions & 18 deletions application/api/internal/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,25 +34,26 @@ def upload_index_files():
if "name" not in request.form:
return {"status": "no name"}
job_name = secure_filename(request.form["name"])
if "file_faiss" not in request.files:
print("No file part")
return {"status": "no file"}
file_faiss = request.files["file_faiss"]
if file_faiss.filename == "":
return {"status": "no file name"}
if "file_pkl" not in request.files:
print("No file part")
return {"status": "no file"}
file_pkl = request.files["file_pkl"]
if file_pkl.filename == "":
return {"status": "no file name"}

# saves index files
save_dir = os.path.join(current_dir, "indexes", user, job_name)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
file_faiss.save(os.path.join(save_dir, "index.faiss"))
file_pkl.save(os.path.join(save_dir, "index.pkl"))
if settings.VECTOR_STORE == "faiss":
if "file_faiss" not in request.files:
print("No file part")
return {"status": "no file"}
file_faiss = request.files["file_faiss"]
if file_faiss.filename == "":
return {"status": "no file name"}
if "file_pkl" not in request.files:
print("No file part")
return {"status": "no file"}
file_pkl = request.files["file_pkl"]
if file_pkl.filename == "":
return {"status": "no file name"}
# saves index files

if not os.path.exists(save_dir):
os.makedirs(save_dir)
file_faiss.save(os.path.join(save_dir, "index.faiss"))
file_pkl.save(os.path.join(save_dir, "index.pkl"))
# create entry in vectors_collection
vectors_collection.insert_one(
{
Expand Down
27 changes: 18 additions & 9 deletions application/api/user/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from application.api.user.tasks import ingest

from application.core.settings import settings
from application.vectorstore.vector_creator import VectorCreator

mongo = MongoClient(settings.MONGO_URI)
db = mongo["docsgpt"]
conversations_collection = db["conversations"]
Expand Down Expand Up @@ -90,10 +92,17 @@ def delete_old():
return {"status": "error"}
path_clean = "/".join(dirs)
vectors_collection.delete_one({"location": path})
try:
shutil.rmtree(path_clean)
except FileNotFoundError:
pass
if settings.VECTOR_STORE == "faiss":
try:
shutil.rmtree(os.path.join(current_dir, path_clean))

Check failure

Code scanning / CodeQL

Uncontrolled data used in path expression

This path depends on a [user-provided value](1).
except FileNotFoundError:
pass
else:
vetorstore = VectorCreator.create_vectorstore(
settings.VECTOR_STORE, path=os.path.join(current_dir, path_clean)
)
vetorstore.delete_index()

return {"status": "ok"}

@user.route("/api/upload", methods=["POST"])
Expand Down Expand Up @@ -173,11 +182,11 @@ def combined_json():
"location": "local",
}
)

data_remote = requests.get("https://d3dg1063dc54p9.cloudfront.net/combined.json").json()
for index in data_remote:
index["location"] = "remote"
data.append(index)
if settings.VECTOR_STORE == "faiss":
data_remote = requests.get("https://d3dg1063dc54p9.cloudfront.net/combined.json").json()
for index in data_remote:
index["location"] = "remote"
data.append(index)

return jsonify(data)

Expand Down
8 changes: 8 additions & 0 deletions application/core/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class Settings(BaseSettings):
TOKENS_MAX_HISTORY: int = 150
SELF_HOSTED_MODEL: bool = False
UPLOAD_FOLDER: str = "inputs"
VECTOR_STORE: str = "elasticsearch" # "faiss" or "elasticsearch"

API_URL: str = "http://localhost:7091" # backend url for celery worker

Expand All @@ -23,6 +24,13 @@ class Settings(BaseSettings):
AZURE_DEPLOYMENT_NAME: str = None # azure deployment name for answering
AZURE_EMBEDDINGS_DEPLOYMENT_NAME: str = None # azure deployment name for embeddings

# elasticsearch
ELASTIC_CLOUD_ID: str = None # cloud id for elasticsearch
ELASTIC_USERNAME: str = None # username for elasticsearch
ELASTIC_PASSWORD: str = None # password for elasticsearch
ELASTIC_URL: str = None # url for elasticsearch
ELASTIC_INDEX: str = "docsgpt" # index name for elasticsearch


path = Path(__file__).parent.parent.absolute()
settings = Settings(_env_file=path.joinpath(".env"), _env_file_encoding="utf-8")
28 changes: 20 additions & 8 deletions application/parser/open_ai_func.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import os

import tiktoken
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from application.vectorstore.vector_creator import VectorCreator
from application.core.settings import settings
from retry import retry


Expand Down Expand Up @@ -33,12 +33,23 @@ def call_openai_api(docs, folder_name, task_status):
os.makedirs(f"{folder_name}")

from tqdm import tqdm
docs_test = [docs[0]]
docs.pop(0)
c1 = 0

store = FAISS.from_documents(docs_test, OpenAIEmbeddings(openai_api_key=os.getenv("EMBEDDINGS_KEY")))

if settings.VECTOR_STORE == "faiss":
docs_init = [docs[0]]
docs.pop(0)

store = VectorCreator.create_vectorstore(
settings.VECTOR_STORE,
docs_init = docs_init,
path=f"{folder_name}",
embeddings_key=os.getenv("EMBEDDINGS_KEY")
)
else:
store = VectorCreator.create_vectorstore(
settings.VECTOR_STORE,
path=f"{folder_name}",
embeddings_key=os.getenv("EMBEDDINGS_KEY")
)
# Uncomment for MPNet embeddings
# model_name = "sentence-transformers/all-mpnet-base-v2"
# hf = HuggingFaceEmbeddings(model_name=model_name)
Expand All @@ -57,7 +68,8 @@ def call_openai_api(docs, folder_name, task_status):
store.save_local(f"{folder_name}")
break
c1 += 1
store.save_local(f"{folder_name}")
if settings.VECTOR_STORE == "faiss":
store.save_local(f"{folder_name}")


def get_user_permission(docs, folder_name):
Expand Down
1 change: 1 addition & 0 deletions application/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ decorator==5.1.1
dill==0.3.6
dnspython==2.3.0
ecdsa==0.18.0
elasticsearch==8.9.0
entrypoints==0.4
faiss-cpu==1.7.3
filelock==3.9.0
Expand Down
2 changes: 1 addition & 1 deletion application/vectorstore/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def search(self, *args, **kwargs):
def is_azure_configured(self):
return settings.OPENAI_API_BASE and settings.OPENAI_API_VERSION and settings.AZURE_DEPLOYMENT_NAME

def _get_docsearch(self, embeddings_name, embeddings_key=None):
def _get_embeddings(self, embeddings_name, embeddings_key=None):
embeddings_factory = {
"openai_text-embedding-ada-002": OpenAIEmbeddings,
"huggingface_sentence-transformers/all-mpnet-base-v2": HuggingFaceHubEmbeddings,
Expand Down
Loading