-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
111 lines (97 loc) · 4.33 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from flask import Flask, render_template, request, jsonify
import textwrap
import numpy as np
import pandas as pd
import google.generativeai as genai
from sklearn.metrics.pairwise import cosine_similarity
import json
import os
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
app = Flask(__name__)
# Configuring API
api_key = os.getenv("API_KEY")
if not api_key:
raise ValueError("API key is not set in environment variables")
genai.configure(api_key=api_key)
# Load Documents from JSON
def load_documents(file_path):
"""Load documents from a JSON file."""
try:
with open(file_path, 'r') as file:
documents = json.load(file)
return documents
except FileNotFoundError:
raise FileNotFoundError(f"The file {file_path} does not exist")
except json.JSONDecodeError:
raise ValueError(f"The file {file_path} is not a valid JSON file")
# Data Indexing
def chunk_document(content, chunk_size=500):
"""Break down large documents into smaller chunks for better indexing and retrieval."""
words = content.split()
chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
return chunks
# Embedding Techniques
def embed_content(title, content, model='models/text-embedding-004'):
"""Embed content using a fine-tuned model for better semantic understanding."""
try:
return genai.embed_content(model=model, content=content, task_type='retrieval_document', title=title)['embedding']
except Exception as e:
raise RuntimeError(f"Error embedding content: {e}")
# Retrieval Algorithms
def find_best_score(query, dataframe, model='models/text-embedding-004'):
"""Compute the distances between the query and each document using cosine similarity for more accurate retrieval."""
try:
query_embedding = genai.embed_content(model=model, content=query, task_type="retrieval_query")['embedding']
similarities = cosine_similarity(np.stack(dataframe['Embeddings']), [query_embedding])
index = np.argmax(similarities)
return dataframe.iloc[index]['Content']
except Exception as e:
raise RuntimeError(f"Error finding best score: {e}")
# Contextual Understanding
def make_prompt(query, relevant_passage):
"""Generate a prompt for the generative model to ensure comprehensive and user-friendly responses."""
escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
prompt = textwrap.dedent("""You are a helpful and informative bot that answers questions using text from the reference passage included below. \
Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \
However, you are talking to a non-technical audience, so be sure to break down complicated concepts and \
strike a friendly and conversational tone. \
If the passage is irrelevant to the answer, tell that the answer is not in documents and just keep it the answer very brief.
QUESTION: '{query}'
PASSAGE: '{relevant_passage}'
ANSWER:
""").format(query=query, relevant_passage=escaped)
return prompt
# Load and Prepare Documents
try:
DOCUMENTS = load_documents('documents.json')
chunked_documents = []
for doc in DOCUMENTS:
chunks = chunk_document(doc['content'])
for chunk in chunks:
chunked_documents.append({"title": doc['title'], "content": chunk})
# Embed Documents
df = pd.DataFrame(chunked_documents)
df.columns = ['Title', 'Content']
df['Embeddings'] = df.apply(lambda x: embed_content(x['Title'], x['Content']), axis=1)
except Exception as e:
raise RuntimeError(f"Error loading and preparing documents: {e}")
@app.route('/')
def index():
"""Render the main HTML page."""
return render_template('index.html')
@app.route('/query', methods=['POST'])
def query():
"""Handle POST requests to process user queries and return responses."""
try:
user_query = request.form['query']
response = find_best_score(user_query, df)
prompt = make_prompt(user_query, response)
model = genai.GenerativeModel('gemini-1.5-pro-latest')
answer = model.generate_content(prompt)
return jsonify(answer=answer.text)
except Exception as e:
return jsonify(error=str(e)), 500
if __name__ == '__main__':
app.run(debug=True)