diff --git a/.gitignore b/.gitignore index 168d306..d0a939b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +code/notebook/.ipynb_checkpoints/ +code/__pycache__/ +code/.ipynb_checkpoints/ +.virtual_documents/ +.gpt_config.env .DS_Store manuscript/ # lambda-pipeline diff --git a/code/notebook/.ipynb_checkpoints/pinecone_retrieval-checkpoint.ipynb b/code/notebook/.ipynb_checkpoints/pinecone_retrieval-checkpoint.ipynb deleted file mode 100644 index 49550d2..0000000 --- a/code/notebook/.ipynb_checkpoints/pinecone_retrieval-checkpoint.ipynb +++ /dev/null @@ -1,191 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 14, - "id": "d9a114f4-c03a-49bf-a519-89db2c5bb3f1", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index.core import Settings\n", - "from llama_index.embeddings.openai import OpenAIEmbedding\n", - "\n", - "from pinecone import Pinecone\n", - "from dotenv import load_dotenv, find_dotenv\n", - "import os\n", - "\n", - "import sys\n", - "sys.path.insert(0, '..')\n", - "from utility import get_GPT_response, client\n", - "\n", - "import json\n" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "2dc22ebd-63d5-4b4c-a2dc-eabb627dae82", - "metadata": {}, - "outputs": [], - "source": [ - "load_dotenv(os.path.join(os.path.expanduser('~'), '.openai_config.env'))\n", - "api_key = os.environ.get('HACKATHON_API_KEY')\n", - "\n", - "load_dotenv(os.path.join(os.path.expanduser('~'), '.pinecone_config.env'))\n", - "pinecone_api_key = os.environ.get('ANDREW_API_KEY')\n" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "a1db1e41-63a4-4b80-9e72-ccdef603d73f", - "metadata": {}, - "outputs": [], - "source": [ - "embedding_model = \"text-embedding-ada-002\"\n", - "\n", - "embed_model = OpenAIEmbedding(\n", - " model=embedding_model,\n", - " api_key=api_key,\n", - ")\n", - "\n", - "\n", - "Settings.embed_model = embed_model\n" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "cadb5fb8-2ed1-4cbe-a124-c0dfba19594b", - "metadata": {}, - "outputs": [], - "source": [ - "pc = Pinecone(api_key=pinecone_api_key)\n", - "\n", - "index_name = \"eds\"\n", - "\n", - "pinecone_index = pc.Index(index_name)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "ba3dd1db-8385-4e36-abcc-79aae7cafcd0", - "metadata": {}, - "outputs": [], - "source": [ - "query = \"what drugs should be avoided by eds patients?\"\n", - "top_k = 10\n", - "query_embedding = embed_model.get_text_embedding(\n", - " query\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "2988ef50-0906-40f0-9228-ba56e671f22e", - "metadata": {}, - "outputs": [], - "source": [ - "retrieved_doc = pinecone_index.query(\n", - " vector=query_embedding, \n", - " top_k=top_k, \n", - " include_metadata=True\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "cd338c5f-bc65-4131-9fa8-82934bec0811", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "extracted_context_summary = list(map(lambda x:json.loads(x.metadata['_node_content'])['metadata']['section_summary'], retrieved_doc.matches))\n", - "provenance = list(map(lambda x:x.metadata['c_document_id'], retrieved_doc.matches))\n" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "2088a355-561c-4262-86eb-cdac2fe07978", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "context = ''\n", - "for i in range(top_k):\n", - " context += extracted_context_summary[i] + '(Ref: ' + provenance[i] + '. '\n" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "715753c0-75c9-42ed-8c08-b693d626c45c", - "metadata": {}, - "outputs": [], - "source": [ - "system_prompt = '''\n", - "You are an expert in the rare disease Ehlers-Danlos syndrome (EDS).\n", - "You are supposed to answer the question asked by the user. \n", - "For this, you need to check the given context and then answer the question by grounding your response on the provided context.\n", - "If you are using the provided context, always make sure to provide the citation. You can find the citation in the context marked as 'Ref' \n", - "'''\n", - "\n", - "prompt = 'Context: ' + context + 'Query: ' + query\n", - "\n", - "response = get_GPT_response(prompt, system_prompt, os.environ.get('MODEL_NAME'), temperature=0.3)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "92b21cf3-c01b-481a-9f2b-7499223a870c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Based on the provided context, individuals with Ehlers-Danlos Syndrome (EDS) should avoid long-term opioid use for pain management due to the associated risks and complications, such as tolerance, hyperalgesia, potential overdose, and the risks of opioid use disorder (Ref: 10.7759/cureus.45713). It is recommended to opt for alternative pain management strategies, including non-opioid medications like NSAIDs, acetaminophen, anticonvulsants, and antidepressants for long-term relief and improved function (Ref: 10.7759/cureus.45713). Additionally, caution is advised when prescribing opioids to EDS patients due to connective tissue vulnerabilities and potential complications, emphasizing the importance of personalized tapering plans for patients on long-term opioids to ensure their safety and well-being (Ref: 10.7759/cureus.45713).\n" - ] - } - ], - "source": [ - "print(response)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f9ef68ce-e3c7-450d-9aa1-453a33af73b9", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/code/notebook/pinecone_retrieval.ipynb b/code/notebook/pinecone_retrieval.ipynb index 149c7a8..ccebd20 100644 --- a/code/notebook/pinecone_retrieval.ipynb +++ b/code/notebook/pinecone_retrieval.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 51, + "execution_count": 1, "id": "d9a114f4-c03a-49bf-a519-89db2c5bb3f1", "metadata": {}, "outputs": [], @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 2, "id": "2dc22ebd-63d5-4b4c-a2dc-eabb627dae82", "metadata": {}, "outputs": [], @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 3, "id": "a1db1e41-63a4-4b80-9e72-ccdef603d73f", "metadata": {}, "outputs": [], @@ -55,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 4, "id": "cadb5fb8-2ed1-4cbe-a124-c0dfba19594b", "metadata": {}, "outputs": [], @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 5, "id": "ba3dd1db-8385-4e36-abcc-79aae7cafcd0", "metadata": {}, "outputs": [], @@ -83,7 +83,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 6, "id": "2988ef50-0906-40f0-9228-ba56e671f22e", "metadata": {}, "outputs": [], diff --git a/lambda-pipeline/app.py b/lambda-pipeline/app.py index f3b3019..7fa643d 100644 --- a/lambda-pipeline/app.py +++ b/lambda-pipeline/app.py @@ -32,7 +32,7 @@ def get_model_response(input_text, temperature: float = 0.7): logging.error(f"Failed to get model response: {str(e)}") return None -def get_rag_context(query, top_k=5): +def get_rag_context(query, top_k=2): try: embed_model = OpenAIEmbedding( model='text-embedding-ada-002', @@ -47,7 +47,8 @@ def get_rag_context(query, top_k=5): top_k=top_k, include_metadata=True ) - extracted_context_summary = list(map(lambda x: json.loads(x.metadata['_node_content'])['metadata']['section_summary'], retrieved_doc.matches)) + # extracted_context_summary = list(map(lambda x: json.loads(x.metadata['_node_content'])['metadata']['section_summary'], retrieved_doc.matches)) + extracted_context_summary = list(map(lambda x: json.loads(x.metadata['_node_content'])['metadata']['text'], retrieved_doc.matches)) provenance = list(map(lambda x: x.metadata['c_document_id'], retrieved_doc.matches)) context = '' for i in range(top_k):