forked from microsoft/generative-ai-for-beginners
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request microsoft#300 from carlotta94c/openai-support-ch8
Adding non-Azure Openai endpoints support to ch8
- Loading branch information
Showing
10 changed files
with
327 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
106 changes: 106 additions & 0 deletions
106
08-building-search-applications/python/oai-assignment.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"%pip install openai python-dotenv" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"from openai import OpenAI\n", | ||
"from dotenv import load_dotenv\n", | ||
"import numpy as np\n", | ||
"load_dotenv()\n", | ||
"\n", | ||
"API_KEY = os.getenv(\"OPENAI_API_KEY\",\"\")\n", | ||
"assert API_KEY, \"ERROR: OpenAI Key is missing\"\n", | ||
"\n", | ||
"client = OpenAI(\n", | ||
" api_key=API_KEY\n", | ||
" )" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Dependencies for embeddings_utils\n", | ||
"%pip install matplotlib plotly scikit-learn pandas" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def cosine_similarity(a, b):\n", | ||
" return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"text = 'the quick brown fox jumped over the lazy dog'\n", | ||
"model = 'text-embedding-ada-002'\n", | ||
"\n", | ||
"client.embeddings.create(input = [text], model=model).data[0].embedding" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# compare several words\n", | ||
"automobile_embedding = client.embeddings.create(input = 'automobile', model=model).data[0].embedding\n", | ||
"vehicle_embedding = client.embeddings.create(input = 'vehicle', model=model).data[0].embedding\n", | ||
"dinosaur_embedding = client.embeddings.create(input = 'dinosaur', model=model).data[0].embedding\n", | ||
"stick_embedding = client.embeddings.create(input = 'stick', model=model).data[0].embedding\n", | ||
"\n", | ||
"# comparing cosine similarity, automobiles vs automobiles should be 1.0, i.e exactly the same, while automobiles vs dinosaurs should be between 0 and 1, i.e. not the same\n", | ||
"print(cosine_similarity(automobile_embedding, automobile_embedding))\n", | ||
"print(cosine_similarity(automobile_embedding, vehicle_embedding))\n", | ||
"print(cosine_similarity(automobile_embedding, dinosaur_embedding))\n", | ||
"print(cosine_similarity(automobile_embedding, stick_embedding))" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.13" | ||
}, | ||
"orig_nbformat": 4 | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
200 changes: 200 additions & 0 deletions
200
08-building-search-applications/python/oai-solution.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,200 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"In order to run the following noteboooks, if you haven't done yet, you need to set the openai key inside .env file as `OPENAI_API_KEY`" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"import pandas as pd\n", | ||
"import numpy as np\n", | ||
"from openai import OpenAI\n", | ||
"from dotenv import load_dotenv\n", | ||
"\n", | ||
"load_dotenv()\n", | ||
"\n", | ||
"API_KEY = os.getenv(\"OPENAI_API_KEY\",\"\")\n", | ||
"assert API_KEY, \"ERROR: OpenAI Key is missing\"\n", | ||
"\n", | ||
"client = OpenAI(\n", | ||
" api_key=API_KEY\n", | ||
" )\n", | ||
"\n", | ||
"model = 'text-embedding-ada-002'\n", | ||
"\n", | ||
"SIMILARITIES_RESULTS_THRESHOLD = 0.75\n", | ||
"DATASET_NAME = \"../embedding_index_3m.json\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Next, we are going to load the Embedding Index into a Pandas Dataframe. The Embedding Index is stored in a JSON file called `embedding_index_3m.json`. The Embedding Index contains the Embeddings for each of the YouTube transcripts up until late Oct 2023." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def load_dataset(source: str) -> pd.core.frame.DataFrame:\n", | ||
" # Load the video session index\n", | ||
" pd_vectors = pd.read_json(source)\n", | ||
" return pd_vectors.drop(columns=[\"text\"], errors=\"ignore\").fillna(\"\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Next, we are going to create a function called `get_videos` that will search the Embedding Index for the query. The function will return the top 5 videos that are most similar to the query. The function works as follows:\n", | ||
"\n", | ||
"1. First, a copy of the Embedding Index is created.\n", | ||
"2. Next, the Embedding for the query is calculated using the OpenAI Embedding API.\n", | ||
"3. Then a new column is created in the Embedding Index called `similarity`. The `similarity` column contains the cosine similarity between the query Embedding and the Embedding for each video segment.\n", | ||
"4. Next, the Embedding Index is filtered by the `similarity` column. The Embedding Index is filtered to only include videos that have a cosine similarity greater than or equal to 0.75.\n", | ||
"5. Finally, the Embedding Index is sorted by the `similarity` column and the top 5 videos are returned." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def cosine_similarity(a, b):\n", | ||
" return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))\n", | ||
"\n", | ||
"def get_videos(\n", | ||
" query: str, dataset: pd.core.frame.DataFrame, rows: int\n", | ||
") -> pd.core.frame.DataFrame:\n", | ||
" # create a copy of the dataset\n", | ||
" video_vectors = dataset.copy()\n", | ||
"\n", | ||
" # get the embeddings for the query \n", | ||
" query_embeddings = client.embeddings.create(input=query, model=model).data[0].embedding\n", | ||
"\n", | ||
" # create a new column with the calculated similarity for each row\n", | ||
" video_vectors[\"similarity\"] = video_vectors[\"ada_v2\"].apply(\n", | ||
" lambda x: cosine_similarity(np.array(query_embeddings), np.array(x))\n", | ||
" )\n", | ||
"\n", | ||
" # filter the videos by similarity\n", | ||
" mask = video_vectors[\"similarity\"] >= SIMILARITIES_RESULTS_THRESHOLD\n", | ||
" video_vectors = video_vectors[mask].copy()\n", | ||
"\n", | ||
" # sort the videos by similarity\n", | ||
" video_vectors = video_vectors.sort_values(by=\"similarity\", ascending=False).head(\n", | ||
" rows\n", | ||
" )\n", | ||
"\n", | ||
" # return the top rows\n", | ||
" return video_vectors.head(rows)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"This function is very simple, it just prints out the results of the search query." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def display_results(videos: pd.core.frame.DataFrame, query: str):\n", | ||
" def _gen_yt_url(video_id: str, seconds: int) -> str:\n", | ||
" \"\"\"convert time in format 00:00:00 to seconds\"\"\"\n", | ||
" return f\"https://youtu.be/{video_id}?t={seconds}\"\n", | ||
"\n", | ||
" print(f\"\\nVideos similar to '{query}':\")\n", | ||
" for _, row in videos.iterrows():\n", | ||
" youtube_url = _gen_yt_url(row[\"videoId\"], row[\"seconds\"])\n", | ||
" print(f\" - {row['title']}\")\n", | ||
" print(f\" Summary: {' '.join(row['summary'].split()[:15])}...\")\n", | ||
" print(f\" YouTube: {youtube_url}\")\n", | ||
" print(f\" Similarity: {row['similarity']}\")\n", | ||
" print(f\" Speakers: {row['speaker']}\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"1. First, the Embedding Index is loaded into a Pandas Dataframe.\n", | ||
"2. Next, the user is prompted to enter a query.\n", | ||
"3. Then the `get_videos` function is called to search the Embedding Index for the query.\n", | ||
"4. Finally, the `display_results` function is called to display the results to the user.\n", | ||
"5. The user is then prompted to enter another query. This process continues until the user enters `exit`.\n", | ||
"\n", | ||
"![](media/notebook_search.png)\n", | ||
"\n", | ||
"You will be prompted to enter a query. Enter a query and press enter. The application will return a list of videos that are relevant to the query. The application will also return a link to the place in the video where the answer to the question is located.\n", | ||
"\n", | ||
"Here are some queries to try out:\n", | ||
"\n", | ||
"- What is Azure Machine Learning?\n", | ||
"- How do convolutional neural networks work?\n", | ||
"- What is a neural network?\n", | ||
"- Can I use Jupyter Notebooks with Azure Machine Learning?\n", | ||
"- What is ONNX?" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"pd_vectors = load_dataset(DATASET_NAME)\n", | ||
"\n", | ||
"# get user query from imput\n", | ||
"while True:\n", | ||
" query = input(\"Enter a query: \")\n", | ||
" if query == \"exit\":\n", | ||
" break\n", | ||
" videos = get_videos(query, pd_vectors, 5)\n", | ||
" display_results(videos, query)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "venv", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.13" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.