diff --git a/data/scraping_scripts/create_vector_stores.py b/data/scraping_scripts/create_vector_stores.py index d282e6a..4504719 100644 --- a/data/scraping_scripts/create_vector_stores.py +++ b/data/scraping_scripts/create_vector_stores.py @@ -1,3 +1,29 @@ +""" +Vector Store Creation Script + +Purpose: +This script processes various data sources (e.g., transformers, peft, trl, llama_index, openai_cookbooks, langchain) +to create vector stores using Chroma and LlamaIndex. It reads data from JSONL files, creates document embeddings, +and stores them in persistent Chroma databases for efficient retrieval. + +Usage: +python script_name.py ... + +Example: +python script_name.py transformers peft llama_index + +The script accepts one or more source names as command-line arguments. Valid source names are: +transformers, peft, trl, llama_index, openai_cookbooks, langchain + +For each specified source, the script will: +1. Read data from the corresponding JSONL file +2. Create document embeddings +3. Store the embeddings in a Chroma vector database +4. Save a dictionary of documents for future reference + +Note: Ensure that the input JSONL files are present in the 'data' directory. +""" + import argparse import json import os @@ -27,6 +53,10 @@ "input_file": "data/openai_cookbooks_data.jsonl", "db_name": "chroma-db-openai_cookbooks", }, + "langchain": { + "input_file": "data/langchain_data.jsonl", + "db_name": "chroma-db-langchain", + }, } diff --git a/data/scraping_scripts/github_to_markdown_ai_docs.py b/data/scraping_scripts/github_to_markdown_ai_docs.py index 0e46712..aabb39c 100644 --- a/data/scraping_scripts/github_to_markdown_ai_docs.py +++ b/data/scraping_scripts/github_to_markdown_ai_docs.py @@ -67,6 +67,11 @@ "repo": "openai-cookbook", "path": "examples", }, + "langchain": { + "owner": "langchain-ai", + "repo": "langchain", + "path": "docs/docs", + }, } # GitHub Personal Access Token (replace with your own token) diff --git a/data/scraping_scripts/process_md_files.py b/data/scraping_scripts/process_md_files.py index 19b94e8..4bd9c20 100644 --- a/data/scraping_scripts/process_md_files.py +++ b/data/scraping_scripts/process_md_files.py @@ -110,6 +110,18 @@ "included_root_files": [], "url_extension": ".ipynb", }, + "langchain": { + "base_url": "https://python.langchain.com/v0.2/docs/", + "input_directory": "data/langchain_md_files", + "output_file": "data/langchain_data.jsonl", + "source_name": "langchain", + "use_include_list": True, + "included_dirs": ["how_to", "versions", "turorials", "integrations"], + "excluded_dirs": [], + "excluded_root_files": [], + "included_root_files": ["security.md", "concepts.mdx", "introduction.mdx"], + "url_extension": "", + }, } diff --git a/scripts/main.py b/scripts/main.py index 4864738..4ff6107 100644 --- a/scripts/main.py +++ b/scripts/main.py @@ -10,6 +10,7 @@ AVAILABLE_SOURCES, AVAILABLE_SOURCES_UI, CONCURRENCY_COUNT, + custom_retriever_langchain, custom_retriever_llama_index, custom_retriever_openai_cookbooks, custom_retriever_peft, @@ -46,6 +47,11 @@ def update_query_engine_tools(selected_sources): "openai_cookbooks_info", """Useful for questions asking about accomplishing common tasks with theĀ OpenAI API. Returns example code and guides stored in Jupyter notebooks, including info about ChatGPT GPT actions, OpenAI Assistants API, and How to fine-tune OpenAI's GPT-4o and GPT-4o-mini models with the OpenAI API.""", ), + "LangChain Docs": ( + custom_retriever_langchain, + "langchain_info", + """Useful for questions asking about the LangChain framework. It is the documentation of the LangChain framework, includes info about building chains, agents, and tools, using memory, prompts, callbacks, etc.""", + ), } for source in selected_sources: diff --git a/scripts/setup.py b/scripts/setup.py index fe7bbf0..5efa68d 100644 --- a/scripts/setup.py +++ b/scripts/setup.py @@ -77,6 +77,10 @@ def setup_database(db_collection, dict_file_name): "chroma-db-openai_cookbooks", "document_dict_openai_cookbooks.pkl", ) +custom_retriever_langchain = setup_database( + "chroma-db-langchain", + "document_dict_langchain.pkl", +) # Constants CONCURRENCY_COUNT = int(os.getenv("CONCURRENCY_COUNT", 64)) @@ -88,6 +92,7 @@ def setup_database(db_collection, dict_file_name): "TRL Docs", "LlamaIndex Docs", "OpenAI Cookbooks", + "LangChain Docs", # "Towards AI Blog", # "RAG Course", ] @@ -98,6 +103,7 @@ def setup_database(db_collection, dict_file_name): "trl", "llama_index", "openai_cookbooks", + "langchain", # "towards_ai_blog", # "rag_course", ] @@ -114,6 +120,7 @@ def setup_database(db_collection, dict_file_name): "custom_retriever_trl", "custom_retriever_llama_index", "custom_retriever_openai_cookbooks", + "custom_retriever_langchain", "CONCURRENCY_COUNT", "MONGODB_URI", "AVAILABLE_SOURCES_UI",