Skip to content

Commit

Permalink
add langchain documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
omar-sol committed Jul 28, 2024
1 parent 139a897 commit 680fe32
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 0 deletions.
30 changes: 30 additions & 0 deletions data/scraping_scripts/create_vector_stores.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,29 @@
"""
Vector Store Creation Script
Purpose:
This script processes various data sources (e.g., transformers, peft, trl, llama_index, openai_cookbooks, langchain)
to create vector stores using Chroma and LlamaIndex. It reads data from JSONL files, creates document embeddings,
and stores them in persistent Chroma databases for efficient retrieval.
Usage:
python script_name.py <source1> <source2> ...
Example:
python script_name.py transformers peft llama_index
The script accepts one or more source names as command-line arguments. Valid source names are:
transformers, peft, trl, llama_index, openai_cookbooks, langchain
For each specified source, the script will:
1. Read data from the corresponding JSONL file
2. Create document embeddings
3. Store the embeddings in a Chroma vector database
4. Save a dictionary of documents for future reference
Note: Ensure that the input JSONL files are present in the 'data' directory.
"""

import argparse
import json
import os
Expand Down Expand Up @@ -27,6 +53,10 @@
"input_file": "data/openai_cookbooks_data.jsonl",
"db_name": "chroma-db-openai_cookbooks",
},
"langchain": {
"input_file": "data/langchain_data.jsonl",
"db_name": "chroma-db-langchain",
},
}


Expand Down
5 changes: 5 additions & 0 deletions data/scraping_scripts/github_to_markdown_ai_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@
"repo": "openai-cookbook",
"path": "examples",
},
"langchain": {
"owner": "langchain-ai",
"repo": "langchain",
"path": "docs/docs",
},
}

# GitHub Personal Access Token (replace with your own token)
Expand Down
12 changes: 12 additions & 0 deletions data/scraping_scripts/process_md_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,18 @@
"included_root_files": [],
"url_extension": ".ipynb",
},
"langchain": {
"base_url": "https://python.langchain.com/v0.2/docs/",
"input_directory": "data/langchain_md_files",
"output_file": "data/langchain_data.jsonl",
"source_name": "langchain",
"use_include_list": True,
"included_dirs": ["how_to", "versions", "turorials", "integrations"],
"excluded_dirs": [],
"excluded_root_files": [],
"included_root_files": ["security.md", "concepts.mdx", "introduction.mdx"],
"url_extension": "",
},
}


Expand Down
6 changes: 6 additions & 0 deletions scripts/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
AVAILABLE_SOURCES,
AVAILABLE_SOURCES_UI,
CONCURRENCY_COUNT,
custom_retriever_langchain,
custom_retriever_llama_index,
custom_retriever_openai_cookbooks,
custom_retriever_peft,
Expand Down Expand Up @@ -46,6 +47,11 @@ def update_query_engine_tools(selected_sources):
"openai_cookbooks_info",
"""Useful for questions asking about accomplishing common tasks with the OpenAI API. Returns example code and guides stored in Jupyter notebooks, including info about ChatGPT GPT actions, OpenAI Assistants API, and How to fine-tune OpenAI's GPT-4o and GPT-4o-mini models with the OpenAI API.""",
),
"LangChain Docs": (
custom_retriever_langchain,
"langchain_info",
"""Useful for questions asking about the LangChain framework. It is the documentation of the LangChain framework, includes info about building chains, agents, and tools, using memory, prompts, callbacks, etc.""",
),
}

for source in selected_sources:
Expand Down
7 changes: 7 additions & 0 deletions scripts/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ def setup_database(db_collection, dict_file_name):
"chroma-db-openai_cookbooks",
"document_dict_openai_cookbooks.pkl",
)
custom_retriever_langchain = setup_database(
"chroma-db-langchain",
"document_dict_langchain.pkl",
)

# Constants
CONCURRENCY_COUNT = int(os.getenv("CONCURRENCY_COUNT", 64))
Expand All @@ -88,6 +92,7 @@ def setup_database(db_collection, dict_file_name):
"TRL Docs",
"LlamaIndex Docs",
"OpenAI Cookbooks",
"LangChain Docs",
# "Towards AI Blog",
# "RAG Course",
]
Expand All @@ -98,6 +103,7 @@ def setup_database(db_collection, dict_file_name):
"trl",
"llama_index",
"openai_cookbooks",
"langchain",
# "towards_ai_blog",
# "rag_course",
]
Expand All @@ -114,6 +120,7 @@ def setup_database(db_collection, dict_file_name):
"custom_retriever_trl",
"custom_retriever_llama_index",
"custom_retriever_openai_cookbooks",
"custom_retriever_langchain",
"CONCURRENCY_COUNT",
"MONGODB_URI",
"AVAILABLE_SOURCES_UI",
Expand Down

0 comments on commit 680fe32

Please sign in to comment.