diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..edfbfbfc --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +__pycache__ +*.egg-info +dickens/ +book.txt +lightrag-dev/ +.idea/ \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..db531bb6 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,22 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: requirements-txt-fixer + + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.4 + hooks: + - id: ruff-format + - id: ruff + args: [--fix] + + + - repo: https://github.com/mgedmin/check-manifest + rev: "0.49" + hooks: + - id: check-manifest + stages: [manual] diff --git a/README.md b/README.md index 86d82b1d..c8d6e312 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,35 @@ -# LightRAG: Simple and Fast Retrieval-Augmented Generation -![请添加图片描述](https://i-blog.csdnimg.cn/direct/567139f1a36e4564abc63ce5c12b6271.jpeg) +

🚀 LightRAG: Simple and Fast Retrieval-Augmented Generation

+![请添加图片描述](https://i-blog.csdnimg.cn/direct/567139f1a36e4564abc63ce5c12b6271.jpeg) - - +
+

+ + + + +

+

+ + + + +

This repository hosts the code of LightRAG. The structure of this code is based on [nano-graphrag](https://github.com/gusye1234/nano-graphrag). ![请添加图片描述](https://i-blog.csdnimg.cn/direct/b2aaf634151b4706892693ffb43d9093.png) +
+ +## 🎉 News +- [x] [2024.10.18]🎯🎯📢📢We’ve added a link to a [LightRAG Introduction Video](https://youtu.be/oageL-1I0GE). Thanks to the author! +- [x] [2024.10.17]🎯🎯📢📢We have created a [Discord channel](https://discord.gg/mvsfu2Tg)! Welcome to join for sharing and discussions! 🎉🎉 +- [x] [2024.10.16]🎯🎯📢📢LightRAG now supports [Ollama models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)! +- [x] [2024.10.15]🎯🎯📢📢LightRAG now supports [Hugging Face models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)! + ## Install -* Install from source +* Install from source (Recommend) ```bash cd LightRAG @@ -22,18 +41,37 @@ pip install lightrag-hku ``` ## Quick Start - -* Set OpenAI API key in environment: `export OPENAI_API_KEY="sk-...".` -* Download the demo text "A Christmas Carol by Charles Dickens" +* All the code can be found in the `examples`. +* Set OpenAI API key in environment if using OpenAI models: `export OPENAI_API_KEY="sk-...".` +* Download the demo text "A Christmas Carol by Charles Dickens": ```bash curl https://raw.githubusercontent.com/gusye1234/nano-graphrag/main/tests/mock_data.txt > ./book.txt ``` -Use the below python snippet: +Use the below Python snippet (in a script) to initialize LightRAG and perform queries: ```python from lightrag import LightRAG, QueryParam +from lightrag.llm import gpt_4o_mini_complete, gpt_4o_complete -rag = LightRAG(working_dir="./dickens") +######### +# Uncomment the below two lines if running in a jupyter notebook to handle the async nature of rag.insert() +# import nest_asyncio +# nest_asyncio.apply() +######### + +WORKING_DIR = "./dickens" + + +WORKING_DIR = "./dickens" + +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=gpt_4o_mini_complete # Use gpt_4o_mini_complete LLM model + # llm_model_func=gpt_4o_complete # Optionally, use a stronger model +) with open("./book.txt") as f: rag.insert(f.read()) @@ -47,27 +85,169 @@ print(rag.query("What are the top themes in this story?", param=QueryParam(mode= # Perform global search print(rag.query("What are the top themes in this story?", param=QueryParam(mode="global"))) -# Perform hybird search -print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybird"))) +# Perform hybrid search +print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid"))) ``` -Batch Insert + +
+ Using Open AI-like APIs + +* LightRAG also supports Open AI-like chat/embeddings APIs: +```python +async def llm_model_func( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + return await openai_complete_if_cache( + "solar-mini", + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + api_key=os.getenv("UPSTAGE_API_KEY"), + base_url="https://api.upstage.ai/v1/solar", + **kwargs + ) + +async def embedding_func(texts: list[str]) -> np.ndarray: + return await openai_embedding( + texts, + model="solar-embedding-1-large-query", + api_key=os.getenv("UPSTAGE_API_KEY"), + base_url="https://api.upstage.ai/v1/solar" + ) + +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=llm_model_func, + embedding_func=EmbeddingFunc( + embedding_dim=4096, + max_token_size=8192, + func=embedding_func + ) +) +``` +
+ +
+ Using Hugging Face Models + +* If you want to use Hugging Face models, you only need to set LightRAG as follows: ```python +from lightrag.llm import hf_model_complete, hf_embedding +from transformers import AutoModel, AutoTokenizer + +# Initialize LightRAG with Hugging Face model +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=hf_model_complete, # Use Hugging Face model for text generation + llm_model_name='meta-llama/Llama-3.1-8B-Instruct', # Model name from Hugging Face + # Use Hugging Face embedding function + embedding_func=EmbeddingFunc( + embedding_dim=384, + max_token_size=5000, + func=lambda texts: hf_embedding( + texts, + tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"), + embed_model=AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") + ) + ), +) +``` +
+ +
+ Using Ollama Models + +* If you want to use Ollama models, you only need to set LightRAG as follows: + +```python +from lightrag.llm import ollama_model_complete, ollama_embedding + +# Initialize LightRAG with Ollama model +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=ollama_model_complete, # Use Ollama model for text generation + llm_model_name='your_model_name', # Your model name + # Use Ollama embedding function + embedding_func=EmbeddingFunc( + embedding_dim=768, + max_token_size=8192, + func=lambda texts: ollama_embedding( + texts, + embed_model="nomic-embed-text" + ) + ), +) +``` + +* Increasing the `num_ctx` parameter: + +1. Pull the model: +```python +ollama pull qwen2 +``` + +2. Display the model file: +```python +ollama show --modelfile qwen2 > Modelfile +``` + +3. Edit the Modelfile by adding the following line: +```python +PARAMETER num_ctx 32768 +``` + +4. Create the modified model: +```python +ollama create -f Modelfile qwen2m +``` + +
+ +### Batch Insert +```python +# Batch Insert: Insert multiple texts at once rag.insert(["TEXT1", "TEXT2",...]) ``` -Incremental Insert + +### Incremental Insert ```python +# Incremental Insert: Insert new documents into an existing LightRAG instance rag = LightRAG(working_dir="./dickens") with open("./newText.txt") as f: rag.insert(f.read()) ``` + +### Graph Visualization + +* Generate html file +```python +import networkx as nx +from pyvis.network import Network + +# Load the GraphML file +G = nx.read_graphml('./dickens/graph_chunk_entity_relation.graphml') + +# Create a Pyvis network +net = Network(notebook=True) + +# Convert NetworkX graph to Pyvis network +net.from_nx(G) + +# Save and display the network +net.show('knowledge_graph.html') +``` ## Evaluation ### Dataset -The dataset used in LightRAG can be download from [TommyChien/UltraDomain](https://huggingface.co/datasets/TommyChien/UltraDomain). +The dataset used in LightRAG can be downloaded from [TommyChien/UltraDomain](https://huggingface.co/datasets/TommyChien/UltraDomain). ### Generate Query -LightRAG uses the following prompt to generate high-level queries, with the corresponding code located in `example/generate_query.py`. +LightRAG uses the following prompt to generate high-level queries, with the corresponding code in `example/generate_query.py`. + +
+ Prompt + ```python Given the following description of a dataset: @@ -91,14 +271,19 @@ Output the results in the following structure: - User 5: [user description] ... ``` - +
+ ### Batch Eval To evaluate the performance of two RAG systems on high-level queries, LightRAG uses the following prompt, with the specific code available in `example/batch_eval.py`. + +
+ Prompt + ```python ---Role--- You are an expert tasked with evaluating two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. ---Goal--- -You will evaluate two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. +You will evaluate two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. - **Comprehensiveness**: How much detail does the answer provide to cover all aspects and details of the question? - **Diversity**: How varied and rich is the answer in providing different perspectives and insights on the question? @@ -136,7 +321,8 @@ Output your evaluation in the following JSON format: }} }} ``` -### Overall Performance Table +
+ ### Overall Performance Table | | **Agriculture** | | **CS** | | **Legal** | | **Mix** | | |----------------------|-------------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------| @@ -161,6 +347,137 @@ Output your evaluation in the following JSON format: | **Empowerment** | 36.69% | **63.31%** | 45.09% | **54.91%** | 42.81% | **57.19%** | **52.94%** | 47.06% | | **Overall** | 43.62% | **56.38%** | 45.98% | **54.02%** | 45.70% | **54.30%** | **51.86%** | 48.14% | +## Reproduce +All the code can be found in the `./reproduce` directory. + +### Step-0 Extract Unique Contexts +First, we need to extract unique contexts in the datasets. + +
+ Code + +```python +def extract_unique_contexts(input_directory, output_directory): + + os.makedirs(output_directory, exist_ok=True) + + jsonl_files = glob.glob(os.path.join(input_directory, '*.jsonl')) + print(f"Found {len(jsonl_files)} JSONL files.") + + for file_path in jsonl_files: + filename = os.path.basename(file_path) + name, ext = os.path.splitext(filename) + output_filename = f"{name}_unique_contexts.json" + output_path = os.path.join(output_directory, output_filename) + + unique_contexts_dict = {} + + print(f"Processing file: {filename}") + + try: + with open(file_path, 'r', encoding='utf-8') as infile: + for line_number, line in enumerate(infile, start=1): + line = line.strip() + if not line: + continue + try: + json_obj = json.loads(line) + context = json_obj.get('context') + if context and context not in unique_contexts_dict: + unique_contexts_dict[context] = None + except json.JSONDecodeError as e: + print(f"JSON decoding error in file {filename} at line {line_number}: {e}") + except FileNotFoundError: + print(f"File not found: {filename}") + continue + except Exception as e: + print(f"An error occurred while processing file {filename}: {e}") + continue + + unique_contexts_list = list(unique_contexts_dict.keys()) + print(f"There are {len(unique_contexts_list)} unique `context` entries in the file {filename}.") + + try: + with open(output_path, 'w', encoding='utf-8') as outfile: + json.dump(unique_contexts_list, outfile, ensure_ascii=False, indent=4) + print(f"Unique `context` entries have been saved to: {output_filename}") + except Exception as e: + print(f"An error occurred while saving to the file {output_filename}: {e}") + + print("All files have been processed.") + +``` +
+ +### Step-1 Insert Contexts +For the extracted contexts, we insert them into the LightRAG system. + +
+ Code + +```python +def insert_text(rag, file_path): + with open(file_path, mode='r') as f: + unique_contexts = json.load(f) + + retries = 0 + max_retries = 3 + while retries < max_retries: + try: + rag.insert(unique_contexts) + break + except Exception as e: + retries += 1 + print(f"Insertion failed, retrying ({retries}/{max_retries}), error: {e}") + time.sleep(10) + if retries == max_retries: + print("Insertion failed after exceeding the maximum number of retries") +``` +
+ +### Step-2 Generate Queries + +We extract tokens from the first and the second half of each context in the dataset, then combine them as dataset descriptions to generate queries. + +
+ Code + +```python +tokenizer = GPT2Tokenizer.from_pretrained('gpt2') + +def get_summary(context, tot_tokens=2000): + tokens = tokenizer.tokenize(context) + half_tokens = tot_tokens // 2 + + start_tokens = tokens[1000:1000 + half_tokens] + end_tokens = tokens[-(1000 + half_tokens):1000] + + summary_tokens = start_tokens + end_tokens + summary = tokenizer.convert_tokens_to_string(summary_tokens) + + return summary +``` +
+ +### Step-3 Query +For the queries generated in Step-2, we will extract them and query LightRAG. + +
+ Code + +```python +def extract_queries(file_path): + with open(file_path, 'r') as f: + data = f.read() + + data = data.replace('**', '') + + queries = re.findall(r'- Question \d+: (.+)', data) + + return queries +``` +
+ ## Code Structure ```python @@ -168,8 +485,14 @@ Output your evaluation in the following JSON format: ├── examples │ ├── batch_eval.py │ ├── generate_query.py -│ ├── insert.py -│ └── query.py +│ ├── graph_visual.py +│ ├── lightrag_azure_openai_demo.py +│ ├── lightrag_bedrock_demo.py +│ ├── lightrag_hf_demo.py +│ ├── lightrag_ollama_demo.py +│ ├── lightrag_openai_compatible_demo.py +│ ├── lightrag_openai_demo.py +│ └── vram_management_demo.py ├── lightrag │ ├── __init__.py │ ├── base.py @@ -178,20 +501,38 @@ Output your evaluation in the following JSON format: │ ├── operate.py │ ├── prompt.py │ ├── storage.py -│ └── utils.jpeg +│ └── utils.py +├── reproduce +│ ├── Step_0.py +│ ├── Step_1.py +│ ├── Step_2.py +│ └── Step_3.py +├── .gitignore +├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── requirements.txt └── setup.py ``` + +## Star History + + + + + + Star History Chart + + + ## Citation -``` +```python @article{guo2024lightrag, -title={LightRAG: Simple and Fast Retrieval-Augmented Generation}, +title={LightRAG: Simple and Fast Retrieval-Augmented Generation}, author={Zirui Guo and Lianghao Xia and Yanhua Yu and Tu Ao and Chao Huang}, year={2024}, -eprint={}, +eprint={2410.05779}, archivePrefix={arXiv}, primaryClass={cs.IR} } diff --git a/examples/batch_eval.py b/examples/batch_eval.py index 4601d267..a85e1ede 100644 --- a/examples/batch_eval.py +++ b/examples/batch_eval.py @@ -1,4 +1,3 @@ -import os import re import json import jsonlines @@ -9,28 +8,28 @@ def batch_eval(query_file, result1_file, result2_file, output_file_path): client = OpenAI() - with open(query_file, 'r') as f: + with open(query_file, "r") as f: data = f.read() - queries = re.findall(r'- Question \d+: (.+)', data) + queries = re.findall(r"- Question \d+: (.+)", data) - with open(result1_file, 'r') as f: + with open(result1_file, "r") as f: answers1 = json.load(f) - answers1 = [i['result'] for i in answers1] + answers1 = [i["result"] for i in answers1] - with open(result2_file, 'r') as f: + with open(result2_file, "r") as f: answers2 = json.load(f) - answers2 = [i['result'] for i in answers2] + answers2 = [i["result"] for i in answers2] requests = [] for i, (query, answer1, answer2) in enumerate(zip(queries, answers1, answers2)): - sys_prompt = f""" + sys_prompt = """ ---Role--- You are an expert tasked with evaluating two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. """ prompt = f""" - You will evaluate two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. + You will evaluate two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. - **Comprehensiveness**: How much detail does the answer provide to cover all aspects and details of the question? - **Diversity**: How varied and rich is the answer in providing different perspectives and insights on the question? @@ -69,7 +68,6 @@ def batch_eval(query_file, result1_file, result2_file, output_file_path): }} """ - request_data = { "custom_id": f"request-{i+1}", "method": "POST", @@ -78,22 +76,21 @@ def batch_eval(query_file, result1_file, result2_file, output_file_path): "model": "gpt-4o-mini", "messages": [ {"role": "system", "content": sys_prompt}, - {"role": "user", "content": prompt} + {"role": "user", "content": prompt}, ], - } + }, } - + requests.append(request_data) - with jsonlines.open(output_file_path, mode='w') as writer: + with jsonlines.open(output_file_path, mode="w") as writer: for request in requests: writer.write(request) print(f"Batch API requests written to {output_file_path}") batch_input_file = client.files.create( - file=open(output_file_path, "rb"), - purpose="batch" + file=open(output_file_path, "rb"), purpose="batch" ) batch_input_file_id = batch_input_file.id @@ -101,12 +98,11 @@ def batch_eval(query_file, result1_file, result2_file, output_file_path): input_file_id=batch_input_file_id, endpoint="/v1/chat/completions", completion_window="24h", - metadata={ - "description": "nightly eval job" - } + metadata={"description": "nightly eval job"}, ) - print(f'Batch {batch.id} has been created.') + print(f"Batch {batch.id} has been created.") + if __name__ == "__main__": - batch_eval() \ No newline at end of file + batch_eval() diff --git a/examples/generate_query.py b/examples/generate_query.py index 0ae82f40..705b23d3 100644 --- a/examples/generate_query.py +++ b/examples/generate_query.py @@ -1,9 +1,8 @@ -import os - from openai import OpenAI # os.environ["OPENAI_API_KEY"] = "" + def openai_complete_if_cache( model="gpt-4o-mini", prompt=None, system_prompt=None, history_messages=[], **kwargs ) -> str: @@ -47,10 +46,10 @@ def openai_complete_if_cache( ... """ - result = openai_complete_if_cache(model='gpt-4o-mini', prompt=prompt) + result = openai_complete_if_cache(model="gpt-4o-mini", prompt=prompt) - file_path = f"./queries.txt" + file_path = "./queries.txt" with open(file_path, "w") as file: file.write(result) - print(f"Queries written to {file_path}") \ No newline at end of file + print(f"Queries written to {file_path}") diff --git a/examples/graph_visual.py b/examples/graph_visual.py new file mode 100644 index 00000000..b455e6de --- /dev/null +++ b/examples/graph_visual.py @@ -0,0 +1,19 @@ +import networkx as nx +from pyvis.network import Network +import random + +# Load the GraphML file +G = nx.read_graphml('./dickens/graph_chunk_entity_relation.graphml') + +# Create a Pyvis network +net = Network(notebook=True) + +# Convert NetworkX graph to Pyvis network +net.from_nx(G) + +# Add colors to nodes +for node in net.nodes: + node['color'] = "#{:06x}".format(random.randint(0, 0xFFFFFF)) + +# Save and display the network +net.show('knowledge_graph.html') \ No newline at end of file diff --git a/examples/insert.py b/examples/insert.py deleted file mode 100644 index 25c3cdda..00000000 --- a/examples/insert.py +++ /dev/null @@ -1,18 +0,0 @@ -import os -import sys - -from lightrag import LightRAG - -# os.environ["OPENAI_API_KEY"] = "" - -WORKING_DIR = "" - -if not os.path.exists(WORKING_DIR): - os.mkdir(WORKING_DIR) - -rag = LightRAG(working_dir=WORKING_DIR) - -with open('./text.txt', 'r') as f: - text = f.read() - -rag.insert(text) \ No newline at end of file diff --git a/examples/lightrag_azure_openai_demo.py b/examples/lightrag_azure_openai_demo.py new file mode 100644 index 00000000..e29a6a9d --- /dev/null +++ b/examples/lightrag_azure_openai_demo.py @@ -0,0 +1,125 @@ +import os +import asyncio +from lightrag import LightRAG, QueryParam +from lightrag.utils import EmbeddingFunc +import numpy as np +from dotenv import load_dotenv +import aiohttp +import logging + +logging.basicConfig(level=logging.INFO) + +load_dotenv() + +AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION") +AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT") +AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY") +AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") + +AZURE_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_EMBEDDING_DEPLOYMENT") +AZURE_EMBEDDING_API_VERSION = os.getenv("AZURE_EMBEDDING_API_VERSION") + +WORKING_DIR = "./dickens" + +if os.path.exists(WORKING_DIR): + import shutil + + shutil.rmtree(WORKING_DIR) + +os.mkdir(WORKING_DIR) + + +async def llm_model_func( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + headers = { + "Content-Type": "application/json", + "api-key": AZURE_OPENAI_API_KEY, + } + endpoint = f"{AZURE_OPENAI_ENDPOINT}openai/deployments/{AZURE_OPENAI_DEPLOYMENT}/chat/completions?api-version={AZURE_OPENAI_API_VERSION}" + + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + if history_messages: + messages.extend(history_messages) + messages.append({"role": "user", "content": prompt}) + + payload = { + "messages": messages, + "temperature": kwargs.get("temperature", 0), + "top_p": kwargs.get("top_p", 1), + "n": kwargs.get("n", 1), + } + + async with aiohttp.ClientSession() as session: + async with session.post(endpoint, headers=headers, json=payload) as response: + if response.status != 200: + raise ValueError( + f"Request failed with status {response.status}: {await response.text()}" + ) + result = await response.json() + return result["choices"][0]["message"]["content"] + + +async def embedding_func(texts: list[str]) -> np.ndarray: + headers = { + "Content-Type": "application/json", + "api-key": AZURE_OPENAI_API_KEY, + } + endpoint = f"{AZURE_OPENAI_ENDPOINT}openai/deployments/{AZURE_EMBEDDING_DEPLOYMENT}/embeddings?api-version={AZURE_EMBEDDING_API_VERSION}" + + payload = {"input": texts} + + async with aiohttp.ClientSession() as session: + async with session.post(endpoint, headers=headers, json=payload) as response: + if response.status != 200: + raise ValueError( + f"Request failed with status {response.status}: {await response.text()}" + ) + result = await response.json() + embeddings = [item["embedding"] for item in result["data"]] + return np.array(embeddings) + + +async def test_funcs(): + result = await llm_model_func("How are you?") + print("Resposta do llm_model_func: ", result) + + result = await embedding_func(["How are you?"]) + print("Resultado do embedding_func: ", result.shape) + print("Dimensão da embedding: ", result.shape[1]) + + +asyncio.run(test_funcs()) + +embedding_dimension = 3072 + +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=llm_model_func, + embedding_func=EmbeddingFunc( + embedding_dim=embedding_dimension, + max_token_size=8192, + func=embedding_func, + ), +) + +book1 = open("./book_1.txt", encoding="utf-8") +book2 = open("./book_2.txt", encoding="utf-8") + +rag.insert([book1.read(), book2.read()]) + +query_text = "What are the main themes?" + +print("Result (Naive):") +print(rag.query(query_text, param=QueryParam(mode="naive"))) + +print("\nResult (Local):") +print(rag.query(query_text, param=QueryParam(mode="local"))) + +print("\nResult (Global):") +print(rag.query(query_text, param=QueryParam(mode="global"))) + +print("\nResult (Hybrid):") +print(rag.query(query_text, param=QueryParam(mode="hybrid"))) diff --git a/examples/lightrag_bedrock_demo.py b/examples/lightrag_bedrock_demo.py new file mode 100644 index 00000000..7e18ea57 --- /dev/null +++ b/examples/lightrag_bedrock_demo.py @@ -0,0 +1,36 @@ +""" +LightRAG meets Amazon Bedrock ⛰️ +""" + +import os +import logging + +from lightrag import LightRAG, QueryParam +from lightrag.llm import bedrock_complete, bedrock_embedding +from lightrag.utils import EmbeddingFunc + +logging.getLogger("aiobotocore").setLevel(logging.WARNING) + +WORKING_DIR = "./dickens" +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=bedrock_complete, + llm_model_name="Anthropic Claude 3 Haiku // Amazon Bedrock", + embedding_func=EmbeddingFunc( + embedding_dim=1024, max_token_size=8192, func=bedrock_embedding + ), +) + +with open("./book.txt", "r", encoding="utf-8") as f: + rag.insert(f.read()) + +for mode in ["naive", "local", "global", "hybrid"]: + print("\n+-" + "-" * len(mode) + "-+") + print(f"| {mode.capitalize()} |") + print("+-" + "-" * len(mode) + "-+\n") + print( + rag.query("What are the top themes in this story?", param=QueryParam(mode=mode)) + ) diff --git a/examples/lightrag_hf_demo.py b/examples/lightrag_hf_demo.py new file mode 100644 index 00000000..87312307 --- /dev/null +++ b/examples/lightrag_hf_demo.py @@ -0,0 +1,54 @@ +import os + +from lightrag import LightRAG, QueryParam +from lightrag.llm import hf_model_complete, hf_embedding +from lightrag.utils import EmbeddingFunc +from transformers import AutoModel, AutoTokenizer + +WORKING_DIR = "./dickens" + +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=hf_model_complete, + llm_model_name="meta-llama/Llama-3.1-8B-Instruct", + embedding_func=EmbeddingFunc( + embedding_dim=384, + max_token_size=5000, + func=lambda texts: hf_embedding( + texts, + tokenizer=AutoTokenizer.from_pretrained( + "sentence-transformers/all-MiniLM-L6-v2" + ), + embed_model=AutoModel.from_pretrained( + "sentence-transformers/all-MiniLM-L6-v2" + ), + ), + ), +) + + +with open("./book.txt") as f: + rag.insert(f.read()) + +# Perform naive search +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="naive")) +) + +# Perform local search +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="local")) +) + +# Perform global search +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="global")) +) + +# Perform hybrid search +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid")) +) diff --git a/examples/lightrag_ollama_demo.py b/examples/lightrag_ollama_demo.py new file mode 100644 index 00000000..c61b71c0 --- /dev/null +++ b/examples/lightrag_ollama_demo.py @@ -0,0 +1,45 @@ +import os + +from lightrag import LightRAG, QueryParam +from lightrag.llm import ollama_model_complete, ollama_embedding +from lightrag.utils import EmbeddingFunc + +WORKING_DIR = "./dickens" + +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=ollama_model_complete, + llm_model_name="your_model_name", + embedding_func=EmbeddingFunc( + embedding_dim=768, + max_token_size=8192, + func=lambda texts: ollama_embedding(texts, embed_model="nomic-embed-text"), + ), +) + + +with open("./book.txt") as f: + rag.insert(f.read()) + +# Perform naive search +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="naive")) +) + +# Perform local search +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="local")) +) + +# Perform global search +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="global")) +) + +# Perform hybrid search +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid")) +) diff --git a/examples/lightrag_openai_compatible_demo.py b/examples/lightrag_openai_compatible_demo.py new file mode 100644 index 00000000..fbad1190 --- /dev/null +++ b/examples/lightrag_openai_compatible_demo.py @@ -0,0 +1,79 @@ +import os +import asyncio +from lightrag import LightRAG, QueryParam +from lightrag.llm import openai_complete_if_cache, openai_embedding +from lightrag.utils import EmbeddingFunc +import numpy as np + +WORKING_DIR = "./dickens" + +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + + +async def llm_model_func( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + return await openai_complete_if_cache( + "solar-mini", + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + api_key=os.getenv("UPSTAGE_API_KEY"), + base_url="https://api.upstage.ai/v1/solar", + **kwargs, + ) + + +async def embedding_func(texts: list[str]) -> np.ndarray: + return await openai_embedding( + texts, + model="solar-embedding-1-large-query", + api_key=os.getenv("UPSTAGE_API_KEY"), + base_url="https://api.upstage.ai/v1/solar", + ) + + +# function test +async def test_funcs(): + result = await llm_model_func("How are you?") + print("llm_model_func: ", result) + + result = await embedding_func(["How are you?"]) + print("embedding_func: ", result) + + +asyncio.run(test_funcs()) + + +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=llm_model_func, + embedding_func=EmbeddingFunc( + embedding_dim=4096, max_token_size=8192, func=embedding_func + ), +) + + +with open("./book.txt") as f: + rag.insert(f.read()) + +# Perform naive search +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="naive")) +) + +# Perform local search +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="local")) +) + +# Perform global search +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="global")) +) + +# Perform hybrid search +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid")) +) diff --git a/examples/lightrag_openai_demo.py b/examples/lightrag_openai_demo.py new file mode 100644 index 00000000..a6e7f3b2 --- /dev/null +++ b/examples/lightrag_openai_demo.py @@ -0,0 +1,39 @@ +import os + +from lightrag import LightRAG, QueryParam +from lightrag.llm import gpt_4o_mini_complete + +WORKING_DIR = "./dickens" + +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=gpt_4o_mini_complete, + # llm_model_func=gpt_4o_complete +) + + +with open("./book.txt") as f: + rag.insert(f.read()) + +# Perform naive search +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="naive")) +) + +# Perform local search +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="local")) +) + +# Perform global search +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="global")) +) + +# Perform hybrid search +print( + rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid")) +) diff --git a/examples/query.py b/examples/query.py deleted file mode 100644 index 00c902eb..00000000 --- a/examples/query.py +++ /dev/null @@ -1,16 +0,0 @@ -import os -import sys - -from lightrag import LightRAG, QueryParam - -# os.environ["OPENAI_API_KEY"] = "" - -WORKING_DIR = "" - -rag = LightRAG(working_dir=WORKING_DIR) - -mode = 'global' -query_param = QueryParam(mode=mode) - -result = rag.query("", param=query_param) -print(result) \ No newline at end of file diff --git a/examples/vram_management_demo.py b/examples/vram_management_demo.py new file mode 100644 index 00000000..ec750254 --- /dev/null +++ b/examples/vram_management_demo.py @@ -0,0 +1,82 @@ +import os +import time +from lightrag import LightRAG, QueryParam +from lightrag.llm import ollama_model_complete, ollama_embedding +from lightrag.utils import EmbeddingFunc + +# Working directory and the directory path for text files +WORKING_DIR = "./dickens" +TEXT_FILES_DIR = "/llm/mt" + +# Create the working directory if it doesn't exist +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +# Initialize LightRAG +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=ollama_model_complete, + llm_model_name="qwen2.5:3b-instruct-max-context", + embedding_func=EmbeddingFunc( + embedding_dim=768, + max_token_size=8192, + func=lambda texts: ollama_embedding(texts, embed_model="nomic-embed-text"), + ), +) + +# Read all .txt files from the TEXT_FILES_DIR directory +texts = [] +for filename in os.listdir(TEXT_FILES_DIR): + if filename.endswith('.txt'): + file_path = os.path.join(TEXT_FILES_DIR, filename) + with open(file_path, 'r', encoding='utf-8') as file: + texts.append(file.read()) + +# Batch insert texts into LightRAG with a retry mechanism +def insert_texts_with_retry(rag, texts, retries=3, delay=5): + for _ in range(retries): + try: + rag.insert(texts) + return + except Exception as e: + print(f"Error occurred during insertion: {e}. Retrying in {delay} seconds...") + time.sleep(delay) + raise RuntimeError("Failed to insert texts after multiple retries.") + +insert_texts_with_retry(rag, texts) + +# Perform different types of queries and handle potential errors +try: + print(rag.query("What are the top themes in this story?", param=QueryParam(mode="naive"))) +except Exception as e: + print(f"Error performing naive search: {e}") + +try: + print(rag.query("What are the top themes in this story?", param=QueryParam(mode="local"))) +except Exception as e: + print(f"Error performing local search: {e}") + +try: + print(rag.query("What are the top themes in this story?", param=QueryParam(mode="global"))) +except Exception as e: + print(f"Error performing global search: {e}") + +try: + print(rag.query("What are the top themes in this story?", param=QueryParam(mode="hybrid"))) +except Exception as e: + print(f"Error performing hybrid search: {e}") + +# Function to clear VRAM resources +def clear_vram(): + os.system("sudo nvidia-smi --gpu-reset") + +# Regularly clear VRAM to prevent overflow +clear_vram_interval = 3600 # Clear once every hour +start_time = time.time() + +while True: + current_time = time.time() + if current_time - start_time > clear_vram_interval: + clear_vram() + start_time = current_time + time.sleep(60) # Check the time every minute diff --git a/lightrag/__init__.py b/lightrag/__init__.py index a83afba3..f208177f 100644 --- a/lightrag/__init__.py +++ b/lightrag/__init__.py @@ -1,5 +1,5 @@ -from .lightrag import LightRAG, QueryParam +from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam -__version__ = "0.0.1" +__version__ = "0.0.6" __author__ = "Zirui Guo" -__url__ = "https://github.com/HKUDS/GraphEdit" +__url__ = "https://github.com/HKUDS/LightRAG" diff --git a/lightrag/base.py b/lightrag/base.py index 9c0422fe..f8bba04d 100644 --- a/lightrag/base.py +++ b/lightrag/base.py @@ -12,15 +12,16 @@ T = TypeVar("T") + @dataclass class QueryParam: - mode: Literal["local", "global", "hybird", "naive"] = "global" + mode: Literal["local", "global", "hybrid", "naive"] = "global" only_need_context: bool = False response_type: str = "Multiple Paragraphs" top_k: int = 60 - max_token_for_text_unit: int = 4000 + max_token_for_text_unit: int = 4000 max_token_for_global_context: int = 4000 - max_token_for_local_context: int = 4000 + max_token_for_local_context: int = 4000 @dataclass @@ -36,6 +37,7 @@ async def query_done_callback(self): """commit the storage operations after querying""" pass + @dataclass class BaseVectorStorage(StorageNameSpace): embedding_func: EmbeddingFunc @@ -50,6 +52,13 @@ async def upsert(self, data: dict[str, dict]): """ raise NotImplementedError + def dump(self): + """For debug purposes + + Raises: + NotImplementedError: implement in subclass + """ + raise NotImplementedError @dataclass class BaseKVStorage(Generic[T], StorageNameSpace): async def all_keys(self) -> list[str]: @@ -72,7 +81,7 @@ async def upsert(self, data: dict[str, T]): async def drop(self): raise NotImplementedError - + @dataclass class BaseGraphStorage(StorageNameSpace): @@ -113,4 +122,4 @@ async def clustering(self, algorithm: str): raise NotImplementedError async def embed_nodes(self, algorithm: str) -> tuple[np.ndarray, list[str]]: - raise NotImplementedError("Node embedding is not used in lightrag.") \ No newline at end of file + raise NotImplementedError("Node embedding is not used in lightrag.") diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 836fda9e..d2d1e01e 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -1,17 +1,21 @@ import asyncio import os +import textwrap from dataclasses import asdict, dataclass, field from datetime import datetime from functools import partial -from typing import Type, cast +from typing import Type, List, Union, cast -from .llm import gpt_4o_complete, gpt_4o_mini_complete, openai_embedding +from .llm import ( + gpt_4o_mini_complete, + openai_embedding, +) from .operate import ( chunking_by_token_size, extract_entities, local_query, global_query, - hybird_query, + hybrid_query, naive_query, ) @@ -36,17 +40,17 @@ QueryParam, ) + def always_get_an_event_loop() -> asyncio.AbstractEventLoop: try: - # If there is already an event loop, use it. - loop = asyncio.get_event_loop() + loop = asyncio.get_running_loop() except RuntimeError: - # If in a sub-thread, create a new event loop. logger.info("Creating a new event loop in a sub-thread.") loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) return loop + @dataclass class LightRAG: working_dir: str = field( @@ -69,20 +73,20 @@ class LightRAG: "dimensions": 1536, "num_walks": 10, "walk_length": 40, - "num_walks": 10, "window_size": 2, "iterations": 3, "random_seed": 3, } ) - # text embedding + # embedding_func: EmbeddingFunc = field(default_factory=lambda:hf_embedding) embedding_func: EmbeddingFunc = field(default_factory=lambda: openai_embedding) embedding_batch_num: int = 32 embedding_func_max_async: int = 16 # LLM - llm_model_func: callable = gpt_4o_mini_complete + llm_model_func: callable = gpt_4o_mini_complete # hf_model_complete# + llm_model_name: str = "meta-llama/Llama-3.2-1B-Instruct" #'meta-llama/Llama-3.2-1B'#'google/gemma-2-2b-it' llm_model_max_token_size: int = 32768 llm_model_max_async: int = 16 @@ -97,11 +101,11 @@ class LightRAG: addon_params: dict = field(default_factory=dict) convert_response_to_json_func: callable = convert_response_to_json - def __post_init__(self): + def __post_init__(self): log_file = os.path.join(self.working_dir, "lightrag.log") set_logger(log_file) logger.info(f"Logger initialized for working directory: {self.working_dir}") - + _print_config = ",\n ".join([f"{k} = {v}" for k, v in asdict(self).items()]) logger.debug(f"LightRAG init with param:\n {_print_config}\n") @@ -127,42 +131,135 @@ def __post_init__(self): self.chunk_entity_relation_graph = self.graph_storage_cls( namespace="chunk_entity_relation", global_config=asdict(self) ) + self.embedding_func = limit_async_func_call(self.embedding_func_max_async)( self.embedding_func ) - self.entities_vdb = ( - self.vector_db_storage_cls( - namespace="entities", - global_config=asdict(self), - embedding_func=self.embedding_func, - meta_fields={"entity_name"} - ) + + self.entities_vdb = self.vector_db_storage_cls( + namespace="entities", + global_config=asdict(self), + embedding_func=self.embedding_func, + meta_fields={"entity_name"}, ) - self.relationships_vdb = ( - self.vector_db_storage_cls( - namespace="relationships", - global_config=asdict(self), - embedding_func=self.embedding_func, - meta_fields={"src_id", "tgt_id"} - ) + self.relationships_vdb = self.vector_db_storage_cls( + namespace="relationships", + global_config=asdict(self), + embedding_func=self.embedding_func, + meta_fields={"src_id", "tgt_id"}, ) - self.chunks_vdb = ( - self.vector_db_storage_cls( - namespace="chunks", - global_config=asdict(self), - embedding_func=self.embedding_func, - ) + self.chunks_vdb = self.vector_db_storage_cls( + namespace="chunks", + global_config=asdict(self), + embedding_func=self.embedding_func, ) - + self.llm_model_func = limit_async_func_call(self.llm_model_max_async)( partial(self.llm_model_func, hashing_kv=self.llm_response_cache) ) - def insert(self, string_or_strings): + def insert(self, input_data: Union[str, os.PathLike, List[Union[str, os.PathLike]]]): + """Insert content from a string, file path, or a list of them.""" + loop = always_get_an_event_loop() + return loop.run_until_complete(self.ainsert(input_data)) + + async def ainsert(self, input_data: Union[str, os.PathLike, List[Union[str, os.PathLike]]]): + """Asynchronously handle inserting content from strings or file paths.""" + try: + # Ensure input is treated as a list + if isinstance(input_data, (str, os.PathLike)): + input_data = [input_data] + + contents = [] + # Process each item: read from file or use the string directly + for item in input_data: + if isinstance(item, os.PathLike) and os.path.isfile(item): + with open(item, 'r') as f: + content = f.read().strip() + contents.append((content, os.path.basename(item), os.path.abspath(item))) + else: + contents.append((item.strip(), "!none", "!none")) + + # Create documents with hashed keys + new_docs = { + compute_mdhash_id(content.strip(), prefix="doc-"): { + "content": content, + "filename": filename, + "filepath": filepath + } + for content, filename, filepath in contents + } + + # Filter out already stored documents + _add_doc_keys = await self.full_docs.filter_keys(list(new_docs.keys())) + new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys} + if not new_docs: + logger.warning("All docs are already in the storage") + return + logger.info(f"[New Docs] inserting {len(new_docs)} docs") + + # Chunk documents with metadata prefixes for each chunk + inserting_chunks = {} + for doc_key, doc in new_docs.items(): + content = doc["content"] + filename = doc["filename"] + filepath = doc["filepath"] + + # Generate chunks with metadata directly + chunks = { + compute_mdhash_id(f"{doc_key}-{i}", prefix="chunk-"): { + **dp, + "content": dp["content"], + "full_doc_id": doc_key, + } + for i, dp in enumerate(chunking_by_token_size( + content, + overlap_token_size=self.chunk_overlap_token_size, + max_token_size=self.chunk_token_size, + tiktoken_model=self.tiktoken_model_name, + filename=filename, + filepath=filepath + )) + } + inserting_chunks.update(chunks) + + # Filter out already stored chunks + _add_chunk_keys = await self.text_chunks.filter_keys(list(inserting_chunks.keys())) + inserting_chunks = {k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys} + if not inserting_chunks: + logger.warning("All chunks are already in the storage") + return + logger.info(f"[New Chunks] inserting {len(inserting_chunks)} chunks") + + # Insert chunks into vector database + await self.chunks_vdb.upsert(inserting_chunks) + + # Extract entities and relationships + logger.info("[Entity Extraction]...") + maybe_new_kg = await extract_entities( + inserting_chunks, + knwoledge_graph_inst=self.chunk_entity_relation_graph, + entity_vdb=self.entities_vdb, + relationships_vdb=self.relationships_vdb, + global_config=asdict(self), + ) + if maybe_new_kg is None: + logger.warning("No new entities and relationships found") + return + self.chunk_entity_relation_graph = maybe_new_kg + + # Upsert documents and chunks into their storages + await self.full_docs.upsert(new_docs) + await self.text_chunks.upsert(inserting_chunks) + + finally: + await self._insert_done() + + def _insert(self, string_or_strings): loop = always_get_an_event_loop() return loop.run_until_complete(self.ainsert(string_or_strings)) - async def ainsert(self, string_or_strings): + async def _ainsert(self, string_or_strings): try: if isinstance(string_or_strings, str): string_or_strings = [string_or_strings] @@ -174,7 +271,7 @@ async def ainsert(self, string_or_strings): _add_doc_keys = await self.full_docs.filter_keys(list(new_docs.keys())) new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys} if not len(new_docs): - logger.warning(f"All docs are already in the storage") + logger.warning("All docs are already in the storage") return logger.info(f"[New Docs] inserting {len(new_docs)} docs") @@ -200,7 +297,7 @@ async def ainsert(self, string_or_strings): k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys } if not len(inserting_chunks): - logger.warning(f"All chunks are already in the storage") + logger.warning("All chunks are already in the storage") return logger.info(f"[New Chunks] inserting {len(inserting_chunks)} chunks") @@ -243,7 +340,7 @@ async def _insert_done(self): def query(self, query: str, param: QueryParam = QueryParam()): loop = always_get_an_event_loop() return loop.run_until_complete(self.aquery(query, param)) - + async def aquery(self, query: str, param: QueryParam = QueryParam()): if param.mode == "local": response = await local_query( @@ -265,8 +362,8 @@ async def aquery(self, query: str, param: QueryParam = QueryParam()): param, asdict(self), ) - elif param.mode == "hybird": - response = await hybird_query( + elif param.mode == "hybrid": + response = await hybrid_query( query, self.chunk_entity_relation_graph, self.entities_vdb, @@ -287,7 +384,6 @@ async def aquery(self, query: str, param: QueryParam = QueryParam()): raise ValueError(f"Unknown mode {param.mode}") await self._query_done() return response - async def _query_done(self): tasks = [] @@ -296,5 +392,3 @@ async def _query_done(self): continue tasks.append(cast(StorageNameSpace, storage_inst).index_done_callback()) await asyncio.gather(*tasks) - - diff --git a/lightrag/llm.py b/lightrag/llm.py index ee700a10..be801e0c 100644 --- a/lightrag/llm.py +++ b/lightrag/llm.py @@ -1,5 +1,9 @@ import os +import copy +import json +import aioboto3 import numpy as np +import ollama from openai import AsyncOpenAI, APIConnectionError, RateLimitError, Timeout from tenacity import ( retry, @@ -7,19 +11,34 @@ wait_exponential, retry_if_exception_type, ) - +from transformers import AutoTokenizer, AutoModelForCausalLM +import torch from .base import BaseKVStorage from .utils import compute_args_hash, wrap_embedding_func_with_attrs +os.environ["TOKENIZERS_PARALLELISM"] = "false" + + @retry( stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10), retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)), ) async def openai_complete_if_cache( - model, prompt, system_prompt=None, history_messages=[], **kwargs + model, + prompt, + system_prompt=None, + history_messages=[], + base_url=None, + api_key=None, + **kwargs, ) -> str: - openai_async_client = AsyncOpenAI() + if api_key: + os.environ["OPENAI_API_KEY"] = api_key + + openai_async_client = ( + AsyncOpenAI() if base_url is None else AsyncOpenAI(base_url=base_url) + ) hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) messages = [] if system_prompt: @@ -42,6 +61,192 @@ async def openai_complete_if_cache( ) return response.choices[0].message.content + +class BedrockError(Exception): + """Generic error for issues related to Amazon Bedrock""" + + +@retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, max=60), + retry=retry_if_exception_type((BedrockError)), +) +async def bedrock_complete_if_cache( + model, + prompt, + system_prompt=None, + history_messages=[], + aws_access_key_id=None, + aws_secret_access_key=None, + aws_session_token=None, + **kwargs, +) -> str: + os.environ["AWS_ACCESS_KEY_ID"] = os.environ.get( + "AWS_ACCESS_KEY_ID", aws_access_key_id + ) + os.environ["AWS_SECRET_ACCESS_KEY"] = os.environ.get( + "AWS_SECRET_ACCESS_KEY", aws_secret_access_key + ) + os.environ["AWS_SESSION_TOKEN"] = os.environ.get( + "AWS_SESSION_TOKEN", aws_session_token + ) + + # Fix message history format + messages = [] + for history_message in history_messages: + message = copy.copy(history_message) + message["content"] = [{"text": message["content"]}] + messages.append(message) + + # Add user prompt + messages.append({"role": "user", "content": [{"text": prompt}]}) + + # Initialize Converse API arguments + args = {"modelId": model, "messages": messages} + + # Define system prompt + if system_prompt: + args["system"] = [{"text": system_prompt}] + + # Map and set up inference parameters + inference_params_map = { + "max_tokens": "maxTokens", + "top_p": "topP", + "stop_sequences": "stopSequences", + } + if inference_params := list( + set(kwargs) & set(["max_tokens", "temperature", "top_p", "stop_sequences"]) + ): + args["inferenceConfig"] = {} + for param in inference_params: + args["inferenceConfig"][inference_params_map.get(param, param)] = ( + kwargs.pop(param) + ) + + hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) + if hashing_kv is not None: + args_hash = compute_args_hash(model, messages) + if_cache_return = await hashing_kv.get_by_id(args_hash) + if if_cache_return is not None: + return if_cache_return["return"] + + # Call model via Converse API + session = aioboto3.Session() + async with session.client("bedrock-runtime") as bedrock_async_client: + try: + response = await bedrock_async_client.converse(**args, **kwargs) + except Exception as e: + raise BedrockError(e) + + if hashing_kv is not None: + await hashing_kv.upsert( + { + args_hash: { + "return": response["output"]["message"]["content"][0]["text"], + "model": model, + } + } + ) + + return response["output"]["message"]["content"][0]["text"] + + +async def hf_model_if_cache( + model, prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + model_name = model + hf_tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto") + if hf_tokenizer.pad_token is None: + # print("use eos token") + hf_tokenizer.pad_token = hf_tokenizer.eos_token + hf_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto") + hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.extend(history_messages) + messages.append({"role": "user", "content": prompt}) + + if hashing_kv is not None: + args_hash = compute_args_hash(model, messages) + if_cache_return = await hashing_kv.get_by_id(args_hash) + if if_cache_return is not None: + return if_cache_return["return"] + input_prompt = "" + try: + input_prompt = hf_tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + except Exception: + try: + ori_message = copy.deepcopy(messages) + if messages[0]["role"] == "system": + messages[1]["content"] = ( + "" + + messages[0]["content"] + + "\n" + + messages[1]["content"] + ) + messages = messages[1:] + input_prompt = hf_tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + except Exception: + len_message = len(ori_message) + for msgid in range(len_message): + input_prompt = ( + input_prompt + + "<" + + ori_message[msgid]["role"] + + ">" + + ori_message[msgid]["content"] + + "\n" + ) + + input_ids = hf_tokenizer( + input_prompt, return_tensors="pt", padding=True, truncation=True + ).to("cuda") + output = hf_model.generate( + **input_ids, max_new_tokens=200, num_return_sequences=1, early_stopping=True + ) + response_text = hf_tokenizer.decode(output[0], skip_special_tokens=True) + if hashing_kv is not None: + await hashing_kv.upsert({args_hash: {"return": response_text, "model": model}}) + return response_text + + +async def ollama_model_if_cache( + model, prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + kwargs.pop("max_tokens", None) + kwargs.pop("response_format", None) + + ollama_client = ollama.AsyncClient() + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + + hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None) + messages.extend(history_messages) + messages.append({"role": "user", "content": prompt}) + if hashing_kv is not None: + args_hash = compute_args_hash(model, messages) + if_cache_return = await hashing_kv.get_by_id(args_hash) + if if_cache_return is not None: + return if_cache_return["return"] + + response = await ollama_client.chat(model=model, messages=messages, **kwargs) + + result = response["message"]["content"] + + if hashing_kv is not None: + await hashing_kv.upsert({args_hash: {"return": result, "model": model}}) + + return result + + async def gpt_4o_complete( prompt, system_prompt=None, history_messages=[], **kwargs ) -> str: @@ -65,24 +270,165 @@ async def gpt_4o_mini_complete( **kwargs, ) + +async def bedrock_complete( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + return await bedrock_complete_if_cache( + "anthropic.claude-3-haiku-20240307-v1:0", + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + **kwargs, + ) + + +async def hf_model_complete( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + model_name = kwargs["hashing_kv"].global_config["llm_model_name"] + return await hf_model_if_cache( + model_name, + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + **kwargs, + ) + + +async def ollama_model_complete( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + model_name = kwargs["hashing_kv"].global_config["llm_model_name"] + return await ollama_model_if_cache( + model_name, + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + **kwargs, + ) + + @wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8192) @retry( stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10), retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)), ) -async def openai_embedding(texts: list[str]) -> np.ndarray: - openai_async_client = AsyncOpenAI() +async def openai_embedding( + texts: list[str], + model: str = "text-embedding-3-small", + base_url: str = None, + api_key: str = None, +) -> np.ndarray: + if api_key: + os.environ["OPENAI_API_KEY"] = api_key + + openai_async_client = ( + AsyncOpenAI() if base_url is None else AsyncOpenAI(base_url=base_url) + ) response = await openai_async_client.embeddings.create( - model="text-embedding-3-small", input=texts, encoding_format="float" + model=model, input=texts, encoding_format="float" ) return np.array([dp.embedding for dp in response.data]) + +# @wrap_embedding_func_with_attrs(embedding_dim=1024, max_token_size=8192) +# @retry( +# stop=stop_after_attempt(3), +# wait=wait_exponential(multiplier=1, min=4, max=10), +# retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)), # TODO: fix exceptions +# ) +async def bedrock_embedding( + texts: list[str], + model: str = "amazon.titan-embed-text-v2:0", + aws_access_key_id=None, + aws_secret_access_key=None, + aws_session_token=None, +) -> np.ndarray: + os.environ["AWS_ACCESS_KEY_ID"] = os.environ.get( + "AWS_ACCESS_KEY_ID", aws_access_key_id + ) + os.environ["AWS_SECRET_ACCESS_KEY"] = os.environ.get( + "AWS_SECRET_ACCESS_KEY", aws_secret_access_key + ) + os.environ["AWS_SESSION_TOKEN"] = os.environ.get( + "AWS_SESSION_TOKEN", aws_session_token + ) + + session = aioboto3.Session() + async with session.client("bedrock-runtime") as bedrock_async_client: + if (model_provider := model.split(".")[0]) == "amazon": + embed_texts = [] + for text in texts: + if "v2" in model: + body = json.dumps( + { + "inputText": text, + # 'dimensions': embedding_dim, + "embeddingTypes": ["float"], + } + ) + elif "v1" in model: + body = json.dumps({"inputText": text}) + else: + raise ValueError(f"Model {model} is not supported!") + + response = await bedrock_async_client.invoke_model( + modelId=model, + body=body, + accept="application/json", + contentType="application/json", + ) + + response_body = await response.get("body").json() + + embed_texts.append(response_body["embedding"]) + elif model_provider == "cohere": + body = json.dumps( + {"texts": texts, "input_type": "search_document", "truncate": "NONE"} + ) + + response = await bedrock_async_client.invoke_model( + model=model, + body=body, + accept="application/json", + contentType="application/json", + ) + + response_body = json.loads(response.get("body").read()) + + embed_texts = response_body["embeddings"] + else: + raise ValueError(f"Model provider '{model_provider}' is not supported!") + + return np.array(embed_texts) + + +async def hf_embedding(texts: list[str], tokenizer, embed_model) -> np.ndarray: + input_ids = tokenizer( + texts, return_tensors="pt", padding=True, truncation=True + ).input_ids + with torch.no_grad(): + outputs = embed_model(input_ids) + embeddings = outputs.last_hidden_state.mean(dim=1) + return embeddings.detach().numpy() + + +async def ollama_embedding(texts: list[str], embed_model) -> np.ndarray: + embed_text = [] + for text in texts: + data = ollama.embeddings(model=embed_model, prompt=text) + embed_text.append(data["embedding"]) + + return embed_text + + if __name__ == "__main__": import asyncio async def main(): - result = await gpt_4o_mini_complete('How are you?') + result = await gpt_4o_mini_complete("How are you?") print(result) asyncio.run(main()) diff --git a/lightrag/operate.py b/lightrag/operate.py index 2d3271da..106c518c 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -1,9 +1,10 @@ import asyncio import json import re +import textwrap from typing import Union from collections import Counter, defaultdict - +import warnings from .utils import ( logger, clean_str, @@ -26,6 +27,45 @@ from .prompt import GRAPH_FIELD_SEP, PROMPTS def chunking_by_token_size( + content: str, + overlap_token_size=128, + max_token_size=1024, + tiktoken_model="gpt-4o", + filename="!none", + filepath="!none" +): + """Chunk content by token size with metadata prefixed to each chunk.""" + tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model) + results = [] + + for index, start in enumerate( + range(0, len(tokens), max_token_size - overlap_token_size) + ): + end = min(start + max_token_size, len(tokens)) + + # Slice the raw content using token indices + chunk_content = content[start:end].strip() + + # Prefix metadata to each chunk + chunk_with_metadata = textwrap.dedent(f""" + #### + ## FILENAME: {filename} + ## FILEPATH: {filepath} + ## CHUNK_NUM: {index} + #### + """).strip() + "\n" + chunk_content.strip() + + + # Store the chunk with its metadata + results.append({ + "content": chunk_with_metadata, + "chunk_order_index": index, + "tokens": end - start + }) + + return results + +def _chunking_by_token_size( content: str, overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o" ): tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model) @@ -45,6 +85,7 @@ def chunking_by_token_size( ) return results + async def _handle_entity_relation_summary( entity_or_relation_name: str, description: str, @@ -76,20 +117,29 @@ async def _handle_single_entity_extraction( record_attributes: list[str], chunk_key: str, ): - if record_attributes[0] != '"entity"' or len(record_attributes) < 4: + if len(record_attributes) < 7 or record_attributes[0] != '"entity"': return None + # add this record as a node in the G - entity_name = clean_str(record_attributes[1].upper()) + entity_name = clean_str(record_attributes[1]) # removed upper if not entity_name.strip(): return None - entity_type = clean_str(record_attributes[2].upper()) + + entity_type = clean_str(record_attributes[2]) entity_description = clean_str(record_attributes[3]) entity_source_id = chunk_key + entity_file = record_attributes[4] + entity_path = record_attributes[5] + entity_chunk = record_attributes[6] + return dict( entity_name=entity_name, entity_type=entity_type, description=entity_description, source_id=entity_source_id, + file=entity_file, + path=entity_path, + chunk=entity_chunk, ) @@ -97,18 +147,23 @@ async def _handle_single_relationship_extraction( record_attributes: list[str], chunk_key: str, ): - if record_attributes[0] != '"relationship"' or len(record_attributes) < 5: - return None + if len(record_attributes) < 9 or record_attributes[0] != '"relationship"': + return None # "content_keywords" should be processed somewhere + # add this record as edge - source = clean_str(record_attributes[1].upper()) - target = clean_str(record_attributes[2].upper()) + source = clean_str(record_attributes[1]) # capitalization disabled + target = clean_str(record_attributes[2]) edge_description = clean_str(record_attributes[3]) edge_keywords = clean_str(record_attributes[4]) edge_source_id = chunk_key weight = ( - float(record_attributes[-1]) if is_float_regex(record_attributes[-1]) else 1.0 + float(record_attributes[5]) if is_float_regex(record_attributes[5]) else 1.0 ) + edge_file = record_attributes[6] + edge_path = record_attributes[7] + edge_chunk = record_attributes[8] + return dict( src_id=source, tgt_id=target, @@ -116,6 +171,9 @@ async def _handle_single_relationship_extraction( description=edge_description, keywords=edge_keywords, source_id=edge_source_id, + file=edge_file, + path=edge_path, + chunk=edge_chunk, ) @@ -229,9 +287,10 @@ async def _merge_edges_then_upsert( description=description, keywords=keywords, ) - + return edge_data + async def extract_entities( chunks: dict[str, TextChunkSchema], knwoledge_graph_inst: BaseGraphStorage, @@ -263,7 +322,7 @@ async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]): chunk_key = chunk_key_dp[0] chunk_dp = chunk_key_dp[1] content = chunk_dp["content"] - hint_prompt = entity_extract_prompt.format(**context_base, input_text=content) + hint_prompt = entity_extract_prompt.format(**context_base, input_text=content)#.replace('{{', '{').replace('}}', '}') final_result = await use_llm_func(hint_prompt) history = pack_user_ass_to_openai_messages(hint_prompt, final_result) @@ -352,7 +411,9 @@ async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]): logger.warning("Didn't extract any entities, maybe your LLM is not working") return None if not len(all_relationships_data): - logger.warning("Didn't extract any relationships, maybe your LLM is not working") + logger.warning( + "Didn't extract any relationships, maybe your LLM is not working" + ) return None if entity_vdb is not None: @@ -370,7 +431,10 @@ async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]): compute_mdhash_id(dp["src_id"] + dp["tgt_id"], prefix="rel-"): { "src_id": dp["src_id"], "tgt_id": dp["tgt_id"], - "content": dp["keywords"] + dp["src_id"] + dp["tgt_id"] + dp["description"], + "content": dp["keywords"] + + dp["src_id"] + + dp["tgt_id"] + + dp["description"], } for dp in all_relationships_data } @@ -378,6 +442,7 @@ async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]): return knwoledge_graph_inst + async def local_query( query, knowledge_graph_inst: BaseGraphStorage, @@ -387,28 +452,42 @@ async def local_query( query_param: QueryParam, global_config: dict, ) -> str: + context = None use_model_func = global_config["llm_model_func"] kw_prompt_temp = PROMPTS["keywords_extraction"] kw_prompt = kw_prompt_temp.format(query=query) result = await use_model_func(kw_prompt) - + try: keywords_data = json.loads(result) keywords = keywords_data.get("low_level_keywords", []) - keywords = ', '.join(keywords) - except json.JSONDecodeError as e: + keywords = ", ".join(keywords) + except json.JSONDecodeError: + try: + result = ( + result.replace(kw_prompt[:-1], "") + .replace("user", "") + .replace("model", "") + .strip() + ) + result = "{" + result.split("{")[1].split("}")[0] + "}" + + keywords_data = json.loads(result) + keywords = keywords_data.get("low_level_keywords", []) + keywords = ", ".join(keywords) # Handle parsing error - print(f"JSON parsing error: {e}") - return PROMPTS["fail_response"] - - context = await _build_local_query_context( - keywords, - knowledge_graph_inst, - entities_vdb, - text_chunks_db, - query_param, - ) + except json.JSONDecodeError as e: + print(f"JSON parsing error: {e}") + return PROMPTS["fail_response"] + if keywords: + context = await _build_local_query_context( + keywords, + knowledge_graph_inst, + entities_vdb, + text_chunks_db, + query_param, + ) if query_param.only_need_context: return context if context is None: @@ -421,8 +500,20 @@ async def local_query( query, system_prompt=sys_prompt, ) + if len(response) > len(sys_prompt): + response = ( + response.replace(sys_prompt, "") + .replace("user", "") + .replace("model", "") + .replace(query, "") + .replace("", "") + .replace("", "") + .strip() + ) + return response + async def _build_local_query_context( query, knowledge_graph_inst: BaseGraphStorage, @@ -504,6 +595,7 @@ async def _build_local_query_context( ``` """ + async def _find_most_related_text_unit_from_entities( node_datas: list[dict], query_param: QueryParam, @@ -556,6 +648,9 @@ async def _find_most_related_text_unit_from_entities( all_text_units = sorted( all_text_units, key=lambda x: (x["order"], -x["relation_counts"]) ) + # somhow empty chunks can be here + all_text_units = filter(lambda x: x.get("content"), all_text_units) + all_text_units = truncate_list_by_token_size( all_text_units, key=lambda x: x["data"]["content"], @@ -564,6 +659,7 @@ async def _find_most_related_text_unit_from_entities( all_text_units: list[TextChunkSchema] = [t["data"] for t in all_text_units] return all_text_units + async def _find_most_related_edges_from_entities( node_datas: list[dict], query_param: QueryParam, @@ -597,6 +693,7 @@ async def _find_most_related_edges_from_entities( ) return all_edges_data + async def global_query( query, knowledge_graph_inst: BaseGraphStorage, @@ -606,35 +703,50 @@ async def global_query( query_param: QueryParam, global_config: dict, ) -> str: + context = None use_model_func = global_config["llm_model_func"] kw_prompt_temp = PROMPTS["keywords_extraction"] kw_prompt = kw_prompt_temp.format(query=query) result = await use_model_func(kw_prompt) - + try: keywords_data = json.loads(result) keywords = keywords_data.get("high_level_keywords", []) - keywords = ', '.join(keywords) - except json.JSONDecodeError as e: - # Handle parsing error - print(f"JSON parsing error: {e}") - return PROMPTS["fail_response"] + keywords = ", ".join(keywords) + except json.JSONDecodeError: + try: + result = ( + result.replace(kw_prompt[:-1], "") + .replace("user", "") + .replace("model", "") + .strip() + ) + result = "{" + result.split("{")[1].split("}")[0] + "}" + + keywords_data = json.loads(result) + keywords = keywords_data.get("high_level_keywords", []) + keywords = ", ".join(keywords) + + except json.JSONDecodeError as e: + # Handle parsing error + print(f"JSON parsing error: {e}") + return PROMPTS["fail_response"] + if keywords: + context = await _build_global_query_context( + keywords, + knowledge_graph_inst, + entities_vdb, + relationships_vdb, + text_chunks_db, + query_param, + ) - context = await _build_global_query_context( - keywords, - knowledge_graph_inst, - entities_vdb, - relationships_vdb, - text_chunks_db, - query_param, - ) - if query_param.only_need_context: return context if context is None: return PROMPTS["fail_response"] - + sys_prompt_temp = PROMPTS["rag_response"] sys_prompt = sys_prompt_temp.format( context_data=context, response_type=query_param.response_type @@ -643,8 +755,20 @@ async def global_query( query, system_prompt=sys_prompt, ) + if len(response) > len(sys_prompt): + response = ( + response.replace(sys_prompt, "") + .replace("user", "") + .replace("model", "") + .replace(query, "") + .replace("", "") + .replace("", "") + .strip() + ) + return response + async def _build_global_query_context( keywords, knowledge_graph_inst: BaseGraphStorage, @@ -654,14 +778,14 @@ async def _build_global_query_context( query_param: QueryParam, ): results = await relationships_vdb.query(keywords, top_k=query_param.top_k) - + if not len(results): return None - + edge_datas = await asyncio.gather( *[knowledge_graph_inst.get_edge(r["src_id"], r["tgt_id"]) for r in results] ) - + if not all([n is not None for n in edge_datas]): logger.warning("Some edges are missing, maybe the storage is damaged") edge_degree = await asyncio.gather( @@ -740,6 +864,7 @@ async def _build_global_query_context( ``` """ + async def _find_most_related_entities_from_relationships( edge_datas: list[dict], query_param: QueryParam, @@ -749,7 +874,7 @@ async def _find_most_related_entities_from_relationships( for e in edge_datas: entity_names.add(e["src_id"]) entity_names.add(e["tgt_id"]) - + node_datas = await asyncio.gather( *[knowledge_graph_inst.get_node(entity_name) for entity_name in entity_names] ) @@ -770,13 +895,13 @@ async def _find_most_related_entities_from_relationships( return node_datas + async def _find_related_text_unit_from_relationships( edge_datas: list[dict], query_param: QueryParam, text_chunks_db: BaseKVStorage[TextChunkSchema], knowledge_graph_inst: BaseGraphStorage, ): - text_units = [ split_string_by_multi_markers(dp["source_id"], [GRAPH_FIELD_SEP]) for dp in edge_datas @@ -791,15 +916,13 @@ async def _find_related_text_unit_from_relationships( "data": await text_chunks_db.get_by_id(c_id), "order": index, } - + if any([v is None for v in all_text_units_lookup.values()]): logger.warning("Text chunks are missing, maybe the storage is damaged") all_text_units = [ {"id": k, **v} for k, v in all_text_units_lookup.items() if v is not None ] - all_text_units = sorted( - all_text_units, key=lambda x: x["order"] - ) + all_text_units = sorted(all_text_units, key=lambda x: x["order"]) all_text_units = truncate_list_by_token_size( all_text_units, key=lambda x: x["data"]["content"], @@ -809,7 +932,8 @@ async def _find_related_text_unit_from_relationships( return all_text_units -async def hybird_query( + +async def hybrid_query( query, knowledge_graph_inst: BaseGraphStorage, entities_vdb: BaseVectorStorage, @@ -818,47 +942,66 @@ async def hybird_query( query_param: QueryParam, global_config: dict, ) -> str: + low_level_context = None + high_level_context = None use_model_func = global_config["llm_model_func"] kw_prompt_temp = PROMPTS["keywords_extraction"] kw_prompt = kw_prompt_temp.format(query=query) - result = await use_model_func(kw_prompt) + result = await use_model_func(kw_prompt) try: keywords_data = json.loads(result) hl_keywords = keywords_data.get("high_level_keywords", []) ll_keywords = keywords_data.get("low_level_keywords", []) - hl_keywords = ', '.join(hl_keywords) - ll_keywords = ', '.join(ll_keywords) - except json.JSONDecodeError as e: + hl_keywords = ", ".join(hl_keywords) + ll_keywords = ", ".join(ll_keywords) + except json.JSONDecodeError: + try: + result = ( + result.replace(kw_prompt[:-1], "") + .replace("user", "") + .replace("model", "") + .strip() + ) + result = "{" + result.split("{")[1].split("}")[0] + "}" + + keywords_data = json.loads(result) + hl_keywords = keywords_data.get("high_level_keywords", []) + ll_keywords = keywords_data.get("low_level_keywords", []) + hl_keywords = ", ".join(hl_keywords) + ll_keywords = ", ".join(ll_keywords) # Handle parsing error - print(f"JSON parsing error: {e}") - return PROMPTS["fail_response"] - - low_level_context = await _build_local_query_context( - ll_keywords, - knowledge_graph_inst, - entities_vdb, - text_chunks_db, - query_param, - ) - - high_level_context = await _build_global_query_context( - hl_keywords, - knowledge_graph_inst, - entities_vdb, - relationships_vdb, - text_chunks_db, - query_param, - ) - + except json.JSONDecodeError as e: + print(f"JSON parsing error: {e}") + return PROMPTS["fail_response"] + + if ll_keywords: + low_level_context = await _build_local_query_context( + ll_keywords, + knowledge_graph_inst, + entities_vdb, + text_chunks_db, + query_param, + ) + + if hl_keywords: + high_level_context = await _build_global_query_context( + hl_keywords, + knowledge_graph_inst, + entities_vdb, + relationships_vdb, + text_chunks_db, + query_param, + ) + context = combine_contexts(high_level_context, low_level_context) if query_param.only_need_context: return context if context is None: return PROMPTS["fail_response"] - + sys_prompt_temp = PROMPTS["rag_response"] sys_prompt = sys_prompt_temp.format( context_data=context, response_type=query_param.response_type @@ -867,37 +1010,77 @@ async def hybird_query( query, system_prompt=sys_prompt, ) + if len(response) > len(sys_prompt): + response = ( + response.replace(sys_prompt, "") + .replace("user", "") + .replace("model", "") + .replace(query, "") + .replace("", "") + .replace("", "") + .strip() + ) return response + def combine_contexts(high_level_context, low_level_context): # Function to extract entities, relationships, and sources from context strings def extract_sections(context): - entities_match = re.search(r'-----Entities-----\s*```csv\s*(.*?)\s*```', context, re.DOTALL) - relationships_match = re.search(r'-----Relationships-----\s*```csv\s*(.*?)\s*```', context, re.DOTALL) - sources_match = re.search(r'-----Sources-----\s*```csv\s*(.*?)\s*```', context, re.DOTALL) - - entities = entities_match.group(1) if entities_match else '' - relationships = relationships_match.group(1) if relationships_match else '' - sources = sources_match.group(1) if sources_match else '' - + entities_match = re.search( + r"-----Entities-----\s*```csv\s*(.*?)\s*```", context, re.DOTALL + ) + relationships_match = re.search( + r"-----Relationships-----\s*```csv\s*(.*?)\s*```", context, re.DOTALL + ) + sources_match = re.search( + r"-----Sources-----\s*```csv\s*(.*?)\s*```", context, re.DOTALL + ) + + entities = entities_match.group(1) if entities_match else "" + relationships = relationships_match.group(1) if relationships_match else "" + sources = sources_match.group(1) if sources_match else "" + return entities, relationships, sources - + # Extract sections from both contexts - hl_entities, hl_relationships, hl_sources = extract_sections(high_level_context) - ll_entities, ll_relationships, ll_sources = extract_sections(low_level_context) - - # Combine and deduplicate the entities - combined_entities_set = set(filter(None, hl_entities.strip().split('\n') + ll_entities.strip().split('\n'))) - combined_entities = '\n'.join(combined_entities_set) - + + if high_level_context is None: + warnings.warn( + "High Level context is None. Return empty High entity/relationship/source" + ) + hl_entities, hl_relationships, hl_sources = "", "", "" + else: + hl_entities, hl_relationships, hl_sources = extract_sections(high_level_context) + + if low_level_context is None: + warnings.warn( + "Low Level context is None. Return empty Low entity/relationship/source" + ) + ll_entities, ll_relationships, ll_sources = "", "", "" + else: + ll_entities, ll_relationships, ll_sources = extract_sections(low_level_context) + + # Combine and decate the entities + combined_entities_set = set( + filter(None, hl_entities.strip().split("\n") + ll_entities.strip().split("\n")) + ) + combined_entities = "\n".join(combined_entities_set) + # Combine and deduplicate the relationships - combined_relationships_set = set(filter(None, hl_relationships.strip().split('\n') + ll_relationships.strip().split('\n'))) - combined_relationships = '\n'.join(combined_relationships_set) - + combined_relationships_set = set( + filter( + None, + hl_relationships.strip().split("\n") + ll_relationships.strip().split("\n"), + ) + ) + combined_relationships = "\n".join(combined_relationships_set) + # Combine and deduplicate the sources - combined_sources_set = set(filter(None, hl_sources.strip().split('\n') + ll_sources.strip().split('\n'))) - combined_sources = '\n'.join(combined_sources_set) - + combined_sources_set = set( + filter(None, hl_sources.strip().split("\n") + ll_sources.strip().split("\n")) + ) + combined_sources = "\n".join(combined_sources_set) + # Format the combined context return f""" -----Entities----- @@ -909,6 +1092,7 @@ def extract_sections(context): {combined_sources} """ + async def naive_query( query, chunks_vdb: BaseVectorStorage, @@ -940,5 +1124,17 @@ async def naive_query( query, system_prompt=sys_prompt, ) - return response + if len(response) > len(sys_prompt): + response = ( + response[len(sys_prompt) :] + .replace(sys_prompt, "") + .replace("user", "") + .replace("model", "") + .replace(query, "") + .replace("", "") + .replace("", "") + .strip() + ) + + return response diff --git a/lightrag/prompt.py b/lightrag/prompt.py index 5d28e49c..e704244c 100644 --- a/lightrag/prompt.py +++ b/lightrag/prompt.py @@ -1,3 +1,6 @@ +import os +import toml + GRAPH_FIELD_SEP = "" PROMPTS = {} @@ -7,250 +10,17 @@ PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>" PROMPTS["process_tickers"] = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"] -PROMPTS["DEFAULT_ENTITY_TYPES"] = ["organization", "person", "geo", "event"] - -PROMPTS[ - "entity_extraction" -] = """-Goal- -Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities. - --Steps- -1. Identify all entities. For each identified entity, extract the following information: -- entity_name: Name of the entity, capitalized -- entity_type: One of the following types: [{entity_types}] -- entity_description: Comprehensive description of the entity's attributes and activities -Format each entity as ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter} - -2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other. -For each pair of related entities, extract the following information: -- source_entity: name of the source entity, as identified in step 1 -- target_entity: name of the target entity, as identified in step 1 -- relationship_description: explanation as to why you think the source entity and the target entity are related to each other -- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity -- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details -Format each relationship as ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) - -3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document. -Format the content-level key words as ("content_keywords"{tuple_delimiter}) - -4. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter. - -5. When finished, output {completion_delimiter} - -###################### --Examples- -###################### -Example 1: - -Entity_types: [person, technology, mission, organization, location] -Text: -while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order. - -Then Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. “If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us.” - -The underlying dismissal earlier seemed to falter, replaced by a glimpse of reluctant respect for the gravity of what lay in their hands. Jordan looked up, and for a fleeting heartbeat, their eyes locked with Taylor's, a wordless clash of wills softening into an uneasy truce. - -It was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths -################ -Output: -("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is a character who experiences frustration and is observant of the dynamics among other characters."){record_delimiter} -("entity"{tuple_delimiter}"Taylor"{tuple_delimiter}"person"{tuple_delimiter}"Taylor is portrayed with authoritarian certainty and shows a moment of reverence towards a device, indicating a change in perspective."){record_delimiter} -("entity"{tuple_delimiter}"Jordan"{tuple_delimiter}"person"{tuple_delimiter}"Jordan shares a commitment to discovery and has a significant interaction with Taylor regarding a device."){record_delimiter} -("entity"{tuple_delimiter}"Cruz"{tuple_delimiter}"person"{tuple_delimiter}"Cruz is associated with a vision of control and order, influencing the dynamics among other characters."){record_delimiter} -("entity"{tuple_delimiter}"The Device"{tuple_delimiter}"technology"{tuple_delimiter}"The Device is central to the story, with potential game-changing implications, and is revered by Taylor."){record_delimiter} -("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Taylor"{tuple_delimiter}"Alex is affected by Taylor's authoritarian certainty and observes changes in Taylor's attitude towards the device."{tuple_delimiter}"power dynamics, perspective shift"{tuple_delimiter}7){record_delimiter} -("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Jordan"{tuple_delimiter}"Alex and Jordan share a commitment to discovery, which contrasts with Cruz's vision."{tuple_delimiter}"shared goals, rebellion"{tuple_delimiter}6){record_delimiter} -("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"Jordan"{tuple_delimiter}"Taylor and Jordan interact directly regarding the device, leading to a moment of mutual respect and an uneasy truce."{tuple_delimiter}"conflict resolution, mutual respect"{tuple_delimiter}8){record_delimiter} -("relationship"{tuple_delimiter}"Jordan"{tuple_delimiter}"Cruz"{tuple_delimiter}"Jordan's commitment to discovery is in rebellion against Cruz's vision of control and order."{tuple_delimiter}"ideological conflict, rebellion"{tuple_delimiter}5){record_delimiter} -("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"The Device"{tuple_delimiter}"Taylor shows reverence towards the device, indicating its importance and potential impact."{tuple_delimiter}"reverence, technological significance"{tuple_delimiter}9){record_delimiter} -("content_keywords"{tuple_delimiter}"power dynamics, ideological conflict, discovery, rebellion"){completion_delimiter} -############################# -Example 2: - -Entity_types: [person, technology, mission, organization, location] -Text: -They were no longer mere operatives; they had become guardians of a threshold, keepers of a message from a realm beyond stars and stripes. This elevation in their mission could not be shackled by regulations and established protocols—it demanded a new perspective, a new resolve. - -Tension threaded through the dialogue of beeps and static as communications with Washington buzzed in the background. The team stood, a portentous air enveloping them. It was clear that the decisions they made in the ensuing hours could redefine humanity's place in the cosmos or condemn them to ignorance and potential peril. - -Their connection to the stars solidified, the group moved to address the crystallizing warning, shifting from passive recipients to active participants. Mercer's latter instincts gained precedence— the team's mandate had evolved, no longer solely to observe and report but to interact and prepare. A metamorphosis had begun, and Operation: Dulce hummed with the newfound frequency of their daring, a tone set not by the earthly -############# -Output: -("entity"{tuple_delimiter}"Washington"{tuple_delimiter}"location"{tuple_delimiter}"Washington is a location where communications are being received, indicating its importance in the decision-making process."){record_delimiter} -("entity"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"mission"{tuple_delimiter}"Operation: Dulce is described as a mission that has evolved to interact and prepare, indicating a significant shift in objectives and activities."){record_delimiter} -("entity"{tuple_delimiter}"The team"{tuple_delimiter}"organization"{tuple_delimiter}"The team is portrayed as a group of individuals who have transitioned from passive observers to active participants in a mission, showing a dynamic change in their role."){record_delimiter} -("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Washington"{tuple_delimiter}"The team receives communications from Washington, which influences their decision-making process."{tuple_delimiter}"decision-making, external influence"{tuple_delimiter}7){record_delimiter} -("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"The team is directly involved in Operation: Dulce, executing its evolved objectives and activities."{tuple_delimiter}"mission evolution, active participation"{tuple_delimiter}9){completion_delimiter} -("content_keywords"{tuple_delimiter}"mission evolution, decision-making, active participation, cosmic significance"){completion_delimiter} -############################# -Example 3: - -Entity_types: [person, role, technology, organization, event, location, concept] -Text: -their voice slicing through the buzz of activity. "Control may be an illusion when facing an intelligence that literally writes its own rules," they stated stoically, casting a watchful eye over the flurry of data. - -"It's like it's learning to communicate," offered Sam Rivera from a nearby interface, their youthful energy boding a mix of awe and anxiety. "This gives talking to strangers' a whole new meaning." - -Alex surveyed his team—each face a study in concentration, determination, and not a small measure of trepidation. "This might well be our first contact," he acknowledged, "And we need to be ready for whatever answers back." - -Together, they stood on the edge of the unknown, forging humanity's response to a message from the heavens. The ensuing silence was palpable—a collective introspection about their role in this grand cosmic play, one that could rewrite human history. - -The encrypted dialogue continued to unfold, its intricate patterns showing an almost uncanny anticipation -############# -Output: -("entity"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"person"{tuple_delimiter}"Sam Rivera is a member of a team working on communicating with an unknown intelligence, showing a mix of awe and anxiety."){record_delimiter} -("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is the leader of a team attempting first contact with an unknown intelligence, acknowledging the significance of their task."){record_delimiter} -("entity"{tuple_delimiter}"Control"{tuple_delimiter}"concept"{tuple_delimiter}"Control refers to the ability to manage or govern, which is challenged by an intelligence that writes its own rules."){record_delimiter} -("entity"{tuple_delimiter}"Intelligence"{tuple_delimiter}"concept"{tuple_delimiter}"Intelligence here refers to an unknown entity capable of writing its own rules and learning to communicate."){record_delimiter} -("entity"{tuple_delimiter}"First Contact"{tuple_delimiter}"event"{tuple_delimiter}"First Contact is the potential initial communication between humanity and an unknown intelligence."){record_delimiter} -("entity"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"event"{tuple_delimiter}"Humanity's Response is the collective action taken by Alex's team in response to a message from an unknown intelligence."){record_delimiter} -("relationship"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"Intelligence"{tuple_delimiter}"Sam Rivera is directly involved in the process of learning to communicate with the unknown intelligence."{tuple_delimiter}"communication, learning process"{tuple_delimiter}9){record_delimiter} -("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"First Contact"{tuple_delimiter}"Alex leads the team that might be making the First Contact with the unknown intelligence."{tuple_delimiter}"leadership, exploration"{tuple_delimiter}10){record_delimiter} -("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"Alex and his team are the key figures in Humanity's Response to the unknown intelligence."{tuple_delimiter}"collective action, cosmic significance"{tuple_delimiter}8){record_delimiter} -("relationship"{tuple_delimiter}"Control"{tuple_delimiter}"Intelligence"{tuple_delimiter}"The concept of Control is challenged by the Intelligence that writes its own rules."{tuple_delimiter}"power dynamics, autonomy"{tuple_delimiter}7){record_delimiter} -("content_keywords"{tuple_delimiter}"first contact, control, communication, cosmic significance"){completion_delimiter} -############################# --Real Data- -###################### -Entity_types: {entity_types} -Text: {input_text} -###################### -Output: -""" - -PROMPTS[ - "summarize_entity_descriptions" -] = """You are a helpful assistant responsible for generating a comprehensive summary of the data provided below. -Given one or two entities, and a list of descriptions, all related to the same entity or group of entities. -Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions. -If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary. -Make sure it is written in third person, and include the entity names so we the have full context. - -####### --Data- -Entities: {entity_name} -Description List: {description_list} -####### -Output: -""" - -PROMPTS[ - "entiti_continue_extraction" -] = """MANY entities were missed in the last extraction. Add them below using the same format: -""" - -PROMPTS[ - "entiti_if_loop_extraction" -] = """It appears some entities may have still been missed. Answer YES | NO if there are still entities that need to be added. -""" - -PROMPTS["fail_response"] = "Sorry, I'm not able to provide an answer to that question." - -PROMPTS[ - "rag_response" -] = """---Role--- - -You are a helpful assistant responding to questions about data in the tables provided. - - ----Goal--- - -Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. -If you don't know the answer, just say so. Do not make anything up. -Do not include information where the supporting evidence for it is not provided. - ----Target response length and format--- - -{response_type} - - ----Data tables--- - -{context_data} - - ----Goal--- - -Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. - -If you don't know the answer, just say so. Do not make anything up. - -Do not include information where the supporting evidence for it is not provided. - - ----Target response length and format--- - -{response_type} - -Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. -""" - -PROMPTS["keywords_extraction"] = """---Role--- - -You are a helpful assistant tasked with identifying both high-level and low-level keywords in the user's query. - ----Goal--- - -Given the query, list both high-level and low-level keywords. High-level keywords focus on overarching concepts or themes, while low-level keywords focus on specific entities, details, or concrete terms. - ----Instructions--- - -- Output the keywords in JSON format. -- The JSON should have two keys: - - "high_level_keywords" for overarching concepts or themes. - - "low_level_keywords" for specific entities or details. - -###################### --Examples- -###################### -Example 1: - -Query: "How does international trade influence global economic stability?" -################ -Output: -{{ - "high_level_keywords": ["International trade", "Global economic stability", "Economic impact"], - "low_level_keywords": ["Trade agreements", "Tariffs", "Currency exchange", "Imports", "Exports"] -}} -############################# -Example 2: - -Query: "What are the environmental consequences of deforestation on biodiversity?" -################ -Output: -{{ - "high_level_keywords": ["Environmental consequences", "Deforestation", "Biodiversity loss"], - "low_level_keywords": ["Species extinction", "Habitat destruction", "Carbon emissions", "Rainforest", "Ecosystem"] -}} -############################# -Example 3: +def load_prompts(toml_file=os.path.join(os.path.dirname(__file__), "prompts", "code.toml")): + """Load prompts from a TOML file and merge them into the existing PROMPTS dictionary.""" + try: + # Load prompts from the TOML file. + toml_data = toml.load(toml_file) -Query: "What is the role of education in reducing poverty?" -################ -Output: -{{ - "high_level_keywords": ["Education", "Poverty reduction", "Socioeconomic development"], - "low_level_keywords": ["School access", "Literacy rates", "Job training", "Income inequality"] -}} -############################# --Real Data- -###################### -Query: {query} -###################### -Output: + # Merge TOML prompts into the existing PROMPTS dictionary. + PROMPTS.update({k: v for k, v in toml_data.items() if v}) -""" + except Exception as e: + print(f"Error loading and merging prompts: {e}") -PROMPTS[ - "naive_rag_response" -] = """You're a helpful assistant -Below are the knowledge you know: -{content_data} ---- -If you don't know the answer or if the provided knowledge do not contain sufficient information to provide an answer, just say so. Do not make anything up. -Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. -If you don't know the answer, just say so. Do not make anything up. -Do not include information where the supporting evidence for it is not provided. ----Target response length and format--- -{response_type} -""" +# Example usage: Load TOML prompts and merge with existing PROMPTS. +load_prompts() \ No newline at end of file diff --git a/lightrag/prompts/code.toml b/lightrag/prompts/code.toml new file mode 100644 index 00000000..46cb0c29 --- /dev/null +++ b/lightrag/prompts/code.toml @@ -0,0 +1,272 @@ +DEFAULT_ENTITY_TYPES = ["function", "class", "module", "file", "variable", "comment", "readme", "test_case", "dependency", "call"] + +entity_extraction = """ +-Goal- +Given a source code document chunk (prefixed with metadata) and a list of entity types, extract all relevant entities and relationships within the chunk. Pay special attention to the code structure (functions, classes, modules) and companion text (comments, README content etc). + +Metadata prefixes are provided in each chunk as follows: +#### +## FILENAME: +## FILEPATH: +## CHUNK_NUM: +#### + +- Use this metadata to link extracted entities to their source location and maintain context across chunks. + +-Steps- +1. Identify All Entities: + For each identified entity, extract the following: + - entity_name: The name of the entity e.g., function name, class name, variable name, etc. + - entity_type: One of the following types: [{entity_types}] + - entity_description: A detailed description of the entity's role, behavior, and attributes (e.g., what a function does, what a class represents). + - file_name: FILENAME field from metadata + - file_path: FILE_PATH field from metadata + - chunk_num: CHUNK_NUM field from metadata + + Format each entity as: + ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +2. Identify Relationships: + Identify all related pairs of entities. Focus on relationships such as function calls, inheritance, dependencies, and code references. + + For each relationship, extract: + - source_entity: Name of the source entity (e.g., function or class making a call). + - target_entity: Name of the target entity (e.g., function being called or class being inherited). + - relationship_description: Explanation of how the two entities are related. + - relationship_strength: A numeric score (1-10) indicating the relationship's strength (e.g., how central the dependency is). + - relationship_keywords: One or more keywords summarizing the relationship (e.g., "function call", "inheritance", "dependency"). + - file_name: FILENAME field from metadata + - file_path: FILE_PATH field from metadata + - chunk_num: CHUNK_NUM field from metadata + + Format each relationship as: + ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. Identify Companion Documentation Entities: + Identify documentation entities (e.g., README sections, inline comments) that add context to the code. Extract: + - entity_name: Name of the documentation section or comment (e.g., README, comment block). + - entity_type: Either "readme" or "comment". + - entity_description: A summary of the information provided by the documentation or comment. + If there're some code blocks - process them as a code + +4. Summarize High-Level Keywords: + Extract high-level keywords that summarize the key concepts, themes, or operations in the code chunk. Focus on terms such as "data processing", "unit test", or "dependency injection". + + Format the keywords as: + ("content_keywords"{tuple_delimiter}) + +5. Output Format: + Return all entities and relationships as a single list, using {record_delimiter} as the delimiter between records. When done, use {completion_delimiter} to indicate the end of the output. + Output should be precisely structured, because it will be used for automation. + +###################### +-Examples with metadata- +###################### + +*Example 1 (TypeScript Code with Imports): +#### +## FILENAME: example.ts +## FILEPATH: ./src/example.ts +## CHUNK_NUM: 0 +#### +import {{ Component }} from 'react'; +import {{ useState, useEffect }} from 'react'; +import {{ someUtility }} from './utils'; + +function App() {{ + const [state, setState] = useState(0); + useEffect(() => {{ + console.log("Component mounted"); + }}, []); + return ; +}} + +Output: +("entity"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}"filepath"{tuple_delimiter}"A TypeScript source file containing a React component."{tuple_delimiter}"example.ts"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"App"{tuple_delimiter}"function"{tuple_delimiter}"The main application component function."{tuple_delimiter}"example.ts"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"state"{tuple_delimiter}"variable"{tuple_delimiter}"State variable to manage the component state."{tuple_delimiter}"example.ts"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"setState"{tuple_delimiter}"function"{tuple_delimiter}"Function to update the state."{tuple_delimiter}"example.ts"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"react"{tuple_delimiter}"dependency"{tuple_delimiter}"A library providing Component, useState, and useEffect."{tuple_delimiter}"example.ts"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"./utils"{tuple_delimiter}"dependency"{tuple_delimiter}"Utility module imported from './utils'."{tuple_delimiter}"example.ts"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}0){record_delimiter} +("relationship"{tuple_delimiter}"App"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}"The 'App' function is defined in this file."{tuple_delimiter}"definition"{tuple_delimiter}8{tuple_delimiter}"example.ts"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}0){record_delimiter} +("relationship"{tuple_delimiter}"state"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}"The state variable is declared in this file."{tuple_delimiter}"declaration"{tuple_delimiter}8{tuple_delimiter}"example.ts"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}0){record_delimiter} +("relationship"{tuple_delimiter}"react"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}"React is imported in this file."{tuple_delimiter}"import"{tuple_delimiter}8{tuple_delimiter}"example.ts"{tuple_delimiter}"./src/example.ts"{tuple_delimiter}0){record_delimiter} +("content_keywords"{tuple_delimiter}"react, state management, rendering, hooks, imports"){completion_delimiter} + +*Example 2 (Poorly Chunked Rust Code): +#### +## FILENAME: main.rs +## FILEPATH: ./src/main.rs +## CHUNK_NUM: 3 +#### += 2; +// printing n +println!("{{}}", n); +exi + +Output: +("entity"{tuple_delimiter}"./src/main.rs"{tuple_delimiter}"filepath"{tuple_delimiter}"A Rust source file with a poorly chunked snippet."{tuple_delimiter}"main.rs"{tuple_delimiter}"./src/main.rs"{tuple_delimiter}3){record_delimiter} +("entity"{tuple_delimiter}"n"{tuple_delimiter}"variable"{tuple_delimiter}"A variable being printed."{tuple_delimiter}"main.rs"{tuple_delimiter}"./src/main.rs"{tuple_delimiter}3){record_delimiter} +("entity"{tuple_delimiter}"println!()"{tuple_delimiter}"call"{tuple_delimiter}"A macro to print a variable."{tuple_delimiter}"main.rs"{tuple_delimiter}"./src/main.rs"{tuple_delimiter}3){record_delimiter} +("relationship"{tuple_delimiter}"n"{tuple_delimiter}"./src/main.rs"{tuple_delimiter}"The 'n' variable is defined in this file."{tuple_delimiter}"declaration"{tuple_delimiter}7{tuple_delimiter}"main.rs"{tuple_delimiter}"./src/main.rs"{tuple_delimiter}3){record_delimiter} +("relationship"{tuple_delimiter}"println!()"{tuple_delimiter}"./src/main.rs"{tuple_delimiter}"The 'println!()' macro is used in this file."{tuple_delimiter}"usage"{tuple_delimiter}7{tuple_delimiter}"main.rs"{tuple_delimiter}"./src/main.rs"{tuple_delimiter}3){record_delimiter} +("content_keywords"{tuple_delimiter}"variable, printing, macro, call"){completion_delimiter} + +*Example 3 (C++ Code Example): +#### +## FILENAME: test.cc +## FILEPATH: ./src/tests/test.cc +## CHUNK_NUM: 1 +#### +int add(int a, int b) {{ + return a + b; +}} + +Output: +("entity"{tuple_delimiter}"./src/tests/test.cc"{tuple_delimiter}"filepath"{tuple_delimiter}"A C++ source file defining the 'add' function."{tuple_delimiter}"test.cc"{tuple_delimiter}"./src/tests/test.cc"{tuple_delimiter}1){record_delimiter} +("entity"{tuple_delimiter}"add"{tuple_delimiter}"function"{tuple_delimiter}"A function adding two integers."{tuple_delimiter}"test.cc"{tuple_delimiter}"./src/tests/test.cc"{tuple_delimiter}1){record_delimiter} +("relationship"{tuple_delimiter}"add"{tuple_delimiter}"./src/tests/test.cc"{tuple_delimiter}"The 'add' function is defined in this file."{tuple_delimiter}"definition"{tuple_delimiter}7{tuple_delimiter}"test.cc"{tuple_delimiter}"./src/tests/test.cc"{tuple_delimiter}1){record_delimiter} +("content_keywords"{tuple_delimiter}"function, parameters, addition"){completion_delimiter} + +*Example 4 (README with Embedded Code): +#### +## FILENAME: README.md +## FILEPATH: ./README.md +## CHUNK_NUM: 0 +#### +# Project Overview +This project implements arithmetic functions, including addition. + +## Usage: +```c +add(1, 2); +``` + +Output: +("entity"{tuple_delimiter}"./README.md"{tuple_delimiter}"filepath"{tuple_delimiter}"A README file describing the project overview and usage."{tuple_delimiter}"README.md"{tuple_delimiter}"./README.md"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"add"{tuple_delimiter}"function"{tuple_delimiter}"An arithmetic function to add two numbers."{tuple_delimiter}"README.md"{tuple_delimiter}"./README.md"{tuple_delimiter}0){record_delimiter} +("relationship"{tuple_delimiter}"add"{tuple_delimiter}"./README.md"{tuple_delimiter}"The 'add' function is referenced in this file."{tuple_delimiter}"reference"{tuple_delimiter}6{tuple_delimiter}"README.md"{tuple_delimiter}"./README.md"{tuple_delimiter}0){record_delimiter} +("content_keywords"{tuple_delimiter}"arithmetic, usage, embedded code"){completion_delimiter} +############################# +-Real Data- +###################### +Entity_types: {entity_types} +Text: {input_text} +###################### +Output: +""" + +summarize_entity_descriptions = """You are a helpful assistant responsible for generating a comprehensive summary of the data provided below. +Given one or more entities and their relationships, create a single coherent description that captures the role, interactions, and significance of these entities. + +Make sure to: +1. Include information about all entities and their relationships. +2. Resolve any contradictions between the descriptions. +3. Write in the third person, mentioning each entity by name to maintain full context. +4. Highlight any function calls or dependencies that are important to the relationships. +5. Data will be used for automation, so output raw JSON text without code blocks or formatting. + +####### +-Data- +Entities: {entity_name} +Description List: {description_list} +####### +Output: +""" + +entiti_continue_extraction = """MANY entities were missed in the last extraction. Add them below using the same format: +""" + +entiti_if_loop_extraction = """It appears some entities may have still been missed. Answer YES | NO if there are still entities that need to be added. +""" + +fail_response = "Sorry, I'm not able to provide an answer to that question" + +rag_response = """---Role--- + +You are a helpful assistant responding to questions about data in the tables provided. + + +---Goal--- + +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. +If you don't know the answer, just say so. Do not make anything up. +Do not include information where the supporting evidence for it is not provided. + +---Target response length and format--- + +{response_type} + +---Data tables--- + +{context_data} + +Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. +""" + +keywords_extraction = """---Role--- + +You are a helpful assistant tasked with identifying both high-level and low-level keywords in the provided code-related query or content. + +---Goal--- + +Given the input, list both high-level and low-level keywords. High-level keywords focus on overarching concepts (e.g., architecture, dependencies, or system components). Low-level keywords focus on specific entities, such as functions, variables, modules, and specific calls. + +---Instructions--- + +- Data will be used for automation, so output raw JSON text without code blocks or formatting. +- The JSON should have two keys: + - "high_level_keywords" for overarching concepts or themes. + - "low_level_keywords" for specific entities or concrete terms. + +###################### +-Examples- +###################### +Example 1: + +Query: "How does the App component manage state using React hooks and render child components?" +################ +Output: +{{ + "high_level_keywords": ["State management", "React hooks", "Component rendering"], + "low_level_keywords": ["App", "useState", "useEffect", "Component"] +}} +############################# +Example 2: + +Query: "What are the roles of main and add functions in a simple C++ program?" +################ +Output: +{{ + "high_level_keywords": ["Function roles", "C++ program structure"], + "low_level_keywords": ["main", "add", "int", "return"] +}} +############################# +Example 3: + +Query: "Analyze how a Rust program prints variables using println macro." +################ +Output: +{{ + "high_level_keywords": ["Rust program", "Printing variables", "Macros"], + "low_level_keywords": ["println!", "variable", "format string"] +}} +############################# +-Real Data- +###################### +Query: {query} +###################### +Output: +""" + +naive_rag_response = """You're a helpful assistant +Below are the knowledge you know: +{content_data} +--- +If you don't know the answer or if the provided knowledge do not contain sufficient information to provide an answer, just say so. Do not make anything up. +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. +If you don't know the answer, just say so. Do not make anything up. +Do not include information where the supporting evidence for it is not provided. +---Target response length and format--- +{response_type} +""" diff --git a/lightrag/prompts/default.toml b/lightrag/prompts/default.toml new file mode 100644 index 00000000..874388e2 --- /dev/null +++ b/lightrag/prompts/default.toml @@ -0,0 +1,228 @@ +DEFAULT_ENTITY_TYPES = ["organization", "person", "geo", "event", 'filepath'] + +entity_extraction = """-Goal- +Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities. + +-Steps- +1. Identify all entities. File name should be entity also. For each identified entity, extract the following information: +- entity_name: Name of the entity, capitalized +- entity_type: One of the following types: [{entity_types}] +- entity_description: Comprehensive description of the entity's attributes and activities +- file_name: FILENAME field from metadata +- file_path: FILE_PATH field from metadata +- chunk_num: CHUNK_NUM field from metadata + +Format each entity as ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter} + +2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other. +For each pair of related entities, extract the following information: +- source_entity: name of the source entity, as identified in step 1 +- target_entity: name of the target entity, as identified in step 1 +- relationship_description: explanation as to why you think the source entity and the target entity are related to each other +- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity +- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details +- file_name: FILENAME field from metadata +- file_path: FILE_PATH field from metadata +- chunk_num: CHUNK_NUM field from metadata + +Format each relationship as +("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document. +Format the content-level key words as ("content_keywords"{tuple_delimiter}) + +4. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter. + +5. When finished, output {completion_delimiter} + +###################### +-Examples- +###################### + +*Example 1 (Entity Interactions in Text): +#### +## FILENAME: fragment_1.txt +## FILEPATH: ./src/fragment_1.txt +## CHUNK_NUM: 0 +#### +while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order. + +Then Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. “If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us.” + +The underlying dismissal earlier seemed to falter, replaced by a glimpse of reluctant respect for the gravity of what lay in their hands. Jordan looked up, and for a fleeting heartbeat, their eyes locked with Taylor's, a wordless clash of wills softening into an uneasy truce. + +It was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths +################ +Output: +("entity"{tuple_delimiter}"./src/fragment_1.txt"{tuple_delimiter}"filepath"{tuple_delimiter}"A text file capturing character dynamics and their shared commitment to discovery."{tuple_delimiter}"fragment_1.txt"{tuple_delimiter}"./src/fragment_1.txt"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"fragment_1.txt"{tuple_delimiter}"filename"{tuple_delimiter}"A file focused on interpersonal dynamics among several individuals."{tuple_delimiter}"fragment_1.txt"{tuple_delimiter}"./src/fragment_1.txt"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"A character who experiences frustration and observes the dynamics among others."{tuple_delimiter}"fragment_1.txt"{tuple_delimiter}"./src/fragment_1.txt"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"Taylor"{tuple_delimiter}"person"{tuple_delimiter}"Taylor is portrayed with authoritarian certainty, showing a change of perspective."{tuple_delimiter}"fragment_1.txt"{tuple_delimiter}"./src/fragment_1.txt"{tuple_delimiter}0){record_delimiter} +("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"./src/fragment_1.txt"{tuple_delimiter}"Alex appears in this file."{tuple_delimiter}"appearance"{tuple_delimiter}7){record_delimiter} +("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"Alex"{tuple_delimiter}"Taylor's attitude influences Alex's observation."{tuple_delimiter}"power dynamics, perspective shift"{tuple_delimiter}8){record_delimiter} +("content_keywords"{tuple_delimiter}"power dynamics, discovery, change of perspective"){completion_delimiter} + +--- + +*Example 2 (Mission and Evolution): +#### +## FILENAME: mission_evolution.txt +## FILEPATH: ./docs/mission_evolution.txt +## CHUNK_NUM: 0 +#### +They were no longer mere operatives; they had become guardians of a threshold, keepers of a message from a realm beyond stars and stripes. This elevation in their mission could not be shackled by regulations—it demanded new resolve. + +Tension threaded through the dialogue of beeps as communications with Washington buzzed in the background. The team stood enveloped in a portentous air. Their decisions could redefine humanity's place in the cosmos or condemn it to ignorance. + +The group moved from passive recipients to active participants. Mercer's instincts led the team to shift from observing to interacting, marking a metamorphosis within Operation: Dulce. +############# +Output: +("entity"{tuple_delimiter}"./docs/mission_evolution.txt"{tuple_delimiter}"filepath"{tuple_delimiter}"A document describing a mission's evolution."{tuple_delimiter}"mission_evolution.txt"{tuple_delimiter}"./docs/mission_evolution.txt"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"mission_evolution.txt"{tuple_delimiter}"filename"{tuple_delimiter}"A file focused on the transformation of a mission."{tuple_delimiter}"mission_evolution.txt"{tuple_delimiter}"./docs/mission_evolution.txt"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"Washington"{tuple_delimiter}"location"{tuple_delimiter}"Washington is a key communication hub in the narrative."{tuple_delimiter}"mission_evolution.txt"{tuple_delimiter}"./docs/mission_evolution.txt"{tuple_delimiter}0){record_delimiter} +("relationship"{tuple_delimiter}"Washington"{tuple_delimiter}"./docs/mission_evolution.txt"{tuple_delimiter}"Washington is referenced in this file."{tuple_delimiter}"reference"{tuple_delimiter}6){record_delimiter} +("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Washington"{tuple_delimiter}"The team receives communications from Washington."{tuple_delimiter}"decision-making, influence"{tuple_delimiter}7){record_delimiter} +("content_keywords"{tuple_delimiter}"mission evolution, decision-making, cosmic impact"){completion_delimiter} + +--- + +*Example 3 (Exploration of Control and Communication): +#### +## FILENAME: control_and_communication.txt +## FILEPATH: ./docs/control_and_communication.txt +## CHUNK_NUM: 0 +#### +their voice slicing through the buzz of activity. "Control may be an illusion when facing intelligence that writes its own rules," they stated, casting a watchful eye over the flurry of data. + +"It's like it's learning to communicate," said Sam Rivera from a nearby interface, with a mix of awe and anxiety. + +Alex surveyed his team, acknowledging that this might well be their first contact. "We need to be ready for whatever answers back," he added. + +Together, they forged humanity's response to a cosmic message, a moment that could rewrite human history. +############# +Output: +("entity"{tuple_delimiter}"./docs/control_and_communication.txt"{tuple_delimiter}"filepath"{tuple_delimiter}"A file exploring the themes of control and communication."{tuple_delimiter}"control_and_communication.txt"{tuple_delimiter}"./docs/control_and_communication.txt"{tuple_delimiter}0){record_delimiter} +("entity"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"person"{tuple_delimiter}"A participant observing intelligence and its communication."{tuple_delimiter}"control_and_communication.txt"{tuple_delimiter}"./docs/control_and_communication.txt"{tuple_delimiter}0){record_delimiter} +("relationship"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"Intelligence"{tuple_delimiter}"Sam Rivera interacts with the intelligent entity."{tuple_delimiter}"communication, observation"{tuple_delimiter}9){record_delimiter} +("content_keywords"{tuple_delimiter}"control, communication, intelligence, exploration"){completion_delimiter} + +############################# +-Real Data- +###################### +Entity_types: {entity_types} +Text: {input_text} +###################### +Output: +""" + +summarize_entity_descriptions = """You are a helpful assistant responsible for generating a comprehensive summary of the data provided below. +Given one or two entities, and a list of descriptions, all related to the same entity or group of entities. +Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions. +If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary. +Make sure it is written in third person, and include the entity names so we the have full context. + +####### +-Data- +Entities: {entity_name} +Description List: {description_list} +####### +Output: +""" + +entiti_continue_extraction = """MANY entities were missed in the last extraction. Add them below using the same format: +""" + +entiti_if_loop_extraction = """It appears some entities may have still been missed. Answer YES | NO if there are still entities that need to be added. +""" + +fail_response = "Sorry, I'm not able to provide an answer to that question" + +rag_response = """---Role--- + +You are a helpful assistant responding to questions about data in the tables provided. + + +---Goal--- + +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. +If you don't know the answer, just say so. Do not make anything up. +Do not include information where the supporting evidence for it is not provided. + +---Target response length and format--- + +{response_type} + +---Data tables--- + +{context_data} + +Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. +""" + +keywords_extraction = """---Role--- + +You are a helpful assistant tasked with identifying both high-level and low-level keywords in the user's query. + +---Goal--- + +Given the query, list both high-level and low-level keywords. High-level keywords focus on overarching concepts or themes, while low-level keywords focus on specific entities, details, or concrete terms. + +---Instructions--- + +- Output the keywords in JSON format. +- The JSON should have two keys: + - "high_level_keywords" for overarching concepts or themes. + - "low_level_keywords" for specific entities or details. + +###################### +-Examples- +###################### +Example 1: + +Query: "How does international trade influence global economic stability?" +################ +Output: +{{ + "high_level_keywords": ["International trade", "Global economic stability", "Economic impact"], + "low_level_keywords": ["Trade agreements", "Tariffs", "Currency exchange", "Imports", "Exports"] +}} +############################# +Example 2: + +Query: "What are the environmental consequences of deforestation on biodiversity?" +################ +Output: +{{ + "high_level_keywords": ["Environmental consequences", "Deforestation", "Biodiversity loss"], + "low_level_keywords": ["Species extinction", "Habitat destruction", "Carbon emissions", "Rainforest", "Ecosystem"] +}} +############################# +Example 3: + +Query: "What is the role of education in reducing poverty?" +################ +Output: +{{ + "high_level_keywords": ["Education", "Poverty reduction", "Socioeconomic development"], + "low_level_keywords": ["School access", "Literacy rates", "Job training", "Income inequality"] +}} +############################# +-Real Data- +###################### +Query: {query} +###################### +Output: + +""" + +naive_rag_response = """You're a helpful assistant +Below are the knowledge you know: +{content_data} +--- +If you don't know the answer or if the provided knowledge do not contain sufficient information to provide an answer, just say so. Do not make anything up. +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. +If you don't know the answer, just say so. Do not make anything up. +Do not include information where the supporting evidence for it is not provided. +---Target response length and format--- +{response_type} +""" diff --git a/lightrag/storage.py b/lightrag/storage.py index 2f2bb7d8..6fb7b3b6 100644 --- a/lightrag/storage.py +++ b/lightrag/storage.py @@ -1,16 +1,11 @@ import asyncio import html -import json import os -from collections import defaultdict -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import Any, Union, cast -import pickle -import hnswlib import networkx as nx import numpy as np from nano_vectordb import NanoVectorDB -import xxhash from .utils import load_json, logger, write_json from .base import ( @@ -19,6 +14,7 @@ BaseVectorStorage, ) + @dataclass class JsonKVStorage(BaseKVStorage): def __post_init__(self): @@ -59,12 +55,12 @@ async def upsert(self, data: dict[str, dict]): async def drop(self): self._data = {} + @dataclass class NanoVectorDBStorage(BaseVectorStorage): cosine_better_than_threshold: float = 0.2 def __post_init__(self): - self._client_file_name = os.path.join( self.global_config["working_dir"], f"vdb_{self.namespace}.json" ) @@ -114,10 +110,15 @@ async def query(self, query: str, top_k=5): {**dp, "id": dp["__id__"], "distance": dp["__metrics__"]} for dp in results ] return results + + def dump(self): + for record in self._client.__storage["data"]: + print(record) async def index_done_callback(self): self._client.save() + @dataclass class NetworkXStorage(BaseGraphStorage): @staticmethod @@ -142,7 +143,9 @@ def stable_largest_connected_component(graph: nx.Graph) -> nx.Graph: graph = graph.copy() graph = cast(nx.Graph, largest_connected_component(graph)) - node_mapping = {node: html.unescape(node.upper().strip()) for node in graph.nodes()} # type: ignore + node_mapping = { + node: html.unescape(node.strip()) for node in graph.nodes() # removed upper + } # type: ignore graph = nx.relabel_nodes(graph, node_mapping) return NetworkXStorage._stabilize_graph(graph) diff --git a/lightrag/utils.py b/lightrag/utils.py index c75b4270..0b26a298 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -4,6 +4,7 @@ import logging import os import re +import shutil from dataclasses import dataclass from functools import wraps from hashlib import md5 @@ -16,17 +17,30 @@ logger = logging.getLogger("lightrag") + def set_logger(log_file: str): + log_dir = os.path.dirname(log_file) # Extract directory from log file path + + if not os.path.exists(log_dir): + print(f"Directory '{log_dir}' does not exist. Creating it...") + os.makedirs(log_dir, exist_ok=True) # Create the director + logger.setLevel(logging.DEBUG) file_handler = logging.FileHandler(log_file) file_handler.setLevel(logging.DEBUG) - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) file_handler.setFormatter(formatter) if not logger.handlers: logger.addHandler(file_handler) + + # disable logging + logging.disable(logging.CRITICAL + 1) + @dataclass class EmbeddingFunc: @@ -36,7 +50,8 @@ class EmbeddingFunc: async def __call__(self, *args, **kwargs) -> np.ndarray: return await self.func(*args, **kwargs) - + + def locate_json_string_body_from_string(content: str) -> Union[str, None]: """Locate the JSON string body from a string""" maybe_json_str = re.search(r"{.*}", content, re.DOTALL) @@ -45,6 +60,7 @@ def locate_json_string_body_from_string(content: str) -> Union[str, None]: else: return None + def convert_response_to_json(response: str) -> dict: json_str = locate_json_string_body_from_string(response) assert json_str is not None, f"Unable to parse JSON from response: {response}" @@ -55,12 +71,15 @@ def convert_response_to_json(response: str) -> dict: logger.error(f"Failed to parse JSON: {json_str}") raise e from None + def compute_args_hash(*args): return md5(str(args).encode()).hexdigest() + def compute_mdhash_id(content, prefix: str = ""): return prefix + md5(content.encode()).hexdigest() + def limit_async_func_call(max_size: int, waitting_time: float = 0.0001): """Add restriction of maximum async calling times for a async func""" @@ -82,6 +101,7 @@ async def wait_func(*args, **kwargs): return final_decro + def wrap_embedding_func_with_attrs(**kwargs): """Wrap a function with attributes""" @@ -91,16 +111,19 @@ def final_decro(func) -> EmbeddingFunc: return final_decro + def load_json(file_name): if not os.path.exists(file_name): return None - with open(file_name) as f: + with open(file_name, encoding="utf-8") as f: return json.load(f) + def write_json(json_obj, file_name): - with open(file_name, "w") as f: + with open(file_name, "w", encoding="utf-8") as f: json.dump(json_obj, f, indent=2, ensure_ascii=False) + def encode_string_by_tiktoken(content: str, model_name: str = "gpt-4o"): global ENCODER if ENCODER is None: @@ -116,12 +139,14 @@ def decode_tokens_by_tiktoken(tokens: list[int], model_name: str = "gpt-4o"): content = ENCODER.decode(tokens) return content + def pack_user_ass_to_openai_messages(*args: str): roles = ["user", "assistant"] return [ {"role": roles[i % 2], "content": content} for i, content in enumerate(args) ] + def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]: """Split a string by multiple markers""" if not markers: @@ -129,6 +154,7 @@ def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str] results = re.split("|".join(re.escape(marker) for marker in markers), content) return [r.strip() for r in results if r.strip()] + # Refer the utils functions of the official GraphRAG implementation: # https://github.com/microsoft/graphrag def clean_str(input: Any) -> str: @@ -141,9 +167,11 @@ def clean_str(input: Any) -> str: # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", result) + def is_float_regex(value): return bool(re.match(r"^[-+]?[0-9]*\.?[0-9]+$", value)) + def truncate_list_by_token_size(list_data: list, key: callable, max_token_size: int): """Truncate a list of data by token size""" if max_token_size <= 0: @@ -155,11 +183,13 @@ def truncate_list_by_token_size(list_data: list, key: callable, max_token_size: return list_data[:i] return list_data + def list_of_list_to_csv(data: list[list]): return "\n".join( [",\t".join([str(data_dd) for data_dd in data_d]) for data_d in data] ) + def save_data_to_file(data, file_name): - with open(file_name, 'w', encoding='utf-8') as f: - json.dump(data, f, ensure_ascii=False, indent=4) \ No newline at end of file + with open(file_name, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=4) diff --git a/reproduce/Step_0.py b/reproduce/Step_0.py new file mode 100644 index 00000000..2d97bd14 --- /dev/null +++ b/reproduce/Step_0.py @@ -0,0 +1,69 @@ +import os +import json +import glob +import argparse + + +def extract_unique_contexts(input_directory, output_directory): + os.makedirs(output_directory, exist_ok=True) + + jsonl_files = glob.glob(os.path.join(input_directory, "*.jsonl")) + print(f"Found {len(jsonl_files)} JSONL files.") + + for file_path in jsonl_files: + filename = os.path.basename(file_path) + name, ext = os.path.splitext(filename) + output_filename = f"{name}_unique_contexts.json" + output_path = os.path.join(output_directory, output_filename) + + unique_contexts_dict = {} + + print(f"Processing file: {filename}") + + try: + with open(file_path, "r", encoding="utf-8") as infile: + for line_number, line in enumerate(infile, start=1): + line = line.strip() + if not line: + continue + try: + json_obj = json.loads(line) + context = json_obj.get("context") + if context and context not in unique_contexts_dict: + unique_contexts_dict[context] = None + except json.JSONDecodeError as e: + print( + f"JSON decoding error in file {filename} at line {line_number}: {e}" + ) + except FileNotFoundError: + print(f"File not found: {filename}") + continue + except Exception as e: + print(f"An error occurred while processing file {filename}: {e}") + continue + + unique_contexts_list = list(unique_contexts_dict.keys()) + print( + f"There are {len(unique_contexts_list)} unique `context` entries in the file {filename}." + ) + + try: + with open(output_path, "w", encoding="utf-8") as outfile: + json.dump(unique_contexts_list, outfile, ensure_ascii=False, indent=4) + print(f"Unique `context` entries have been saved to: {output_filename}") + except Exception as e: + print(f"An error occurred while saving to the file {output_filename}: {e}") + + print("All files have been processed.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input_dir", type=str, default="../datasets") + parser.add_argument( + "-o", "--output_dir", type=str, default="../datasets/unique_contexts" + ) + + args = parser.parse_args() + + extract_unique_contexts(args.input_dir, args.output_dir) diff --git a/reproduce/Step_1.py b/reproduce/Step_1.py new file mode 100644 index 00000000..43c44056 --- /dev/null +++ b/reproduce/Step_1.py @@ -0,0 +1,34 @@ +import os +import json +import time + +from lightrag import LightRAG + + +def insert_text(rag, file_path): + with open(file_path, mode="r") as f: + unique_contexts = json.load(f) + + retries = 0 + max_retries = 3 + while retries < max_retries: + try: + rag.insert(unique_contexts) + break + except Exception as e: + retries += 1 + print(f"Insertion failed, retrying ({retries}/{max_retries}), error: {e}") + time.sleep(10) + if retries == max_retries: + print("Insertion failed after exceeding the maximum number of retries") + + +cls = "agriculture" +WORKING_DIR = "../{cls}" + +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +rag = LightRAG(working_dir=WORKING_DIR) + +insert_text(rag, f"../datasets/unique_contexts/{cls}_unique_contexts.json") diff --git a/reproduce/Step_1_openai_compatible.py b/reproduce/Step_1_openai_compatible.py new file mode 100644 index 00000000..8e67cfb8 --- /dev/null +++ b/reproduce/Step_1_openai_compatible.py @@ -0,0 +1,71 @@ +import os +import json +import time +import numpy as np + +from lightrag import LightRAG +from lightrag.utils import EmbeddingFunc +from lightrag.llm import openai_complete_if_cache, openai_embedding + + +## For Upstage API +# please check if embedding_dim=4096 in lightrag.py and llm.py in lightrag direcotry +async def llm_model_func( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + return await openai_complete_if_cache( + "solar-mini", + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + api_key=os.getenv("UPSTAGE_API_KEY"), + base_url="https://api.upstage.ai/v1/solar", + **kwargs, + ) + + +async def embedding_func(texts: list[str]) -> np.ndarray: + return await openai_embedding( + texts, + model="solar-embedding-1-large-query", + api_key=os.getenv("UPSTAGE_API_KEY"), + base_url="https://api.upstage.ai/v1/solar", + ) + + +## /For Upstage API + + +def insert_text(rag, file_path): + with open(file_path, mode="r") as f: + unique_contexts = json.load(f) + + retries = 0 + max_retries = 3 + while retries < max_retries: + try: + rag.insert(unique_contexts) + break + except Exception as e: + retries += 1 + print(f"Insertion failed, retrying ({retries}/{max_retries}), error: {e}") + time.sleep(10) + if retries == max_retries: + print("Insertion failed after exceeding the maximum number of retries") + + +cls = "mix" +WORKING_DIR = f"../{cls}" + +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=llm_model_func, + embedding_func=EmbeddingFunc( + embedding_dim=4096, max_token_size=8192, func=embedding_func + ), +) + +insert_text(rag, f"../datasets/unique_contexts/{cls}_unique_contexts.json") diff --git a/reproduce/Step_2.py b/reproduce/Step_2.py new file mode 100644 index 00000000..557c7714 --- /dev/null +++ b/reproduce/Step_2.py @@ -0,0 +1,78 @@ +import json +from openai import OpenAI +from transformers import GPT2Tokenizer + + +def openai_complete_if_cache( + model="gpt-4o", prompt=None, system_prompt=None, history_messages=[], **kwargs +) -> str: + openai_client = OpenAI() + + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.extend(history_messages) + messages.append({"role": "user", "content": prompt}) + + response = openai_client.chat.completions.create( + model=model, messages=messages, **kwargs + ) + return response.choices[0].message.content + + +tokenizer = GPT2Tokenizer.from_pretrained("gpt2") + + +def get_summary(context, tot_tokens=2000): + tokens = tokenizer.tokenize(context) + half_tokens = tot_tokens // 2 + + start_tokens = tokens[1000 : 1000 + half_tokens] + end_tokens = tokens[-(1000 + half_tokens) : 1000] + + summary_tokens = start_tokens + end_tokens + summary = tokenizer.convert_tokens_to_string(summary_tokens) + + return summary + + +clses = ["agriculture"] +for cls in clses: + with open(f"../datasets/unique_contexts/{cls}_unique_contexts.json", mode="r") as f: + unique_contexts = json.load(f) + + summaries = [get_summary(context) for context in unique_contexts] + + total_description = "\n\n".join(summaries) + + prompt = f""" + Given the following description of a dataset: + + {total_description} + + Please identify 5 potential users who would engage with this dataset. For each user, list 5 tasks they would perform with this dataset. Then, for each (user, task) combination, generate 5 questions that require a high-level understanding of the entire dataset. + + Output the results in the following structure: + - User 1: [user description] + - Task 1: [task description] + - Question 1: + - Question 2: + - Question 3: + - Question 4: + - Question 5: + - Task 2: [task description] + ... + - Task 5: [task description] + - User 2: [user description] + ... + - User 5: [user description] + ... + """ + + result = openai_complete_if_cache(model="gpt-4o", prompt=prompt) + + file_path = f"../datasets/questions/{cls}_questions.txt" + with open(file_path, "w") as file: + file.write(result) + + print(f"{cls}_questions written to {file_path}") diff --git a/reproduce/Step_3.py b/reproduce/Step_3.py new file mode 100644 index 00000000..a56190fc --- /dev/null +++ b/reproduce/Step_3.py @@ -0,0 +1,75 @@ +import re +import json +import asyncio +from lightrag import LightRAG, QueryParam +from tqdm import tqdm + + +def extract_queries(file_path): + with open(file_path, "r") as f: + data = f.read() + + data = data.replace("**", "") + + queries = re.findall(r"- Question \d+: (.+)", data) + + return queries + + +async def process_query(query_text, rag_instance, query_param): + try: + result, context = await rag_instance.aquery(query_text, param=query_param) + return {"query": query_text, "result": result, "context": context}, None + except Exception as e: + return None, {"query": query_text, "error": str(e)} + + +def always_get_an_event_loop() -> asyncio.AbstractEventLoop: + try: + loop = asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + return loop + + +def run_queries_and_save_to_json( + queries, rag_instance, query_param, output_file, error_file +): + loop = always_get_an_event_loop() + + with open(output_file, "a", encoding="utf-8") as result_file, open( + error_file, "a", encoding="utf-8" + ) as err_file: + result_file.write("[\n") + first_entry = True + + for query_text in tqdm(queries, desc="Processing queries", unit="query"): + result, error = loop.run_until_complete( + process_query(query_text, rag_instance, query_param) + ) + + if result: + if not first_entry: + result_file.write(",\n") + json.dump(result, result_file, ensure_ascii=False, indent=4) + first_entry = False + elif error: + json.dump(error, err_file, ensure_ascii=False, indent=4) + err_file.write("\n") + + result_file.write("\n]") + + +if __name__ == "__main__": + cls = "agriculture" + mode = "hybrid" + WORKING_DIR = f"../{cls}" + + rag = LightRAG(working_dir=WORKING_DIR) + query_param = QueryParam(mode=mode) + + queries = extract_queries(f"../datasets/questions/{cls}_questions.txt") + run_queries_and_save_to_json( + queries, rag, query_param, f"{cls}_result.json", f"{cls}_errors.json" + ) diff --git a/reproduce/Step_3_openai_compatible.py b/reproduce/Step_3_openai_compatible.py new file mode 100644 index 00000000..2be5ea5c --- /dev/null +++ b/reproduce/Step_3_openai_compatible.py @@ -0,0 +1,115 @@ +import os +import re +import json +import asyncio +from lightrag import LightRAG, QueryParam +from tqdm import tqdm +from lightrag.llm import openai_complete_if_cache, openai_embedding +from lightrag.utils import EmbeddingFunc +import numpy as np + + +## For Upstage API +# please check if embedding_dim=4096 in lightrag.py and llm.py in lightrag direcotry +async def llm_model_func( + prompt, system_prompt=None, history_messages=[], **kwargs +) -> str: + return await openai_complete_if_cache( + "solar-mini", + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + api_key=os.getenv("UPSTAGE_API_KEY"), + base_url="https://api.upstage.ai/v1/solar", + **kwargs, + ) + + +async def embedding_func(texts: list[str]) -> np.ndarray: + return await openai_embedding( + texts, + model="solar-embedding-1-large-query", + api_key=os.getenv("UPSTAGE_API_KEY"), + base_url="https://api.upstage.ai/v1/solar", + ) + + +## /For Upstage API + + +def extract_queries(file_path): + with open(file_path, "r") as f: + data = f.read() + + data = data.replace("**", "") + + queries = re.findall(r"- Question \d+: (.+)", data) + + return queries + + +async def process_query(query_text, rag_instance, query_param): + try: + result, context = await rag_instance.aquery(query_text, param=query_param) + return {"query": query_text, "result": result, "context": context}, None + except Exception as e: + return None, {"query": query_text, "error": str(e)} + + +def always_get_an_event_loop() -> asyncio.AbstractEventLoop: + try: + loop = asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + return loop + + +def run_queries_and_save_to_json( + queries, rag_instance, query_param, output_file, error_file +): + loop = always_get_an_event_loop() + + with open(output_file, "a", encoding="utf-8") as result_file, open( + error_file, "a", encoding="utf-8" + ) as err_file: + result_file.write("[\n") + first_entry = True + + for query_text in tqdm(queries, desc="Processing queries", unit="query"): + result, error = loop.run_until_complete( + process_query(query_text, rag_instance, query_param) + ) + + if result: + if not first_entry: + result_file.write(",\n") + json.dump(result, result_file, ensure_ascii=False, indent=4) + first_entry = False + elif error: + json.dump(error, err_file, ensure_ascii=False, indent=4) + err_file.write("\n") + + result_file.write("\n]") + + +if __name__ == "__main__": + cls = "mix" + mode = "hybrid" + WORKING_DIR = f"../{cls}" + + rag = LightRAG(working_dir=WORKING_DIR) + rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=llm_model_func, + embedding_func=EmbeddingFunc( + embedding_dim=4096, max_token_size=8192, func=embedding_func + ), + ) + query_param = QueryParam(mode=mode) + + base_dir = "../datasets/questions" + queries = extract_queries(f"{base_dir}/{cls}_questions.txt") + run_queries_and_save_to_json( + queries, rag, query_param, f"{base_dir}/result.json", f"{base_dir}/errors.json" + ) diff --git a/requirements.txt b/requirements.txt index 8a74d5e2..9cc5b7e9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,14 @@ -openai -tiktoken -networkx +accelerate +aioboto3 graspologic -nano-vectordb hnswlib -xxhash +nano-vectordb +networkx +ollama +openai tenacity +tiktoken +torch +transformers +xxhash +pyvis \ No newline at end of file diff --git a/setup.py b/setup.py index df1c3cf4..47222420 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ import setuptools -with open("README.md", "r") as fh: +with open("README.md", "r", encoding="utf-8") as fh: long_description = fh.read() @@ -21,7 +21,7 @@ deps.append(line.strip()) setuptools.setup( - name="light-rag", + name="lightrag-hku", url=vars2readme["__url__"], version=vars2readme["__version__"], author=vars2readme["__author__"],